1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file defines the interfaces that X86 uses to lower LLVM code into a
13 //===----------------------------------------------------------------------===//
15 #include "X86ISelLowering.h"
16 #include "Utils/X86ShuffleDecode.h"
17 #include "X86CallingConv.h"
18 #include "X86FrameLowering.h"
19 #include "X86InstrBuilder.h"
20 #include "X86MachineFunctionInfo.h"
21 #include "X86TargetMachine.h"
22 #include "X86TargetObjectFile.h"
23 #include "llvm/ADT/SmallBitVector.h"
24 #include "llvm/ADT/SmallSet.h"
25 #include "llvm/ADT/Statistic.h"
26 #include "llvm/ADT/StringExtras.h"
27 #include "llvm/ADT/StringSwitch.h"
28 #include "llvm/ADT/VariadicFunction.h"
29 #include "llvm/CodeGen/IntrinsicLowering.h"
30 #include "llvm/CodeGen/MachineFrameInfo.h"
31 #include "llvm/CodeGen/MachineFunction.h"
32 #include "llvm/CodeGen/MachineInstrBuilder.h"
33 #include "llvm/CodeGen/MachineJumpTableInfo.h"
34 #include "llvm/CodeGen/MachineModuleInfo.h"
35 #include "llvm/CodeGen/MachineRegisterInfo.h"
36 #include "llvm/IR/CallSite.h"
37 #include "llvm/IR/CallingConv.h"
38 #include "llvm/IR/Constants.h"
39 #include "llvm/IR/DerivedTypes.h"
40 #include "llvm/IR/Function.h"
41 #include "llvm/IR/GlobalAlias.h"
42 #include "llvm/IR/GlobalVariable.h"
43 #include "llvm/IR/Instructions.h"
44 #include "llvm/IR/Intrinsics.h"
45 #include "llvm/MC/MCAsmInfo.h"
46 #include "llvm/MC/MCContext.h"
47 #include "llvm/MC/MCExpr.h"
48 #include "llvm/MC/MCSymbol.h"
49 #include "llvm/Support/CommandLine.h"
50 #include "llvm/Support/Debug.h"
51 #include "llvm/Support/ErrorHandling.h"
52 #include "llvm/Support/MathExtras.h"
53 #include "llvm/Target/TargetOptions.h"
54 #include "X86IntrinsicsInfo.h"
60 #define DEBUG_TYPE "x86-isel"
62 STATISTIC(NumTailCalls, "Number of tail calls");
64 static cl::opt<bool> ExperimentalVectorWideningLegalization(
65 "x86-experimental-vector-widening-legalization", cl::init(false),
66 cl::desc("Enable an experimental vector type legalization through widening "
67 "rather than promotion."),
70 static cl::opt<bool> ExperimentalVectorShuffleLowering(
71 "x86-experimental-vector-shuffle-lowering", cl::init(true),
72 cl::desc("Enable an experimental vector shuffle lowering code path."),
75 static cl::opt<bool> ExperimentalVectorShuffleLegality(
76 "x86-experimental-vector-shuffle-legality", cl::init(false),
77 cl::desc("Enable experimental shuffle legality based on the experimental "
78 "shuffle lowering. Should only be used with the experimental "
82 static cl::opt<int> ReciprocalEstimateRefinementSteps(
83 "x86-recip-refinement-steps", cl::init(1),
84 cl::desc("Specify the number of Newton-Raphson iterations applied to the "
85 "result of the hardware reciprocal estimate instruction."),
88 // Forward declarations.
89 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
92 static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
93 SelectionDAG &DAG, SDLoc dl,
94 unsigned vectorWidth) {
95 assert((vectorWidth == 128 || vectorWidth == 256) &&
96 "Unsupported vector width");
97 EVT VT = Vec.getValueType();
98 EVT ElVT = VT.getVectorElementType();
99 unsigned Factor = VT.getSizeInBits()/vectorWidth;
100 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
101 VT.getVectorNumElements()/Factor);
103 // Extract from UNDEF is UNDEF.
104 if (Vec.getOpcode() == ISD::UNDEF)
105 return DAG.getUNDEF(ResultVT);
107 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
108 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
110 // This is the index of the first element of the vectorWidth-bit chunk
112 unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
115 // If the input is a buildvector just emit a smaller one.
116 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
117 return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
118 makeArrayRef(Vec->op_begin() + NormalizedIdxVal,
121 SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
122 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
125 /// Generate a DAG to grab 128-bits from a vector > 128 bits. This
126 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
127 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
128 /// instructions or a simple subregister reference. Idx is an index in the
129 /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
130 /// lowering EXTRACT_VECTOR_ELT operations easier.
131 static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
132 SelectionDAG &DAG, SDLoc dl) {
133 assert((Vec.getValueType().is256BitVector() ||
134 Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
135 return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
138 /// Generate a DAG to grab 256-bits from a 512-bit vector.
139 static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
140 SelectionDAG &DAG, SDLoc dl) {
141 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
142 return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
145 static SDValue InsertSubVector(SDValue Result, SDValue Vec,
146 unsigned IdxVal, SelectionDAG &DAG,
147 SDLoc dl, unsigned vectorWidth) {
148 assert((vectorWidth == 128 || vectorWidth == 256) &&
149 "Unsupported vector width");
150 // Inserting UNDEF is Result
151 if (Vec.getOpcode() == ISD::UNDEF)
153 EVT VT = Vec.getValueType();
154 EVT ElVT = VT.getVectorElementType();
155 EVT ResultVT = Result.getValueType();
157 // Insert the relevant vectorWidth bits.
158 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
160 // This is the index of the first element of the vectorWidth-bit chunk
162 unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
165 SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
166 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
169 /// Generate a DAG to put 128-bits into a vector > 128 bits. This
170 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
171 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
172 /// simple superregister reference. Idx is an index in the 128 bits
173 /// we want. It need not be aligned to a 128-bit boundary. That makes
174 /// lowering INSERT_VECTOR_ELT operations easier.
175 static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
176 SelectionDAG &DAG,SDLoc dl) {
177 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
178 return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
181 static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
182 SelectionDAG &DAG, SDLoc dl) {
183 assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
184 return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
187 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
188 /// instructions. This is used because creating CONCAT_VECTOR nodes of
189 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
190 /// large BUILD_VECTORS.
191 static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
192 unsigned NumElems, SelectionDAG &DAG,
194 SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
195 return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
198 static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
199 unsigned NumElems, SelectionDAG &DAG,
201 SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
202 return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
205 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM)
206 : TargetLowering(TM) {
207 Subtarget = &TM.getSubtarget<X86Subtarget>();
208 X86ScalarSSEf64 = Subtarget->hasSSE2();
209 X86ScalarSSEf32 = Subtarget->hasSSE1();
210 TD = getDataLayout();
212 // Set up the TargetLowering object.
213 static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
215 // X86 is weird. It always uses i8 for shift amounts and setcc results.
216 setBooleanContents(ZeroOrOneBooleanContent);
217 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
218 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
220 // For 64-bit, since we have so many registers, use the ILP scheduler.
221 // For 32-bit, use the register pressure specific scheduling.
222 // For Atom, always use ILP scheduling.
223 if (Subtarget->isAtom())
224 setSchedulingPreference(Sched::ILP);
225 else if (Subtarget->is64Bit())
226 setSchedulingPreference(Sched::ILP);
228 setSchedulingPreference(Sched::RegPressure);
229 const X86RegisterInfo *RegInfo =
230 TM.getSubtarget<X86Subtarget>().getRegisterInfo();
231 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
233 // Bypass expensive divides on Atom when compiling with O2.
234 if (TM.getOptLevel() >= CodeGenOpt::Default) {
235 if (Subtarget->hasSlowDivide32())
236 addBypassSlowDiv(32, 8);
237 if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit())
238 addBypassSlowDiv(64, 16);
241 if (Subtarget->isTargetKnownWindowsMSVC()) {
242 // Setup Windows compiler runtime calls.
243 setLibcallName(RTLIB::SDIV_I64, "_alldiv");
244 setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
245 setLibcallName(RTLIB::SREM_I64, "_allrem");
246 setLibcallName(RTLIB::UREM_I64, "_aullrem");
247 setLibcallName(RTLIB::MUL_I64, "_allmul");
248 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
249 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
250 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
251 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
252 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
254 // The _ftol2 runtime function has an unusual calling conv, which
255 // is modeled by a special pseudo-instruction.
256 setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr);
257 setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr);
258 setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr);
259 setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr);
262 if (Subtarget->isTargetDarwin()) {
263 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
264 setUseUnderscoreSetJmp(false);
265 setUseUnderscoreLongJmp(false);
266 } else if (Subtarget->isTargetWindowsGNU()) {
267 // MS runtime is weird: it exports _setjmp, but longjmp!
268 setUseUnderscoreSetJmp(true);
269 setUseUnderscoreLongJmp(false);
271 setUseUnderscoreSetJmp(true);
272 setUseUnderscoreLongJmp(true);
275 // Set up the register classes.
276 addRegisterClass(MVT::i8, &X86::GR8RegClass);
277 addRegisterClass(MVT::i16, &X86::GR16RegClass);
278 addRegisterClass(MVT::i32, &X86::GR32RegClass);
279 if (Subtarget->is64Bit())
280 addRegisterClass(MVT::i64, &X86::GR64RegClass);
282 for (MVT VT : MVT::integer_valuetypes())
283 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
285 // We don't accept any truncstore of integer registers.
286 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
287 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
288 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
289 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
290 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
291 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
293 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
295 // SETOEQ and SETUNE require checking two conditions.
296 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
297 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
298 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
299 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
300 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
301 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
303 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
305 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
306 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
307 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
309 if (Subtarget->is64Bit()) {
310 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
311 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
312 } else if (!TM.Options.UseSoftFloat) {
313 // We have an algorithm for SSE2->double, and we turn this into a
314 // 64-bit FILD followed by conditional FADD for other targets.
315 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
316 // We have an algorithm for SSE2, and we turn this into a 64-bit
317 // FILD for other targets.
318 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
321 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
323 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
324 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
326 if (!TM.Options.UseSoftFloat) {
327 // SSE has no i16 to fp conversion, only i32
328 if (X86ScalarSSEf32) {
329 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
330 // f32 and f64 cases are Legal, f80 case is not
331 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
333 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
334 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
337 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
338 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);
341 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
342 // are Legal, f80 is custom lowered.
343 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
344 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
346 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
348 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
349 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
351 if (X86ScalarSSEf32) {
352 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
353 // f32 and f64 cases are Legal, f80 case is not
354 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
356 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
357 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
360 // Handle FP_TO_UINT by promoting the destination to a larger signed
362 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
363 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
364 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
366 if (Subtarget->is64Bit()) {
367 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
368 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
369 } else if (!TM.Options.UseSoftFloat) {
370 // Since AVX is a superset of SSE3, only check for SSE here.
371 if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
372 // Expand FP_TO_UINT into a select.
373 // FIXME: We would like to use a Custom expander here eventually to do
374 // the optimal thing for SSE vs. the default expansion in the legalizer.
375 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
377 // With SSE3 we can use fisttpll to convert to a signed i64; without
378 // SSE, we're stuck with a fistpll.
379 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
382 if (isTargetFTOL()) {
383 // Use the _ftol2 runtime function, which has a pseudo-instruction
384 // to handle its weird calling convention.
385 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
388 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
389 if (!X86ScalarSSEf64) {
390 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
391 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
392 if (Subtarget->is64Bit()) {
393 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
394 // Without SSE, i64->f64 goes through memory.
395 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
399 // Scalar integer divide and remainder are lowered to use operations that
400 // produce two results, to match the available instructions. This exposes
401 // the two-result form to trivial CSE, which is able to combine x/y and x%y
402 // into a single instruction.
404 // Scalar integer multiply-high is also lowered to use two-result
405 // operations, to match the available instructions. However, plain multiply
406 // (low) operations are left as Legal, as there are single-result
407 // instructions for this in x86. Using the two-result multiply instructions
408 // when both high and low results are needed must be arranged by dagcombine.
409 for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
411 setOperationAction(ISD::MULHS, VT, Expand);
412 setOperationAction(ISD::MULHU, VT, Expand);
413 setOperationAction(ISD::SDIV, VT, Expand);
414 setOperationAction(ISD::UDIV, VT, Expand);
415 setOperationAction(ISD::SREM, VT, Expand);
416 setOperationAction(ISD::UREM, VT, Expand);
418 // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
419 setOperationAction(ISD::ADDC, VT, Custom);
420 setOperationAction(ISD::ADDE, VT, Custom);
421 setOperationAction(ISD::SUBC, VT, Custom);
422 setOperationAction(ISD::SUBE, VT, Custom);
425 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
426 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
427 setOperationAction(ISD::BR_CC , MVT::f32, Expand);
428 setOperationAction(ISD::BR_CC , MVT::f64, Expand);
429 setOperationAction(ISD::BR_CC , MVT::f80, Expand);
430 setOperationAction(ISD::BR_CC , MVT::i8, Expand);
431 setOperationAction(ISD::BR_CC , MVT::i16, Expand);
432 setOperationAction(ISD::BR_CC , MVT::i32, Expand);
433 setOperationAction(ISD::BR_CC , MVT::i64, Expand);
434 setOperationAction(ISD::SELECT_CC , MVT::f32, Expand);
435 setOperationAction(ISD::SELECT_CC , MVT::f64, Expand);
436 setOperationAction(ISD::SELECT_CC , MVT::f80, Expand);
437 setOperationAction(ISD::SELECT_CC , MVT::i8, Expand);
438 setOperationAction(ISD::SELECT_CC , MVT::i16, Expand);
439 setOperationAction(ISD::SELECT_CC , MVT::i32, Expand);
440 setOperationAction(ISD::SELECT_CC , MVT::i64, Expand);
441 if (Subtarget->is64Bit())
442 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
443 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
444 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
445 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
446 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
447 setOperationAction(ISD::FREM , MVT::f32 , Expand);
448 setOperationAction(ISD::FREM , MVT::f64 , Expand);
449 setOperationAction(ISD::FREM , MVT::f80 , Expand);
450 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
452 // Promote the i8 variants and force them on up to i32 which has a shorter
454 setOperationAction(ISD::CTTZ , MVT::i8 , Promote);
455 AddPromotedToType (ISD::CTTZ , MVT::i8 , MVT::i32);
456 setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i8 , Promote);
457 AddPromotedToType (ISD::CTTZ_ZERO_UNDEF , MVT::i8 , MVT::i32);
458 if (Subtarget->hasBMI()) {
459 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Expand);
460 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Expand);
461 if (Subtarget->is64Bit())
462 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
464 setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
465 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
466 if (Subtarget->is64Bit())
467 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
470 if (Subtarget->hasLZCNT()) {
471 // When promoting the i8 variants, force them to i32 for a shorter
473 setOperationAction(ISD::CTLZ , MVT::i8 , Promote);
474 AddPromotedToType (ISD::CTLZ , MVT::i8 , MVT::i32);
475 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Promote);
476 AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
477 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Expand);
478 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Expand);
479 if (Subtarget->is64Bit())
480 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
482 setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
483 setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
484 setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
485 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
486 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
487 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
488 if (Subtarget->is64Bit()) {
489 setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
490 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
494 // Special handling for half-precision floating point conversions.
495 // If we don't have F16C support, then lower half float conversions
496 // into library calls.
497 if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) {
498 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
499 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
502 // There's never any support for operations beyond MVT::f32.
503 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
504 setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
505 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
506 setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
508 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
509 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
510 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
511 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
512 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
513 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
515 if (Subtarget->hasPOPCNT()) {
516 setOperationAction(ISD::CTPOP , MVT::i8 , Promote);
518 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
519 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
520 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
521 if (Subtarget->is64Bit())
522 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
525 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
527 if (!Subtarget->hasMOVBE())
528 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
530 // These should be promoted to a larger select which is supported.
531 setOperationAction(ISD::SELECT , MVT::i1 , Promote);
532 // X86 wants to expand cmov itself.
533 setOperationAction(ISD::SELECT , MVT::i8 , Custom);
534 setOperationAction(ISD::SELECT , MVT::i16 , Custom);
535 setOperationAction(ISD::SELECT , MVT::i32 , Custom);
536 setOperationAction(ISD::SELECT , MVT::f32 , Custom);
537 setOperationAction(ISD::SELECT , MVT::f64 , Custom);
538 setOperationAction(ISD::SELECT , MVT::f80 , Custom);
539 setOperationAction(ISD::SETCC , MVT::i8 , Custom);
540 setOperationAction(ISD::SETCC , MVT::i16 , Custom);
541 setOperationAction(ISD::SETCC , MVT::i32 , Custom);
542 setOperationAction(ISD::SETCC , MVT::f32 , Custom);
543 setOperationAction(ISD::SETCC , MVT::f64 , Custom);
544 setOperationAction(ISD::SETCC , MVT::f80 , Custom);
545 if (Subtarget->is64Bit()) {
546 setOperationAction(ISD::SELECT , MVT::i64 , Custom);
547 setOperationAction(ISD::SETCC , MVT::i64 , Custom);
549 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
550 // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
551 // SjLj exception handling but a light-weight setjmp/longjmp replacement to
552 // support continuation, user-level threading, and etc.. As a result, no
553 // other SjLj exception interfaces are implemented and please don't build
554 // your own exception handling based on them.
555 // LLVM/Clang supports zero-cost DWARF exception handling.
556 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
557 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
560 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom);
561 setOperationAction(ISD::JumpTable , MVT::i32 , Custom);
562 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom);
563 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom);
564 if (Subtarget->is64Bit())
565 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
566 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom);
567 setOperationAction(ISD::BlockAddress , MVT::i32 , Custom);
568 if (Subtarget->is64Bit()) {
569 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom);
570 setOperationAction(ISD::JumpTable , MVT::i64 , Custom);
571 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom);
572 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom);
573 setOperationAction(ISD::BlockAddress , MVT::i64 , Custom);
575 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
576 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom);
577 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom);
578 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom);
579 if (Subtarget->is64Bit()) {
580 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom);
581 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom);
582 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom);
585 if (Subtarget->hasSSE1())
586 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
588 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
590 // Expand certain atomics
591 for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
593 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
594 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
595 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
598 if (Subtarget->hasCmpxchg16b()) {
599 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
602 // FIXME - use subtarget debug flags
603 if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() &&
604 !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) {
605 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
608 if (Subtarget->is64Bit()) {
609 setExceptionPointerRegister(X86::RAX);
610 setExceptionSelectorRegister(X86::RDX);
612 setExceptionPointerRegister(X86::EAX);
613 setExceptionSelectorRegister(X86::EDX);
615 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
616 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
618 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
619 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
621 setOperationAction(ISD::TRAP, MVT::Other, Legal);
622 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
624 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
625 setOperationAction(ISD::VASTART , MVT::Other, Custom);
626 setOperationAction(ISD::VAEND , MVT::Other, Expand);
627 if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {
628 // TargetInfo::X86_64ABIBuiltinVaList
629 setOperationAction(ISD::VAARG , MVT::Other, Custom);
630 setOperationAction(ISD::VACOPY , MVT::Other, Custom);
632 // TargetInfo::CharPtrBuiltinVaList
633 setOperationAction(ISD::VAARG , MVT::Other, Expand);
634 setOperationAction(ISD::VACOPY , MVT::Other, Expand);
637 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
638 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
640 setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(), Custom);
642 if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
643 // f32 and f64 use SSE.
644 // Set up the FP register classes.
645 addRegisterClass(MVT::f32, &X86::FR32RegClass);
646 addRegisterClass(MVT::f64, &X86::FR64RegClass);
648 // Use ANDPD to simulate FABS.
649 setOperationAction(ISD::FABS , MVT::f64, Custom);
650 setOperationAction(ISD::FABS , MVT::f32, Custom);
652 // Use XORP to simulate FNEG.
653 setOperationAction(ISD::FNEG , MVT::f64, Custom);
654 setOperationAction(ISD::FNEG , MVT::f32, Custom);
656 // Use ANDPD and ORPD to simulate FCOPYSIGN.
657 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
658 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
660 // Lower this to FGETSIGNx86 plus an AND.
661 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
662 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
664 // We don't support sin/cos/fmod
665 setOperationAction(ISD::FSIN , MVT::f64, Expand);
666 setOperationAction(ISD::FCOS , MVT::f64, Expand);
667 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
668 setOperationAction(ISD::FSIN , MVT::f32, Expand);
669 setOperationAction(ISD::FCOS , MVT::f32, Expand);
670 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
672 // Expand FP immediates into loads from the stack, except for the special
674 addLegalFPImmediate(APFloat(+0.0)); // xorpd
675 addLegalFPImmediate(APFloat(+0.0f)); // xorps
676 } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
677 // Use SSE for f32, x87 for f64.
678 // Set up the FP register classes.
679 addRegisterClass(MVT::f32, &X86::FR32RegClass);
680 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
682 // Use ANDPS to simulate FABS.
683 setOperationAction(ISD::FABS , MVT::f32, Custom);
685 // Use XORP to simulate FNEG.
686 setOperationAction(ISD::FNEG , MVT::f32, Custom);
688 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
690 // Use ANDPS and ORPS to simulate FCOPYSIGN.
691 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
692 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
694 // We don't support sin/cos/fmod
695 setOperationAction(ISD::FSIN , MVT::f32, Expand);
696 setOperationAction(ISD::FCOS , MVT::f32, Expand);
697 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
699 // Special cases we handle for FP constants.
700 addLegalFPImmediate(APFloat(+0.0f)); // xorps
701 addLegalFPImmediate(APFloat(+0.0)); // FLD0
702 addLegalFPImmediate(APFloat(+1.0)); // FLD1
703 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
704 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
706 if (!TM.Options.UnsafeFPMath) {
707 setOperationAction(ISD::FSIN , MVT::f64, Expand);
708 setOperationAction(ISD::FCOS , MVT::f64, Expand);
709 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
711 } else if (!TM.Options.UseSoftFloat) {
712 // f32 and f64 in x87.
713 // Set up the FP register classes.
714 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
715 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
717 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
718 setOperationAction(ISD::UNDEF, MVT::f32, Expand);
719 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
720 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
722 if (!TM.Options.UnsafeFPMath) {
723 setOperationAction(ISD::FSIN , MVT::f64, Expand);
724 setOperationAction(ISD::FSIN , MVT::f32, Expand);
725 setOperationAction(ISD::FCOS , MVT::f64, Expand);
726 setOperationAction(ISD::FCOS , MVT::f32, Expand);
727 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
728 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
730 addLegalFPImmediate(APFloat(+0.0)); // FLD0
731 addLegalFPImmediate(APFloat(+1.0)); // FLD1
732 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
733 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
734 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
735 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
736 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
737 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
740 // We don't support FMA.
741 setOperationAction(ISD::FMA, MVT::f64, Expand);
742 setOperationAction(ISD::FMA, MVT::f32, Expand);
744 // Long double always uses X87.
745 if (!TM.Options.UseSoftFloat) {
746 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
747 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
748 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
750 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
751 addLegalFPImmediate(TmpFlt); // FLD0
753 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
756 APFloat TmpFlt2(+1.0);
757 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
759 addLegalFPImmediate(TmpFlt2); // FLD1
760 TmpFlt2.changeSign();
761 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
764 if (!TM.Options.UnsafeFPMath) {
765 setOperationAction(ISD::FSIN , MVT::f80, Expand);
766 setOperationAction(ISD::FCOS , MVT::f80, Expand);
767 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
770 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
771 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
772 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
773 setOperationAction(ISD::FRINT, MVT::f80, Expand);
774 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
775 setOperationAction(ISD::FMA, MVT::f80, Expand);
778 // Always use a library call for pow.
779 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
780 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
781 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
783 setOperationAction(ISD::FLOG, MVT::f80, Expand);
784 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
785 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
786 setOperationAction(ISD::FEXP, MVT::f80, Expand);
787 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
788 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
789 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
791 // First set operation action for all vector types to either promote
792 // (for widening) or expand (for scalarization). Then we will selectively
793 // turn on ones that can be effectively codegen'd.
794 for (MVT VT : MVT::vector_valuetypes()) {
795 setOperationAction(ISD::ADD , VT, Expand);
796 setOperationAction(ISD::SUB , VT, Expand);
797 setOperationAction(ISD::FADD, VT, Expand);
798 setOperationAction(ISD::FNEG, VT, Expand);
799 setOperationAction(ISD::FSUB, VT, Expand);
800 setOperationAction(ISD::MUL , VT, Expand);
801 setOperationAction(ISD::FMUL, VT, Expand);
802 setOperationAction(ISD::SDIV, VT, Expand);
803 setOperationAction(ISD::UDIV, VT, Expand);
804 setOperationAction(ISD::FDIV, VT, Expand);
805 setOperationAction(ISD::SREM, VT, Expand);
806 setOperationAction(ISD::UREM, VT, Expand);
807 setOperationAction(ISD::LOAD, VT, Expand);
808 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
809 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
810 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
811 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
812 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
813 setOperationAction(ISD::FABS, VT, Expand);
814 setOperationAction(ISD::FSIN, VT, Expand);
815 setOperationAction(ISD::FSINCOS, VT, Expand);
816 setOperationAction(ISD::FCOS, VT, Expand);
817 setOperationAction(ISD::FSINCOS, VT, Expand);
818 setOperationAction(ISD::FREM, VT, Expand);
819 setOperationAction(ISD::FMA, VT, Expand);
820 setOperationAction(ISD::FPOWI, VT, Expand);
821 setOperationAction(ISD::FSQRT, VT, Expand);
822 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
823 setOperationAction(ISD::FFLOOR, VT, Expand);
824 setOperationAction(ISD::FCEIL, VT, Expand);
825 setOperationAction(ISD::FTRUNC, VT, Expand);
826 setOperationAction(ISD::FRINT, VT, Expand);
827 setOperationAction(ISD::FNEARBYINT, VT, Expand);
828 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
829 setOperationAction(ISD::MULHS, VT, Expand);
830 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
831 setOperationAction(ISD::MULHU, VT, Expand);
832 setOperationAction(ISD::SDIVREM, VT, Expand);
833 setOperationAction(ISD::UDIVREM, VT, Expand);
834 setOperationAction(ISD::FPOW, VT, Expand);
835 setOperationAction(ISD::CTPOP, VT, Expand);
836 setOperationAction(ISD::CTTZ, VT, Expand);
837 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
838 setOperationAction(ISD::CTLZ, VT, Expand);
839 setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
840 setOperationAction(ISD::SHL, VT, Expand);
841 setOperationAction(ISD::SRA, VT, Expand);
842 setOperationAction(ISD::SRL, VT, Expand);
843 setOperationAction(ISD::ROTL, VT, Expand);
844 setOperationAction(ISD::ROTR, VT, Expand);
845 setOperationAction(ISD::BSWAP, VT, Expand);
846 setOperationAction(ISD::SETCC, VT, Expand);
847 setOperationAction(ISD::FLOG, VT, Expand);
848 setOperationAction(ISD::FLOG2, VT, Expand);
849 setOperationAction(ISD::FLOG10, VT, Expand);
850 setOperationAction(ISD::FEXP, VT, Expand);
851 setOperationAction(ISD::FEXP2, VT, Expand);
852 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
853 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
854 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
855 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
856 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
857 setOperationAction(ISD::TRUNCATE, VT, Expand);
858 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
859 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
860 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
861 setOperationAction(ISD::VSELECT, VT, Expand);
862 setOperationAction(ISD::SELECT_CC, VT, Expand);
863 for (MVT InnerVT : MVT::vector_valuetypes()) {
864 setTruncStoreAction(InnerVT, VT, Expand);
866 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
867 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
869 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
870 // types, we have to deal with them whether we ask for Expansion or not.
871 // Setting Expand causes its own optimisation problems though, so leave
873 if (VT.getVectorElementType() == MVT::i1)
874 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
878 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
879 // with -msoft-float, disable use of MMX as well.
880 if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
881 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
882 // No operations on x86mmx supported, everything uses intrinsics.
885 // MMX-sized vectors (other than x86mmx) are expected to be expanded
886 // into smaller operations.
887 setOperationAction(ISD::MULHS, MVT::v8i8, Expand);
888 setOperationAction(ISD::MULHS, MVT::v4i16, Expand);
889 setOperationAction(ISD::MULHS, MVT::v2i32, Expand);
890 setOperationAction(ISD::MULHS, MVT::v1i64, Expand);
891 setOperationAction(ISD::AND, MVT::v8i8, Expand);
892 setOperationAction(ISD::AND, MVT::v4i16, Expand);
893 setOperationAction(ISD::AND, MVT::v2i32, Expand);
894 setOperationAction(ISD::AND, MVT::v1i64, Expand);
895 setOperationAction(ISD::OR, MVT::v8i8, Expand);
896 setOperationAction(ISD::OR, MVT::v4i16, Expand);
897 setOperationAction(ISD::OR, MVT::v2i32, Expand);
898 setOperationAction(ISD::OR, MVT::v1i64, Expand);
899 setOperationAction(ISD::XOR, MVT::v8i8, Expand);
900 setOperationAction(ISD::XOR, MVT::v4i16, Expand);
901 setOperationAction(ISD::XOR, MVT::v2i32, Expand);
902 setOperationAction(ISD::XOR, MVT::v1i64, Expand);
903 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Expand);
904 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Expand);
905 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i32, Expand);
906 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Expand);
907 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand);
908 setOperationAction(ISD::SELECT, MVT::v8i8, Expand);
909 setOperationAction(ISD::SELECT, MVT::v4i16, Expand);
910 setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
911 setOperationAction(ISD::SELECT, MVT::v1i64, Expand);
912 setOperationAction(ISD::BITCAST, MVT::v8i8, Expand);
913 setOperationAction(ISD::BITCAST, MVT::v4i16, Expand);
914 setOperationAction(ISD::BITCAST, MVT::v2i32, Expand);
915 setOperationAction(ISD::BITCAST, MVT::v1i64, Expand);
917 if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
918 addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
920 setOperationAction(ISD::FADD, MVT::v4f32, Legal);
921 setOperationAction(ISD::FSUB, MVT::v4f32, Legal);
922 setOperationAction(ISD::FMUL, MVT::v4f32, Legal);
923 setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
924 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
925 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
926 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
927 setOperationAction(ISD::LOAD, MVT::v4f32, Legal);
928 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
929 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
930 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
931 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
932 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
935 if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
936 addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
938 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
939 // registers cannot be used even for integer operations.
940 addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
941 addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
942 addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
943 addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
945 setOperationAction(ISD::ADD, MVT::v16i8, Legal);
946 setOperationAction(ISD::ADD, MVT::v8i16, Legal);
947 setOperationAction(ISD::ADD, MVT::v4i32, Legal);
948 setOperationAction(ISD::ADD, MVT::v2i64, Legal);
949 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
950 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
951 setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
952 setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
953 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
954 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
955 setOperationAction(ISD::SUB, MVT::v16i8, Legal);
956 setOperationAction(ISD::SUB, MVT::v8i16, Legal);
957 setOperationAction(ISD::SUB, MVT::v4i32, Legal);
958 setOperationAction(ISD::SUB, MVT::v2i64, Legal);
959 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
960 setOperationAction(ISD::FADD, MVT::v2f64, Legal);
961 setOperationAction(ISD::FSUB, MVT::v2f64, Legal);
962 setOperationAction(ISD::FMUL, MVT::v2f64, Legal);
963 setOperationAction(ISD::FDIV, MVT::v2f64, Legal);
964 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal);
965 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
966 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
968 setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
969 setOperationAction(ISD::SETCC, MVT::v16i8, Custom);
970 setOperationAction(ISD::SETCC, MVT::v8i16, Custom);
971 setOperationAction(ISD::SETCC, MVT::v4i32, Custom);
973 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom);
974 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom);
975 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
976 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
977 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
979 // Only provide customized ctpop vector bit twiddling for vector types we
980 // know to perform better than using the popcnt instructions on each vector
981 // element. If popcnt isn't supported, always provide the custom version.
982 if (!Subtarget->hasPOPCNT()) {
983 setOperationAction(ISD::CTPOP, MVT::v4i32, Custom);
984 setOperationAction(ISD::CTPOP, MVT::v2i64, Custom);
987 // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
988 for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
989 MVT VT = (MVT::SimpleValueType)i;
990 // Do not attempt to custom lower non-power-of-2 vectors
991 if (!isPowerOf2_32(VT.getVectorNumElements()))
993 // Do not attempt to custom lower non-128-bit vectors
994 if (!VT.is128BitVector())
996 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
997 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
998 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1001 // We support custom legalizing of sext and anyext loads for specific
1002 // memory vector types which we can load as a scalar (or sequence of
1003 // scalars) and extend in-register to a legal 128-bit vector type. For sext
1004 // loads these must work with a single scalar load.
1005 for (MVT VT : MVT::integer_vector_valuetypes()) {
1006 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
1007 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
1008 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
1009 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
1010 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
1011 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
1012 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
1013 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
1014 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
1017 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);
1018 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
1019 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom);
1020 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom);
1021 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom);
1022 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
1024 if (Subtarget->is64Bit()) {
1025 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom);
1026 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
1029 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
1030 for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
1031 MVT VT = (MVT::SimpleValueType)i;
1033 // Do not attempt to promote non-128-bit vectors
1034 if (!VT.is128BitVector())
1037 setOperationAction(ISD::AND, VT, Promote);
1038 AddPromotedToType (ISD::AND, VT, MVT::v2i64);
1039 setOperationAction(ISD::OR, VT, Promote);
1040 AddPromotedToType (ISD::OR, VT, MVT::v2i64);
1041 setOperationAction(ISD::XOR, VT, Promote);
1042 AddPromotedToType (ISD::XOR, VT, MVT::v2i64);
1043 setOperationAction(ISD::LOAD, VT, Promote);
1044 AddPromotedToType (ISD::LOAD, VT, MVT::v2i64);
1045 setOperationAction(ISD::SELECT, VT, Promote);
1046 AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
1049 // Custom lower v2i64 and v2f64 selects.
1050 setOperationAction(ISD::LOAD, MVT::v2f64, Legal);
1051 setOperationAction(ISD::LOAD, MVT::v2i64, Legal);
1052 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
1053 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
1055 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
1056 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
1058 setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom);
1059 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
1060 // As there is no 64-bit GPR available, we need build a special custom
1061 // sequence to convert from v2i32 to v2f32.
1062 if (!Subtarget->is64Bit())
1063 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
1065 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
1066 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
1068 for (MVT VT : MVT::fp_vector_valuetypes())
1069 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
1071 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
1072 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
1073 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
1076 if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
1077 setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
1078 setOperationAction(ISD::FCEIL, MVT::f32, Legal);
1079 setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
1080 setOperationAction(ISD::FRINT, MVT::f32, Legal);
1081 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
1082 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
1083 setOperationAction(ISD::FCEIL, MVT::f64, Legal);
1084 setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
1085 setOperationAction(ISD::FRINT, MVT::f64, Legal);
1086 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
1088 setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
1089 setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
1090 setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
1091 setOperationAction(ISD::FRINT, MVT::v4f32, Legal);
1092 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
1093 setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
1094 setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
1095 setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);
1096 setOperationAction(ISD::FRINT, MVT::v2f64, Legal);
1097 setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);
1099 // FIXME: Do we need to handle scalar-to-vector here?
1100 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1102 setOperationAction(ISD::VSELECT, MVT::v2f64, Custom);
1103 setOperationAction(ISD::VSELECT, MVT::v2i64, Custom);
1104 setOperationAction(ISD::VSELECT, MVT::v4i32, Custom);
1105 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
1106 setOperationAction(ISD::VSELECT, MVT::v8i16, Custom);
1107 // There is no BLENDI for byte vectors. We don't need to custom lower
1108 // some vselects for now.
1109 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
1111 // SSE41 brings specific instructions for doing vector sign extend even in
1112 // cases where we don't have SRA.
1113 for (MVT VT : MVT::integer_vector_valuetypes()) {
1114 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
1115 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
1116 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
1119 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1120 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8, Legal);
1121 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
1122 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
1123 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1124 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1125 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1127 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8, Legal);
1128 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
1129 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
1130 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1131 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1132 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1134 // i8 and i16 vectors are custom because the source register and source
1135 // source memory operand types are not the same width. f32 vectors are
1136 // custom since the immediate controlling the insert encodes additional
1138 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
1139 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
1140 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
1141 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
1143 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
1144 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
1145 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
1146 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
1148 // FIXME: these should be Legal, but that's only for the case where
1149 // the index is constant. For now custom expand to deal with that.
1150 if (Subtarget->is64Bit()) {
1151 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom);
1152 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
1156 if (Subtarget->hasSSE2()) {
1157 setOperationAction(ISD::SRL, MVT::v8i16, Custom);
1158 setOperationAction(ISD::SRL, MVT::v16i8, Custom);
1160 setOperationAction(ISD::SHL, MVT::v8i16, Custom);
1161 setOperationAction(ISD::SHL, MVT::v16i8, Custom);
1163 setOperationAction(ISD::SRA, MVT::v8i16, Custom);
1164 setOperationAction(ISD::SRA, MVT::v16i8, Custom);
1166 // In the customized shift lowering, the legal cases in AVX2 will be
1168 setOperationAction(ISD::SRL, MVT::v2i64, Custom);
1169 setOperationAction(ISD::SRL, MVT::v4i32, Custom);
1171 setOperationAction(ISD::SHL, MVT::v2i64, Custom);
1172 setOperationAction(ISD::SHL, MVT::v4i32, Custom);
1174 setOperationAction(ISD::SRA, MVT::v4i32, Custom);
1177 if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
1178 addRegisterClass(MVT::v32i8, &X86::VR256RegClass);
1179 addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
1180 addRegisterClass(MVT::v8i32, &X86::VR256RegClass);
1181 addRegisterClass(MVT::v8f32, &X86::VR256RegClass);
1182 addRegisterClass(MVT::v4i64, &X86::VR256RegClass);
1183 addRegisterClass(MVT::v4f64, &X86::VR256RegClass);
1185 setOperationAction(ISD::LOAD, MVT::v8f32, Legal);
1186 setOperationAction(ISD::LOAD, MVT::v4f64, Legal);
1187 setOperationAction(ISD::LOAD, MVT::v4i64, Legal);
1189 setOperationAction(ISD::FADD, MVT::v8f32, Legal);
1190 setOperationAction(ISD::FSUB, MVT::v8f32, Legal);
1191 setOperationAction(ISD::FMUL, MVT::v8f32, Legal);
1192 setOperationAction(ISD::FDIV, MVT::v8f32, Legal);
1193 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal);
1194 setOperationAction(ISD::FFLOOR, MVT::v8f32, Legal);
1195 setOperationAction(ISD::FCEIL, MVT::v8f32, Legal);
1196 setOperationAction(ISD::FTRUNC, MVT::v8f32, Legal);
1197 setOperationAction(ISD::FRINT, MVT::v8f32, Legal);
1198 setOperationAction(ISD::FNEARBYINT, MVT::v8f32, Legal);
1199 setOperationAction(ISD::FNEG, MVT::v8f32, Custom);
1200 setOperationAction(ISD::FABS, MVT::v8f32, Custom);
1202 setOperationAction(ISD::FADD, MVT::v4f64, Legal);
1203 setOperationAction(ISD::FSUB, MVT::v4f64, Legal);
1204 setOperationAction(ISD::FMUL, MVT::v4f64, Legal);
1205 setOperationAction(ISD::FDIV, MVT::v4f64, Legal);
1206 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal);
1207 setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal);
1208 setOperationAction(ISD::FCEIL, MVT::v4f64, Legal);
1209 setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal);
1210 setOperationAction(ISD::FRINT, MVT::v4f64, Legal);
1211 setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Legal);
1212 setOperationAction(ISD::FNEG, MVT::v4f64, Custom);
1213 setOperationAction(ISD::FABS, MVT::v4f64, Custom);
1215 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1216 // even though v8i16 is a legal type.
1217 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote);
1218 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote);
1219 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1221 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote);
1222 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1223 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
1225 setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);
1226 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
1228 for (MVT VT : MVT::fp_vector_valuetypes())
1229 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1231 setOperationAction(ISD::SRL, MVT::v16i16, Custom);
1232 setOperationAction(ISD::SRL, MVT::v32i8, Custom);
1234 setOperationAction(ISD::SHL, MVT::v16i16, Custom);
1235 setOperationAction(ISD::SHL, MVT::v32i8, Custom);
1237 setOperationAction(ISD::SRA, MVT::v16i16, Custom);
1238 setOperationAction(ISD::SRA, MVT::v32i8, Custom);
1240 setOperationAction(ISD::SETCC, MVT::v32i8, Custom);
1241 setOperationAction(ISD::SETCC, MVT::v16i16, Custom);
1242 setOperationAction(ISD::SETCC, MVT::v8i32, Custom);
1243 setOperationAction(ISD::SETCC, MVT::v4i64, Custom);
1245 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1246 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1247 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1249 setOperationAction(ISD::VSELECT, MVT::v4f64, Custom);
1250 setOperationAction(ISD::VSELECT, MVT::v4i64, Custom);
1251 setOperationAction(ISD::VSELECT, MVT::v8i32, Custom);
1252 setOperationAction(ISD::VSELECT, MVT::v8f32, Custom);
1254 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
1255 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom);
1256 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
1257 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom);
1258 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom);
1259 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom);
1260 setOperationAction(ISD::ANY_EXTEND, MVT::v4i64, Custom);
1261 setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Custom);
1262 setOperationAction(ISD::ANY_EXTEND, MVT::v16i16, Custom);
1263 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1264 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1265 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1267 if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
1268 setOperationAction(ISD::FMA, MVT::v8f32, Legal);
1269 setOperationAction(ISD::FMA, MVT::v4f64, Legal);
1270 setOperationAction(ISD::FMA, MVT::v4f32, Legal);
1271 setOperationAction(ISD::FMA, MVT::v2f64, Legal);
1272 setOperationAction(ISD::FMA, MVT::f32, Legal);
1273 setOperationAction(ISD::FMA, MVT::f64, Legal);
1276 if (Subtarget->hasInt256()) {
1277 setOperationAction(ISD::ADD, MVT::v4i64, Legal);
1278 setOperationAction(ISD::ADD, MVT::v8i32, Legal);
1279 setOperationAction(ISD::ADD, MVT::v16i16, Legal);
1280 setOperationAction(ISD::ADD, MVT::v32i8, Legal);
1282 setOperationAction(ISD::SUB, MVT::v4i64, Legal);
1283 setOperationAction(ISD::SUB, MVT::v8i32, Legal);
1284 setOperationAction(ISD::SUB, MVT::v16i16, Legal);
1285 setOperationAction(ISD::SUB, MVT::v32i8, Legal);
1287 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1288 setOperationAction(ISD::MUL, MVT::v8i32, Legal);
1289 setOperationAction(ISD::MUL, MVT::v16i16, Legal);
1290 // Don't lower v32i8 because there is no 128-bit byte mul
1292 setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
1293 setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
1294 setOperationAction(ISD::MULHU, MVT::v16i16, Legal);
1295 setOperationAction(ISD::MULHS, MVT::v16i16, Legal);
1297 setOperationAction(ISD::VSELECT, MVT::v16i16, Custom);
1298 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1300 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1301 // when we have a 256bit-wide blend with immediate.
1302 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1304 // Only provide customized ctpop vector bit twiddling for vector types we
1305 // know to perform better than using the popcnt instructions on each
1306 // vector element. If popcnt isn't supported, always provide the custom
1308 if (!Subtarget->hasPOPCNT())
1309 setOperationAction(ISD::CTPOP, MVT::v4i64, Custom);
1311 // Custom CTPOP always performs better on natively supported v8i32
1312 setOperationAction(ISD::CTPOP, MVT::v8i32, Custom);
1314 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1315 setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1316 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
1317 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
1318 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1319 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1320 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1322 setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1323 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
1324 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
1325 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1326 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1327 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1329 setOperationAction(ISD::ADD, MVT::v4i64, Custom);
1330 setOperationAction(ISD::ADD, MVT::v8i32, Custom);
1331 setOperationAction(ISD::ADD, MVT::v16i16, Custom);
1332 setOperationAction(ISD::ADD, MVT::v32i8, Custom);
1334 setOperationAction(ISD::SUB, MVT::v4i64, Custom);
1335 setOperationAction(ISD::SUB, MVT::v8i32, Custom);
1336 setOperationAction(ISD::SUB, MVT::v16i16, Custom);
1337 setOperationAction(ISD::SUB, MVT::v32i8, Custom);
1339 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1340 setOperationAction(ISD::MUL, MVT::v8i32, Custom);
1341 setOperationAction(ISD::MUL, MVT::v16i16, Custom);
1342 // Don't lower v32i8 because there is no 128-bit byte mul
1345 // In the customized shift lowering, the legal cases in AVX2 will be
1347 setOperationAction(ISD::SRL, MVT::v4i64, Custom);
1348 setOperationAction(ISD::SRL, MVT::v8i32, Custom);
1350 setOperationAction(ISD::SHL, MVT::v4i64, Custom);
1351 setOperationAction(ISD::SHL, MVT::v8i32, Custom);
1353 setOperationAction(ISD::SRA, MVT::v8i32, Custom);
1355 // Custom lower several nodes for 256-bit types.
1356 for (MVT VT : MVT::vector_valuetypes()) {
1357 if (VT.getScalarSizeInBits() >= 32) {
1358 setOperationAction(ISD::MLOAD, VT, Legal);
1359 setOperationAction(ISD::MSTORE, VT, Legal);
1361 // Extract subvector is special because the value type
1362 // (result) is 128-bit but the source is 256-bit wide.
1363 if (VT.is128BitVector()) {
1364 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1366 // Do not attempt to custom lower other non-256-bit vectors
1367 if (!VT.is256BitVector())
1370 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1371 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1372 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1373 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1374 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1375 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1376 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1379 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1380 for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
1381 MVT VT = (MVT::SimpleValueType)i;
1383 // Do not attempt to promote non-256-bit vectors
1384 if (!VT.is256BitVector())
1387 setOperationAction(ISD::AND, VT, Promote);
1388 AddPromotedToType (ISD::AND, VT, MVT::v4i64);
1389 setOperationAction(ISD::OR, VT, Promote);
1390 AddPromotedToType (ISD::OR, VT, MVT::v4i64);
1391 setOperationAction(ISD::XOR, VT, Promote);
1392 AddPromotedToType (ISD::XOR, VT, MVT::v4i64);
1393 setOperationAction(ISD::LOAD, VT, Promote);
1394 AddPromotedToType (ISD::LOAD, VT, MVT::v4i64);
1395 setOperationAction(ISD::SELECT, VT, Promote);
1396 AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
1400 if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) {
1401 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1402 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1403 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1404 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1406 addRegisterClass(MVT::i1, &X86::VK1RegClass);
1407 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1408 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1410 for (MVT VT : MVT::fp_vector_valuetypes())
1411 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1413 setOperationAction(ISD::BR_CC, MVT::i1, Expand);
1414 setOperationAction(ISD::SETCC, MVT::i1, Custom);
1415 setOperationAction(ISD::XOR, MVT::i1, Legal);
1416 setOperationAction(ISD::OR, MVT::i1, Legal);
1417 setOperationAction(ISD::AND, MVT::i1, Legal);
1418 setOperationAction(ISD::LOAD, MVT::v16f32, Legal);
1419 setOperationAction(ISD::LOAD, MVT::v8f64, Legal);
1420 setOperationAction(ISD::LOAD, MVT::v8i64, Legal);
1421 setOperationAction(ISD::LOAD, MVT::v16i32, Legal);
1422 setOperationAction(ISD::LOAD, MVT::v16i1, Legal);
1424 setOperationAction(ISD::FADD, MVT::v16f32, Legal);
1425 setOperationAction(ISD::FSUB, MVT::v16f32, Legal);
1426 setOperationAction(ISD::FMUL, MVT::v16f32, Legal);
1427 setOperationAction(ISD::FDIV, MVT::v16f32, Legal);
1428 setOperationAction(ISD::FSQRT, MVT::v16f32, Legal);
1429 setOperationAction(ISD::FNEG, MVT::v16f32, Custom);
1431 setOperationAction(ISD::FADD, MVT::v8f64, Legal);
1432 setOperationAction(ISD::FSUB, MVT::v8f64, Legal);
1433 setOperationAction(ISD::FMUL, MVT::v8f64, Legal);
1434 setOperationAction(ISD::FDIV, MVT::v8f64, Legal);
1435 setOperationAction(ISD::FSQRT, MVT::v8f64, Legal);
1436 setOperationAction(ISD::FNEG, MVT::v8f64, Custom);
1437 setOperationAction(ISD::FMA, MVT::v8f64, Legal);
1438 setOperationAction(ISD::FMA, MVT::v16f32, Legal);
1440 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal);
1441 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal);
1442 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal);
1443 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal);
1444 if (Subtarget->is64Bit()) {
1445 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Legal);
1446 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Legal);
1447 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Legal);
1448 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Legal);
1450 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1451 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1452 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1453 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1454 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1455 setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
1456 setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
1457 setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Promote);
1458 setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Promote);
1459 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1460 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1461 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
1462 setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal);
1463 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
1465 setOperationAction(ISD::TRUNCATE, MVT::i1, Custom);
1466 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1467 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
1468 setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom);
1469 setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom);
1470 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
1471 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1472 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1473 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1474 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1475 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
1476 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
1477 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
1479 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
1480 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
1481 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
1482 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
1483 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
1484 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Legal);
1486 setOperationAction(ISD::SETCC, MVT::v16i1, Custom);
1487 setOperationAction(ISD::SETCC, MVT::v8i1, Custom);
1489 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1491 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom);
1492 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
1493 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i1, Custom);
1494 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i1, Custom);
1495 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom);
1496 setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom);
1497 setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
1498 setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
1499 setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
1501 setOperationAction(ISD::ADD, MVT::v8i64, Legal);
1502 setOperationAction(ISD::ADD, MVT::v16i32, Legal);
1504 setOperationAction(ISD::SUB, MVT::v8i64, Legal);
1505 setOperationAction(ISD::SUB, MVT::v16i32, Legal);
1507 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1509 setOperationAction(ISD::SRL, MVT::v8i64, Custom);
1510 setOperationAction(ISD::SRL, MVT::v16i32, Custom);
1512 setOperationAction(ISD::SHL, MVT::v8i64, Custom);
1513 setOperationAction(ISD::SHL, MVT::v16i32, Custom);
1515 setOperationAction(ISD::SRA, MVT::v8i64, Custom);
1516 setOperationAction(ISD::SRA, MVT::v16i32, Custom);
1518 setOperationAction(ISD::AND, MVT::v8i64, Legal);
1519 setOperationAction(ISD::OR, MVT::v8i64, Legal);
1520 setOperationAction(ISD::XOR, MVT::v8i64, Legal);
1521 setOperationAction(ISD::AND, MVT::v16i32, Legal);
1522 setOperationAction(ISD::OR, MVT::v16i32, Legal);
1523 setOperationAction(ISD::XOR, MVT::v16i32, Legal);
1525 if (Subtarget->hasCDI()) {
1526 setOperationAction(ISD::CTLZ, MVT::v8i64, Legal);
1527 setOperationAction(ISD::CTLZ, MVT::v16i32, Legal);
1530 // Custom lower several nodes.
1531 for (MVT VT : MVT::vector_valuetypes()) {
1532 unsigned EltSize = VT.getVectorElementType().getSizeInBits();
1533 // Extract subvector is special because the value type
1534 // (result) is 256/128-bit but the source is 512-bit wide.
1535 if (VT.is128BitVector() || VT.is256BitVector()) {
1536 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1538 if (VT.getVectorElementType() == MVT::i1)
1539 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1541 // Do not attempt to custom lower other non-512-bit vectors
1542 if (!VT.is512BitVector())
1545 if ( EltSize >= 32) {
1546 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1547 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1548 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1549 setOperationAction(ISD::VSELECT, VT, Legal);
1550 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1551 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1552 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1553 setOperationAction(ISD::MLOAD, VT, Legal);
1554 setOperationAction(ISD::MSTORE, VT, Legal);
1557 for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
1558 MVT VT = (MVT::SimpleValueType)i;
1560 // Do not attempt to promote non-512-bit vectors.
1561 if (!VT.is512BitVector())
1564 setOperationAction(ISD::SELECT, VT, Promote);
1565 AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
1569 if (!TM.Options.UseSoftFloat && Subtarget->hasBWI()) {
1570 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1571 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1573 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1574 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1576 setOperationAction(ISD::LOAD, MVT::v32i16, Legal);
1577 setOperationAction(ISD::LOAD, MVT::v64i8, Legal);
1578 setOperationAction(ISD::SETCC, MVT::v32i1, Custom);
1579 setOperationAction(ISD::SETCC, MVT::v64i1, Custom);
1580 setOperationAction(ISD::ADD, MVT::v32i16, Legal);
1581 setOperationAction(ISD::ADD, MVT::v64i8, Legal);
1582 setOperationAction(ISD::SUB, MVT::v32i16, Legal);
1583 setOperationAction(ISD::SUB, MVT::v64i8, Legal);
1584 setOperationAction(ISD::MUL, MVT::v32i16, Legal);
1586 for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
1587 const MVT VT = (MVT::SimpleValueType)i;
1589 const unsigned EltSize = VT.getVectorElementType().getSizeInBits();
1591 // Do not attempt to promote non-512-bit vectors.
1592 if (!VT.is512BitVector())
1596 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1597 setOperationAction(ISD::VSELECT, VT, Legal);
1602 if (!TM.Options.UseSoftFloat && Subtarget->hasVLX()) {
1603 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1604 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1606 setOperationAction(ISD::SETCC, MVT::v4i1, Custom);
1607 setOperationAction(ISD::SETCC, MVT::v2i1, Custom);
1608 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Legal);
1610 setOperationAction(ISD::AND, MVT::v8i32, Legal);
1611 setOperationAction(ISD::OR, MVT::v8i32, Legal);
1612 setOperationAction(ISD::XOR, MVT::v8i32, Legal);
1613 setOperationAction(ISD::AND, MVT::v4i32, Legal);
1614 setOperationAction(ISD::OR, MVT::v4i32, Legal);
1615 setOperationAction(ISD::XOR, MVT::v4i32, Legal);
1618 // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
1619 // of this type with custom code.
1620 for (MVT VT : MVT::vector_valuetypes())
1621 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
1623 // We want to custom lower some of our intrinsics.
1624 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1625 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1626 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1627 if (!Subtarget->is64Bit())
1628 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1630 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1631 // handle type legalization for these operations here.
1633 // FIXME: We really should do custom legalization for addition and
1634 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1635 // than generic legalization for 64-bit multiplication-with-overflow, though.
1636 for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
1637 // Add/Sub/Mul with overflow operations are custom lowered.
1639 setOperationAction(ISD::SADDO, VT, Custom);
1640 setOperationAction(ISD::UADDO, VT, Custom);
1641 setOperationAction(ISD::SSUBO, VT, Custom);
1642 setOperationAction(ISD::USUBO, VT, Custom);
1643 setOperationAction(ISD::SMULO, VT, Custom);
1644 setOperationAction(ISD::UMULO, VT, Custom);
1648 if (!Subtarget->is64Bit()) {
1649 // These libcalls are not available in 32-bit.
1650 setLibcallName(RTLIB::SHL_I128, nullptr);
1651 setLibcallName(RTLIB::SRL_I128, nullptr);
1652 setLibcallName(RTLIB::SRA_I128, nullptr);
1655 // Combine sin / cos into one node or libcall if possible.
1656 if (Subtarget->hasSinCos()) {
1657 setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1658 setLibcallName(RTLIB::SINCOS_F64, "sincos");
1659 if (Subtarget->isTargetDarwin()) {
1660 // For MacOSX, we don't want the normal expansion of a libcall to sincos.
1661 // We want to issue a libcall to __sincos_stret to avoid memory traffic.
1662 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1663 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1667 if (Subtarget->isTargetWin64()) {
1668 setOperationAction(ISD::SDIV, MVT::i128, Custom);
1669 setOperationAction(ISD::UDIV, MVT::i128, Custom);
1670 setOperationAction(ISD::SREM, MVT::i128, Custom);
1671 setOperationAction(ISD::UREM, MVT::i128, Custom);
1672 setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1673 setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1676 // We have target-specific dag combine patterns for the following nodes:
1677 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1678 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1679 setTargetDAGCombine(ISD::VSELECT);
1680 setTargetDAGCombine(ISD::SELECT);
1681 setTargetDAGCombine(ISD::SHL);
1682 setTargetDAGCombine(ISD::SRA);
1683 setTargetDAGCombine(ISD::SRL);
1684 setTargetDAGCombine(ISD::OR);
1685 setTargetDAGCombine(ISD::AND);
1686 setTargetDAGCombine(ISD::ADD);
1687 setTargetDAGCombine(ISD::FADD);
1688 setTargetDAGCombine(ISD::FSUB);
1689 setTargetDAGCombine(ISD::FMA);
1690 setTargetDAGCombine(ISD::SUB);
1691 setTargetDAGCombine(ISD::LOAD);
1692 setTargetDAGCombine(ISD::MLOAD);
1693 setTargetDAGCombine(ISD::STORE);
1694 setTargetDAGCombine(ISD::MSTORE);
1695 setTargetDAGCombine(ISD::ZERO_EXTEND);
1696 setTargetDAGCombine(ISD::ANY_EXTEND);
1697 setTargetDAGCombine(ISD::SIGN_EXTEND);
1698 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1699 setTargetDAGCombine(ISD::TRUNCATE);
1700 setTargetDAGCombine(ISD::SINT_TO_FP);
1701 setTargetDAGCombine(ISD::SETCC);
1702 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
1703 setTargetDAGCombine(ISD::BUILD_VECTOR);
1704 setTargetDAGCombine(ISD::MUL);
1705 setTargetDAGCombine(ISD::XOR);
1707 computeRegisterProperties();
1709 // On Darwin, -Os means optimize for size without hurting performance,
1710 // do not reduce the limit.
1711 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1712 MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
1713 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1714 MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1715 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1716 MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1717 setPrefLoopAlignment(4); // 2^4 bytes.
1719 // Predictable cmov don't hurt on atom because it's in-order.
1720 PredictableSelectIsExpensive = !Subtarget->isAtom();
1721 EnableExtLdPromotion = true;
1722 setPrefFunctionAlignment(4); // 2^4 bytes.
1724 verifyIntrinsicTables();
1727 // This has so far only been implemented for 64-bit MachO.
1728 bool X86TargetLowering::useLoadStackGuardNode() const {
1729 return Subtarget->isTargetMachO() && Subtarget->is64Bit();
1732 TargetLoweringBase::LegalizeTypeAction
1733 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1734 if (ExperimentalVectorWideningLegalization &&
1735 VT.getVectorNumElements() != 1 &&
1736 VT.getVectorElementType().getSimpleVT() != MVT::i1)
1737 return TypeWidenVector;
1739 return TargetLoweringBase::getPreferredVectorAction(VT);
1742 EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1744 return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
1746 const unsigned NumElts = VT.getVectorNumElements();
1747 const EVT EltVT = VT.getVectorElementType();
1748 if (VT.is512BitVector()) {
1749 if (Subtarget->hasAVX512())
1750 if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1751 EltVT == MVT::f32 || EltVT == MVT::f64)
1753 case 8: return MVT::v8i1;
1754 case 16: return MVT::v16i1;
1756 if (Subtarget->hasBWI())
1757 if (EltVT == MVT::i8 || EltVT == MVT::i16)
1759 case 32: return MVT::v32i1;
1760 case 64: return MVT::v64i1;
1764 if (VT.is256BitVector() || VT.is128BitVector()) {
1765 if (Subtarget->hasVLX())
1766 if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1767 EltVT == MVT::f32 || EltVT == MVT::f64)
1769 case 2: return MVT::v2i1;
1770 case 4: return MVT::v4i1;
1771 case 8: return MVT::v8i1;
1773 if (Subtarget->hasBWI() && Subtarget->hasVLX())
1774 if (EltVT == MVT::i8 || EltVT == MVT::i16)
1776 case 8: return MVT::v8i1;
1777 case 16: return MVT::v16i1;
1778 case 32: return MVT::v32i1;
1782 return VT.changeVectorElementTypeToInteger();
1785 /// Helper for getByValTypeAlignment to determine
1786 /// the desired ByVal argument alignment.
1787 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1790 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1791 if (VTy->getBitWidth() == 128)
1793 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1794 unsigned EltAlign = 0;
1795 getMaxByValAlign(ATy->getElementType(), EltAlign);
1796 if (EltAlign > MaxAlign)
1797 MaxAlign = EltAlign;
1798 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1799 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
1800 unsigned EltAlign = 0;
1801 getMaxByValAlign(STy->getElementType(i), EltAlign);
1802 if (EltAlign > MaxAlign)
1803 MaxAlign = EltAlign;
1810 /// Return the desired alignment for ByVal aggregate
1811 /// function arguments in the caller parameter area. For X86, aggregates
1812 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1813 /// are at 4-byte boundaries.
1814 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
1815 if (Subtarget->is64Bit()) {
1816 // Max of 8 and alignment of type.
1817 unsigned TyAlign = TD->getABITypeAlignment(Ty);
1824 if (Subtarget->hasSSE1())
1825 getMaxByValAlign(Ty, Align);
1829 /// Returns the target specific optimal type for load
1830 /// and store operations as a result of memset, memcpy, and memmove
1831 /// lowering. If DstAlign is zero that means it's safe to destination
1832 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1833 /// means there isn't a need to check it against alignment requirement,
1834 /// probably because the source does not need to be loaded. If 'IsMemset' is
1835 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1836 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1837 /// source is constant so it does not need to be loaded.
1838 /// It returns EVT::Other if the type should be determined using generic
1839 /// target-independent logic.
1841 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1842 unsigned DstAlign, unsigned SrcAlign,
1843 bool IsMemset, bool ZeroMemset,
1845 MachineFunction &MF) const {
1846 const Function *F = MF.getFunction();
1847 if ((!IsMemset || ZeroMemset) &&
1848 !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
1849 Attribute::NoImplicitFloat)) {
1851 (Subtarget->isUnalignedMemAccessFast() ||
1852 ((DstAlign == 0 || DstAlign >= 16) &&
1853 (SrcAlign == 0 || SrcAlign >= 16)))) {
1855 if (Subtarget->hasInt256())
1857 if (Subtarget->hasFp256())
1860 if (Subtarget->hasSSE2())
1862 if (Subtarget->hasSSE1())
1864 } else if (!MemcpyStrSrc && Size >= 8 &&
1865 !Subtarget->is64Bit() &&
1866 Subtarget->hasSSE2()) {
1867 // Do not use f64 to lower memcpy if source is string constant. It's
1868 // better to use i32 to avoid the loads.
1872 if (Subtarget->is64Bit() && Size >= 8)
1877 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1879 return X86ScalarSSEf32;
1880 else if (VT == MVT::f64)
1881 return X86ScalarSSEf64;
1886 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1891 *Fast = Subtarget->isUnalignedMemAccessFast();
1895 /// Return the entry encoding for a jump table in the
1896 /// current function. The returned value is a member of the
1897 /// MachineJumpTableInfo::JTEntryKind enum.
1898 unsigned X86TargetLowering::getJumpTableEncoding() const {
1899 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1901 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1902 Subtarget->isPICStyleGOT())
1903 return MachineJumpTableInfo::EK_Custom32;
1905 // Otherwise, use the normal jump table encoding heuristics.
1906 return TargetLowering::getJumpTableEncoding();
1910 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1911 const MachineBasicBlock *MBB,
1912 unsigned uid,MCContext &Ctx) const{
1913 assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ &&
1914 Subtarget->isPICStyleGOT());
1915 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1917 return MCSymbolRefExpr::Create(MBB->getSymbol(),
1918 MCSymbolRefExpr::VK_GOTOFF, Ctx);
1921 /// Returns relocation base for the given PIC jumptable.
1922 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1923 SelectionDAG &DAG) const {
1924 if (!Subtarget->is64Bit())
1925 // This doesn't have SDLoc associated with it, but is not really the
1926 // same as a Register.
1927 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
1931 /// This returns the relocation base for the given PIC jumptable,
1932 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
1933 const MCExpr *X86TargetLowering::
1934 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1935 MCContext &Ctx) const {
1936 // X86-64 uses RIP relative addressing based on the jump table label.
1937 if (Subtarget->isPICStyleRIPRel())
1938 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1940 // Otherwise, the reference is relative to the PIC base.
1941 return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
1944 // FIXME: Why this routine is here? Move to RegInfo!
1945 std::pair<const TargetRegisterClass*, uint8_t>
1946 X86TargetLowering::findRepresentativeClass(MVT VT) const{
1947 const TargetRegisterClass *RRC = nullptr;
1949 switch (VT.SimpleTy) {
1951 return TargetLowering::findRepresentativeClass(VT);
1952 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1953 RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
1956 RRC = &X86::VR64RegClass;
1958 case MVT::f32: case MVT::f64:
1959 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1960 case MVT::v4f32: case MVT::v2f64:
1961 case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
1963 RRC = &X86::VR128RegClass;
1966 return std::make_pair(RRC, Cost);
1969 bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
1970 unsigned &Offset) const {
1971 if (!Subtarget->isTargetLinux())
1974 if (Subtarget->is64Bit()) {
1975 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
1977 if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
1989 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
1990 unsigned DestAS) const {
1991 assert(SrcAS != DestAS && "Expected different address spaces!");
1993 return SrcAS < 256 && DestAS < 256;
1996 //===----------------------------------------------------------------------===//
1997 // Return Value Calling Convention Implementation
1998 //===----------------------------------------------------------------------===//
2000 #include "X86GenCallingConv.inc"
2003 X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
2004 MachineFunction &MF, bool isVarArg,
2005 const SmallVectorImpl<ISD::OutputArg> &Outs,
2006 LLVMContext &Context) const {
2007 SmallVector<CCValAssign, 16> RVLocs;
2008 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2009 return CCInfo.CheckReturn(Outs, RetCC_X86);
2012 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2013 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2018 X86TargetLowering::LowerReturn(SDValue Chain,
2019 CallingConv::ID CallConv, bool isVarArg,
2020 const SmallVectorImpl<ISD::OutputArg> &Outs,
2021 const SmallVectorImpl<SDValue> &OutVals,
2022 SDLoc dl, SelectionDAG &DAG) const {
2023 MachineFunction &MF = DAG.getMachineFunction();
2024 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2026 SmallVector<CCValAssign, 16> RVLocs;
2027 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2028 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2031 SmallVector<SDValue, 6> RetOps;
2032 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2033 // Operand #1 = Bytes To Pop
2034 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
2037 // Copy the result values into the output registers.
2038 for (unsigned i = 0; i != RVLocs.size(); ++i) {
2039 CCValAssign &VA = RVLocs[i];
2040 assert(VA.isRegLoc() && "Can only return in registers!");
2041 SDValue ValToCopy = OutVals[i];
2042 EVT ValVT = ValToCopy.getValueType();
2044 // Promote values to the appropriate types.
2045 if (VA.getLocInfo() == CCValAssign::SExt)
2046 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2047 else if (VA.getLocInfo() == CCValAssign::ZExt)
2048 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2049 else if (VA.getLocInfo() == CCValAssign::AExt)
2050 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2051 else if (VA.getLocInfo() == CCValAssign::BCvt)
2052 ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
2054 assert(VA.getLocInfo() != CCValAssign::FPExt &&
2055 "Unexpected FP-extend for return value.");
2057 // If this is x86-64, and we disabled SSE, we can't return FP values,
2058 // or SSE or MMX vectors.
2059 if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2060 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2061 (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
2062 report_fatal_error("SSE register return with SSE disabled");
2064 // Likewise we can't return F64 values with SSE1 only. gcc does so, but
2065 // llvm-gcc has never done it right and no one has noticed, so this
2066 // should be OK for now.
2067 if (ValVT == MVT::f64 &&
2068 (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
2069 report_fatal_error("SSE2 register return with SSE2 disabled");
2071 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2072 // the RET instruction and handled by the FP Stackifier.
2073 if (VA.getLocReg() == X86::FP0 ||
2074 VA.getLocReg() == X86::FP1) {
2075 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2076 // change the value to the FP stack register class.
2077 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2078 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2079 RetOps.push_back(ValToCopy);
2080 // Don't emit a copytoreg.
2084 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2085 // which is returned in RAX / RDX.
2086 if (Subtarget->is64Bit()) {
2087 if (ValVT == MVT::x86mmx) {
2088 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2089 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
2090 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2092 // If we don't have SSE2 available, convert to v4f32 so the generated
2093 // register is legal.
2094 if (!Subtarget->hasSSE2())
2095 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
2100 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
2101 Flag = Chain.getValue(1);
2102 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2105 // The x86-64 ABIs require that for returning structs by value we copy
2106 // the sret argument into %rax/%eax (depending on ABI) for the return.
2107 // Win32 requires us to put the sret argument to %eax as well.
2108 // We saved the argument into a virtual register in the entry block,
2109 // so now we copy the value out and into %rax/%eax.
2110 if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() &&
2111 (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
2112 MachineFunction &MF = DAG.getMachineFunction();
2113 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2114 unsigned Reg = FuncInfo->getSRetReturnReg();
2116 "SRetReturnReg should have been set in LowerFormalArguments().");
2117 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
2120 = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
2121 X86::RAX : X86::EAX;
2122 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2123 Flag = Chain.getValue(1);
2125 // RAX/EAX now acts like a return value.
2126 RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
2129 RetOps[0] = Chain; // Update chain.
2131 // Add the flag if we have it.
2133 RetOps.push_back(Flag);
2135 return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps);
2138 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2139 if (N->getNumValues() != 1)
2141 if (!N->hasNUsesOfValue(1, 0))
2144 SDValue TCChain = Chain;
2145 SDNode *Copy = *N->use_begin();
2146 if (Copy->getOpcode() == ISD::CopyToReg) {
2147 // If the copy has a glue operand, we conservatively assume it isn't safe to
2148 // perform a tail call.
2149 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2151 TCChain = Copy->getOperand(0);
2152 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2155 bool HasRet = false;
2156 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2158 if (UI->getOpcode() != X86ISD::RET_FLAG)
2160 // If we are returning more than one value, we can definitely
2161 // not make a tail call see PR19530
2162 if (UI->getNumOperands() > 4)
2164 if (UI->getNumOperands() == 4 &&
2165 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2178 X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
2179 ISD::NodeType ExtendKind) const {
2181 // TODO: Is this also valid on 32-bit?
2182 if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
2183 ReturnMVT = MVT::i8;
2185 ReturnMVT = MVT::i32;
2187 EVT MinVT = getRegisterType(Context, ReturnMVT);
2188 return VT.bitsLT(MinVT) ? MinVT : VT;
2191 /// Lower the result values of a call into the
2192 /// appropriate copies out of appropriate physical registers.
2195 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
2196 CallingConv::ID CallConv, bool isVarArg,
2197 const SmallVectorImpl<ISD::InputArg> &Ins,
2198 SDLoc dl, SelectionDAG &DAG,
2199 SmallVectorImpl<SDValue> &InVals) const {
2201 // Assign locations to each value returned by this call.
2202 SmallVector<CCValAssign, 16> RVLocs;
2203 bool Is64Bit = Subtarget->is64Bit();
2204 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2206 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2208 // Copy all of the result registers out of their specified physreg.
2209 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
2210 CCValAssign &VA = RVLocs[i];
2211 EVT CopyVT = VA.getValVT();
2213 // If this is x86-64, and we disabled SSE, we can't return FP values
2214 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
2215 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
2216 report_fatal_error("SSE register return with SSE disabled");
2219 // If we prefer to use the value in xmm registers, copy it out as f80 and
2220 // use a truncate to move it from fp stack reg to xmm reg.
2221 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2222 isScalarFPTypeInSSEReg(VA.getValVT()))
2225 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
2226 CopyVT, InFlag).getValue(1);
2227 SDValue Val = Chain.getValue(0);
2229 if (CopyVT != VA.getValVT())
2230 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2231 // This truncation won't change the value.
2232 DAG.getIntPtrConstant(1));
2234 InFlag = Chain.getValue(2);
2235 InVals.push_back(Val);
2241 //===----------------------------------------------------------------------===//
2242 // C & StdCall & Fast Calling Convention implementation
2243 //===----------------------------------------------------------------------===//
2244 // StdCall calling convention seems to be standard for many Windows' API
2245 // routines and around. It differs from C calling convention just a little:
2246 // callee should clean up the stack, not caller. Symbols should be also
2247 // decorated in some fancy way :) It doesn't support any vector arguments.
2248 // For info on fast calling convention see Fast Calling Convention (tail call)
2249 // implementation LowerX86_32FastCCCallTo.
2251 /// CallIsStructReturn - Determines whether a call uses struct return
2253 enum StructReturnType {
2258 static StructReturnType
2259 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
2261 return NotStructReturn;
2263 const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2264 if (!Flags.isSRet())
2265 return NotStructReturn;
2266 if (Flags.isInReg())
2267 return RegStructReturn;
2268 return StackStructReturn;
2271 /// Determines whether a function uses struct return semantics.
2272 static StructReturnType
2273 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
2275 return NotStructReturn;
2277 const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2278 if (!Flags.isSRet())
2279 return NotStructReturn;
2280 if (Flags.isInReg())
2281 return RegStructReturn;
2282 return StackStructReturn;
2285 /// Make a copy of an aggregate at address specified by "Src" to address
2286 /// "Dst" with size and alignment information specified by the specific
2287 /// parameter attribute. The copy will be passed as a byval function parameter.
2289 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
2290 ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
2292 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
2294 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2295 /*isVolatile*/false, /*AlwaysInline=*/true,
2296 MachinePointerInfo(), MachinePointerInfo());
2299 /// Return true if the calling convention is one that
2300 /// supports tail call optimization.
2301 static bool IsTailCallConvention(CallingConv::ID CC) {
2302 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2303 CC == CallingConv::HiPE);
2306 /// \brief Return true if the calling convention is a C calling convention.
2307 static bool IsCCallConvention(CallingConv::ID CC) {
2308 return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||
2309 CC == CallingConv::X86_64_SysV);
2312 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
2313 if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
2317 CallingConv::ID CalleeCC = CS.getCallingConv();
2318 if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
2324 /// Return true if the function is being made into
2325 /// a tailcall target by changing its ABI.
2326 static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
2327 bool GuaranteedTailCallOpt) {
2328 return GuaranteedTailCallOpt && IsTailCallConvention(CC);
2332 X86TargetLowering::LowerMemArgument(SDValue Chain,
2333 CallingConv::ID CallConv,
2334 const SmallVectorImpl<ISD::InputArg> &Ins,
2335 SDLoc dl, SelectionDAG &DAG,
2336 const CCValAssign &VA,
2337 MachineFrameInfo *MFI,
2339 // Create the nodes corresponding to a load from this parameter slot.
2340 ISD::ArgFlagsTy Flags = Ins[i].Flags;
2341 bool AlwaysUseMutable = FuncIsMadeTailCallSafe(
2342 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2343 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2346 // If value is passed by pointer we have address passed instead of the value
2348 if (VA.getLocInfo() == CCValAssign::Indirect)
2349 ValVT = VA.getLocVT();
2351 ValVT = VA.getValVT();
2353 // FIXME: For now, all byval parameter objects are marked mutable. This can be
2354 // changed with more analysis.
2355 // In case of tail call optimization mark all arguments mutable. Since they
2356 // could be overwritten by lowering of arguments in case of a tail call.
2357 if (Flags.isByVal()) {
2358 unsigned Bytes = Flags.getByValSize();
2359 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2360 int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2361 return DAG.getFrameIndex(FI, getPointerTy());
2363 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
2364 VA.getLocMemOffset(), isImmutable);
2365 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
2366 return DAG.getLoad(ValVT, dl, Chain, FIN,
2367 MachinePointerInfo::getFixedStack(FI),
2368 false, false, false, 0);
2372 // FIXME: Get this from tablegen.
2373 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2374 const X86Subtarget *Subtarget) {
2375 assert(Subtarget->is64Bit());
2377 if (Subtarget->isCallingConvWin64(CallConv)) {
2378 static const MCPhysReg GPR64ArgRegsWin64[] = {
2379 X86::RCX, X86::RDX, X86::R8, X86::R9
2381 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2384 static const MCPhysReg GPR64ArgRegs64Bit[] = {
2385 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2387 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2390 // FIXME: Get this from tablegen.
2391 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2392 CallingConv::ID CallConv,
2393 const X86Subtarget *Subtarget) {
2394 assert(Subtarget->is64Bit());
2395 if (Subtarget->isCallingConvWin64(CallConv)) {
2396 // The XMM registers which might contain var arg parameters are shadowed
2397 // in their paired GPR. So we only need to save the GPR to their home
2399 // TODO: __vectorcall will change this.
2403 const Function *Fn = MF.getFunction();
2404 bool NoImplicitFloatOps = Fn->getAttributes().
2405 hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
2406 assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) &&
2407 "SSE register cannot be used when SSE is disabled!");
2408 if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
2409 !Subtarget->hasSSE1())
2410 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2414 static const MCPhysReg XMMArgRegs64Bit[] = {
2415 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2416 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2418 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2422 X86TargetLowering::LowerFormalArguments(SDValue Chain,
2423 CallingConv::ID CallConv,
2425 const SmallVectorImpl<ISD::InputArg> &Ins,
2428 SmallVectorImpl<SDValue> &InVals)
2430 MachineFunction &MF = DAG.getMachineFunction();
2431 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2433 const Function* Fn = MF.getFunction();
2434 if (Fn->hasExternalLinkage() &&
2435 Subtarget->isTargetCygMing() &&
2436 Fn->getName() == "main")
2437 FuncInfo->setForceFramePointer(true);
2439 MachineFrameInfo *MFI = MF.getFrameInfo();
2440 bool Is64Bit = Subtarget->is64Bit();
2441 bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
2443 assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
2444 "Var args not supported with calling convention fastcc, ghc or hipe");
2446 // Assign locations to all of the incoming arguments.
2447 SmallVector<CCValAssign, 16> ArgLocs;
2448 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2450 // Allocate shadow area for Win64
2452 CCInfo.AllocateStack(32, 8);
2454 CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
2456 unsigned LastVal = ~0U;
2458 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2459 CCValAssign &VA = ArgLocs[i];
2460 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
2462 assert(VA.getValNo() != LastVal &&
2463 "Don't support value assigned to multiple locs yet");
2465 LastVal = VA.getValNo();
2467 if (VA.isRegLoc()) {
2468 EVT RegVT = VA.getLocVT();
2469 const TargetRegisterClass *RC;
2470 if (RegVT == MVT::i32)
2471 RC = &X86::GR32RegClass;
2472 else if (Is64Bit && RegVT == MVT::i64)
2473 RC = &X86::GR64RegClass;
2474 else if (RegVT == MVT::f32)
2475 RC = &X86::FR32RegClass;
2476 else if (RegVT == MVT::f64)
2477 RC = &X86::FR64RegClass;
2478 else if (RegVT.is512BitVector())
2479 RC = &X86::VR512RegClass;
2480 else if (RegVT.is256BitVector())
2481 RC = &X86::VR256RegClass;
2482 else if (RegVT.is128BitVector())
2483 RC = &X86::VR128RegClass;
2484 else if (RegVT == MVT::x86mmx)
2485 RC = &X86::VR64RegClass;
2486 else if (RegVT == MVT::i1)
2487 RC = &X86::VK1RegClass;
2488 else if (RegVT == MVT::v8i1)
2489 RC = &X86::VK8RegClass;
2490 else if (RegVT == MVT::v16i1)
2491 RC = &X86::VK16RegClass;
2492 else if (RegVT == MVT::v32i1)
2493 RC = &X86::VK32RegClass;
2494 else if (RegVT == MVT::v64i1)
2495 RC = &X86::VK64RegClass;
2497 llvm_unreachable("Unknown argument type!");
2499 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2500 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2502 // If this is an 8 or 16-bit value, it is really passed promoted to 32
2503 // bits. Insert an assert[sz]ext to capture this, then truncate to the
2505 if (VA.getLocInfo() == CCValAssign::SExt)
2506 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
2507 DAG.getValueType(VA.getValVT()));
2508 else if (VA.getLocInfo() == CCValAssign::ZExt)
2509 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
2510 DAG.getValueType(VA.getValVT()));
2511 else if (VA.getLocInfo() == CCValAssign::BCvt)
2512 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
2514 if (VA.isExtInLoc()) {
2515 // Handle MMX values passed in XMM regs.
2516 if (RegVT.isVector())
2517 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
2519 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
2522 assert(VA.isMemLoc());
2523 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
2526 // If value is passed via pointer - do a load.
2527 if (VA.getLocInfo() == CCValAssign::Indirect)
2528 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
2529 MachinePointerInfo(), false, false, false, 0);
2531 InVals.push_back(ArgValue);
2534 if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) {
2535 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2536 // The x86-64 ABIs require that for returning structs by value we copy
2537 // the sret argument into %rax/%eax (depending on ABI) for the return.
2538 // Win32 requires us to put the sret argument to %eax as well.
2539 // Save the argument into a virtual register so that we can access it
2540 // from the return points.
2541 if (Ins[i].Flags.isSRet()) {
2542 unsigned Reg = FuncInfo->getSRetReturnReg();
2544 MVT PtrTy = getPointerTy();
2545 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
2546 FuncInfo->setSRetReturnReg(Reg);
2548 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
2549 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
2555 unsigned StackSize = CCInfo.getNextStackOffset();
2556 // Align stack specially for tail calls.
2557 if (FuncIsMadeTailCallSafe(CallConv,
2558 MF.getTarget().Options.GuaranteedTailCallOpt))
2559 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
2561 // If the function takes variable number of arguments, make a frame index for
2562 // the start of the first vararg value... for expansion of llvm.va_start. We
2563 // can skip this if there are no va_start calls.
2564 if (MFI->hasVAStart() &&
2565 (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
2566 CallConv != CallingConv::X86_ThisCall))) {
2567 FuncInfo->setVarArgsFrameIndex(
2568 MFI->CreateFixedObject(1, StackSize, true));
2571 // Figure out if XMM registers are in use.
2572 assert(!(MF.getTarget().Options.UseSoftFloat &&
2573 Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
2574 Attribute::NoImplicitFloat)) &&
2575 "SSE register cannot be used when SSE is disabled!");
2577 // 64-bit calling conventions support varargs and register parameters, so we
2578 // have to do extra work to spill them in the prologue.
2579 if (Is64Bit && isVarArg && MFI->hasVAStart()) {
2580 // Find the first unallocated argument registers.
2581 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
2582 ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
2583 unsigned NumIntRegs =
2584 CCInfo.getFirstUnallocated(ArgGPRs.data(), ArgGPRs.size());
2585 unsigned NumXMMRegs =
2586 CCInfo.getFirstUnallocated(ArgXMMs.data(), ArgXMMs.size());
2587 assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
2588 "SSE register cannot be used when SSE is disabled!");
2590 // Gather all the live in physical registers.
2591 SmallVector<SDValue, 6> LiveGPRs;
2592 SmallVector<SDValue, 8> LiveXMMRegs;
2594 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
2595 unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
2597 DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
2599 if (!ArgXMMs.empty()) {
2600 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2601 ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
2602 for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
2603 unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
2604 LiveXMMRegs.push_back(
2605 DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
2610 const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
2611 // Get to the caller-allocated home save location. Add 8 to account
2612 // for the return address.
2613 int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
2614 FuncInfo->setRegSaveFrameIndex(
2615 MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
2616 // Fixup to set vararg frame on shadow area (4 x i64).
2618 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
2620 // For X86-64, if there are vararg parameters that are passed via
2621 // registers, then we must store them to their spots on the stack so
2622 // they may be loaded by deferencing the result of va_next.
2623 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
2624 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
2625 FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
2626 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
2629 // Store the integer parameter registers.
2630 SmallVector<SDValue, 8> MemOps;
2631 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
2633 unsigned Offset = FuncInfo->getVarArgsGPOffset();
2634 for (SDValue Val : LiveGPRs) {
2635 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
2636 DAG.getIntPtrConstant(Offset));
2638 DAG.getStore(Val.getValue(1), dl, Val, FIN,
2639 MachinePointerInfo::getFixedStack(
2640 FuncInfo->getRegSaveFrameIndex(), Offset),
2642 MemOps.push_back(Store);
2646 if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
2647 // Now store the XMM (fp + vector) parameter registers.
2648 SmallVector<SDValue, 12> SaveXMMOps;
2649 SaveXMMOps.push_back(Chain);
2650 SaveXMMOps.push_back(ALVal);
2651 SaveXMMOps.push_back(DAG.getIntPtrConstant(
2652 FuncInfo->getRegSaveFrameIndex()));
2653 SaveXMMOps.push_back(DAG.getIntPtrConstant(
2654 FuncInfo->getVarArgsFPOffset()));
2655 SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
2657 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
2658 MVT::Other, SaveXMMOps));
2661 if (!MemOps.empty())
2662 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
2665 if (isVarArg && MFI->hasMustTailInVarArgFunc()) {
2666 // Find the largest legal vector type.
2667 MVT VecVT = MVT::Other;
2668 // FIXME: Only some x86_32 calling conventions support AVX512.
2669 if (Subtarget->hasAVX512() &&
2670 (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
2671 CallConv == CallingConv::Intel_OCL_BI)))
2672 VecVT = MVT::v16f32;
2673 else if (Subtarget->hasAVX())
2675 else if (Subtarget->hasSSE2())
2678 // We forward some GPRs and some vector types.
2679 SmallVector<MVT, 2> RegParmTypes;
2680 MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
2681 RegParmTypes.push_back(IntVT);
2682 if (VecVT != MVT::Other)
2683 RegParmTypes.push_back(VecVT);
2685 // Compute the set of forwarded registers. The rest are scratch.
2686 SmallVectorImpl<ForwardedRegister> &Forwards =
2687 FuncInfo->getForwardedMustTailRegParms();
2688 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
2690 // Conservatively forward AL on x86_64, since it might be used for varargs.
2691 if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
2692 unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2693 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
2696 // Copy all forwards from physical to virtual registers.
2697 for (ForwardedRegister &F : Forwards) {
2698 // FIXME: Can we use a less constrained schedule?
2699 SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
2700 F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
2701 Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
2705 // Some CCs need callee pop.
2706 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
2707 MF.getTarget().Options.GuaranteedTailCallOpt)) {
2708 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
2710 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
2711 // If this is an sret function, the return should pop the hidden pointer.
2712 if (!Is64Bit && !IsTailCallConvention(CallConv) &&
2713 !Subtarget->getTargetTriple().isOSMSVCRT() &&
2714 argsAreStructReturn(Ins) == StackStructReturn)
2715 FuncInfo->setBytesToPopOnReturn(4);
2719 // RegSaveFrameIndex is X86-64 only.
2720 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
2721 if (CallConv == CallingConv::X86_FastCall ||
2722 CallConv == CallingConv::X86_ThisCall)
2723 // fastcc functions can't have varargs.
2724 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
2727 FuncInfo->setArgumentStackSize(StackSize);
2733 X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
2734 SDValue StackPtr, SDValue Arg,
2735 SDLoc dl, SelectionDAG &DAG,
2736 const CCValAssign &VA,
2737 ISD::ArgFlagsTy Flags) const {
2738 unsigned LocMemOffset = VA.getLocMemOffset();
2739 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
2740 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
2741 if (Flags.isByVal())
2742 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
2744 return DAG.getStore(Chain, dl, Arg, PtrOff,
2745 MachinePointerInfo::getStack(LocMemOffset),
2749 /// Emit a load of return address if tail call
2750 /// optimization is performed and it is required.
2752 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
2753 SDValue &OutRetAddr, SDValue Chain,
2754 bool IsTailCall, bool Is64Bit,
2755 int FPDiff, SDLoc dl) const {
2756 // Adjust the Return address stack slot.
2757 EVT VT = getPointerTy();
2758 OutRetAddr = getReturnAddressFrameIndex(DAG);
2760 // Load the "old" Return address.
2761 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
2762 false, false, false, 0);
2763 return SDValue(OutRetAddr.getNode(), 1);
2766 /// Emit a store of the return address if tail call
2767 /// optimization is performed and it is required (FPDiff!=0).
2768 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
2769 SDValue Chain, SDValue RetAddrFrIdx,
2770 EVT PtrVT, unsigned SlotSize,
2771 int FPDiff, SDLoc dl) {
2772 // Store the return address to the appropriate stack slot.
2773 if (!FPDiff) return Chain;
2774 // Calculate the new stack slot for the return address.
2775 int NewReturnAddrFI =
2776 MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
2778 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
2779 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
2780 MachinePointerInfo::getFixedStack(NewReturnAddrFI),
2786 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2787 SmallVectorImpl<SDValue> &InVals) const {
2788 SelectionDAG &DAG = CLI.DAG;
2790 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2791 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2792 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
2793 SDValue Chain = CLI.Chain;
2794 SDValue Callee = CLI.Callee;
2795 CallingConv::ID CallConv = CLI.CallConv;
2796 bool &isTailCall = CLI.IsTailCall;
2797 bool isVarArg = CLI.IsVarArg;
2799 MachineFunction &MF = DAG.getMachineFunction();
2800 bool Is64Bit = Subtarget->is64Bit();
2801 bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
2802 StructReturnType SR = callIsStructReturn(Outs);
2803 bool IsSibcall = false;
2804 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
2806 if (MF.getTarget().Options.DisableTailCalls)
2809 bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
2811 // Force this to be a tail call. The verifier rules are enough to ensure
2812 // that we can lower this successfully without moving the return address
2815 } else if (isTailCall) {
2816 // Check if it's really possible to do a tail call.
2817 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
2818 isVarArg, SR != NotStructReturn,
2819 MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
2820 Outs, OutVals, Ins, DAG);
2822 // Sibcalls are automatically detected tailcalls which do not require
2824 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
2831 assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
2832 "Var args not supported with calling convention fastcc, ghc or hipe");
2834 // Analyze operands of the call, assigning locations to each operand.
2835 SmallVector<CCValAssign, 16> ArgLocs;
2836 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2838 // Allocate shadow area for Win64
2840 CCInfo.AllocateStack(32, 8);
2842 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
2844 // Get a count of how many bytes are to be pushed on the stack.
2845 unsigned NumBytes = CCInfo.getNextStackOffset();
2847 // This is a sibcall. The memory operands are available in caller's
2848 // own caller's stack.
2850 else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
2851 IsTailCallConvention(CallConv))
2852 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
2855 if (isTailCall && !IsSibcall && !IsMustTail) {
2856 // Lower arguments at fp - stackoffset + fpdiff.
2857 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
2859 FPDiff = NumBytesCallerPushed - NumBytes;
2861 // Set the delta of movement of the returnaddr stackslot.
2862 // But only set if delta is greater than previous delta.
2863 if (FPDiff < X86Info->getTCReturnAddrDelta())
2864 X86Info->setTCReturnAddrDelta(FPDiff);
2867 unsigned NumBytesToPush = NumBytes;
2868 unsigned NumBytesToPop = NumBytes;
2870 // If we have an inalloca argument, all stack space has already been allocated
2871 // for us and be right at the top of the stack. We don't support multiple
2872 // arguments passed in memory when using inalloca.
2873 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
2875 if (!ArgLocs.back().isMemLoc())
2876 report_fatal_error("cannot use inalloca attribute on a register "
2878 if (ArgLocs.back().getLocMemOffset() != 0)
2879 report_fatal_error("any parameter with the inalloca attribute must be "
2880 "the only memory argument");
2884 Chain = DAG.getCALLSEQ_START(
2885 Chain, DAG.getIntPtrConstant(NumBytesToPush, true), dl);
2887 SDValue RetAddrFrIdx;
2888 // Load return address for tail calls.
2889 if (isTailCall && FPDiff)
2890 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
2891 Is64Bit, FPDiff, dl);
2893 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2894 SmallVector<SDValue, 8> MemOpChains;
2897 // Walk the register/memloc assignments, inserting copies/loads. In the case
2898 // of tail call optimization arguments are handle later.
2899 const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
2900 DAG.getSubtarget().getRegisterInfo());
2901 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2902 // Skip inalloca arguments, they have already been written.
2903 ISD::ArgFlagsTy Flags = Outs[i].Flags;
2904 if (Flags.isInAlloca())
2907 CCValAssign &VA = ArgLocs[i];
2908 EVT RegVT = VA.getLocVT();
2909 SDValue Arg = OutVals[i];
2910 bool isByVal = Flags.isByVal();
2912 // Promote the value if needed.
2913 switch (VA.getLocInfo()) {
2914 default: llvm_unreachable("Unknown loc info!");
2915 case CCValAssign::Full: break;
2916 case CCValAssign::SExt:
2917 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
2919 case CCValAssign::ZExt:
2920 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
2922 case CCValAssign::AExt:
2923 if (RegVT.is128BitVector()) {
2924 // Special case: passing MMX values in XMM registers.
2925 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
2926 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
2927 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
2929 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
2931 case CCValAssign::BCvt:
2932 Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
2934 case CCValAssign::Indirect: {
2935 // Store the argument.
2936 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
2937 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
2938 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
2939 MachinePointerInfo::getFixedStack(FI),
2946 if (VA.isRegLoc()) {
2947 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2948 if (isVarArg && IsWin64) {
2949 // Win64 ABI requires argument XMM reg to be copied to the corresponding
2950 // shadow reg if callee is a varargs function.
2951 unsigned ShadowReg = 0;
2952 switch (VA.getLocReg()) {
2953 case X86::XMM0: ShadowReg = X86::RCX; break;
2954 case X86::XMM1: ShadowReg = X86::RDX; break;
2955 case X86::XMM2: ShadowReg = X86::R8; break;
2956 case X86::XMM3: ShadowReg = X86::R9; break;
2959 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
2961 } else if (!IsSibcall && (!isTailCall || isByVal)) {
2962 assert(VA.isMemLoc());
2963 if (!StackPtr.getNode())
2964 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
2966 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
2967 dl, DAG, VA, Flags));
2971 if (!MemOpChains.empty())
2972 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2974 if (Subtarget->isPICStyleGOT()) {
2975 // ELF / PIC requires GOT in the EBX register before function calls via PLT
2978 RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
2979 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
2981 // If we are tail calling and generating PIC/GOT style code load the
2982 // address of the callee into ECX. The value in ecx is used as target of
2983 // the tail jump. This is done to circumvent the ebx/callee-saved problem
2984 // for tail calls on PIC/GOT architectures. Normally we would just put the
2985 // address of GOT into ebx and then call target@PLT. But for tail calls
2986 // ebx would be restored (since ebx is callee saved) before jumping to the
2989 // Note: The actual moving to ECX is done further down.
2990 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
2991 if (G && !G->getGlobal()->hasHiddenVisibility() &&
2992 !G->getGlobal()->hasProtectedVisibility())
2993 Callee = LowerGlobalAddress(Callee, DAG);
2994 else if (isa<ExternalSymbolSDNode>(Callee))
2995 Callee = LowerExternalSymbol(Callee, DAG);
2999 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3000 // From AMD64 ABI document:
3001 // For calls that may call functions that use varargs or stdargs
3002 // (prototype-less calls or calls to functions containing ellipsis (...) in
3003 // the declaration) %al is used as hidden argument to specify the number
3004 // of SSE registers used. The contents of %al do not need to match exactly
3005 // the number of registers, but must be an ubound on the number of SSE
3006 // registers used and is in the range 0 - 8 inclusive.
3008 // Count the number of XMM registers allocated.
3009 static const MCPhysReg XMMArgRegs[] = {
3010 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3011 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3013 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
3014 assert((Subtarget->hasSSE1() || !NumXMMRegs)
3015 && "SSE registers cannot be used when SSE is disabled");
3017 RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3018 DAG.getConstant(NumXMMRegs, MVT::i8)));
3021 if (isVarArg && IsMustTail) {
3022 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3023 for (const auto &F : Forwards) {
3024 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3025 RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3029 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
3030 // don't need this because the eligibility check rejects calls that require
3031 // shuffling arguments passed in memory.
3032 if (!IsSibcall && isTailCall) {
3033 // Force all the incoming stack arguments to be loaded from the stack
3034 // before any new outgoing arguments are stored to the stack, because the
3035 // outgoing stack slots may alias the incoming argument stack slots, and
3036 // the alias isn't otherwise explicit. This is slightly more conservative
3037 // than necessary, because it means that each store effectively depends
3038 // on every argument instead of just those arguments it would clobber.
3039 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3041 SmallVector<SDValue, 8> MemOpChains2;
3044 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3045 CCValAssign &VA = ArgLocs[i];
3048 assert(VA.isMemLoc());
3049 SDValue Arg = OutVals[i];
3050 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3051 // Skip inalloca arguments. They don't require any work.
3052 if (Flags.isInAlloca())
3054 // Create frame index.
3055 int32_t Offset = VA.getLocMemOffset()+FPDiff;
3056 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3057 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
3058 FIN = DAG.getFrameIndex(FI, getPointerTy());
3060 if (Flags.isByVal()) {
3061 // Copy relative to framepointer.
3062 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
3063 if (!StackPtr.getNode())
3064 StackPtr = DAG.getCopyFromReg(Chain, dl,
3065 RegInfo->getStackRegister(),
3067 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
3069 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3073 // Store relative to framepointer.
3074 MemOpChains2.push_back(
3075 DAG.getStore(ArgChain, dl, Arg, FIN,
3076 MachinePointerInfo::getFixedStack(FI),
3081 if (!MemOpChains2.empty())
3082 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3084 // Store the return address to the appropriate stack slot.
3085 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3086 getPointerTy(), RegInfo->getSlotSize(),
3090 // Build a sequence of copy-to-reg nodes chained together with token chain
3091 // and flag operands which copy the outgoing args into registers.
3093 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3094 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3095 RegsToPass[i].second, InFlag);
3096 InFlag = Chain.getValue(1);
3099 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3100 assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3101 // In the 64-bit large code model, we have to make all calls
3102 // through a register, since the call instruction's 32-bit
3103 // pc-relative offset may not be large enough to hold the whole
3105 } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3106 // If the callee is a GlobalAddress node (quite common, every direct call
3107 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3109 GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3111 // We should use extra load for direct calls to dllimported functions in
3113 const GlobalValue *GV = G->getGlobal();
3114 if (!GV->hasDLLImportStorageClass()) {
3115 unsigned char OpFlags = 0;
3116 bool ExtraLoad = false;
3117 unsigned WrapperKind = ISD::DELETED_NODE;
3119 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
3120 // external symbols most go through the PLT in PIC mode. If the symbol
3121 // has hidden or protected visibility, or if it is static or local, then
3122 // we don't need to use the PLT - we can directly call it.
3123 if (Subtarget->isTargetELF() &&
3124 DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
3125 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
3126 OpFlags = X86II::MO_PLT;
3127 } else if (Subtarget->isPICStyleStubAny() &&
3128 (GV->isDeclaration() || GV->isWeakForLinker()) &&
3129 (!Subtarget->getTargetTriple().isMacOSX() ||
3130 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
3131 // PC-relative references to external symbols should go through $stub,
3132 // unless we're building with the leopard linker or later, which
3133 // automatically synthesizes these stubs.
3134 OpFlags = X86II::MO_DARWIN_STUB;
3135 } else if (Subtarget->isPICStyleRIPRel() &&
3136 isa<Function>(GV) &&
3137 cast<Function>(GV)->getAttributes().
3138 hasAttribute(AttributeSet::FunctionIndex,
3139 Attribute::NonLazyBind)) {
3140 // If the function is marked as non-lazy, generate an indirect call
3141 // which loads from the GOT directly. This avoids runtime overhead
3142 // at the cost of eager binding (and one extra byte of encoding).
3143 OpFlags = X86II::MO_GOTPCREL;
3144 WrapperKind = X86ISD::WrapperRIP;
3148 Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
3149 G->getOffset(), OpFlags);
3151 // Add a wrapper if needed.
3152 if (WrapperKind != ISD::DELETED_NODE)
3153 Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
3154 // Add extra indirection if needed.
3156 Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
3157 MachinePointerInfo::getGOT(),
3158 false, false, false, 0);
3160 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3161 unsigned char OpFlags = 0;
3163 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
3164 // external symbols should go through the PLT.
3165 if (Subtarget->isTargetELF() &&
3166 DAG.getTarget().getRelocationModel() == Reloc::PIC_) {
3167 OpFlags = X86II::MO_PLT;
3168 } else if (Subtarget->isPICStyleStubAny() &&
3169 (!Subtarget->getTargetTriple().isMacOSX() ||
3170 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
3171 // PC-relative references to external symbols should go through $stub,
3172 // unless we're building with the leopard linker or later, which
3173 // automatically synthesizes these stubs.
3174 OpFlags = X86II::MO_DARWIN_STUB;
3177 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
3179 } else if (Subtarget->isTarget64BitILP32() && Callee->getValueType(0) == MVT::i32) {
3180 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3181 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3184 // Returns a chain & a flag for retval copy to use.
3185 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3186 SmallVector<SDValue, 8> Ops;
3188 if (!IsSibcall && isTailCall) {
3189 Chain = DAG.getCALLSEQ_END(Chain,
3190 DAG.getIntPtrConstant(NumBytesToPop, true),
3191 DAG.getIntPtrConstant(0, true), InFlag, dl);
3192 InFlag = Chain.getValue(1);
3195 Ops.push_back(Chain);
3196 Ops.push_back(Callee);
3199 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
3201 // Add argument registers to the end of the list so that they are known live
3203 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3204 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3205 RegsToPass[i].second.getValueType()));
3207 // Add a register mask operand representing the call-preserved registers.
3208 const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
3209 const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
3210 assert(Mask && "Missing call preserved mask for calling convention");
3211 Ops.push_back(DAG.getRegisterMask(Mask));
3213 if (InFlag.getNode())
3214 Ops.push_back(InFlag);
3218 //// If this is the first return lowered for this function, add the regs
3219 //// to the liveout set for the function.
3220 // This isn't right, although it's probably harmless on x86; liveouts
3221 // should be computed from returns not tail calls. Consider a void
3222 // function making a tail call to a function returning int.
3223 return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3226 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3227 InFlag = Chain.getValue(1);
3229 // Create the CALLSEQ_END node.
3230 unsigned NumBytesForCalleeToPop;
3231 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3232 DAG.getTarget().Options.GuaranteedTailCallOpt))
3233 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
3234 else if (!Is64Bit && !IsTailCallConvention(CallConv) &&
3235 !Subtarget->getTargetTriple().isOSMSVCRT() &&
3236 SR == StackStructReturn)
3237 // If this is a call to a struct-return function, the callee
3238 // pops the hidden struct pointer, so we have to push it back.
3239 // This is common for Darwin/X86, Linux & Mingw32 targets.
3240 // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3241 NumBytesForCalleeToPop = 4;
3243 NumBytesForCalleeToPop = 0; // Callee pops nothing.
3245 // Returns a flag for retval copy to use.
3247 Chain = DAG.getCALLSEQ_END(Chain,
3248 DAG.getIntPtrConstant(NumBytesToPop, true),
3249 DAG.getIntPtrConstant(NumBytesForCalleeToPop,
3252 InFlag = Chain.getValue(1);
3255 // Handle result values, copying them out of physregs into vregs that we
3257 return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
3258 Ins, dl, DAG, InVals);
3261 //===----------------------------------------------------------------------===//
3262 // Fast Calling Convention (tail call) implementation
3263 //===----------------------------------------------------------------------===//
3265 // Like std call, callee cleans arguments, convention except that ECX is
3266 // reserved for storing the tail called function address. Only 2 registers are
3267 // free for argument passing (inreg). Tail call optimization is performed
3269 // * tailcallopt is enabled
3270 // * caller/callee are fastcc
3271 // On X86_64 architecture with GOT-style position independent code only local
3272 // (within module) calls are supported at the moment.
3273 // To keep the stack aligned according to platform abi the function
3274 // GetAlignedArgumentStackSize ensures that argument delta is always multiples
3275 // of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3276 // If a tail called function callee has more arguments than the caller the
3277 // caller needs to make sure that there is room to move the RETADDR to. This is
3278 // achieved by reserving an area the size of the argument delta right after the
3279 // original RETADDR, but before the saved framepointer or the spilled registers
3280 // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3292 /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
3293 /// for a 16 byte align requirement.
3295 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3296 SelectionDAG& DAG) const {
3297 MachineFunction &MF = DAG.getMachineFunction();
3298 const TargetMachine &TM = MF.getTarget();
3299 const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
3300 TM.getSubtargetImpl()->getRegisterInfo());
3301 const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering();
3302 unsigned StackAlignment = TFI.getStackAlignment();
3303 uint64_t AlignMask = StackAlignment - 1;
3304 int64_t Offset = StackSize;
3305 unsigned SlotSize = RegInfo->getSlotSize();
3306 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3307 // Number smaller than 12 so just add the difference.
3308 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3310 // Mask out lower bits, add stackalignment once plus the 12 bytes.
3311 Offset = ((~AlignMask) & Offset) + StackAlignment +
3312 (StackAlignment-SlotSize);
3317 /// MatchingStackOffset - Return true if the given stack call argument is
3318 /// already available in the same position (relatively) of the caller's
3319 /// incoming argument stack.
3321 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3322 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
3323 const X86InstrInfo *TII) {
3324 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
3326 if (Arg.getOpcode() == ISD::CopyFromReg) {
3327 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3328 if (!TargetRegisterInfo::isVirtualRegister(VR))
3330 MachineInstr *Def = MRI->getVRegDef(VR);
3333 if (!Flags.isByVal()) {
3334 if (!TII->isLoadFromStackSlot(Def, FI))
3337 unsigned Opcode = Def->getOpcode();
3338 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
3339 Opcode == X86::LEA64_32r) &&
3340 Def->getOperand(1).isFI()) {
3341 FI = Def->getOperand(1).getIndex();
3342 Bytes = Flags.getByValSize();
3346 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
3347 if (Flags.isByVal())
3348 // ByVal argument is passed in as a pointer but it's now being
3349 // dereferenced. e.g.
3350 // define @foo(%struct.X* %A) {
3351 // tail call @bar(%struct.X* byval %A)
3354 SDValue Ptr = Ld->getBasePtr();
3355 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
3358 FI = FINode->getIndex();
3359 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
3360 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
3361 FI = FINode->getIndex();
3362 Bytes = Flags.getByValSize();
3366 assert(FI != INT_MAX);
3367 if (!MFI->isFixedObjectIndex(FI))
3369 return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
3372 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
3373 /// for tail call optimization. Targets which want to do tail call
3374 /// optimization should implement this function.
3376 X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
3377 CallingConv::ID CalleeCC,
3379 bool isCalleeStructRet,
3380 bool isCallerStructRet,
3382 const SmallVectorImpl<ISD::OutputArg> &Outs,
3383 const SmallVectorImpl<SDValue> &OutVals,
3384 const SmallVectorImpl<ISD::InputArg> &Ins,
3385 SelectionDAG &DAG) const {
3386 if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
3389 // If -tailcallopt is specified, make fastcc functions tail-callable.
3390 const MachineFunction &MF = DAG.getMachineFunction();
3391 const Function *CallerF = MF.getFunction();
3393 // If the function return type is x86_fp80 and the callee return type is not,
3394 // then the FP_EXTEND of the call result is not a nop. It's not safe to
3395 // perform a tailcall optimization here.
3396 if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
3399 CallingConv::ID CallerCC = CallerF->getCallingConv();
3400 bool CCMatch = CallerCC == CalleeCC;
3401 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
3402 bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
3404 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
3405 if (IsTailCallConvention(CalleeCC) && CCMatch)
3410 // Look for obvious safe cases to perform tail call optimization that do not
3411 // require ABI changes. This is what gcc calls sibcall.
3413 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
3414 // emit a special epilogue.
3415 const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
3416 DAG.getSubtarget().getRegisterInfo());
3417 if (RegInfo->needsStackRealignment(MF))
3420 // Also avoid sibcall optimization if either caller or callee uses struct
3421 // return semantics.
3422 if (isCalleeStructRet || isCallerStructRet)
3425 // An stdcall/thiscall caller is expected to clean up its arguments; the
3426 // callee isn't going to do that.
3427 // FIXME: this is more restrictive than needed. We could produce a tailcall
3428 // when the stack adjustment matches. For example, with a thiscall that takes
3429 // only one argument.
3430 if (!CCMatch && (CallerCC == CallingConv::X86_StdCall ||
3431 CallerCC == CallingConv::X86_ThisCall))
3434 // Do not sibcall optimize vararg calls unless all arguments are passed via
3436 if (isVarArg && !Outs.empty()) {
3438 // Optimizing for varargs on Win64 is unlikely to be safe without
3439 // additional testing.
3440 if (IsCalleeWin64 || IsCallerWin64)
3443 SmallVector<CCValAssign, 16> ArgLocs;
3444 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
3447 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3448 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
3449 if (!ArgLocs[i].isRegLoc())
3453 // If the call result is in ST0 / ST1, it needs to be popped off the x87
3454 // stack. Therefore, if it's not used by the call it is not safe to optimize
3455 // this into a sibcall.
3456 bool Unused = false;
3457 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
3464 SmallVector<CCValAssign, 16> RVLocs;
3465 CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs,
3467 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3468 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
3469 CCValAssign &VA = RVLocs[i];
3470 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
3475 // If the calling conventions do not match, then we'd better make sure the
3476 // results are returned in the same way as what the caller expects.
3478 SmallVector<CCValAssign, 16> RVLocs1;
3479 CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
3481 CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
3483 SmallVector<CCValAssign, 16> RVLocs2;
3484 CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
3486 CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
3488 if (RVLocs1.size() != RVLocs2.size())
3490 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
3491 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
3493 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
3495 if (RVLocs1[i].isRegLoc()) {
3496 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
3499 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
3505 // If the callee takes no arguments then go on to check the results of the
3507 if (!Outs.empty()) {
3508 // Check if stack adjustment is needed. For now, do not do this if any
3509 // argument is passed on the stack.
3510 SmallVector<CCValAssign, 16> ArgLocs;
3511 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
3514 // Allocate shadow area for Win64
3516 CCInfo.AllocateStack(32, 8);
3518 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3519 if (CCInfo.getNextStackOffset()) {
3520 MachineFunction &MF = DAG.getMachineFunction();
3521 if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
3524 // Check if the arguments are already laid out in the right way as
3525 // the caller's fixed stack objects.
3526 MachineFrameInfo *MFI = MF.getFrameInfo();
3527 const MachineRegisterInfo *MRI = &MF.getRegInfo();
3528 const X86InstrInfo *TII =
3529 static_cast<const X86InstrInfo *>(DAG.getSubtarget().getInstrInfo());
3530 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3531 CCValAssign &VA = ArgLocs[i];
3532 SDValue Arg = OutVals[i];
3533 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3534 if (VA.getLocInfo() == CCValAssign::Indirect)
3536 if (!VA.isRegLoc()) {
3537 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
3544 // If the tailcall address may be in a register, then make sure it's
3545 // possible to register allocate for it. In 32-bit, the call address can
3546 // only target EAX, EDX, or ECX since the tail call must be scheduled after
3547 // callee-saved registers are restored. These happen to be the same
3548 // registers used to pass 'inreg' arguments so watch out for those.
3549 if (!Subtarget->is64Bit() &&
3550 ((!isa<GlobalAddressSDNode>(Callee) &&
3551 !isa<ExternalSymbolSDNode>(Callee)) ||
3552 DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
3553 unsigned NumInRegs = 0;
3554 // In PIC we need an extra register to formulate the address computation
3556 unsigned MaxInRegs =
3557 (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
3559 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3560 CCValAssign &VA = ArgLocs[i];
3563 unsigned Reg = VA.getLocReg();
3566 case X86::EAX: case X86::EDX: case X86::ECX:
3567 if (++NumInRegs == MaxInRegs)
3579 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
3580 const TargetLibraryInfo *libInfo) const {
3581 return X86::createFastISel(funcInfo, libInfo);
3584 //===----------------------------------------------------------------------===//
3585 // Other Lowering Hooks
3586 //===----------------------------------------------------------------------===//
3588 static bool MayFoldLoad(SDValue Op) {
3589 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
3592 static bool MayFoldIntoStore(SDValue Op) {
3593 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
3596 static bool isTargetShuffle(unsigned Opcode) {
3598 default: return false;
3599 case X86ISD::BLENDI:
3600 case X86ISD::PSHUFB:
3601 case X86ISD::PSHUFD:
3602 case X86ISD::PSHUFHW:
3603 case X86ISD::PSHUFLW:
3605 case X86ISD::PALIGNR:
3606 case X86ISD::MOVLHPS:
3607 case X86ISD::MOVLHPD:
3608 case X86ISD::MOVHLPS:
3609 case X86ISD::MOVLPS:
3610 case X86ISD::MOVLPD:
3611 case X86ISD::MOVSHDUP:
3612 case X86ISD::MOVSLDUP:
3613 case X86ISD::MOVDDUP:
3616 case X86ISD::UNPCKL:
3617 case X86ISD::UNPCKH:
3618 case X86ISD::VPERMILPI:
3619 case X86ISD::VPERM2X128:
3620 case X86ISD::VPERMI:
3625 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3626 SDValue V1, SelectionDAG &DAG) {
3628 default: llvm_unreachable("Unknown x86 shuffle node");
3629 case X86ISD::MOVSHDUP:
3630 case X86ISD::MOVSLDUP:
3631 case X86ISD::MOVDDUP:
3632 return DAG.getNode(Opc, dl, VT, V1);
3636 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3637 SDValue V1, unsigned TargetMask,
3638 SelectionDAG &DAG) {
3640 default: llvm_unreachable("Unknown x86 shuffle node");
3641 case X86ISD::PSHUFD:
3642 case X86ISD::PSHUFHW:
3643 case X86ISD::PSHUFLW:
3644 case X86ISD::VPERMILPI:
3645 case X86ISD::VPERMI:
3646 return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
3650 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3651 SDValue V1, SDValue V2, unsigned TargetMask,
3652 SelectionDAG &DAG) {
3654 default: llvm_unreachable("Unknown x86 shuffle node");
3655 case X86ISD::PALIGNR:
3656 case X86ISD::VALIGN:
3658 case X86ISD::VPERM2X128:
3659 return DAG.getNode(Opc, dl, VT, V1, V2,
3660 DAG.getConstant(TargetMask, MVT::i8));
3664 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3665 SDValue V1, SDValue V2, SelectionDAG &DAG) {
3667 default: llvm_unreachable("Unknown x86 shuffle node");
3668 case X86ISD::MOVLHPS:
3669 case X86ISD::MOVLHPD:
3670 case X86ISD::MOVHLPS:
3671 case X86ISD::MOVLPS:
3672 case X86ISD::MOVLPD:
3675 case X86ISD::UNPCKL:
3676 case X86ISD::UNPCKH:
3677 return DAG.getNode(Opc, dl, VT, V1, V2);
3681 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
3682 MachineFunction &MF = DAG.getMachineFunction();
3683 const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
3684 DAG.getSubtarget().getRegisterInfo());
3685 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3686 int ReturnAddrIndex = FuncInfo->getRAIndex();
3688 if (ReturnAddrIndex == 0) {
3689 // Set up a frame object for the return address.
3690 unsigned SlotSize = RegInfo->getSlotSize();
3691 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
3694 FuncInfo->setRAIndex(ReturnAddrIndex);
3697 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
3700 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
3701 bool hasSymbolicDisplacement) {
3702 // Offset should fit into 32 bit immediate field.
3703 if (!isInt<32>(Offset))
3706 // If we don't have a symbolic displacement - we don't have any extra
3708 if (!hasSymbolicDisplacement)
3711 // FIXME: Some tweaks might be needed for medium code model.
3712 if (M != CodeModel::Small && M != CodeModel::Kernel)
3715 // For small code model we assume that latest object is 16MB before end of 31
3716 // bits boundary. We may also accept pretty large negative constants knowing
3717 // that all objects are in the positive half of address space.
3718 if (M == CodeModel::Small && Offset < 16*1024*1024)
3721 // For kernel code model we know that all object resist in the negative half
3722 // of 32bits address space. We may not accept negative offsets, since they may
3723 // be just off and we may accept pretty large positive ones.
3724 if (M == CodeModel::Kernel && Offset >= 0)
3730 /// isCalleePop - Determines whether the callee is required to pop its
3731 /// own arguments. Callee pop is necessary to support tail calls.
3732 bool X86::isCalleePop(CallingConv::ID CallingConv,
3733 bool is64Bit, bool IsVarArg, bool TailCallOpt) {
3734 switch (CallingConv) {
3737 case CallingConv::X86_StdCall:
3738 case CallingConv::X86_FastCall:
3739 case CallingConv::X86_ThisCall:
3741 case CallingConv::Fast:
3742 case CallingConv::GHC:
3743 case CallingConv::HiPE:
3750 /// \brief Return true if the condition is an unsigned comparison operation.
3751 static bool isX86CCUnsigned(unsigned X86CC) {
3753 default: llvm_unreachable("Invalid integer condition!");
3754 case X86::COND_E: return true;
3755 case X86::COND_G: return false;
3756 case X86::COND_GE: return false;
3757 case X86::COND_L: return false;
3758 case X86::COND_LE: return false;
3759 case X86::COND_NE: return true;
3760 case X86::COND_B: return true;
3761 case X86::COND_A: return true;
3762 case X86::COND_BE: return true;
3763 case X86::COND_AE: return true;
3765 llvm_unreachable("covered switch fell through?!");
3768 /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
3769 /// specific condition code, returning the condition code and the LHS/RHS of the
3770 /// comparison to make.
3771 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
3772 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
3774 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
3775 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
3776 // X > -1 -> X == 0, jump !sign.
3777 RHS = DAG.getConstant(0, RHS.getValueType());
3778 return X86::COND_NS;
3780 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
3781 // X < 0 -> X == 0, jump on sign.
3784 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
3786 RHS = DAG.getConstant(0, RHS.getValueType());
3787 return X86::COND_LE;
3791 switch (SetCCOpcode) {
3792 default: llvm_unreachable("Invalid integer condition!");
3793 case ISD::SETEQ: return X86::COND_E;
3794 case ISD::SETGT: return X86::COND_G;
3795 case ISD::SETGE: return X86::COND_GE;
3796 case ISD::SETLT: return X86::COND_L;
3797 case ISD::SETLE: return X86::COND_LE;
3798 case ISD::SETNE: return X86::COND_NE;
3799 case ISD::SETULT: return X86::COND_B;
3800 case ISD::SETUGT: return X86::COND_A;
3801 case ISD::SETULE: return X86::COND_BE;
3802 case ISD::SETUGE: return X86::COND_AE;
3806 // First determine if it is required or is profitable to flip the operands.
3808 // If LHS is a foldable load, but RHS is not, flip the condition.
3809 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3810 !ISD::isNON_EXTLoad(RHS.getNode())) {
3811 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3812 std::swap(LHS, RHS);
3815 switch (SetCCOpcode) {
3821 std::swap(LHS, RHS);
3825 // On a floating point condition, the flags are set as follows:
3827 // 0 | 0 | 0 | X > Y
3828 // 0 | 0 | 1 | X < Y
3829 // 1 | 0 | 0 | X == Y
3830 // 1 | 1 | 1 | unordered
3831 switch (SetCCOpcode) {
3832 default: llvm_unreachable("Condcode should be pre-legalized away");
3834 case ISD::SETEQ: return X86::COND_E;
3835 case ISD::SETOLT: // flipped
3837 case ISD::SETGT: return X86::COND_A;
3838 case ISD::SETOLE: // flipped
3840 case ISD::SETGE: return X86::COND_AE;
3841 case ISD::SETUGT: // flipped
3843 case ISD::SETLT: return X86::COND_B;
3844 case ISD::SETUGE: // flipped
3846 case ISD::SETLE: return X86::COND_BE;
3848 case ISD::SETNE: return X86::COND_NE;
3849 case ISD::SETUO: return X86::COND_P;
3850 case ISD::SETO: return X86::COND_NP;
3852 case ISD::SETUNE: return X86::COND_INVALID;
3856 /// hasFPCMov - is there a floating point cmov for the specific X86 condition
3857 /// code. Current x86 isa includes the following FP cmov instructions:
3858 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3859 static bool hasFPCMov(unsigned X86CC) {
3875 /// isFPImmLegal - Returns true if the target can instruction select the
3876 /// specified FP immediate natively. If false, the legalizer will
3877 /// materialize the FP immediate as a load from a constant pool.
3878 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
3879 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
3880 if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
3886 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
3887 ISD::LoadExtType ExtTy,
3889 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3890 // relocation target a movq or addq instruction: don't let the load shrink.
3891 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3892 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3893 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3894 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3898 /// \brief Returns true if it is beneficial to convert a load of a constant
3899 /// to just the constant itself.
3900 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
3902 assert(Ty->isIntegerTy());
3904 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3905 if (BitSize == 0 || BitSize > 64)
3910 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
3911 unsigned Index) const {
3912 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
3915 return (Index == 0 || Index == ResVT.getVectorNumElements());
3918 bool X86TargetLowering::isCheapToSpeculateCttz() const {
3919 // Speculate cttz only if we can directly use TZCNT.
3920 return Subtarget->hasBMI();
3923 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
3924 // Speculate ctlz only if we can directly use LZCNT.
3925 return Subtarget->hasLZCNT();
3928 /// isUndefOrInRange - Return true if Val is undef or if its value falls within
3929 /// the specified range (L, H].
3930 static bool isUndefOrInRange(int Val, int Low, int Hi) {
3931 return (Val < 0) || (Val >= Low && Val < Hi);
3934 /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
3935 /// specified value.
3936 static bool isUndefOrEqual(int Val, int CmpVal) {
3937 return (Val < 0 || Val == CmpVal);
3940 /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
3941 /// from position Pos and ending in Pos+Size, falls within the specified
3942 /// sequential range (Low, Low+Size]. or is undef.
3943 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
3944 unsigned Pos, unsigned Size, int Low) {
3945 for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
3946 if (!isUndefOrEqual(Mask[i], Low))
3951 /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
3952 /// is suitable for input to PSHUFD. That is, it doesn't reference the other
3953 /// operand - by default will match for first operand.
3954 static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT,
3955 bool TestSecondOperand = false) {
3956 if (VT != MVT::v4f32 && VT != MVT::v4i32 &&
3957 VT != MVT::v2f64 && VT != MVT::v2i64)
3960 unsigned NumElems = VT.getVectorNumElements();
3961 unsigned Lo = TestSecondOperand ? NumElems : 0;
3962 unsigned Hi = Lo + NumElems;
3964 for (unsigned i = 0; i < NumElems; ++i)
3965 if (!isUndefOrInRange(Mask[i], (int)Lo, (int)Hi))
3971 /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
3972 /// is suitable for input to PSHUFHW.
3973 static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
3974 if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
3977 // Lower quadword copied in order or undef.
3978 if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
3981 // Upper quadword shuffled.
3982 for (unsigned i = 4; i != 8; ++i)
3983 if (!isUndefOrInRange(Mask[i], 4, 8))
3986 if (VT == MVT::v16i16) {
3987 // Lower quadword copied in order or undef.
3988 if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
3991 // Upper quadword shuffled.
3992 for (unsigned i = 12; i != 16; ++i)
3993 if (!isUndefOrInRange(Mask[i], 12, 16))
4000 /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
4001 /// is suitable for input to PSHUFLW.
4002 static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
4003 if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
4006 // Upper quadword copied in order.
4007 if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
4010 // Lower quadword shuffled.
4011 for (unsigned i = 0; i != 4; ++i)
4012 if (!isUndefOrInRange(Mask[i], 0, 4))
4015 if (VT == MVT::v16i16) {
4016 // Upper quadword copied in order.
4017 if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
4020 // Lower quadword shuffled.
4021 for (unsigned i = 8; i != 12; ++i)
4022 if (!isUndefOrInRange(Mask[i], 8, 12))
4029 /// \brief Return true if the mask specifies a shuffle of elements that is
4030 /// suitable for input to intralane (palignr) or interlane (valign) vector
4032 static bool isAlignrMask(ArrayRef<int> Mask, MVT VT, bool InterLane) {
4033 unsigned NumElts = VT.getVectorNumElements();
4034 unsigned NumLanes = InterLane ? 1: VT.getSizeInBits()/128;
4035 unsigned NumLaneElts = NumElts/NumLanes;
4037 // Do not handle 64-bit element shuffles with palignr.
4038 if (NumLaneElts == 2)
4041 for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
4043 for (i = 0; i != NumLaneElts; ++i) {
4048 // Lane is all undef, go to next lane
4049 if (i == NumLaneElts)
4052 int Start = Mask[i+l];
4054 // Make sure its in this lane in one of the sources
4055 if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
4056 !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
4059 // If not lane 0, then we must match lane 0
4060 if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
4063 // Correct second source to be contiguous with first source
4064 if (Start >= (int)NumElts)
4065 Start -= NumElts - NumLaneElts;
4067 // Make sure we're shifting in the right direction.
4068 if (Start <= (int)(i+l))
4073 // Check the rest of the elements to see if they are consecutive.
4074 for (++i; i != NumLaneElts; ++i) {
4075 int Idx = Mask[i+l];
4077 // Make sure its in this lane
4078 if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
4079 !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
4082 // If not lane 0, then we must match lane 0
4083 if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
4086 if (Idx >= (int)NumElts)
4087 Idx -= NumElts - NumLaneElts;
4089 if (!isUndefOrEqual(Idx, Start+i))
4098 /// \brief Return true if the node specifies a shuffle of elements that is
4099 /// suitable for input to PALIGNR.
4100 static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
4101 const X86Subtarget *Subtarget) {
4102 if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
4103 (VT.is256BitVector() && !Subtarget->hasInt256()) ||
4104 VT.is512BitVector())
4105 // FIXME: Add AVX512BW.
4108 return isAlignrMask(Mask, VT, false);
4111 /// \brief Return true if the node specifies a shuffle of elements that is
4112 /// suitable for input to VALIGN.
4113 static bool isVALIGNMask(ArrayRef<int> Mask, MVT VT,
4114 const X86Subtarget *Subtarget) {
4115 // FIXME: Add AVX512VL.
4116 if (!VT.is512BitVector() || !Subtarget->hasAVX512())
4118 return isAlignrMask(Mask, VT, true);
4121 /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
4122 /// the two vector operands have swapped position.
4123 static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
4124 unsigned NumElems) {
4125 for (unsigned i = 0; i != NumElems; ++i) {
4129 else if (idx < (int)NumElems)
4130 Mask[i] = idx + NumElems;
4132 Mask[i] = idx - NumElems;
4136 /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
4137 /// specifies a shuffle of elements that is suitable for input to 128/256-bit
4138 /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
4139 /// reverse of what x86 shuffles want.
4140 static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) {
4142 unsigned NumElems = VT.getVectorNumElements();
4143 unsigned NumLanes = VT.getSizeInBits()/128;
4144 unsigned NumLaneElems = NumElems/NumLanes;
4146 if (NumLaneElems != 2 && NumLaneElems != 4)
4149 unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4150 bool symetricMaskRequired =
4151 (VT.getSizeInBits() >= 256) && (EltSize == 32);
4153 // VSHUFPSY divides the resulting vector into 4 chunks.
4154 // The sources are also splitted into 4 chunks, and each destination
4155 // chunk must come from a different source chunk.
4157 // SRC1 => X7 X6 X5 X4 X3 X2 X1 X0
4158 // SRC2 => Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y9
4160 // DST => Y7..Y4, Y7..Y4, X7..X4, X7..X4,
4161 // Y3..Y0, Y3..Y0, X3..X0, X3..X0
4163 // VSHUFPDY divides the resulting vector into 4 chunks.
4164 // The sources are also splitted into 4 chunks, and each destination
4165 // chunk must come from a different source chunk.
4167 // SRC1 => X3 X2 X1 X0
4168 // SRC2 => Y3 Y2 Y1 Y0
4170 // DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0
4172 SmallVector<int, 4> MaskVal(NumLaneElems, -1);
4173 unsigned HalfLaneElems = NumLaneElems/2;
4174 for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
4175 for (unsigned i = 0; i != NumLaneElems; ++i) {
4176 int Idx = Mask[i+l];
4177 unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
4178 if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
4180 // For VSHUFPSY, the mask of the second half must be the same as the
4181 // first but with the appropriate offsets. This works in the same way as
4182 // VPERMILPS works with masks.
4183 if (!symetricMaskRequired || Idx < 0)
4185 if (MaskVal[i] < 0) {
4186 MaskVal[i] = Idx - l;
4189 if ((signed)(Idx - l) != MaskVal[i])
4197 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
4198 /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
4199 static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) {
4200 if (!VT.is128BitVector())
4203 unsigned NumElems = VT.getVectorNumElements();
4208 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
4209 return isUndefOrEqual(Mask[0], 6) &&
4210 isUndefOrEqual(Mask[1], 7) &&
4211 isUndefOrEqual(Mask[2], 2) &&
4212 isUndefOrEqual(Mask[3], 3);
4215 /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
4216 /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
4218 static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) {
4219 if (!VT.is128BitVector())
4222 unsigned NumElems = VT.getVectorNumElements();
4227 return isUndefOrEqual(Mask[0], 2) &&
4228 isUndefOrEqual(Mask[1], 3) &&
4229 isUndefOrEqual(Mask[2], 2) &&
4230 isUndefOrEqual(Mask[3], 3);
4233 /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
4234 /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
4235 static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) {
4236 if (!VT.is128BitVector())
4239 unsigned NumElems = VT.getVectorNumElements();
4241 if (NumElems != 2 && NumElems != 4)
4244 for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4245 if (!isUndefOrEqual(Mask[i], i + NumElems))
4248 for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
4249 if (!isUndefOrEqual(Mask[i], i))
4255 /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
4256 /// specifies a shuffle of elements that is suitable for input to MOVLHPS.
4257 static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
4258 if (!VT.is128BitVector())
4261 unsigned NumElems = VT.getVectorNumElements();
4263 if (NumElems != 2 && NumElems != 4)
4266 for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4267 if (!isUndefOrEqual(Mask[i], i))
4270 for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4271 if (!isUndefOrEqual(Mask[i + e], i + NumElems))
4277 /// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
4278 /// specifies a shuffle of elements that is suitable for input to INSERTPS.
4279 /// i. e: If all but one element come from the same vector.
4280 static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
4281 // TODO: Deal with AVX's VINSERTPS
4282 if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))
4285 unsigned CorrectPosV1 = 0;
4286 unsigned CorrectPosV2 = 0;
4287 for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) {
4288 if (Mask[i] == -1) {
4296 else if (Mask[i] == i + 4)
4300 if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
4301 // We have 3 elements (undefs count as elements from any vector) from one
4302 // vector, and one from another.
4309 // Some special combinations that can be optimized.
4312 SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
4313 SelectionDAG &DAG) {
4314 MVT VT = SVOp->getSimpleValueType(0);
4317 if (VT != MVT::v8i32 && VT != MVT::v8f32)
4320 ArrayRef<int> Mask = SVOp->getMask();
4322 // These are the special masks that may be optimized.
4323 static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
4324 static const int MaskToOptimizeOdd[] = {1, 9, 3, 11, 5, 13, 7, 15};
4325 bool MatchEvenMask = true;
4326 bool MatchOddMask = true;
4327 for (int i=0; i<8; ++i) {
4328 if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
4329 MatchEvenMask = false;
4330 if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
4331 MatchOddMask = false;
4334 if (!MatchEvenMask && !MatchOddMask)
4337 SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
4339 SDValue Op0 = SVOp->getOperand(0);
4340 SDValue Op1 = SVOp->getOperand(1);
4342 if (MatchEvenMask) {
4343 // Shift the second operand right to 32 bits.
4344 static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
4345 Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
4347 // Shift the first operand left to 32 bits.
4348 static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
4349 Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
4351 static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
4352 return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
4355 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
4356 /// specifies a shuffle of elements that is suitable for input to UNPCKL.
4357 static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT,
4358 bool HasInt256, bool V2IsSplat = false) {
4360 assert(VT.getSizeInBits() >= 128 &&
4361 "Unsupported vector type for unpckl");
4363 unsigned NumElts = VT.getVectorNumElements();
4364 if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
4365 (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4368 assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
4369 "Unsupported vector type for unpckh");
4371 // AVX defines UNPCK* to operate independently on 128-bit lanes.
4372 unsigned NumLanes = VT.getSizeInBits()/128;
4373 unsigned NumLaneElts = NumElts/NumLanes;
4375 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4376 for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
4377 int BitI = Mask[l+i];
4378 int BitI1 = Mask[l+i+1];
4379 if (!isUndefOrEqual(BitI, j))
4382 if (!isUndefOrEqual(BitI1, NumElts))
4385 if (!isUndefOrEqual(BitI1, j + NumElts))
4394 /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
4395 /// specifies a shuffle of elements that is suitable for input to UNPCKH.
4396 static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT,
4397 bool HasInt256, bool V2IsSplat = false) {
4398 assert(VT.getSizeInBits() >= 128 &&
4399 "Unsupported vector type for unpckh");
4401 unsigned NumElts = VT.getVectorNumElements();
4402 if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
4403 (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4406 assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
4407 "Unsupported vector type for unpckh");
4409 // AVX defines UNPCK* to operate independently on 128-bit lanes.
4410 unsigned NumLanes = VT.getSizeInBits()/128;
4411 unsigned NumLaneElts = NumElts/NumLanes;
4413 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4414 for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
4415 int BitI = Mask[l+i];
4416 int BitI1 = Mask[l+i+1];
4417 if (!isUndefOrEqual(BitI, j))
4420 if (isUndefOrEqual(BitI1, NumElts))
4423 if (!isUndefOrEqual(BitI1, j+NumElts))
4431 /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
4432 /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
4434 static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
4435 unsigned NumElts = VT.getVectorNumElements();
4436 bool Is256BitVec = VT.is256BitVector();
4438 if (VT.is512BitVector())
4440 assert((VT.is128BitVector() || VT.is256BitVector()) &&
4441 "Unsupported vector type for unpckh");
4443 if (Is256BitVec && NumElts != 4 && NumElts != 8 &&
4444 (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4447 // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
4448 // FIXME: Need a better way to get rid of this, there's no latency difference
4449 // between UNPCKLPD and MOVDDUP, the later should always be checked first and
4450 // the former later. We should also remove the "_undef" special mask.
4451 if (NumElts == 4 && Is256BitVec)
4454 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
4455 // independently on 128-bit lanes.
4456 unsigned NumLanes = VT.getSizeInBits()/128;
4457 unsigned NumLaneElts = NumElts/NumLanes;
4459 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4460 for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
4461 int BitI = Mask[l+i];
4462 int BitI1 = Mask[l+i+1];
4464 if (!isUndefOrEqual(BitI, j))
4466 if (!isUndefOrEqual(BitI1, j))
4474 /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
4475 /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
4477 static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
4478 unsigned NumElts = VT.getVectorNumElements();
4480 if (VT.is512BitVector())
4483 assert((VT.is128BitVector() || VT.is256BitVector()) &&
4484 "Unsupported vector type for unpckh");
4486 if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
4487 (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4490 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
4491 // independently on 128-bit lanes.
4492 unsigned NumLanes = VT.getSizeInBits()/128;
4493 unsigned NumLaneElts = NumElts/NumLanes;
4495 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4496 for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
4497 int BitI = Mask[l+i];
4498 int BitI1 = Mask[l+i+1];
4499 if (!isUndefOrEqual(BitI, j))
4501 if (!isUndefOrEqual(BitI1, j))
4508 // Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or
4509 // (src1[0], src0[1]), manipulation with 256-bit sub-vectors
4510 static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) {
4511 if (!VT.is512BitVector())
4514 unsigned NumElts = VT.getVectorNumElements();
4515 unsigned HalfSize = NumElts/2;
4516 if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) {
4517 if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) {
4522 if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) {
4523 if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) {
4531 /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
4532 /// specifies a shuffle of elements that is suitable for input to MOVSS,
4533 /// MOVSD, and MOVD, i.e. setting the lowest element.
4534 static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
4535 if (VT.getVectorElementType().getSizeInBits() < 32)
4537 if (!VT.is128BitVector())
4540 unsigned NumElts = VT.getVectorNumElements();
4542 if (!isUndefOrEqual(Mask[0], NumElts))
4545 for (unsigned i = 1; i != NumElts; ++i)
4546 if (!isUndefOrEqual(Mask[i], i))
4552 /// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
4553 /// as permutations between 128-bit chunks or halves. As an example: this
4555 /// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
4556 /// The first half comes from the second half of V1 and the second half from the
4557 /// the second half of V2.
4558 static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
4559 if (!HasFp256 || !VT.is256BitVector())
4562 // The shuffle result is divided into half A and half B. In total the two
4563 // sources have 4 halves, namely: C, D, E, F. The final values of A and
4564 // B must come from C, D, E or F.
4565 unsigned HalfSize = VT.getVectorNumElements()/2;
4566 bool MatchA = false, MatchB = false;
4568 // Check if A comes from one of C, D, E, F.
4569 for (unsigned Half = 0; Half != 4; ++Half) {
4570 if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
4576 // Check if B comes from one of C, D, E, F.
4577 for (unsigned Half = 0; Half != 4; ++Half) {
4578 if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
4584 return MatchA && MatchB;
4587 /// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
4588 /// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
4589 static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
4590 MVT VT = SVOp->getSimpleValueType(0);
4592 unsigned HalfSize = VT.getVectorNumElements()/2;
4594 unsigned FstHalf = 0, SndHalf = 0;
4595 for (unsigned i = 0; i < HalfSize; ++i) {
4596 if (SVOp->getMaskElt(i) > 0) {
4597 FstHalf = SVOp->getMaskElt(i)/HalfSize;
4601 for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
4602 if (SVOp->getMaskElt(i) > 0) {
4603 SndHalf = SVOp->getMaskElt(i)/HalfSize;
4608 return (FstHalf | (SndHalf << 4));
4611 // Symetric in-lane mask. Each lane has 4 elements (for imm8)
4612 static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) {
4613 unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4617 unsigned NumElts = VT.getVectorNumElements();
4619 if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) {
4620 for (unsigned i = 0; i != NumElts; ++i) {
4623 Imm8 |= Mask[i] << (i*2);
4628 unsigned LaneSize = 4;
4629 SmallVector<int, 4> MaskVal(LaneSize, -1);
4631 for (unsigned l = 0; l != NumElts; l += LaneSize) {
4632 for (unsigned i = 0; i != LaneSize; ++i) {
4633 if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
4637 if (MaskVal[i] < 0) {
4638 MaskVal[i] = Mask[i+l] - l;
4639 Imm8 |= MaskVal[i] << (i*2);
4642 if (Mask[i+l] != (signed)(MaskVal[i]+l))
4649 /// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
4650 /// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
4651 /// Note that VPERMIL mask matching is different depending whether theunderlying
4652 /// type is 32 or 64. In the VPERMILPS the high half of the mask should point
4653 /// to the same elements of the low, but to the higher half of the source.
4654 /// In VPERMILPD the two lanes could be shuffled independently of each other
4655 /// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
4656 static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {
4657 unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4658 if (VT.getSizeInBits() < 256 || EltSize < 32)
4660 bool symetricMaskRequired = (EltSize == 32);
4661 unsigned NumElts = VT.getVectorNumElements();
4663 unsigned NumLanes = VT.getSizeInBits()/128;
4664 unsigned LaneSize = NumElts/NumLanes;
4665 // 2 or 4 elements in one lane
4667 SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);
4668 for (unsigned l = 0; l != NumElts; l += LaneSize) {
4669 for (unsigned i = 0; i != LaneSize; ++i) {
4670 if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
4672 if (symetricMaskRequired) {
4673 if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {
4674 ExpectedMaskVal[i] = Mask[i+l] - l;
4677 if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l))
4685 /// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
4686 /// of what x86 movss want. X86 movs requires the lowest element to be lowest
4687 /// element of vector 2 and the other elements to come from vector 1 in order.
4688 static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT,
4689 bool V2IsSplat = false, bool V2IsUndef = false) {
4690 if (!VT.is128BitVector())
4693 unsigned NumOps = VT.getVectorNumElements();
4694 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
4697 if (!isUndefOrEqual(Mask[0], 0))
4700 for (unsigned i = 1; i != NumOps; ++i)
4701 if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
4702 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
4703 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
4709 /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4710 /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
4711 /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
4712 static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT,
4713 const X86Subtarget *Subtarget) {
4714 if (!Subtarget->hasSSE3())
4717 unsigned NumElems = VT.getVectorNumElements();
4719 if ((VT.is128BitVector() && NumElems != 4) ||
4720 (VT.is256BitVector() && NumElems != 8) ||
4721 (VT.is512BitVector() && NumElems != 16))
4724 // "i+1" is the value the indexed mask element must have
4725 for (unsigned i = 0; i != NumElems; i += 2)
4726 if (!isUndefOrEqual(Mask[i], i+1) ||
4727 !isUndefOrEqual(Mask[i+1], i+1))
4733 /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4734 /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
4735 /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
4736 static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT,
4737 const X86Subtarget *Subtarget) {
4738 if (!Subtarget->hasSSE3())
4741 unsigned NumElems = VT.getVectorNumElements();
4743 if ((VT.is128BitVector() && NumElems != 4) ||
4744 (VT.is256BitVector() && NumElems != 8) ||
4745 (VT.is512BitVector() && NumElems != 16))
4748 // "i" is the value the indexed mask element must have
4749 for (unsigned i = 0; i != NumElems; i += 2)
4750 if (!isUndefOrEqual(Mask[i], i) ||
4751 !isUndefOrEqual(Mask[i+1], i))
4757 /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
4758 /// specifies a shuffle of elements that is suitable for input to 256-bit
4759 /// version of MOVDDUP.
4760 static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
4761 if (!HasFp256 || !VT.is256BitVector())
4764 unsigned NumElts = VT.getVectorNumElements();
4768 for (unsigned i = 0; i != NumElts/2; ++i)
4769 if (!isUndefOrEqual(Mask[i], 0))
4771 for (unsigned i = NumElts/2; i != NumElts; ++i)
4772 if (!isUndefOrEqual(Mask[i], NumElts/2))
4777 /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4778 /// specifies a shuffle of elements that is suitable for input to 128-bit
4779 /// version of MOVDDUP.
4780 static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) {
4781 if (!VT.is128BitVector())
4784 unsigned e = VT.getVectorNumElements() / 2;
4785 for (unsigned i = 0; i != e; ++i)
4786 if (!isUndefOrEqual(Mask[i], i))
4788 for (unsigned i = 0; i != e; ++i)
4789 if (!isUndefOrEqual(Mask[e+i], i))
4794 /// isVEXTRACTIndex - Return true if the specified
4795 /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
4796 /// suitable for instruction that extract 128 or 256 bit vectors
4797 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
4798 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4799 if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4802 // The index should be aligned on a vecWidth-bit boundary.
4804 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4806 MVT VT = N->getSimpleValueType(0);
4807 unsigned ElSize = VT.getVectorElementType().getSizeInBits();
4808 bool Result = (Index * ElSize) % vecWidth == 0;
4813 /// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR
4814 /// operand specifies a subvector insert that is suitable for input to
4815 /// insertion of 128 or 256-bit subvectors
4816 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
4817 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4818 if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4820 // The index should be aligned on a vecWidth-bit boundary.
4822 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4824 MVT VT = N->getSimpleValueType(0);
4825 unsigned ElSize = VT.getVectorElementType().getSizeInBits();
4826 bool Result = (Index * ElSize) % vecWidth == 0;
4831 bool X86::isVINSERT128Index(SDNode *N) {
4832 return isVINSERTIndex(N, 128);
4835 bool X86::isVINSERT256Index(SDNode *N) {
4836 return isVINSERTIndex(N, 256);
4839 bool X86::isVEXTRACT128Index(SDNode *N) {
4840 return isVEXTRACTIndex(N, 128);
4843 bool X86::isVEXTRACT256Index(SDNode *N) {
4844 return isVEXTRACTIndex(N, 256);
4847 /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
4848 /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
4849 /// Handles 128-bit and 256-bit.
4850 static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
4851 MVT VT = N->getSimpleValueType(0);
4853 assert((VT.getSizeInBits() >= 128) &&
4854 "Unsupported vector type for PSHUF/SHUFP");
4856 // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
4857 // independently on 128-bit lanes.
4858 unsigned NumElts = VT.getVectorNumElements();
4859 unsigned NumLanes = VT.getSizeInBits()/128;
4860 unsigned NumLaneElts = NumElts/NumLanes;
4862 assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) &&
4863 "Only supports 2, 4 or 8 elements per lane");
4865 unsigned Shift = (NumLaneElts >= 4) ? 1 : 0;
4867 for (unsigned i = 0; i != NumElts; ++i) {
4868 int Elt = N->getMaskElt(i);
4869 if (Elt < 0) continue;
4870 Elt &= NumLaneElts - 1;
4871 unsigned ShAmt = (i << Shift) % 8;
4872 Mask |= Elt << ShAmt;
4878 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
4879 /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
4880 static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
4881 MVT VT = N->getSimpleValueType(0);
4883 assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
4884 "Unsupported vector type for PSHUFHW");
4886 unsigned NumElts = VT.getVectorNumElements();
4889 for (unsigned l = 0; l != NumElts; l += 8) {
4890 // 8 nodes per lane, but we only care about the last 4.
4891 for (unsigned i = 0; i < 4; ++i) {
4892 int Elt = N->getMaskElt(l+i+4);
4893 if (Elt < 0) continue;
4894 Elt &= 0x3; // only 2-bits.
4895 Mask |= Elt << (i * 2);
4902 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
4903 /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
4904 static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
4905 MVT VT = N->getSimpleValueType(0);
4907 assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
4908 "Unsupported vector type for PSHUFHW");
4910 unsigned NumElts = VT.getVectorNumElements();
4913 for (unsigned l = 0; l != NumElts; l += 8) {
4914 // 8 nodes per lane, but we only care about the first 4.
4915 for (unsigned i = 0; i < 4; ++i) {
4916 int Elt = N->getMaskElt(l+i);
4917 if (Elt < 0) continue;
4918 Elt &= 0x3; // only 2-bits
4919 Mask |= Elt << (i * 2);
4926 /// \brief Return the appropriate immediate to shuffle the specified
4927 /// VECTOR_SHUFFLE mask with the PALIGNR (if InterLane is false) or with
4928 /// VALIGN (if Interlane is true) instructions.
4929 static unsigned getShuffleAlignrImmediate(ShuffleVectorSDNode *SVOp,
4931 MVT VT = SVOp->getSimpleValueType(0);
4932 unsigned EltSize = InterLane ? 1 :
4933 VT.getVectorElementType().getSizeInBits() >> 3;
4935 unsigned NumElts = VT.getVectorNumElements();
4936 unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128;
4937 unsigned NumLaneElts = NumElts/NumLanes;
4941 for (i = 0; i != NumElts; ++i) {
4942 Val = SVOp->getMaskElt(i);
4946 if (Val >= (int)NumElts)
4947 Val -= NumElts - NumLaneElts;
4949 assert(Val - i > 0 && "PALIGNR imm should be positive");
4950 return (Val - i) * EltSize;
4953 /// \brief Return the appropriate immediate to shuffle the specified
4954 /// VECTOR_SHUFFLE mask with the PALIGNR instruction.
4955 static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
4956 return getShuffleAlignrImmediate(SVOp, false);
4959 /// \brief Return the appropriate immediate to shuffle the specified
4960 /// VECTOR_SHUFFLE mask with the VALIGN instruction.
4961 static unsigned getShuffleVALIGNImmediate(ShuffleVectorSDNode *SVOp) {
4962 return getShuffleAlignrImmediate(SVOp, true);
4966 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
4967 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4968 if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4969 llvm_unreachable("Illegal extract subvector for VEXTRACT");
4972 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4974 MVT VecVT = N->getOperand(0).getSimpleValueType();
4975 MVT ElVT = VecVT.getVectorElementType();
4977 unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4978 return Index / NumElemsPerChunk;
4981 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
4982 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4983 if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4984 llvm_unreachable("Illegal insert subvector for VINSERT");
4987 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4989 MVT VecVT = N->getSimpleValueType(0);
4990 MVT ElVT = VecVT.getVectorElementType();
4992 unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4993 return Index / NumElemsPerChunk;
4996 /// getExtractVEXTRACT128Immediate - Return the appropriate immediate
4997 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
4998 /// and VINSERTI128 instructions.
4999 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
5000 return getExtractVEXTRACTImmediate(N, 128);
5003 /// getExtractVEXTRACT256Immediate - Return the appropriate immediate
5004 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4
5005 /// and VINSERTI64x4 instructions.
5006 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
5007 return getExtractVEXTRACTImmediate(N, 256);
5010 /// getInsertVINSERT128Immediate - Return the appropriate immediate
5011 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
5012 /// and VINSERTI128 instructions.
5013 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
5014 return getInsertVINSERTImmediate(N, 128);
5017 /// getInsertVINSERT256Immediate - Return the appropriate immediate
5018 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4
5019 /// and VINSERTI64x4 instructions.
5020 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
5021 return getInsertVINSERTImmediate(N, 256);
5024 /// isZero - Returns true if Elt is a constant integer zero
5025 static bool isZero(SDValue V) {
5026 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
5027 return C && C->isNullValue();
5030 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
5032 bool X86::isZeroNode(SDValue Elt) {
5035 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
5036 return CFP->getValueAPF().isPosZero();
5040 /// ShouldXformToMOVHLPS - Return true if the node should be transformed to
5041 /// match movhlps. The lower half elements should come from upper half of
5042 /// V1 (and in order), and the upper half elements should come from the upper
5043 /// half of V2 (and in order).
5044 static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) {
5045 if (!VT.is128BitVector())
5047 if (VT.getVectorNumElements() != 4)
5049 for (unsigned i = 0, e = 2; i != e; ++i)
5050 if (!isUndefOrEqual(Mask[i], i+2))
5052 for (unsigned i = 2; i != 4; ++i)
5053 if (!isUndefOrEqual(Mask[i], i+4))
5058 /// isScalarLoadToVector - Returns true if the node is a scalar load that
5059 /// is promoted to a vector. It also returns the LoadSDNode by reference if
5061 static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) {
5062 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
5064 N = N->getOperand(0).getNode();
5065 if (!ISD::isNON_EXTLoad(N))
5068 *LD = cast<LoadSDNode>(N);
5072 // Test whether the given value is a vector value which will be legalized
5074 static bool WillBeConstantPoolLoad(SDNode *N) {
5075 if (N->getOpcode() != ISD::BUILD_VECTOR)
5078 // Check for any non-constant elements.
5079 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
5080 switch (N->getOperand(i).getNode()->getOpcode()) {
5082 case ISD::ConstantFP:
5089 // Vectors of all-zeros and all-ones are materialized with special
5090 // instructions rather than being loaded.
5091 return !ISD::isBuildVectorAllZeros(N) &&
5092 !ISD::isBuildVectorAllOnes(N);
5095 /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
5096 /// match movlp{s|d}. The lower half elements should come from lower half of
5097 /// V1 (and in order), and the upper half elements should come from the upper
5098 /// half of V2 (and in order). And since V1 will become the source of the
5099 /// MOVLP, it must be either a vector load or a scalar load to vector.
5100 static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
5101 ArrayRef<int> Mask, MVT VT) {
5102 if (!VT.is128BitVector())
5105 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
5107 // Is V2 is a vector load, don't do this transformation. We will try to use
5108 // load folding shufps op.
5109 if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
5112 unsigned NumElems = VT.getVectorNumElements();
5114 if (NumElems != 2 && NumElems != 4)
5116 for (unsigned i = 0, e = NumElems/2; i != e; ++i)
5117 if (!isUndefOrEqual(Mask[i], i))
5119 for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
5120 if (!isUndefOrEqual(Mask[i], i+NumElems))
5125 /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
5126 /// to an zero vector.
5127 /// FIXME: move to dag combiner / method on ShuffleVectorSDNode
5128 static bool isZeroShuffle(ShuffleVectorSDNode *N) {
5129 SDValue V1 = N->getOperand(0);
5130 SDValue V2 = N->getOperand(1);
5131 unsigned NumElems = N->getValueType(0).getVectorNumElements();
5132 for (unsigned i = 0; i != NumElems; ++i) {
5133 int Idx = N->getMaskElt(i);
5134 if (Idx >= (int)NumElems) {
5135 unsigned Opc = V2.getOpcode();
5136 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
5138 if (Opc != ISD::BUILD_VECTOR ||
5139 !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
5141 } else if (Idx >= 0) {
5142 unsigned Opc = V1.getOpcode();
5143 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
5145 if (Opc != ISD::BUILD_VECTOR ||
5146 !X86::isZeroNode(V1.getOperand(Idx)))
5153 /// getZeroVector - Returns a vector of specified type with all zero elements.
5155 static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
5156 SelectionDAG &DAG, SDLoc dl) {
5157 assert(VT.isVector() && "Expected a vector type");
5159 // Always build SSE zero vectors as <4 x i32> bitcasted
5160 // to their dest type. This ensures they get CSE'd.
5162 if (VT.is128BitVector()) { // SSE
5163 if (Subtarget->hasSSE2()) { // SSE2
5164 SDValue Cst = DAG.getConstant(0, MVT::i32);
5165 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
5167 SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
5168 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
5170 } else if (VT.is256BitVector()) { // AVX
5171 if (Subtarget->hasInt256()) { // AVX2
5172 SDValue Cst = DAG.getConstant(0, MVT::i32);
5173 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5174 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
5176 // 256-bit logic and arithmetic instructions in AVX are all
5177 // floating-point, no support for integer ops. Emit fp zeroed vectors.
5178 SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
5179 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5180 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
5182 } else if (VT.is512BitVector()) { // AVX-512
5183 SDValue Cst = DAG.getConstant(0, MVT::i32);
5184 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
5185 Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5186 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
5187 } else if (VT.getScalarType() == MVT::i1) {
5188 assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type");
5189 SDValue Cst = DAG.getConstant(0, MVT::i1);
5190 SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
5191 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
5193 llvm_unreachable("Unexpected vector type");
5195 return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
5198 /// getOnesVector - Returns a vector of specified type with all bits set.
5199 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
5200 /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
5201 /// Then bitcast to their original type, ensuring they get CSE'd.
5202 static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
5204 assert(VT.isVector() && "Expected a vector type");
5206 SDValue Cst = DAG.getConstant(~0U, MVT::i32);
5208 if (VT.is256BitVector()) {
5209 if (HasInt256) { // AVX2
5210 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5211 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
5213 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
5214 Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
5216 } else if (VT.is128BitVector()) {
5217 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
5219 llvm_unreachable("Unexpected vector type");
5221 return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
5224 /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
5225 /// that point to V2 points to its first element.
5226 static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
5227 for (unsigned i = 0; i != NumElems; ++i) {
5228 if (Mask[i] > (int)NumElems) {
5234 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
5235 /// operation of specified width.
5236 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
5238 unsigned NumElems = VT.getVectorNumElements();
5239 SmallVector<int, 8> Mask;
5240 Mask.push_back(NumElems);
5241 for (unsigned i = 1; i != NumElems; ++i)
5243 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
5246 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
5247 static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
5249 unsigned NumElems = VT.getVectorNumElements();
5250 SmallVector<int, 8> Mask;
5251 for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
5253 Mask.push_back(i + NumElems);
5255 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
5258 /// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
5259 static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
5261 unsigned NumElems = VT.getVectorNumElements();
5262 SmallVector<int, 8> Mask;
5263 for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
5264 Mask.push_back(i + Half);
5265 Mask.push_back(i + NumElems + Half);
5267 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
5270 // PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
5271 // a generic shuffle instruction because the target has no such instructions.
5272 // Generate shuffles which repeat i16 and i8 several times until they can be
5273 // represented by v4f32 and then be manipulated by target suported shuffles.
5274 static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
5275 MVT VT = V.getSimpleValueType();
5276 int NumElems = VT.getVectorNumElements();
5279 while (NumElems > 4) {
5280 if (EltNo < NumElems/2) {
5281 V = getUnpackl(DAG, dl, VT, V, V);
5283 V = getUnpackh(DAG, dl, VT, V, V);
5284 EltNo -= NumElems/2;
5291 /// getLegalSplat - Generate a legal splat with supported x86 shuffles
5292 static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
5293 MVT VT = V.getSimpleValueType();
5296 if (VT.is128BitVector()) {
5297 V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
5298 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
5299 V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
5301 } else if (VT.is256BitVector()) {
5302 // To use VPERMILPS to splat scalars, the second half of indicies must
5303 // refer to the higher part, which is a duplication of the lower one,
5304 // because VPERMILPS can only handle in-lane permutations.
5305 int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
5306 EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
5308 V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
5309 V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
5312 llvm_unreachable("Vector size not supported");
5314 return DAG.getNode(ISD::BITCAST, dl, VT, V);
5317 /// PromoteSplat - Splat is promoted to target supported vector shuffles.
5318 static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
5319 MVT SrcVT = SV->getSimpleValueType(0);
5320 SDValue V1 = SV->getOperand(0);
5323 int EltNo = SV->getSplatIndex();
5324 int NumElems = SrcVT.getVectorNumElements();
5325 bool Is256BitVec = SrcVT.is256BitVector();
5327 assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) &&
5328 "Unknown how to promote splat for type");
5330 // Extract the 128-bit part containing the splat element and update
5331 // the splat element index when it refers to the higher register.
5333 V1 = Extract128BitVector(V1, EltNo, DAG, dl);
5334 if (EltNo >= NumElems/2)
5335 EltNo -= NumElems/2;
5338 // All i16 and i8 vector types can't be used directly by a generic shuffle
5339 // instruction because the target has no such instruction. Generate shuffles
5340 // which repeat i16 and i8 several times until they fit in i32, and then can
5341 // be manipulated by target suported shuffles.
5342 MVT EltVT = SrcVT.getVectorElementType();
5343 if (EltVT == MVT::i8 || EltVT == MVT::i16)
5344 V1 = PromoteSplati8i16(V1, DAG, EltNo);
5346 // Recreate the 256-bit vector and place the same 128-bit vector
5347 // into the low and high part. This is necessary because we want
5348 // to use VPERM* to shuffle the vectors
5350 V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
5353 return getLegalSplat(DAG, V1, EltNo);
5356 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
5357 /// vector of zero or undef vector. This produces a shuffle where the low
5358 /// element of V2 is swizzled into the zero/undef vector, landing at element
5359 /// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
5360 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
5362 const X86Subtarget *Subtarget,
5363 SelectionDAG &DAG) {
5364 MVT VT = V2.getSimpleValueType();
5366 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5367 unsigned NumElems = VT.getVectorNumElements();
5368 SmallVector<int, 16> MaskVec;
5369 for (unsigned i = 0; i != NumElems; ++i)
5370 // If this is the insertion idx, put the low elt of V2 here.
5371 MaskVec.push_back(i == Idx ? NumElems : i);
5372 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
5375 /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
5376 /// target specific opcode. Returns true if the Mask could be calculated. Sets
5377 /// IsUnary to true if only uses one source. Note that this will set IsUnary for
5378 /// shuffles which use a single input multiple times, and in those cases it will
5379 /// adjust the mask to only have indices within that single input.
5380 static bool getTargetShuffleMask(SDNode *N, MVT VT,
5381 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5382 unsigned NumElems = VT.getVectorNumElements();
5386 bool IsFakeUnary = false;
5387 switch(N->getOpcode()) {
5388 case X86ISD::BLENDI:
5389 ImmN = N->getOperand(N->getNumOperands()-1);
5390 DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5393 ImmN = N->getOperand(N->getNumOperands()-1);
5394 DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5395 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5397 case X86ISD::UNPCKH:
5398 DecodeUNPCKHMask(VT, Mask);
5399 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5401 case X86ISD::UNPCKL:
5402 DecodeUNPCKLMask(VT, Mask);
5403 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5405 case X86ISD::MOVHLPS:
5406 DecodeMOVHLPSMask(NumElems, Mask);
5407 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5409 case X86ISD::MOVLHPS:
5410 DecodeMOVLHPSMask(NumElems, Mask);
5411 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5413 case X86ISD::PALIGNR:
5414 ImmN = N->getOperand(N->getNumOperands()-1);
5415 DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5417 case X86ISD::PSHUFD:
5418 case X86ISD::VPERMILPI:
5419 ImmN = N->getOperand(N->getNumOperands()-1);
5420 DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5423 case X86ISD::PSHUFHW:
5424 ImmN = N->getOperand(N->getNumOperands()-1);
5425 DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5428 case X86ISD::PSHUFLW:
5429 ImmN = N->getOperand(N->getNumOperands()-1);
5430 DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5433 case X86ISD::PSHUFB: {
5435 SDValue MaskNode = N->getOperand(1);
5436 while (MaskNode->getOpcode() == ISD::BITCAST)
5437 MaskNode = MaskNode->getOperand(0);
5439 if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
5440 // If we have a build-vector, then things are easy.
5441 EVT VT = MaskNode.getValueType();
5442 assert(VT.isVector() &&
5443 "Can't produce a non-vector with a build_vector!");
5444 if (!VT.isInteger())
5447 int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8;
5449 SmallVector<uint64_t, 32> RawMask;
5450 for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) {
5451 SDValue Op = MaskNode->getOperand(i);
5452 if (Op->getOpcode() == ISD::UNDEF) {
5453 RawMask.push_back((uint64_t)SM_SentinelUndef);
5456 auto *CN = dyn_cast<ConstantSDNode>(Op.getNode());
5459 APInt MaskElement = CN->getAPIntValue();
5461 // We now have to decode the element which could be any integer size and
5462 // extract each byte of it.
5463 for (int j = 0; j < NumBytesPerElement; ++j) {
5464 // Note that this is x86 and so always little endian: the low byte is
5465 // the first byte of the mask.
5466 RawMask.push_back(MaskElement.getLoBits(8).getZExtValue());
5467 MaskElement = MaskElement.lshr(8);
5470 DecodePSHUFBMask(RawMask, Mask);
5474 auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
5478 SDValue Ptr = MaskLoad->getBasePtr();
5479 if (Ptr->getOpcode() == X86ISD::Wrapper)
5480 Ptr = Ptr->getOperand(0);
5482 auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
5483 if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
5486 if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
5487 DecodePSHUFBMask(C, Mask);
5493 case X86ISD::VPERMI:
5494 ImmN = N->getOperand(N->getNumOperands()-1);
5495 DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5499 case X86ISD::MOVSD: {
5500 // The index 0 always comes from the first element of the second source,
5501 // this is why MOVSS and MOVSD are used in the first place. The other
5502 // elements come from the other positions of the first source vector
5503 Mask.push_back(NumElems);
5504 for (unsigned i = 1; i != NumElems; ++i) {
5509 case X86ISD::VPERM2X128:
5510 ImmN = N->getOperand(N->getNumOperands()-1);
5511 DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5512 if (Mask.empty()) return false;
5514 case X86ISD::MOVSLDUP:
5515 DecodeMOVSLDUPMask(VT, Mask);
5518 case X86ISD::MOVSHDUP:
5519 DecodeMOVSHDUPMask(VT, Mask);
5522 case X86ISD::MOVDDUP:
5523 DecodeMOVDDUPMask(VT, Mask);
5526 case X86ISD::MOVLHPD:
5527 case X86ISD::MOVLPD:
5528 case X86ISD::MOVLPS:
5529 // Not yet implemented
5531 default: llvm_unreachable("unknown target shuffle node");
5534 // If we have a fake unary shuffle, the shuffle mask is spread across two
5535 // inputs that are actually the same node. Re-map the mask to always point
5536 // into the first input.
5539 if (M >= (int)Mask.size())
5545 /// getShuffleScalarElt - Returns the scalar element that will make up the ith
5546 /// element of the result of the vector shuffle.
5547 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
5550 return SDValue(); // Limit search depth.
5552 SDValue V = SDValue(N, 0);
5553 EVT VT = V.getValueType();
5554 unsigned Opcode = V.getOpcode();
5556 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
5557 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
5558 int Elt = SV->getMaskElt(Index);
5561 return DAG.getUNDEF(VT.getVectorElementType());
5563 unsigned NumElems = VT.getVectorNumElements();
5564 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
5565 : SV->getOperand(1);
5566 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
5569 // Recurse into target specific vector shuffles to find scalars.
5570 if (isTargetShuffle(Opcode)) {
5571 MVT ShufVT = V.getSimpleValueType();
5572 unsigned NumElems = ShufVT.getVectorNumElements();
5573 SmallVector<int, 16> ShuffleMask;
5576 if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
5579 int Elt = ShuffleMask[Index];
5581 return DAG.getUNDEF(ShufVT.getVectorElementType());
5583 SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
5585 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
5589 // Actual nodes that may contain scalar elements
5590 if (Opcode == ISD::BITCAST) {
5591 V = V.getOperand(0);
5592 EVT SrcVT = V.getValueType();
5593 unsigned NumElems = VT.getVectorNumElements();
5595 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
5599 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
5600 return (Index == 0) ? V.getOperand(0)
5601 : DAG.getUNDEF(VT.getVectorElementType());
5603 if (V.getOpcode() == ISD::BUILD_VECTOR)
5604 return V.getOperand(Index);
5609 /// getNumOfConsecutiveZeros - Return the number of elements of a vector
5610 /// shuffle operation which come from a consecutively from a zero. The
5611 /// search can start in two different directions, from left or right.
5612 /// We count undefs as zeros until PreferredNum is reached.
5613 static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp,
5614 unsigned NumElems, bool ZerosFromLeft,
5616 unsigned PreferredNum = -1U) {
5617 unsigned NumZeros = 0;
5618 for (unsigned i = 0; i != NumElems; ++i) {
5619 unsigned Index = ZerosFromLeft ? i : NumElems - i - 1;
5620 SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
5624 if (X86::isZeroNode(Elt))
5626 else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum.
5627 NumZeros = std::min(NumZeros + 1, PreferredNum);
5635 /// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
5636 /// correspond consecutively to elements from one of the vector operands,
5637 /// starting from its index OpIdx. Also tell OpNum which source vector operand.
5639 bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
5640 unsigned MaskI, unsigned MaskE, unsigned OpIdx,
5641 unsigned NumElems, unsigned &OpNum) {
5642 bool SeenV1 = false;
5643 bool SeenV2 = false;
5645 for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
5646 int Idx = SVOp->getMaskElt(i);
5647 // Ignore undef indicies
5651 if (Idx < (int)NumElems)
5656 // Only accept consecutive elements from the same vector
5657 if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
5661 OpNum = SeenV1 ? 0 : 1;
5665 /// isVectorShiftRight - Returns true if the shuffle can be implemented as a
5666 /// logical left shift of a vector.
5667 static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
5668 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5670 SVOp->getSimpleValueType(0).getVectorNumElements();
5671 unsigned NumZeros = getNumOfConsecutiveZeros(
5672 SVOp, NumElems, false /* check zeros from right */, DAG,
5673 SVOp->getMaskElt(0));
5679 // Considering the elements in the mask that are not consecutive zeros,
5680 // check if they consecutively come from only one of the source vectors.
5682 // V1 = {X, A, B, C} 0
5684 // vector_shuffle V1, V2 <1, 2, 3, X>
5686 if (!isShuffleMaskConsecutive(SVOp,
5687 0, // Mask Start Index
5688 NumElems-NumZeros, // Mask End Index(exclusive)
5689 NumZeros, // Where to start looking in the src vector
5690 NumElems, // Number of elements in vector
5691 OpSrc)) // Which source operand ?
5696 ShVal = SVOp->getOperand(OpSrc);
5700 /// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
5701 /// logical left shift of a vector.
5702 static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
5703 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5705 SVOp->getSimpleValueType(0).getVectorNumElements();
5706 unsigned NumZeros = getNumOfConsecutiveZeros(
5707 SVOp, NumElems, true /* check zeros from left */, DAG,
5708 NumElems - SVOp->getMaskElt(NumElems - 1) - 1);
5714 // Considering the elements in the mask that are not consecutive zeros,
5715 // check if they consecutively come from only one of the source vectors.
5717 // 0 { A, B, X, X } = V2
5719 // vector_shuffle V1, V2 <X, X, 4, 5>
5721 if (!isShuffleMaskConsecutive(SVOp,
5722 NumZeros, // Mask Start Index
5723 NumElems, // Mask End Index(exclusive)
5724 0, // Where to start looking in the src vector
5725 NumElems, // Number of elements in vector
5726 OpSrc)) // Which source operand ?
5731 ShVal = SVOp->getOperand(OpSrc);
5735 /// isVectorShift - Returns true if the shuffle can be implemented as a
5736 /// logical left or right shift of a vector.
5737 static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
5738 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5739 // Although the logic below support any bitwidth size, there are no
5740 // shift instructions which handle more than 128-bit vectors.
5741 if (!SVOp->getSimpleValueType(0).is128BitVector())
5744 if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
5745 isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
5751 /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
5753 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
5754 unsigned NumNonZero, unsigned NumZero,
5756 const X86Subtarget* Subtarget,
5757 const TargetLowering &TLI) {
5764 for (unsigned i = 0; i < 16; ++i) {
5765 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
5766 if (ThisIsNonZero && First) {
5768 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5770 V = DAG.getUNDEF(MVT::v8i16);
5775 SDValue ThisElt, LastElt;
5776 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
5777 if (LastIsNonZero) {
5778 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
5779 MVT::i16, Op.getOperand(i-1));
5781 if (ThisIsNonZero) {
5782 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
5783 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
5784 ThisElt, DAG.getConstant(8, MVT::i8));
5786 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
5790 if (ThisElt.getNode())
5791 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
5792 DAG.getIntPtrConstant(i/2));
5796 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
5799 /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
5801 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
5802 unsigned NumNonZero, unsigned NumZero,
5804 const X86Subtarget* Subtarget,
5805 const TargetLowering &TLI) {
5812 for (unsigned i = 0; i < 8; ++i) {
5813 bool isNonZero = (NonZeros & (1 << i)) != 0;
5817 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5819 V = DAG.getUNDEF(MVT::v8i16);
5822 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
5823 MVT::v8i16, V, Op.getOperand(i),
5824 DAG.getIntPtrConstant(i));
5831 /// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
5832 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
5833 const X86Subtarget *Subtarget,
5834 const TargetLowering &TLI) {
5835 // Find all zeroable elements.
5837 for (int i=0; i < 4; ++i) {
5838 SDValue Elt = Op->getOperand(i);
5839 Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt));
5841 assert(std::count_if(&Zeroable[0], &Zeroable[4],
5842 [](bool M) { return !M; }) > 1 &&
5843 "We expect at least two non-zero elements!");
5845 // We only know how to deal with build_vector nodes where elements are either
5846 // zeroable or extract_vector_elt with constant index.
5847 SDValue FirstNonZero;
5848 unsigned FirstNonZeroIdx;
5849 for (unsigned i=0; i < 4; ++i) {
5852 SDValue Elt = Op->getOperand(i);
5853 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5854 !isa<ConstantSDNode>(Elt.getOperand(1)))
5856 // Make sure that this node is extracting from a 128-bit vector.
5857 MVT VT = Elt.getOperand(0).getSimpleValueType();
5858 if (!VT.is128BitVector())
5860 if (!FirstNonZero.getNode()) {
5862 FirstNonZeroIdx = i;
5866 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
5867 SDValue V1 = FirstNonZero.getOperand(0);
5868 MVT VT = V1.getSimpleValueType();
5870 // See if this build_vector can be lowered as a blend with zero.
5872 unsigned EltMaskIdx, EltIdx;
5874 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
5875 if (Zeroable[EltIdx]) {
5876 // The zero vector will be on the right hand side.
5877 Mask[EltIdx] = EltIdx+4;
5881 Elt = Op->getOperand(EltIdx);
5882 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
5883 EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
5884 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
5886 Mask[EltIdx] = EltIdx;
5890 // Let the shuffle legalizer deal with blend operations.
5891 SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
5892 if (V1.getSimpleValueType() != VT)
5893 V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1);
5894 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]);
5897 // See if we can lower this build_vector to a INSERTPS.
5898 if (!Subtarget->hasSSE41())
5901 SDValue V2 = Elt.getOperand(0);
5902 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
5905 bool CanFold = true;
5906 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
5910 SDValue Current = Op->getOperand(i);
5911 SDValue SrcVector = Current->getOperand(0);
5914 CanFold = SrcVector == V1 &&
5915 cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
5921 assert(V1.getNode() && "Expected at least two non-zero elements!");
5922 if (V1.getSimpleValueType() != MVT::v4f32)
5923 V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1);
5924 if (V2.getSimpleValueType() != MVT::v4f32)
5925 V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2);
5927 // Ok, we can emit an INSERTPS instruction.
5929 for (int i = 0; i < 4; ++i)
5933 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
5934 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
5935 SDValue Result = DAG.getNode(X86ISD::INSERTPS, SDLoc(Op), MVT::v4f32, V1, V2,
5936 DAG.getIntPtrConstant(InsertPSMask));
5937 return DAG.getNode(ISD::BITCAST, SDLoc(Op), VT, Result);
5940 /// Return a vector logical shift node.
5941 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
5942 unsigned NumBits, SelectionDAG &DAG,
5943 const TargetLowering &TLI, SDLoc dl) {
5944 assert(VT.is128BitVector() && "Unknown type for VShift");
5945 MVT ShVT = MVT::v2i64;
5946 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
5947 SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
5948 MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(SrcOp.getValueType());
5949 SDValue ShiftVal = DAG.getConstant(NumBits, ScalarShiftTy);
5950 return DAG.getNode(ISD::BITCAST, dl, VT,
5951 DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
5955 LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
5957 // Check if the scalar load can be widened into a vector load. And if
5958 // the address is "base + cst" see if the cst can be "absorbed" into
5959 // the shuffle mask.
5960 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
5961 SDValue Ptr = LD->getBasePtr();
5962 if (!ISD::isNormalLoad(LD) || LD->isVolatile())
5964 EVT PVT = LD->getValueType(0);
5965 if (PVT != MVT::i32 && PVT != MVT::f32)
5970 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
5971 FI = FINode->getIndex();
5973 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
5974 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
5975 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
5976 Offset = Ptr.getConstantOperandVal(1);
5977 Ptr = Ptr.getOperand(0);
5982 // FIXME: 256-bit vector instructions don't require a strict alignment,
5983 // improve this code to support it better.
5984 unsigned RequiredAlign = VT.getSizeInBits()/8;
5985 SDValue Chain = LD->getChain();
5986 // Make sure the stack object alignment is at least 16 or 32.
5987 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
5988 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
5989 if (MFI->isFixedObjectIndex(FI)) {
5990 // Can't change the alignment. FIXME: It's possible to compute
5991 // the exact stack offset and reference FI + adjust offset instead.
5992 // If someone *really* cares about this. That's the way to implement it.
5995 MFI->setObjectAlignment(FI, RequiredAlign);
5999 // (Offset % 16 or 32) must be multiple of 4. Then address is then
6000 // Ptr + (Offset & ~15).
6003 if ((Offset % RequiredAlign) & 3)
6005 int64_t StartOffset = Offset & ~(RequiredAlign-1);
6007 Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(),
6008 Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
6010 int EltNo = (Offset - StartOffset) >> 2;
6011 unsigned NumElems = VT.getVectorNumElements();
6013 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6014 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6015 LD->getPointerInfo().getWithOffset(StartOffset),
6016 false, false, false, 0);
6018 SmallVector<int, 8> Mask;
6019 for (unsigned i = 0; i != NumElems; ++i)
6020 Mask.push_back(EltNo);
6022 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
6028 /// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
6029 /// vector of type 'VT', see if the elements can be replaced by a single large
6030 /// load which has the same value as a build_vector whose operands are 'elts'.
6032 /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
6034 /// FIXME: we'd also like to handle the case where the last elements are zero
6035 /// rather than undef via VZEXT_LOAD, but we do not detect that case today.
6036 /// There's even a handy isZeroNode for that purpose.
6037 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6038 SDLoc &DL, SelectionDAG &DAG,
6039 bool isAfterLegalize) {
6040 EVT EltVT = VT.getVectorElementType();
6041 unsigned NumElems = Elts.size();
6043 LoadSDNode *LDBase = nullptr;
6044 unsigned LastLoadedElt = -1U;
6046 // For each element in the initializer, see if we've found a load or an undef.
6047 // If we don't find an initial load element, or later load elements are
6048 // non-consecutive, bail out.
6049 for (unsigned i = 0; i < NumElems; ++i) {
6050 SDValue Elt = Elts[i];
6052 if (!Elt.getNode() ||
6053 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
6056 if (Elt.getNode()->getOpcode() == ISD::UNDEF)
6058 LDBase = cast<LoadSDNode>(Elt.getNode());
6062 if (Elt.getOpcode() == ISD::UNDEF)
6065 LoadSDNode *LD = cast<LoadSDNode>(Elt);
6066 if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
6071 // If we have found an entire vector of loads and undefs, then return a large
6072 // load of the entire vector width starting at the base pointer. If we found
6073 // consecutive loads for the low half, generate a vzext_load node.
6074 if (LastLoadedElt == NumElems - 1) {
6076 if (isAfterLegalize &&
6077 !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
6080 SDValue NewLd = SDValue();
6082 NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6083 LDBase->getPointerInfo(), LDBase->isVolatile(),
6084 LDBase->isNonTemporal(), LDBase->isInvariant(),
6085 LDBase->getAlignment());
6087 if (LDBase->hasAnyUseOfValue(1)) {
6088 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
6090 SDValue(NewLd.getNode(), 1));
6091 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6092 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6093 SDValue(NewLd.getNode(), 1));
6099 //TODO: The code below fires only for for loading the low v2i32 / v2f32
6100 //of a v4i32 / v4f32. It's probably worth generalizing.
6101 if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) &&
6102 DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
6103 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
6104 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6106 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
6107 LDBase->getPointerInfo(),
6108 LDBase->getAlignment(),
6109 false/*isVolatile*/, true/*ReadMem*/,
6112 // Make sure the newly-created LOAD is in the same position as LDBase in
6113 // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
6114 // update uses of LDBase's output chain to use the TokenFactor.
6115 if (LDBase->hasAnyUseOfValue(1)) {
6116 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
6117 SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
6118 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6119 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6120 SDValue(ResNode.getNode(), 1));
6123 return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
6128 /// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
6129 /// to generate a splat value for the following cases:
6130 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
6131 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
6132 /// a scalar load, or a constant.
6133 /// The VBROADCAST node is returned when a pattern is found,
6134 /// or SDValue() otherwise.
6135 static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
6136 SelectionDAG &DAG) {
6137 // VBROADCAST requires AVX.
6138 // TODO: Splats could be generated for non-AVX CPUs using SSE
6139 // instructions, but there's less potential gain for only 128-bit vectors.
6140 if (!Subtarget->hasAVX())
6143 MVT VT = Op.getSimpleValueType();
6146 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
6147 "Unsupported vector type for broadcast.");
6152 switch (Op.getOpcode()) {
6154 // Unknown pattern found.
6157 case ISD::BUILD_VECTOR: {
6158 auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
6159 BitVector UndefElements;
6160 SDValue Splat = BVOp->getSplatValue(&UndefElements);
6162 // We need a splat of a single value to use broadcast, and it doesn't
6163 // make any sense if the value is only in one element of the vector.
6164 if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
6168 ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
6169 Ld.getOpcode() == ISD::ConstantFP);
6171 // Make sure that all of the users of a non-constant load are from the
6172 // BUILD_VECTOR node.
6173 if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
6178 case ISD::VECTOR_SHUFFLE: {
6179 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
6181 // Shuffles must have a splat mask where the first element is
6183 if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
6186 SDValue Sc = Op.getOperand(0);
6187 if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
6188 Sc.getOpcode() != ISD::BUILD_VECTOR) {
6190 if (!Subtarget->hasInt256())
6193 // Use the register form of the broadcast instruction available on AVX2.
6194 if (VT.getSizeInBits() >= 256)
6195 Sc = Extract128BitVector(Sc, 0, DAG, dl);
6196 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
6199 Ld = Sc.getOperand(0);
6200 ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
6201 Ld.getOpcode() == ISD::ConstantFP);
6203 // The scalar_to_vector node and the suspected
6204 // load node must have exactly one user.
6205 // Constants may have multiple users.
6207 // AVX-512 has register version of the broadcast
6208 bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
6209 Ld.getValueType().getSizeInBits() >= 32;
6210 if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
6217 unsigned ScalarSize = Ld.getValueType().getSizeInBits();
6218 bool IsGE256 = (VT.getSizeInBits() >= 256);
6220 // When optimizing for size, generate up to 5 extra bytes for a broadcast
6221 // instruction to save 8 or more bytes of constant pool data.
6222 // TODO: If multiple splats are generated to load the same constant,
6223 // it may be detrimental to overall size. There needs to be a way to detect
6224 // that condition to know if this is truly a size win.
6225 const Function *F = DAG.getMachineFunction().getFunction();
6226 bool OptForSize = F->getAttributes().
6227 hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
6229 // Handle broadcasting a single constant scalar from the constant pool
6231 // On Sandybridge (no AVX2), it is still better to load a constant vector
6232 // from the constant pool and not to broadcast it from a scalar.
6233 // But override that restriction when optimizing for size.
6234 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
6235 if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) {
6236 EVT CVT = Ld.getValueType();
6237 assert(!CVT.isVector() && "Must not broadcast a vector type");
6239 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
6240 // For size optimization, also splat v2f64 and v2i64, and for size opt
6241 // with AVX2, also splat i8 and i16.
6242 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
6243 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6244 (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) {
6245 const Constant *C = nullptr;
6246 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
6247 C = CI->getConstantIntValue();
6248 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
6249 C = CF->getConstantFPValue();
6251 assert(C && "Invalid constant type");
6253 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6254 SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
6255 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6256 Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
6257 MachinePointerInfo::getConstantPool(),
6258 false, false, false, Alignment);
6260 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6264 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
6266 // Handle AVX2 in-register broadcasts.
6267 if (!IsLoad && Subtarget->hasInt256() &&
6268 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
6269 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6271 // The scalar source must be a normal load.
6275 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6276 (Subtarget->hasVLX() && ScalarSize == 64))
6277 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6279 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
6280 // double since there is no vbroadcastsd xmm
6281 if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
6282 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
6283 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6286 // Unsupported broadcast.
6290 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
6291 /// underlying vector and index.
6293 /// Modifies \p ExtractedFromVec to the real vector and returns the real
6295 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
6297 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
6298 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
6301 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
6303 // (extract_vector_elt (v8f32 %vreg1), Constant<6>)
6305 // (extract_vector_elt (vector_shuffle<2,u,u,u>
6306 // (extract_subvector (v8f32 %vreg0), Constant<4>),
6309 // In this case the vector is the extract_subvector expression and the index
6310 // is 2, as specified by the shuffle.
6311 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
6312 SDValue ShuffleVec = SVOp->getOperand(0);
6313 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
6314 assert(ShuffleVecVT.getVectorElementType() ==
6315 ExtractedFromVec.getSimpleValueType().getVectorElementType());
6317 int ShuffleIdx = SVOp->getMaskElt(Idx);
6318 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
6319 ExtractedFromVec = ShuffleVec;
6325 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
6326 MVT VT = Op.getSimpleValueType();
6328 // Skip if insert_vec_elt is not supported.
6329 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6330 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
6334 unsigned NumElems = Op.getNumOperands();
6338 SmallVector<unsigned, 4> InsertIndices;
6339 SmallVector<int, 8> Mask(NumElems, -1);
6341 for (unsigned i = 0; i != NumElems; ++i) {
6342 unsigned Opc = Op.getOperand(i).getOpcode();
6344 if (Opc == ISD::UNDEF)
6347 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
6348 // Quit if more than 1 elements need inserting.
6349 if (InsertIndices.size() > 1)
6352 InsertIndices.push_back(i);
6356 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
6357 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
6358 // Quit if non-constant index.
6359 if (!isa<ConstantSDNode>(ExtIdx))
6361 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
6363 // Quit if extracted from vector of different type.
6364 if (ExtractedFromVec.getValueType() != VT)
6367 if (!VecIn1.getNode())
6368 VecIn1 = ExtractedFromVec;
6369 else if (VecIn1 != ExtractedFromVec) {
6370 if (!VecIn2.getNode())
6371 VecIn2 = ExtractedFromVec;
6372 else if (VecIn2 != ExtractedFromVec)
6373 // Quit if more than 2 vectors to shuffle
6377 if (ExtractedFromVec == VecIn1)
6379 else if (ExtractedFromVec == VecIn2)
6380 Mask[i] = Idx + NumElems;
6383 if (!VecIn1.getNode())
6386 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
6387 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
6388 for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
6389 unsigned Idx = InsertIndices[i];
6390 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
6391 DAG.getIntPtrConstant(Idx));
6397 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
6399 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
6401 MVT VT = Op.getSimpleValueType();
6402 assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) &&
6403 "Unexpected type in LowerBUILD_VECTORvXi1!");
6406 if (ISD::isBuildVectorAllZeros(Op.getNode())) {
6407 SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
6408 SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
6409 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
6412 if (ISD::isBuildVectorAllOnes(Op.getNode())) {
6413 SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
6414 SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
6415 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
6418 bool AllContants = true;
6419 uint64_t Immediate = 0;
6420 int NonConstIdx = -1;
6421 bool IsSplat = true;
6422 unsigned NumNonConsts = 0;
6423 unsigned NumConsts = 0;
6424 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6425 SDValue In = Op.getOperand(idx);
6426 if (In.getOpcode() == ISD::UNDEF)
6428 if (!isa<ConstantSDNode>(In)) {
6429 AllContants = false;
6434 if (cast<ConstantSDNode>(In)->getZExtValue())
6435 Immediate |= (1ULL << idx);
6437 if (In != Op.getOperand(0))
6442 SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1,
6443 DAG.getConstant(Immediate, MVT::i16));
6444 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask,
6445 DAG.getIntPtrConstant(0));
6448 if (NumNonConsts == 1 && NonConstIdx != 0) {
6451 SDValue VecAsImm = DAG.getConstant(Immediate,
6452 MVT::getIntegerVT(VT.getSizeInBits()));
6453 DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
6456 DstVec = DAG.getUNDEF(VT);
6457 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
6458 Op.getOperand(NonConstIdx),
6459 DAG.getIntPtrConstant(NonConstIdx));
6461 if (!IsSplat && (NonConstIdx != 0))
6462 llvm_unreachable("Unsupported BUILD_VECTOR operation");
6463 MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8;
6466 Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
6467 DAG.getConstant(-1, SelectVT),
6468 DAG.getConstant(0, SelectVT));
6470 Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
6471 DAG.getConstant((Immediate | 1), SelectVT),
6472 DAG.getConstant(Immediate, SelectVT));
6473 return DAG.getNode(ISD::BITCAST, dl, VT, Select);
6476 /// \brief Return true if \p N implements a horizontal binop and return the
6477 /// operands for the horizontal binop into V0 and V1.
6479 /// This is a helper function of PerformBUILD_VECTORCombine.
6480 /// This function checks that the build_vector \p N in input implements a
6481 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
6482 /// operation to match.
6483 /// For example, if \p Opcode is equal to ISD::ADD, then this function
6484 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
6485 /// is equal to ISD::SUB, then this function checks if this is a horizontal
6488 /// This function only analyzes elements of \p N whose indices are
6489 /// in range [BaseIdx, LastIdx).
6490 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
6492 unsigned BaseIdx, unsigned LastIdx,
6493 SDValue &V0, SDValue &V1) {
6494 EVT VT = N->getValueType(0);
6496 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
6497 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
6498 "Invalid Vector in input!");
6500 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
6501 bool CanFold = true;
6502 unsigned ExpectedVExtractIdx = BaseIdx;
6503 unsigned NumElts = LastIdx - BaseIdx;
6504 V0 = DAG.getUNDEF(VT);
6505 V1 = DAG.getUNDEF(VT);
6507 // Check if N implements a horizontal binop.
6508 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
6509 SDValue Op = N->getOperand(i + BaseIdx);
6512 if (Op->getOpcode() == ISD::UNDEF) {
6513 // Update the expected vector extract index.
6514 if (i * 2 == NumElts)
6515 ExpectedVExtractIdx = BaseIdx;
6516 ExpectedVExtractIdx += 2;
6520 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
6525 SDValue Op0 = Op.getOperand(0);
6526 SDValue Op1 = Op.getOperand(1);
6528 // Try to match the following pattern:
6529 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
6530 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6531 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6532 Op0.getOperand(0) == Op1.getOperand(0) &&
6533 isa<ConstantSDNode>(Op0.getOperand(1)) &&
6534 isa<ConstantSDNode>(Op1.getOperand(1)));
6538 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
6539 unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
6541 if (i * 2 < NumElts) {
6542 if (V0.getOpcode() == ISD::UNDEF)
6543 V0 = Op0.getOperand(0);
6545 if (V1.getOpcode() == ISD::UNDEF)
6546 V1 = Op0.getOperand(0);
6547 if (i * 2 == NumElts)
6548 ExpectedVExtractIdx = BaseIdx;
6551 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
6552 if (I0 == ExpectedVExtractIdx)
6553 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
6554 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
6555 // Try to match the following dag sequence:
6556 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
6557 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
6561 ExpectedVExtractIdx += 2;
6567 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
6568 /// a concat_vector.
6570 /// This is a helper function of PerformBUILD_VECTORCombine.
6571 /// This function expects two 256-bit vectors called V0 and V1.
6572 /// At first, each vector is split into two separate 128-bit vectors.
6573 /// Then, the resulting 128-bit vectors are used to implement two
6574 /// horizontal binary operations.
6576 /// The kind of horizontal binary operation is defined by \p X86Opcode.
6578 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
6579 /// the two new horizontal binop.
6580 /// When Mode is set, the first horizontal binop dag node would take as input
6581 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
6582 /// horizontal binop dag node would take as input the lower 128-bit of V1
6583 /// and the upper 128-bit of V1.
6585 /// HADD V0_LO, V0_HI
6586 /// HADD V1_LO, V1_HI
6588 /// Otherwise, the first horizontal binop dag node takes as input the lower
6589 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
6590 /// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1.
6592 /// HADD V0_LO, V1_LO
6593 /// HADD V0_HI, V1_HI
6595 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
6596 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
6597 /// the upper 128-bits of the result.
6598 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
6599 SDLoc DL, SelectionDAG &DAG,
6600 unsigned X86Opcode, bool Mode,
6601 bool isUndefLO, bool isUndefHI) {
6602 EVT VT = V0.getValueType();
6603 assert(VT.is256BitVector() && VT == V1.getValueType() &&
6604 "Invalid nodes in input!");
6606 unsigned NumElts = VT.getVectorNumElements();
6607 SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL);
6608 SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL);
6609 SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL);
6610 SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL);
6611 EVT NewVT = V0_LO.getValueType();
6613 SDValue LO = DAG.getUNDEF(NewVT);
6614 SDValue HI = DAG.getUNDEF(NewVT);
6617 // Don't emit a horizontal binop if the result is expected to be UNDEF.
6618 if (!isUndefLO && V0->getOpcode() != ISD::UNDEF)
6619 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
6620 if (!isUndefHI && V1->getOpcode() != ISD::UNDEF)
6621 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
6623 // Don't emit a horizontal binop if the result is expected to be UNDEF.
6624 if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF ||
6625 V1_LO->getOpcode() != ISD::UNDEF))
6626 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
6628 if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF ||
6629 V1_HI->getOpcode() != ISD::UNDEF))
6630 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
6633 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
6636 /// \brief Try to fold a build_vector that performs an 'addsub' into the
6637 /// sequence of 'vadd + vsub + blendi'.
6638 static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG,
6639 const X86Subtarget *Subtarget) {
6641 EVT VT = BV->getValueType(0);
6642 unsigned NumElts = VT.getVectorNumElements();
6643 SDValue InVec0 = DAG.getUNDEF(VT);
6644 SDValue InVec1 = DAG.getUNDEF(VT);
6646 assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
6647 VT == MVT::v2f64) && "build_vector with an invalid type found!");
6649 // Odd-numbered elements in the input build vector are obtained from
6650 // adding two integer/float elements.
6651 // Even-numbered elements in the input build vector are obtained from
6652 // subtracting two integer/float elements.
6653 unsigned ExpectedOpcode = ISD::FSUB;
6654 unsigned NextExpectedOpcode = ISD::FADD;
6655 bool AddFound = false;
6656 bool SubFound = false;
6658 for (unsigned i = 0, e = NumElts; i != e; i++) {
6659 SDValue Op = BV->getOperand(i);
6661 // Skip 'undef' values.
6662 unsigned Opcode = Op.getOpcode();
6663 if (Opcode == ISD::UNDEF) {
6664 std::swap(ExpectedOpcode, NextExpectedOpcode);
6668 // Early exit if we found an unexpected opcode.
6669 if (Opcode != ExpectedOpcode)
6672 SDValue Op0 = Op.getOperand(0);
6673 SDValue Op1 = Op.getOperand(1);
6675 // Try to match the following pattern:
6676 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
6677 // Early exit if we cannot match that sequence.
6678 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6679 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6680 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
6681 !isa<ConstantSDNode>(Op1.getOperand(1)) ||
6682 Op0.getOperand(1) != Op1.getOperand(1))
6685 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
6689 // We found a valid add/sub node. Update the information accordingly.
6695 // Update InVec0 and InVec1.
6696 if (InVec0.getOpcode() == ISD::UNDEF)
6697 InVec0 = Op0.getOperand(0);
6698 if (InVec1.getOpcode() == ISD::UNDEF)
6699 InVec1 = Op1.getOperand(0);
6701 // Make sure that operands in input to each add/sub node always
6702 // come from a same pair of vectors.
6703 if (InVec0 != Op0.getOperand(0)) {
6704 if (ExpectedOpcode == ISD::FSUB)
6707 // FADD is commutable. Try to commute the operands
6708 // and then test again.
6709 std::swap(Op0, Op1);
6710 if (InVec0 != Op0.getOperand(0))
6714 if (InVec1 != Op1.getOperand(0))
6717 // Update the pair of expected opcodes.
6718 std::swap(ExpectedOpcode, NextExpectedOpcode);
6721 // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
6722 if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF &&
6723 InVec1.getOpcode() != ISD::UNDEF)
6724 return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
6729 static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
6730 const X86Subtarget *Subtarget) {
6732 EVT VT = N->getValueType(0);
6733 unsigned NumElts = VT.getVectorNumElements();
6734 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N);
6735 SDValue InVec0, InVec1;
6737 // Try to match an ADDSUB.
6738 if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
6739 (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
6740 SDValue Value = matchAddSub(BV, DAG, Subtarget);
6741 if (Value.getNode())
6745 // Try to match horizontal ADD/SUB.
6746 unsigned NumUndefsLO = 0;
6747 unsigned NumUndefsHI = 0;
6748 unsigned Half = NumElts/2;
6750 // Count the number of UNDEF operands in the build_vector in input.
6751 for (unsigned i = 0, e = Half; i != e; ++i)
6752 if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
6755 for (unsigned i = Half, e = NumElts; i != e; ++i)
6756 if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
6759 // Early exit if this is either a build_vector of all UNDEFs or all the
6760 // operands but one are UNDEF.
6761 if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
6764 if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) {
6765 // Try to match an SSE3 float HADD/HSUB.
6766 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
6767 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
6769 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
6770 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
6771 } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
6772 // Try to match an SSSE3 integer HADD/HSUB.
6773 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
6774 return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
6776 if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
6777 return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
6780 if (!Subtarget->hasAVX())
6783 if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
6784 // Try to match an AVX horizontal add/sub of packed single/double
6785 // precision floating point values from 256-bit vectors.
6786 SDValue InVec2, InVec3;
6787 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
6788 isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
6789 ((InVec0.getOpcode() == ISD::UNDEF ||
6790 InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6791 ((InVec1.getOpcode() == ISD::UNDEF ||
6792 InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6793 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
6795 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
6796 isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
6797 ((InVec0.getOpcode() == ISD::UNDEF ||
6798 InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6799 ((InVec1.getOpcode() == ISD::UNDEF ||
6800 InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6801 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
6802 } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
6803 // Try to match an AVX2 horizontal add/sub of signed integers.
6804 SDValue InVec2, InVec3;
6806 bool CanFold = true;
6808 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
6809 isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
6810 ((InVec0.getOpcode() == ISD::UNDEF ||
6811 InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6812 ((InVec1.getOpcode() == ISD::UNDEF ||
6813 InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6814 X86Opcode = X86ISD::HADD;
6815 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
6816 isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
6817 ((InVec0.getOpcode() == ISD::UNDEF ||
6818 InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6819 ((InVec1.getOpcode() == ISD::UNDEF ||
6820 InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6821 X86Opcode = X86ISD::HSUB;
6826 // Fold this build_vector into a single horizontal add/sub.
6827 // Do this only if the target has AVX2.
6828 if (Subtarget->hasAVX2())
6829 return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
6831 // Do not try to expand this build_vector into a pair of horizontal
6832 // add/sub if we can emit a pair of scalar add/sub.
6833 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
6836 // Convert this build_vector into a pair of horizontal binop followed by
6838 bool isUndefLO = NumUndefsLO == Half;
6839 bool isUndefHI = NumUndefsHI == Half;
6840 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
6841 isUndefLO, isUndefHI);
6845 if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
6846 VT == MVT::v16i16) && Subtarget->hasAVX()) {
6848 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
6849 X86Opcode = X86ISD::HADD;
6850 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
6851 X86Opcode = X86ISD::HSUB;
6852 else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
6853 X86Opcode = X86ISD::FHADD;
6854 else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
6855 X86Opcode = X86ISD::FHSUB;
6859 // Don't try to expand this build_vector into a pair of horizontal add/sub
6860 // if we can simply emit a pair of scalar add/sub.
6861 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
6864 // Convert this build_vector into two horizontal add/sub followed by
6866 bool isUndefLO = NumUndefsLO == Half;
6867 bool isUndefHI = NumUndefsHI == Half;
6868 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
6869 isUndefLO, isUndefHI);
6876 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
6879 MVT VT = Op.getSimpleValueType();
6880 MVT ExtVT = VT.getVectorElementType();
6881 unsigned NumElems = Op.getNumOperands();
6883 // Generate vectors for predicate vectors.
6884 if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512())
6885 return LowerBUILD_VECTORvXi1(Op, DAG);
6887 // Vectors containing all zeros can be matched by pxor and xorps later
6888 if (ISD::isBuildVectorAllZeros(Op.getNode())) {
6889 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
6890 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
6891 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
6894 return getZeroVector(VT, Subtarget, DAG, dl);
6897 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
6898 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
6899 // vpcmpeqd on 256-bit vectors.
6900 if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
6901 if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
6904 if (!VT.is512BitVector())
6905 return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
6908 SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
6909 if (Broadcast.getNode())
6912 unsigned EVTBits = ExtVT.getSizeInBits();
6914 unsigned NumZero = 0;
6915 unsigned NumNonZero = 0;
6916 unsigned NonZeros = 0;
6917 bool IsAllConstants = true;
6918 SmallSet<SDValue, 8> Values;
6919 for (unsigned i = 0; i < NumElems; ++i) {
6920 SDValue Elt = Op.getOperand(i);
6921 if (Elt.getOpcode() == ISD::UNDEF)
6924 if (Elt.getOpcode() != ISD::Constant &&
6925 Elt.getOpcode() != ISD::ConstantFP)
6926 IsAllConstants = false;
6927 if (X86::isZeroNode(Elt))
6930 NonZeros |= (1 << i);
6935 // All undef vector. Return an UNDEF. All zero vectors were handled above.
6936 if (NumNonZero == 0)
6937 return DAG.getUNDEF(VT);
6939 // Special case for single non-zero, non-undef, element.
6940 if (NumNonZero == 1) {
6941 unsigned Idx = countTrailingZeros(NonZeros);
6942 SDValue Item = Op.getOperand(Idx);
6944 // If this is an insertion of an i64 value on x86-32, and if the top bits of
6945 // the value are obviously zero, truncate the value to i32 and do the
6946 // insertion that way. Only do this if the value is non-constant or if the
6947 // value is a constant being inserted into element 0. It is cheaper to do
6948 // a constant pool load than it is to do a movd + shuffle.
6949 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
6950 (!IsAllConstants || Idx == 0)) {
6951 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
6953 assert(VT == MVT::v2i64 && "Expected an SSE value type!");
6954 EVT VecVT = MVT::v4i32;
6955 unsigned VecElts = 4;
6957 // Truncate the value (which may itself be a constant) to i32, and
6958 // convert it to a vector with movd (S2V+shuffle to zero extend).
6959 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
6960 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
6962 // If using the new shuffle lowering, just directly insert this.
6963 if (ExperimentalVectorShuffleLowering)
6965 ISD::BITCAST, dl, VT,
6966 getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG));
6968 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
6970 // Now we have our 32-bit value zero extended in the low element of
6971 // a vector. If Idx != 0, swizzle it into place.
6973 SmallVector<int, 4> Mask;
6974 Mask.push_back(Idx);
6975 for (unsigned i = 1; i != VecElts; ++i)
6977 Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
6980 return DAG.getNode(ISD::BITCAST, dl, VT, Item);
6984 // If we have a constant or non-constant insertion into the low element of
6985 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
6986 // the rest of the elements. This will be matched as movd/movq/movss/movsd
6987 // depending on what the source datatype is.
6990 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
6992 if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
6993 (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
6994 if (VT.is256BitVector() || VT.is512BitVector()) {
6995 SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
6996 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
6997 Item, DAG.getIntPtrConstant(0));
6999 assert(VT.is128BitVector() && "Expected an SSE value type!");
7000 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7001 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
7002 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7005 if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
7006 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
7007 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7008 if (VT.is256BitVector()) {
7009 SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
7010 Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
7012 assert(VT.is128BitVector() && "Expected an SSE value type!");
7013 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7015 return DAG.getNode(ISD::BITCAST, dl, VT, Item);
7019 // Is it a vector logical left shift?
7020 if (NumElems == 2 && Idx == 1 &&
7021 X86::isZeroNode(Op.getOperand(0)) &&
7022 !X86::isZeroNode(Op.getOperand(1))) {
7023 unsigned NumBits = VT.getSizeInBits();
7024 return getVShift(true, VT,
7025 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
7026 VT, Op.getOperand(1)),
7027 NumBits/2, DAG, *this, dl);
7030 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
7033 // Otherwise, if this is a vector with i32 or f32 elements, and the element
7034 // is a non-constant being inserted into an element other than the low one,
7035 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
7036 // movd/movss) to move this into the low element, then shuffle it into
7038 if (EVTBits == 32) {
7039 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7041 // If using the new shuffle lowering, just directly insert this.
7042 if (ExperimentalVectorShuffleLowering)
7043 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
7045 // Turn it into a shuffle of zero and zero-extended scalar to vector.
7046 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
7047 SmallVector<int, 8> MaskVec;
7048 for (unsigned i = 0; i != NumElems; ++i)
7049 MaskVec.push_back(i == Idx ? 0 : 1);
7050 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
7054 // Splat is obviously ok. Let legalizer expand it to a shuffle.
7055 if (Values.size() == 1) {
7056 if (EVTBits == 32) {
7057 // Instead of a shuffle like this:
7058 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
7059 // Check if it's possible to issue this instead.
7060 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
7061 unsigned Idx = countTrailingZeros(NonZeros);
7062 SDValue Item = Op.getOperand(Idx);
7063 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
7064 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
7069 // A vector full of immediates; various special cases are already
7070 // handled, so this is best done with a single constant-pool load.
7074 // For AVX-length vectors, see if we can use a vector load to get all of the
7075 // elements, otherwise build the individual 128-bit pieces and use
7076 // shuffles to put them in place.
7077 if (VT.is256BitVector() || VT.is512BitVector()) {
7078 SmallVector<SDValue, 64> V;
7079 for (unsigned i = 0; i != NumElems; ++i)
7080 V.push_back(Op.getOperand(i));
7082 // Check for a build vector of consecutive loads.
7083 if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
7086 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
7088 // Build both the lower and upper subvector.
7089 SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
7090 makeArrayRef(&V[0], NumElems/2));
7091 SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
7092 makeArrayRef(&V[NumElems / 2], NumElems/2));
7094 // Recreate the wider vector with the lower and upper part.
7095 if (VT.is256BitVector())
7096 return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7097 return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7100 // Let legalizer expand 2-wide build_vectors.
7101 if (EVTBits == 64) {
7102 if (NumNonZero == 1) {
7103 // One half is zero or undef.
7104 unsigned Idx = countTrailingZeros(NonZeros);
7105 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
7106 Op.getOperand(Idx));
7107 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
7112 // If element VT is < 32 bits, convert it to inserts into a zero vector.
7113 if (EVTBits == 8 && NumElems == 16) {
7114 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
7116 if (V.getNode()) return V;
7119 if (EVTBits == 16 && NumElems == 8) {
7120 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
7122 if (V.getNode()) return V;
7125 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
7126 if (EVTBits == 32 && NumElems == 4) {
7127 SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this);
7132 // If element VT is == 32 bits, turn it into a number of shuffles.
7133 SmallVector<SDValue, 8> V(NumElems);
7134 if (NumElems == 4 && NumZero > 0) {
7135 for (unsigned i = 0; i < 4; ++i) {
7136 bool isZero = !(NonZeros & (1 << i));
7138 V[i] = getZeroVector(VT, Subtarget, DAG, dl);
7140 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7143 for (unsigned i = 0; i < 2; ++i) {
7144 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
7147 V[i] = V[i*2]; // Must be a zero vector.
7150 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
7153 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
7156 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
7161 bool Reverse1 = (NonZeros & 0x3) == 2;
7162 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
7166 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
7167 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
7169 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
7172 if (Values.size() > 1 && VT.is128BitVector()) {
7173 // Check for a build vector of consecutive loads.
7174 for (unsigned i = 0; i < NumElems; ++i)
7175 V[i] = Op.getOperand(i);
7177 // Check for elements which are consecutive loads.
7178 SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false);
7182 // Check for a build vector from mostly shuffle plus few inserting.
7183 SDValue Sh = buildFromShuffleMostly(Op, DAG);
7187 // For SSE 4.1, use insertps to put the high elements into the low element.
7188 if (getSubtarget()->hasSSE41()) {
7190 if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
7191 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
7193 Result = DAG.getUNDEF(VT);
7195 for (unsigned i = 1; i < NumElems; ++i) {
7196 if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
7197 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
7198 Op.getOperand(i), DAG.getIntPtrConstant(i));
7203 // Otherwise, expand into a number of unpckl*, start by extending each of
7204 // our (non-undef) elements to the full vector width with the element in the
7205 // bottom slot of the vector (which generates no code for SSE).
7206 for (unsigned i = 0; i < NumElems; ++i) {
7207 if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
7208 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7210 V[i] = DAG.getUNDEF(VT);
7213 // Next, we iteratively mix elements, e.g. for v4f32:
7214 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
7215 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
7216 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
7217 unsigned EltStride = NumElems >> 1;
7218 while (EltStride != 0) {
7219 for (unsigned i = 0; i < EltStride; ++i) {
7220 // If V[i+EltStride] is undef and this is the first round of mixing,
7221 // then it is safe to just drop this shuffle: V[i] is already in the
7222 // right place, the one element (since it's the first round) being
7223 // inserted as undef can be dropped. This isn't safe for successive
7224 // rounds because they will permute elements within both vectors.
7225 if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
7226 EltStride == NumElems/2)
7229 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
7238 // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
7239 // to create 256-bit vectors from two other 128-bit ones.
7240 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7242 MVT ResVT = Op.getSimpleValueType();
7244 assert((ResVT.is256BitVector() ||
7245 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
7247 SDValue V1 = Op.getOperand(0);
7248 SDValue V2 = Op.getOperand(1);
7249 unsigned NumElems = ResVT.getVectorNumElements();
7250 if(ResVT.is256BitVector())
7251 return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7253 if (Op.getNumOperands() == 4) {
7254 MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),
7255 ResVT.getVectorNumElements()/2);
7256 SDValue V3 = Op.getOperand(2);
7257 SDValue V4 = Op.getOperand(3);
7258 return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl),
7259 Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl);
7261 return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7264 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7265 MVT LLVM_ATTRIBUTE_UNUSED VT = Op.getSimpleValueType();
7266 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
7267 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
7268 Op.getNumOperands() == 4)));
7270 // AVX can use the vinsertf128 instruction to create 256-bit vectors
7271 // from two other 128-bit ones.
7273 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
7274 return LowerAVXCONCAT_VECTORS(Op, DAG);
7278 //===----------------------------------------------------------------------===//
7279 // Vector shuffle lowering
7281 // This is an experimental code path for lowering vector shuffles on x86. It is
7282 // designed to handle arbitrary vector shuffles and blends, gracefully
7283 // degrading performance as necessary. It works hard to recognize idiomatic
7284 // shuffles and lower them to optimal instruction patterns without leaving
7285 // a framework that allows reasonably efficient handling of all vector shuffle
7287 //===----------------------------------------------------------------------===//
7289 /// \brief Tiny helper function to identify a no-op mask.
7291 /// This is a somewhat boring predicate function. It checks whether the mask
7292 /// array input, which is assumed to be a single-input shuffle mask of the kind
7293 /// used by the X86 shuffle instructions (not a fully general
7294 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
7295 /// in-place shuffle are 'no-op's.
7296 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
7297 for (int i = 0, Size = Mask.size(); i < Size; ++i)
7298 if (Mask[i] != -1 && Mask[i] != i)
7303 /// \brief Helper function to classify a mask as a single-input mask.
7305 /// This isn't a generic single-input test because in the vector shuffle
7306 /// lowering we canonicalize single inputs to be the first input operand. This
7307 /// means we can more quickly test for a single input by only checking whether
7308 /// an input from the second operand exists. We also assume that the size of
7309 /// mask corresponds to the size of the input vectors which isn't true in the
7310 /// fully general case.
7311 static bool isSingleInputShuffleMask(ArrayRef<int> Mask) {
7313 if (M >= (int)Mask.size())
7318 /// \brief Test whether there are elements crossing 128-bit lanes in this
7321 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
7322 /// and we routinely test for these.
7323 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
7324 int LaneSize = 128 / VT.getScalarSizeInBits();
7325 int Size = Mask.size();
7326 for (int i = 0; i < Size; ++i)
7327 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
7332 /// \brief Test whether a shuffle mask is equivalent within each 128-bit lane.
7334 /// This checks a shuffle mask to see if it is performing the same
7335 /// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies
7336 /// that it is also not lane-crossing. It may however involve a blend from the
7337 /// same lane of a second vector.
7339 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
7340 /// non-trivial to compute in the face of undef lanes. The representation is
7341 /// *not* suitable for use with existing 128-bit shuffles as it will contain
7342 /// entries from both V1 and V2 inputs to the wider mask.
7344 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
7345 SmallVectorImpl<int> &RepeatedMask) {
7346 int LaneSize = 128 / VT.getScalarSizeInBits();
7347 RepeatedMask.resize(LaneSize, -1);
7348 int Size = Mask.size();
7349 for (int i = 0; i < Size; ++i) {
7352 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
7353 // This entry crosses lanes, so there is no way to model this shuffle.
7356 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
7357 if (RepeatedMask[i % LaneSize] == -1)
7358 // This is the first non-undef entry in this slot of a 128-bit lane.
7359 RepeatedMask[i % LaneSize] =
7360 Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size;
7361 else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i])
7362 // Found a mismatch with the repeated mask.
7368 // Hide this symbol with an anonymous namespace instead of 'static' so that MSVC
7369 // 2013 will allow us to use it as a non-type template parameter.
7372 /// \brief Implementation of the \c isShuffleEquivalent variadic functor.
7374 /// See its documentation for details.
7375 bool isShuffleEquivalentImpl(ArrayRef<int> Mask, ArrayRef<const int *> Args) {
7376 if (Mask.size() != Args.size())
7378 for (int i = 0, e = Mask.size(); i < e; ++i) {
7379 assert(*Args[i] >= 0 && "Arguments must be positive integers!");
7380 if (Mask[i] != -1 && Mask[i] != *Args[i])
7388 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
7391 /// This is a fast way to test a shuffle mask against a fixed pattern:
7393 /// if (isShuffleEquivalent(Mask, 3, 2, 1, 0)) { ... }
7395 /// It returns true if the mask is exactly as wide as the argument list, and
7396 /// each element of the mask is either -1 (signifying undef) or the value given
7397 /// in the argument.
7398 static const VariadicFunction1<
7399 bool, ArrayRef<int>, int, isShuffleEquivalentImpl> isShuffleEquivalent = {};
7401 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
7403 /// This helper function produces an 8-bit shuffle immediate corresponding to
7404 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
7405 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
7408 /// NB: We rely heavily on "undef" masks preserving the input lane.
7409 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask,
7410 SelectionDAG &DAG) {
7411 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
7412 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
7413 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
7414 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
7415 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
7418 Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0;
7419 Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2;
7420 Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4;
7421 Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6;
7422 return DAG.getConstant(Imm, MVT::i8);
7425 /// \brief Try to emit a blend instruction for a shuffle.
7427 /// This doesn't do any checks for the availability of instructions for blending
7428 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
7429 /// be matched in the backend with the type given. What it does check for is
7430 /// that the shuffle mask is in fact a blend.
7431 static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
7432 SDValue V2, ArrayRef<int> Mask,
7433 const X86Subtarget *Subtarget,
7434 SelectionDAG &DAG) {
7436 unsigned BlendMask = 0;
7437 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7438 if (Mask[i] >= Size) {
7439 if (Mask[i] != i + Size)
7440 return SDValue(); // Shuffled V2 input!
7441 BlendMask |= 1u << i;
7444 if (Mask[i] >= 0 && Mask[i] != i)
7445 return SDValue(); // Shuffled V1 input!
7447 switch (VT.SimpleTy) {
7452 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
7453 DAG.getConstant(BlendMask, MVT::i8));
7457 assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
7461 // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
7462 // that instruction.
7463 if (Subtarget->hasAVX2()) {
7464 // Scale the blend by the number of 32-bit dwords per element.
7465 int Scale = VT.getScalarSizeInBits() / 32;
7467 for (int i = 0, Size = Mask.size(); i < Size; ++i)
7468 if (Mask[i] >= Size)
7469 for (int j = 0; j < Scale; ++j)
7470 BlendMask |= 1u << (i * Scale + j);
7472 MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
7473 V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
7474 V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
7475 return DAG.getNode(ISD::BITCAST, DL, VT,
7476 DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
7477 DAG.getConstant(BlendMask, MVT::i8)));
7481 // For integer shuffles we need to expand the mask and cast the inputs to
7482 // v8i16s prior to blending.
7483 int Scale = 8 / VT.getVectorNumElements();
7485 for (int i = 0, Size = Mask.size(); i < Size; ++i)
7486 if (Mask[i] >= Size)
7487 for (int j = 0; j < Scale; ++j)
7488 BlendMask |= 1u << (i * Scale + j);
7490 V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
7491 V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
7492 return DAG.getNode(ISD::BITCAST, DL, VT,
7493 DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
7494 DAG.getConstant(BlendMask, MVT::i8)));
7498 assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
7499 SmallVector<int, 8> RepeatedMask;
7500 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
7501 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
7502 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
7504 for (int i = 0; i < 8; ++i)
7505 if (RepeatedMask[i] >= 16)
7506 BlendMask |= 1u << i;
7507 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
7508 DAG.getConstant(BlendMask, MVT::i8));
7513 assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
7514 // Scale the blend by the number of bytes per element.
7515 int Scale = VT.getScalarSizeInBits() / 8;
7516 assert(Mask.size() * Scale == 32 && "Not a 256-bit vector!");
7518 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
7519 // mix of LLVM's code generator and the x86 backend. We tell the code
7520 // generator that boolean values in the elements of an x86 vector register
7521 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
7522 // mapping a select to operand #1, and 'false' mapping to operand #2. The
7523 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
7524 // of the element (the remaining are ignored) and 0 in that high bit would
7525 // mean operand #1 while 1 in the high bit would mean operand #2. So while
7526 // the LLVM model for boolean values in vector elements gets the relevant
7527 // bit set, it is set backwards and over constrained relative to x86's
7529 SDValue VSELECTMask[32];
7530 for (int i = 0, Size = Mask.size(); i < Size; ++i)
7531 for (int j = 0; j < Scale; ++j)
7532 VSELECTMask[Scale * i + j] =
7533 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
7534 : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8);
7536 V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1);
7537 V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V2);
7539 ISD::BITCAST, DL, VT,
7540 DAG.getNode(ISD::VSELECT, DL, MVT::v32i8,
7541 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, VSELECTMask),
7546 llvm_unreachable("Not a supported integer vector type!");
7550 /// \brief Generic routine to lower a shuffle and blend as a decomposed set of
7551 /// unblended shuffles followed by an unshuffled blend.
7553 /// This matches the extremely common pattern for handling combined
7554 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
7556 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
7560 SelectionDAG &DAG) {
7561 // Shuffle the input elements into the desired positions in V1 and V2 and
7562 // blend them together.
7563 SmallVector<int, 32> V1Mask(Mask.size(), -1);
7564 SmallVector<int, 32> V2Mask(Mask.size(), -1);
7565 SmallVector<int, 32> BlendMask(Mask.size(), -1);
7566 for (int i = 0, Size = Mask.size(); i < Size; ++i)
7567 if (Mask[i] >= 0 && Mask[i] < Size) {
7568 V1Mask[i] = Mask[i];
7570 } else if (Mask[i] >= Size) {
7571 V2Mask[i] = Mask[i] - Size;
7572 BlendMask[i] = i + Size;
7575 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
7576 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
7577 return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
7580 /// \brief Try to lower a vector shuffle as a byte rotation.
7582 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
7583 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
7584 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
7585 /// try to generically lower a vector shuffle through such an pattern. It
7586 /// does not check for the profitability of lowering either as PALIGNR or
7587 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
7588 /// This matches shuffle vectors that look like:
7590 /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
7592 /// Essentially it concatenates V1 and V2, shifts right by some number of
7593 /// elements, and takes the low elements as the result. Note that while this is
7594 /// specified as a *right shift* because x86 is little-endian, it is a *left
7595 /// rotate* of the vector lanes.
7597 /// Note that this only handles 128-bit vector widths currently.
7598 static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
7601 const X86Subtarget *Subtarget,
7602 SelectionDAG &DAG) {
7603 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
7605 // We need to detect various ways of spelling a rotation:
7606 // [11, 12, 13, 14, 15, 0, 1, 2]
7607 // [-1, 12, 13, 14, -1, -1, 1, -1]
7608 // [-1, -1, -1, -1, -1, -1, 1, 2]
7609 // [ 3, 4, 5, 6, 7, 8, 9, 10]
7610 // [-1, 4, 5, 6, -1, -1, 9, -1]
7611 // [-1, 4, 5, 6, -1, -1, -1, -1]
7614 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7617 assert(Mask[i] >= 0 && "Only -1 is a valid negative mask element!");
7619 // Based on the mod-Size value of this mask element determine where
7620 // a rotated vector would have started.
7621 int StartIdx = i - (Mask[i] % Size);
7623 // The identity rotation isn't interesting, stop.
7626 // If we found the tail of a vector the rotation must be the missing
7627 // front. If we found the head of a vector, it must be how much of the head.
7628 int CandidateRotation = StartIdx < 0 ? -StartIdx : Size - StartIdx;
7631 Rotation = CandidateRotation;
7632 else if (Rotation != CandidateRotation)
7633 // The rotations don't match, so we can't match this mask.
7636 // Compute which value this mask is pointing at.
7637 SDValue MaskV = Mask[i] < Size ? V1 : V2;
7639 // Compute which of the two target values this index should be assigned to.
7640 // This reflects whether the high elements are remaining or the low elements
7642 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
7644 // Either set up this value if we've not encountered it before, or check
7645 // that it remains consistent.
7648 else if (TargetV != MaskV)
7649 // This may be a rotation, but it pulls from the inputs in some
7650 // unsupported interleaving.
7654 // Check that we successfully analyzed the mask, and normalize the results.
7655 assert(Rotation != 0 && "Failed to locate a viable rotation!");
7656 assert((Lo || Hi) && "Failed to find a rotated input vector!");
7662 assert(VT.getSizeInBits() == 128 &&
7663 "Rotate-based lowering only supports 128-bit lowering!");
7664 assert(Mask.size() <= 16 &&
7665 "Can shuffle at most 16 bytes in a 128-bit vector!");
7667 // The actual rotate instruction rotates bytes, so we need to scale the
7668 // rotation based on how many bytes are in the vector.
7669 int Scale = 16 / Mask.size();
7671 // SSSE3 targets can use the palignr instruction
7672 if (Subtarget->hasSSSE3()) {
7673 // Cast the inputs to v16i8 to match PALIGNR.
7674 Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Lo);
7675 Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Hi);
7677 return DAG.getNode(ISD::BITCAST, DL, VT,
7678 DAG.getNode(X86ISD::PALIGNR, DL, MVT::v16i8, Hi, Lo,
7679 DAG.getConstant(Rotation * Scale, MVT::i8)));
7682 // Default SSE2 implementation
7683 int LoByteShift = 16 - Rotation * Scale;
7684 int HiByteShift = Rotation * Scale;
7686 // Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ.
7687 Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Lo);
7688 Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Hi);
7690 SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo,
7691 DAG.getConstant(8 * LoByteShift, MVT::i8));
7692 SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi,
7693 DAG.getConstant(8 * HiByteShift, MVT::i8));
7694 return DAG.getNode(ISD::BITCAST, DL, VT,
7695 DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift));
7698 /// \brief Compute whether each element of a shuffle is zeroable.
7700 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
7701 /// Either it is an undef element in the shuffle mask, the element of the input
7702 /// referenced is undef, or the element of the input referenced is known to be
7703 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
7704 /// as many lanes with this technique as possible to simplify the remaining
7706 static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
7707 SDValue V1, SDValue V2) {
7708 SmallBitVector Zeroable(Mask.size(), false);
7710 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
7711 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
7713 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7715 // Handle the easy cases.
7716 if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
7721 // If this is an index into a build_vector node, dig out the input value and
7723 SDValue V = M < Size ? V1 : V2;
7724 if (V.getOpcode() != ISD::BUILD_VECTOR)
7727 SDValue Input = V.getOperand(M % Size);
7728 // The UNDEF opcode check really should be dead code here, but not quite
7729 // worth asserting on (it isn't invalid, just unexpected).
7730 if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input))
7737 /// \brief Try to lower a vector shuffle as a byte shift (shifts in zeros).
7739 /// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ SSE2
7740 /// byte-shift instructions. The mask must consist of a shifted sequential
7741 /// shuffle from one of the input vectors and zeroable elements for the
7742 /// remaining 'shifted in' elements.
7744 /// Note that this only handles 128-bit vector widths currently.
7745 static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1,
7746 SDValue V2, ArrayRef<int> Mask,
7747 SelectionDAG &DAG) {
7748 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
7750 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7752 int Size = Mask.size();
7753 int Scale = 16 / Size;
7755 for (int Shift = 1; Shift < Size; Shift++) {
7756 int ByteShift = Shift * Scale;
7758 // PSRLDQ : (little-endian) right byte shift
7759 // [ 5, 6, 7, zz, zz, zz, zz, zz]
7760 // [ -1, 5, 6, 7, zz, zz, zz, zz]
7761 // [ 1, 2, -1, -1, -1, -1, zz, zz]
7762 bool ZeroableRight = true;
7763 for (int i = Size - Shift; i < Size; i++) {
7764 ZeroableRight &= Zeroable[i];
7767 if (ZeroableRight) {
7768 bool ValidShiftRight1 =
7769 isSequentialOrUndefInRange(Mask, 0, Size - Shift, Shift);
7770 bool ValidShiftRight2 =
7771 isSequentialOrUndefInRange(Mask, 0, Size - Shift, Size + Shift);
7773 if (ValidShiftRight1 || ValidShiftRight2) {
7774 // Cast the inputs to v2i64 to match PSRLDQ.
7775 SDValue &TargetV = ValidShiftRight1 ? V1 : V2;
7776 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
7777 SDValue Shifted = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, V,
7778 DAG.getConstant(ByteShift * 8, MVT::i8));
7779 return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
7783 // PSLLDQ : (little-endian) left byte shift
7784 // [ zz, 0, 1, 2, 3, 4, 5, 6]
7785 // [ zz, zz, -1, -1, 2, 3, 4, -1]
7786 // [ zz, zz, zz, zz, zz, zz, -1, 1]
7787 bool ZeroableLeft = true;
7788 for (int i = 0; i < Shift; i++) {
7789 ZeroableLeft &= Zeroable[i];
7793 bool ValidShiftLeft1 =
7794 isSequentialOrUndefInRange(Mask, Shift, Size - Shift, 0);
7795 bool ValidShiftLeft2 =
7796 isSequentialOrUndefInRange(Mask, Shift, Size - Shift, Size);
7798 if (ValidShiftLeft1 || ValidShiftLeft2) {
7799 // Cast the inputs to v2i64 to match PSLLDQ.
7800 SDValue &TargetV = ValidShiftLeft1 ? V1 : V2;
7801 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
7802 SDValue Shifted = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, V,
7803 DAG.getConstant(ByteShift * 8, MVT::i8));
7804 return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
7812 /// \brief Lower a vector shuffle as a zero or any extension.
7814 /// Given a specific number of elements, element bit width, and extension
7815 /// stride, produce either a zero or any extension based on the available
7816 /// features of the subtarget.
7817 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
7818 SDLoc DL, MVT VT, int NumElements, int Scale, bool AnyExt, SDValue InputV,
7819 const X86Subtarget *Subtarget, SelectionDAG &DAG) {
7820 assert(Scale > 1 && "Need a scale to extend.");
7821 int EltBits = VT.getSizeInBits() / NumElements;
7822 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
7823 "Only 8, 16, and 32 bit elements can be extended.");
7824 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
7826 // Found a valid zext mask! Try various lowering strategies based on the
7827 // input type and available ISA extensions.
7828 if (Subtarget->hasSSE41()) {
7829 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
7830 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
7831 NumElements / Scale);
7832 InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
7833 return DAG.getNode(ISD::BITCAST, DL, VT,
7834 DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV));
7837 // For any extends we can cheat for larger element sizes and use shuffle
7838 // instructions that can fold with a load and/or copy.
7839 if (AnyExt && EltBits == 32) {
7840 int PSHUFDMask[4] = {0, -1, 1, -1};
7842 ISD::BITCAST, DL, VT,
7843 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
7844 DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
7845 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
7847 if (AnyExt && EltBits == 16 && Scale > 2) {
7848 int PSHUFDMask[4] = {0, -1, 0, -1};
7849 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
7850 DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
7851 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG));
7852 int PSHUFHWMask[4] = {1, -1, -1, -1};
7854 ISD::BITCAST, DL, VT,
7855 DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16,
7856 DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, InputV),
7857 getV4X86ShuffleImm8ForMask(PSHUFHWMask, DAG)));
7860 // If this would require more than 2 unpack instructions to expand, use
7861 // pshufb when available. We can only use more than 2 unpack instructions
7862 // when zero extending i8 elements which also makes it easier to use pshufb.
7863 if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) {
7864 assert(NumElements == 16 && "Unexpected byte vector width!");
7865 SDValue PSHUFBMask[16];
7866 for (int i = 0; i < 16; ++i)
7868 DAG.getConstant((i % Scale == 0) ? i / Scale : 0x80, MVT::i8);
7869 InputV = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, InputV);
7870 return DAG.getNode(ISD::BITCAST, DL, VT,
7871 DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
7872 DAG.getNode(ISD::BUILD_VECTOR, DL,
7873 MVT::v16i8, PSHUFBMask)));
7876 // Otherwise emit a sequence of unpacks.
7878 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
7879 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
7880 : getZeroVector(InputVT, Subtarget, DAG, DL);
7881 InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
7882 InputV = DAG.getNode(X86ISD::UNPCKL, DL, InputVT, InputV, Ext);
7886 } while (Scale > 1);
7887 return DAG.getNode(ISD::BITCAST, DL, VT, InputV);
7890 /// \brief Try to lower a vector shuffle as a zero extension on any micrarch.
7892 /// This routine will try to do everything in its power to cleverly lower
7893 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
7894 /// check for the profitability of this lowering, it tries to aggressively
7895 /// match this pattern. It will use all of the micro-architectural details it
7896 /// can to emit an efficient lowering. It handles both blends with all-zero
7897 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
7898 /// masking out later).
7900 /// The reason we have dedicated lowering for zext-style shuffles is that they
7901 /// are both incredibly common and often quite performance sensitive.
7902 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
7903 SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
7904 const X86Subtarget *Subtarget, SelectionDAG &DAG) {
7905 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7907 int Bits = VT.getSizeInBits();
7908 int NumElements = Mask.size();
7910 // Define a helper function to check a particular ext-scale and lower to it if
7912 auto Lower = [&](int Scale) -> SDValue {
7915 for (int i = 0; i < NumElements; ++i) {
7917 continue; // Valid anywhere but doesn't tell us anything.
7918 if (i % Scale != 0) {
7919 // Each of the extended elements need to be zeroable.
7923 // We no longer are in the anyext case.
7928 // Each of the base elements needs to be consecutive indices into the
7929 // same input vector.
7930 SDValue V = Mask[i] < NumElements ? V1 : V2;
7933 else if (InputV != V)
7934 return SDValue(); // Flip-flopping inputs.
7936 if (Mask[i] % NumElements != i / Scale)
7937 return SDValue(); // Non-consecutive strided elements.
7940 // If we fail to find an input, we have a zero-shuffle which should always
7941 // have already been handled.
7942 // FIXME: Maybe handle this here in case during blending we end up with one?
7946 return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
7947 DL, VT, NumElements, Scale, AnyExt, InputV, Subtarget, DAG);
7950 // The widest scale possible for extending is to a 64-bit integer.
7951 assert(Bits % 64 == 0 &&
7952 "The number of bits in a vector must be divisible by 64 on x86!");
7953 int NumExtElements = Bits / 64;
7955 // Each iteration, try extending the elements half as much, but into twice as
7957 for (; NumExtElements < NumElements; NumExtElements *= 2) {
7958 assert(NumElements % NumExtElements == 0 &&
7959 "The input vector size must be divisible by the extended size.");
7960 if (SDValue V = Lower(NumElements / NumExtElements))
7964 // No viable ext lowering found.
7968 /// \brief Try to get a scalar value for a specific element of a vector.
7970 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
7971 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
7972 SelectionDAG &DAG) {
7973 MVT VT = V.getSimpleValueType();
7974 MVT EltVT = VT.getVectorElementType();
7975 while (V.getOpcode() == ISD::BITCAST)
7976 V = V.getOperand(0);
7977 // If the bitcasts shift the element size, we can't extract an equivalent
7979 MVT NewVT = V.getSimpleValueType();
7980 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
7983 if (V.getOpcode() == ISD::BUILD_VECTOR ||
7984 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR))
7985 return DAG.getNode(ISD::BITCAST, SDLoc(V), EltVT, V.getOperand(Idx));
7990 /// \brief Helper to test for a load that can be folded with x86 shuffles.
7992 /// This is particularly important because the set of instructions varies
7993 /// significantly based on whether the operand is a load or not.
7994 static bool isShuffleFoldableLoad(SDValue V) {
7995 while (V.getOpcode() == ISD::BITCAST)
7996 V = V.getOperand(0);
7998 return ISD::isNON_EXTLoad(V.getNode());
8001 /// \brief Try to lower insertion of a single element into a zero vector.
8003 /// This is a common pattern that we have especially efficient patterns to lower
8004 /// across all subtarget feature sets.
8005 static SDValue lowerVectorShuffleAsElementInsertion(
8006 MVT VT, SDLoc DL, SDValue V1, SDValue V2, ArrayRef<int> Mask,
8007 const X86Subtarget *Subtarget, SelectionDAG &DAG) {
8008 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8010 MVT EltVT = VT.getVectorElementType();
8012 int V2Index = std::find_if(Mask.begin(), Mask.end(),
8013 [&Mask](int M) { return M >= (int)Mask.size(); }) -
8015 bool IsV1Zeroable = true;
8016 for (int i = 0, Size = Mask.size(); i < Size; ++i)
8017 if (i != V2Index && !Zeroable[i]) {
8018 IsV1Zeroable = false;
8022 // Check for a single input from a SCALAR_TO_VECTOR node.
8023 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
8024 // all the smarts here sunk into that routine. However, the current
8025 // lowering of BUILD_VECTOR makes that nearly impossible until the old
8026 // vector shuffle lowering is dead.
8027 if (SDValue V2S = getScalarValueForVectorElement(
8028 V2, Mask[V2Index] - Mask.size(), DAG)) {
8029 // We need to zext the scalar if it is smaller than an i32.
8030 V2S = DAG.getNode(ISD::BITCAST, DL, EltVT, V2S);
8031 if (EltVT == MVT::i8 || EltVT == MVT::i16) {
8032 // Using zext to expand a narrow element won't work for non-zero
8037 // Zero-extend directly to i32.
8039 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
8041 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
8042 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
8043 EltVT == MVT::i16) {
8044 // Either not inserting from the low element of the input or the input
8045 // element size is too small to use VZEXT_MOVL to clear the high bits.
8049 if (!IsV1Zeroable) {
8050 // If V1 can't be treated as a zero vector we have fewer options to lower
8051 // this. We can't support integer vectors or non-zero targets cheaply, and
8052 // the V1 elements can't be permuted in any way.
8053 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
8054 if (!VT.isFloatingPoint() || V2Index != 0)
8056 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
8057 V1Mask[V2Index] = -1;
8058 if (!isNoopShuffleMask(V1Mask))
8060 // This is essentially a special case blend operation, but if we have
8061 // general purpose blend operations, they are always faster. Bail and let
8062 // the rest of the lowering handle these as blends.
8063 if (Subtarget->hasSSE41())
8066 // Otherwise, use MOVSD or MOVSS.
8067 assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
8068 "Only two types of floating point element types to handle!");
8069 return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
8073 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
8075 V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
8078 // If we have 4 or fewer lanes we can cheaply shuffle the element into
8079 // the desired position. Otherwise it is more efficient to do a vector
8080 // shift left. We know that we can do a vector shift left because all
8081 // the inputs are zero.
8082 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
8083 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
8084 V2Shuffle[V2Index] = 0;
8085 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
8087 V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V2);
8089 X86ISD::VSHLDQ, DL, MVT::v2i64, V2,
8091 V2Index * EltVT.getSizeInBits(),
8092 DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64)));
8093 V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
8099 /// \brief Try to lower broadcast of a single element.
8101 /// For convenience, this code also bundles all of the subtarget feature set
8102 /// filtering. While a little annoying to re-dispatch on type here, there isn't
8103 /// a convenient way to factor it out.
8104 static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V,
8106 const X86Subtarget *Subtarget,
8107 SelectionDAG &DAG) {
8108 if (!Subtarget->hasAVX())
8110 if (VT.isInteger() && !Subtarget->hasAVX2())
8113 // Check that the mask is a broadcast.
8114 int BroadcastIdx = -1;
8116 if (M >= 0 && BroadcastIdx == -1)
8118 else if (M >= 0 && M != BroadcastIdx)
8121 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
8122 "a sorted mask where the broadcast "
8125 // Go up the chain of (vector) values to try and find a scalar load that
8126 // we can combine with the broadcast.
8128 switch (V.getOpcode()) {
8129 case ISD::CONCAT_VECTORS: {
8130 int OperandSize = Mask.size() / V.getNumOperands();
8131 V = V.getOperand(BroadcastIdx / OperandSize);
8132 BroadcastIdx %= OperandSize;
8136 case ISD::INSERT_SUBVECTOR: {
8137 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
8138 auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
8142 int BeginIdx = (int)ConstantIdx->getZExtValue();
8144 BeginIdx + (int)VInner.getValueType().getVectorNumElements();
8145 if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
8146 BroadcastIdx -= BeginIdx;
8157 // Check if this is a broadcast of a scalar. We special case lowering
8158 // for scalars so that we can more effectively fold with loads.
8159 if (V.getOpcode() == ISD::BUILD_VECTOR ||
8160 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
8161 V = V.getOperand(BroadcastIdx);
8163 // If the scalar isn't a load we can't broadcast from it in AVX1, only with
8165 if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V))
8167 } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) {
8168 // We can't broadcast from a vector register w/o AVX2, and we can only
8169 // broadcast from the zero-element of a vector register.
8173 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V);
8176 // Check for whether we can use INSERTPS to perform the shuffle. We only use
8177 // INSERTPS when the V1 elements are already in the correct locations
8178 // because otherwise we can just always use two SHUFPS instructions which
8179 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
8180 // perform INSERTPS if a single V1 element is out of place and all V2
8181 // elements are zeroable.
8182 static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
8184 SelectionDAG &DAG) {
8185 assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
8186 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8187 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8188 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8190 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8193 int V1DstIndex = -1;
8194 int V2DstIndex = -1;
8195 bool V1UsedInPlace = false;
8197 for (int i = 0; i < 4; i++) {
8198 // Synthesize a zero mask from the zeroable elements (includes undefs).
8204 // Flag if we use any V1 inputs in place.
8206 V1UsedInPlace = true;
8210 // We can only insert a single non-zeroable element.
8211 if (V1DstIndex != -1 || V2DstIndex != -1)
8215 // V1 input out of place for insertion.
8218 // V2 input for insertion.
8223 // Don't bother if we have no (non-zeroable) element for insertion.
8224 if (V1DstIndex == -1 && V2DstIndex == -1)
8227 // Determine element insertion src/dst indices. The src index is from the
8228 // start of the inserted vector, not the start of the concatenated vector.
8229 unsigned V2SrcIndex = 0;
8230 if (V1DstIndex != -1) {
8231 // If we have a V1 input out of place, we use V1 as the V2 element insertion
8232 // and don't use the original V2 at all.
8233 V2SrcIndex = Mask[V1DstIndex];
8234 V2DstIndex = V1DstIndex;
8237 V2SrcIndex = Mask[V2DstIndex] - 4;
8240 // If no V1 inputs are used in place, then the result is created only from
8241 // the zero mask and the V2 insertion - so remove V1 dependency.
8243 V1 = DAG.getUNDEF(MVT::v4f32);
8245 unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
8246 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
8248 // Insert the V2 element into the desired position.
8250 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
8251 DAG.getConstant(InsertPSMask, MVT::i8));
8254 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
8256 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
8257 /// support for floating point shuffles but not integer shuffles. These
8258 /// instructions will incur a domain crossing penalty on some chips though so
8259 /// it is better to avoid lowering through this for integer vectors where
8261 static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8262 const X86Subtarget *Subtarget,
8263 SelectionDAG &DAG) {
8265 assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!");
8266 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
8267 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
8268 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8269 ArrayRef<int> Mask = SVOp->getMask();
8270 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
8272 if (isSingleInputShuffleMask(Mask)) {
8273 // Use low duplicate instructions for masks that match their pattern.
8274 if (Subtarget->hasSSE3())
8275 if (isShuffleEquivalent(Mask, 0, 0))
8276 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, V1);
8278 // Straight shuffle of a single input vector. Simulate this by using the
8279 // single input as both of the "inputs" to this instruction..
8280 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
8282 if (Subtarget->hasAVX()) {
8283 // If we have AVX, we can use VPERMILPS which will allow folding a load
8284 // into the shuffle.
8285 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
8286 DAG.getConstant(SHUFPDMask, MVT::i8));
8289 return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V1,
8290 DAG.getConstant(SHUFPDMask, MVT::i8));
8292 assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
8293 assert(Mask[1] >= 2 && "Non-canonicalized blend!");
8295 // Use dedicated unpack instructions for masks that match their pattern.
8296 if (isShuffleEquivalent(Mask, 0, 2))
8297 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2);
8298 if (isShuffleEquivalent(Mask, 1, 3))
8299 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2);
8301 // If we have a single input, insert that into V1 if we can do so cheaply.
8302 if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
8303 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8304 MVT::v2f64, DL, V1, V2, Mask, Subtarget, DAG))
8306 // Try inverting the insertion since for v2 masks it is easy to do and we
8307 // can't reliably sort the mask one way or the other.
8308 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
8309 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
8310 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8311 MVT::v2f64, DL, V2, V1, InverseMask, Subtarget, DAG))
8315 // Try to use one of the special instruction patterns to handle two common
8316 // blend patterns if a zero-blend above didn't work.
8317 if (isShuffleEquivalent(Mask, 0, 3) || isShuffleEquivalent(Mask, 1, 3))
8318 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
8319 // We can either use a special instruction to load over the low double or
8320 // to move just the low double.
8322 isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
8324 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
8326 if (Subtarget->hasSSE41())
8327 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
8331 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
8332 return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2,
8333 DAG.getConstant(SHUFPDMask, MVT::i8));
8336 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
8338 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
8339 /// the integer unit to minimize domain crossing penalties. However, for blends
8340 /// it falls back to the floating point shuffle operation with appropriate bit
8342 static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8343 const X86Subtarget *Subtarget,
8344 SelectionDAG &DAG) {
8346 assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!");
8347 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
8348 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
8349 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8350 ArrayRef<int> Mask = SVOp->getMask();
8351 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
8353 if (isSingleInputShuffleMask(Mask)) {
8354 // Check for being able to broadcast a single element.
8355 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v2i64, DL, V1,
8356 Mask, Subtarget, DAG))
8359 // Straight shuffle of a single input vector. For everything from SSE2
8360 // onward this has a single fast instruction with no scary immediates.
8361 // We have to map the mask as it is actually a v4i32 shuffle instruction.
8362 V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V1);
8363 int WidenedMask[4] = {
8364 std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
8365 std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
8367 ISD::BITCAST, DL, MVT::v2i64,
8368 DAG.getNode(X86ISD::PSHUFD, SDLoc(Op), MVT::v4i32, V1,
8369 getV4X86ShuffleImm8ForMask(WidenedMask, DAG)));
8372 // Try to use byte shift instructions.
8373 if (SDValue Shift = lowerVectorShuffleAsByteShift(
8374 DL, MVT::v2i64, V1, V2, Mask, DAG))
8377 // If we have a single input from V2 insert that into V1 if we can do so
8379 if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
8380 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8381 MVT::v2i64, DL, V1, V2, Mask, Subtarget, DAG))
8383 // Try inverting the insertion since for v2 masks it is easy to do and we
8384 // can't reliably sort the mask one way or the other.
8385 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
8386 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
8387 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8388 MVT::v2i64, DL, V2, V1, InverseMask, Subtarget, DAG))
8392 // Use dedicated unpack instructions for masks that match their pattern.
8393 if (isShuffleEquivalent(Mask, 0, 2))
8394 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2);
8395 if (isShuffleEquivalent(Mask, 1, 3))
8396 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2);
8398 if (Subtarget->hasSSE41())
8399 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
8403 // Try to use byte rotation instructions.
8404 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
8405 if (Subtarget->hasSSSE3())
8406 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
8407 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
8410 // We implement this with SHUFPD which is pretty lame because it will likely
8411 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
8412 // However, all the alternatives are still more cycles and newer chips don't
8413 // have this problem. It would be really nice if x86 had better shuffles here.
8414 V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V1);
8415 V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V2);
8416 return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
8417 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
8420 /// \brief Lower a vector shuffle using the SHUFPS instruction.
8422 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
8423 /// It makes no assumptions about whether this is the *best* lowering, it simply
8425 static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT,
8426 ArrayRef<int> Mask, SDValue V1,
8427 SDValue V2, SelectionDAG &DAG) {
8428 SDValue LowV = V1, HighV = V2;
8429 int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
8432 std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
8434 if (NumV2Elements == 1) {
8436 std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
8439 // Compute the index adjacent to V2Index and in the same half by toggling
8441 int V2AdjIndex = V2Index ^ 1;
8443 if (Mask[V2AdjIndex] == -1) {
8444 // Handles all the cases where we have a single V2 element and an undef.
8445 // This will only ever happen in the high lanes because we commute the
8446 // vector otherwise.
8448 std::swap(LowV, HighV);
8449 NewMask[V2Index] -= 4;
8451 // Handle the case where the V2 element ends up adjacent to a V1 element.
8452 // To make this work, blend them together as the first step.
8453 int V1Index = V2AdjIndex;
8454 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
8455 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
8456 getV4X86ShuffleImm8ForMask(BlendMask, DAG));
8458 // Now proceed to reconstruct the final blend as we have the necessary
8459 // high or low half formed.
8466 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
8467 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
8469 } else if (NumV2Elements == 2) {
8470 if (Mask[0] < 4 && Mask[1] < 4) {
8471 // Handle the easy case where we have V1 in the low lanes and V2 in the
8475 } else if (Mask[2] < 4 && Mask[3] < 4) {
8476 // We also handle the reversed case because this utility may get called
8477 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
8478 // arrange things in the right direction.
8484 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
8485 // trying to place elements directly, just blend them and set up the final
8486 // shuffle to place them.
8488 // The first two blend mask elements are for V1, the second two are for
8490 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
8491 Mask[2] < 4 ? Mask[2] : Mask[3],
8492 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
8493 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
8494 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
8495 getV4X86ShuffleImm8ForMask(BlendMask, DAG));
8497 // Now we do a normal shuffle of V1 by giving V1 as both operands to
8500 NewMask[0] = Mask[0] < 4 ? 0 : 2;
8501 NewMask[1] = Mask[0] < 4 ? 2 : 0;
8502 NewMask[2] = Mask[2] < 4 ? 1 : 3;
8503 NewMask[3] = Mask[2] < 4 ? 3 : 1;
8506 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
8507 getV4X86ShuffleImm8ForMask(NewMask, DAG));
8510 /// \brief Lower 4-lane 32-bit floating point shuffles.
8512 /// Uses instructions exclusively from the floating point unit to minimize
8513 /// domain crossing penalties, as these are sufficient to implement all v4f32
8515 static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8516 const X86Subtarget *Subtarget,
8517 SelectionDAG &DAG) {
8519 assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
8520 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8521 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8522 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8523 ArrayRef<int> Mask = SVOp->getMask();
8524 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8527 std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
8529 if (NumV2Elements == 0) {
8530 // Check for being able to broadcast a single element.
8531 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f32, DL, V1,
8532 Mask, Subtarget, DAG))
8535 // Use even/odd duplicate instructions for masks that match their pattern.
8536 if (Subtarget->hasSSE3()) {
8537 if (isShuffleEquivalent(Mask, 0, 0, 2, 2))
8538 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
8539 if (isShuffleEquivalent(Mask, 1, 1, 3, 3))
8540 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
8543 if (Subtarget->hasAVX()) {
8544 // If we have AVX, we can use VPERMILPS which will allow folding a load
8545 // into the shuffle.
8546 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
8547 getV4X86ShuffleImm8ForMask(Mask, DAG));
8550 // Otherwise, use a straight shuffle of a single input vector. We pass the
8551 // input vector to both operands to simulate this with a SHUFPS.
8552 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
8553 getV4X86ShuffleImm8ForMask(Mask, DAG));
8556 // Use dedicated unpack instructions for masks that match their pattern.
8557 if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
8558 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2);
8559 if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
8560 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
8562 // There are special ways we can lower some single-element blends. However, we
8563 // have custom ways we can lower more complex single-element blends below that
8564 // we defer to if both this and BLENDPS fail to match, so restrict this to
8565 // when the V2 input is targeting element 0 of the mask -- that is the fast
8567 if (NumV2Elements == 1 && Mask[0] >= 4)
8568 if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4f32, DL, V1, V2,
8569 Mask, Subtarget, DAG))
8572 if (Subtarget->hasSSE41()) {
8573 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
8577 // Use INSERTPS if we can complete the shuffle efficiently.
8578 if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG))
8582 // Otherwise fall back to a SHUFPS lowering strategy.
8583 return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
8586 /// \brief Lower 4-lane i32 vector shuffles.
8588 /// We try to handle these with integer-domain shuffles where we can, but for
8589 /// blends we use the floating point domain blend instructions.
8590 static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8591 const X86Subtarget *Subtarget,
8592 SelectionDAG &DAG) {
8594 assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!");
8595 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
8596 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
8597 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8598 ArrayRef<int> Mask = SVOp->getMask();
8599 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8601 // Whenever we can lower this as a zext, that instruction is strictly faster
8602 // than any alternative. It also allows us to fold memory operands into the
8603 // shuffle in many cases.
8604 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2,
8605 Mask, Subtarget, DAG))
8609 std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
8611 if (NumV2Elements == 0) {
8612 // Check for being able to broadcast a single element.
8613 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i32, DL, V1,
8614 Mask, Subtarget, DAG))
8617 // Straight shuffle of a single input vector. For everything from SSE2
8618 // onward this has a single fast instruction with no scary immediates.
8619 // We coerce the shuffle pattern to be compatible with UNPCK instructions
8620 // but we aren't actually going to use the UNPCK instruction because doing
8621 // so prevents folding a load into this instruction or making a copy.
8622 const int UnpackLoMask[] = {0, 0, 1, 1};
8623 const int UnpackHiMask[] = {2, 2, 3, 3};
8624 if (isShuffleEquivalent(Mask, 0, 0, 1, 1))
8625 Mask = UnpackLoMask;
8626 else if (isShuffleEquivalent(Mask, 2, 2, 3, 3))
8627 Mask = UnpackHiMask;
8629 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
8630 getV4X86ShuffleImm8ForMask(Mask, DAG));
8633 // Try to use byte shift instructions.
8634 if (SDValue Shift = lowerVectorShuffleAsByteShift(
8635 DL, MVT::v4i32, V1, V2, Mask, DAG))
8638 // There are special ways we can lower some single-element blends.
8639 if (NumV2Elements == 1)
8640 if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2,
8641 Mask, Subtarget, DAG))
8644 // Use dedicated unpack instructions for masks that match their pattern.
8645 if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
8646 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2);
8647 if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
8648 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
8650 if (Subtarget->hasSSE41())
8651 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
8655 // Try to use byte rotation instructions.
8656 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
8657 if (Subtarget->hasSSSE3())
8658 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
8659 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
8662 // We implement this with SHUFPS because it can blend from two vectors.
8663 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
8664 // up the inputs, bypassing domain shift penalties that we would encur if we
8665 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
8667 return DAG.getNode(ISD::BITCAST, DL, MVT::v4i32,
8668 DAG.getVectorShuffle(
8670 DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V1),
8671 DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V2), Mask));
8674 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
8675 /// shuffle lowering, and the most complex part.
8677 /// The lowering strategy is to try to form pairs of input lanes which are
8678 /// targeted at the same half of the final vector, and then use a dword shuffle
8679 /// to place them onto the right half, and finally unpack the paired lanes into
8680 /// their final position.
8682 /// The exact breakdown of how to form these dword pairs and align them on the
8683 /// correct sides is really tricky. See the comments within the function for
8684 /// more of the details.
8685 static SDValue lowerV8I16SingleInputVectorShuffle(
8686 SDLoc DL, SDValue V, MutableArrayRef<int> Mask,
8687 const X86Subtarget *Subtarget, SelectionDAG &DAG) {
8688 assert(V.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
8689 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
8690 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
8692 SmallVector<int, 4> LoInputs;
8693 std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
8694 [](int M) { return M >= 0; });
8695 std::sort(LoInputs.begin(), LoInputs.end());
8696 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
8697 SmallVector<int, 4> HiInputs;
8698 std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
8699 [](int M) { return M >= 0; });
8700 std::sort(HiInputs.begin(), HiInputs.end());
8701 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
8703 std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
8704 int NumHToL = LoInputs.size() - NumLToL;
8706 std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
8707 int NumHToH = HiInputs.size() - NumLToH;
8708 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
8709 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
8710 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
8711 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
8713 // Check for being able to broadcast a single element.
8714 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i16, DL, V,
8715 Mask, Subtarget, DAG))
8718 // Try to use byte shift instructions.
8719 if (SDValue Shift = lowerVectorShuffleAsByteShift(
8720 DL, MVT::v8i16, V, V, Mask, DAG))
8723 // Use dedicated unpack instructions for masks that match their pattern.
8724 if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3))
8725 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V);
8726 if (isShuffleEquivalent(Mask, 4, 4, 5, 5, 6, 6, 7, 7))
8727 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V);
8729 // Try to use byte rotation instructions.
8730 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
8731 DL, MVT::v8i16, V, V, Mask, Subtarget, DAG))
8734 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
8735 // such inputs we can swap two of the dwords across the half mark and end up
8736 // with <=2 inputs to each half in each half. Once there, we can fall through
8737 // to the generic code below. For example:
8739 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
8740 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
8742 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
8743 // and an existing 2-into-2 on the other half. In this case we may have to
8744 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
8745 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
8746 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
8747 // because any other situation (including a 3-into-1 or 1-into-3 in the other
8748 // half than the one we target for fixing) will be fixed when we re-enter this
8749 // path. We will also combine away any sequence of PSHUFD instructions that
8750 // result into a single instruction. Here is an example of the tricky case:
8752 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
8753 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
8755 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
8757 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
8758 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
8760 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
8761 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
8763 // The result is fine to be handled by the generic logic.
8764 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
8765 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
8766 int AOffset, int BOffset) {
8767 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
8768 "Must call this with A having 3 or 1 inputs from the A half.");
8769 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
8770 "Must call this with B having 1 or 3 inputs from the B half.");
8771 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
8772 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
8774 // Compute the index of dword with only one word among the three inputs in
8775 // a half by taking the sum of the half with three inputs and subtracting
8776 // the sum of the actual three inputs. The difference is the remaining
8779 int &TripleDWord = AToAInputs.size() == 3 ? ADWord : BDWord;
8780 int &OneInputDWord = AToAInputs.size() == 3 ? BDWord : ADWord;
8781 int TripleInputOffset = AToAInputs.size() == 3 ? AOffset : BOffset;
8782 ArrayRef<int> TripleInputs = AToAInputs.size() == 3 ? AToAInputs : BToAInputs;
8783 int OneInput = AToAInputs.size() == 3 ? BToAInputs[0] : AToAInputs[0];
8784 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
8785 int TripleNonInputIdx =
8786 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
8787 TripleDWord = TripleNonInputIdx / 2;
8789 // We use xor with one to compute the adjacent DWord to whichever one the
8791 OneInputDWord = (OneInput / 2) ^ 1;
8793 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
8794 // and BToA inputs. If there is also such a problem with the BToB and AToB
8795 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
8796 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
8797 // is essential that we don't *create* a 3<-1 as then we might oscillate.
8798 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
8799 // Compute how many inputs will be flipped by swapping these DWords. We
8801 // to balance this to ensure we don't form a 3-1 shuffle in the other
8803 int NumFlippedAToBInputs =
8804 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
8805 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
8806 int NumFlippedBToBInputs =
8807 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
8808 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
8809 if ((NumFlippedAToBInputs == 1 &&
8810 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
8811 (NumFlippedBToBInputs == 1 &&
8812 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
8813 // We choose whether to fix the A half or B half based on whether that
8814 // half has zero flipped inputs. At zero, we may not be able to fix it
8815 // with that half. We also bias towards fixing the B half because that
8816 // will more commonly be the high half, and we have to bias one way.
8817 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
8818 ArrayRef<int> Inputs) {
8819 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
8820 bool IsFixIdxInput = std::find(Inputs.begin(), Inputs.end(),
8821 PinnedIdx ^ 1) != Inputs.end();
8822 // Determine whether the free index is in the flipped dword or the
8823 // unflipped dword based on where the pinned index is. We use this bit
8824 // in an xor to conditionally select the adjacent dword.
8825 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
8826 bool IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
8827 FixFreeIdx) != Inputs.end();
8828 if (IsFixIdxInput == IsFixFreeIdxInput)
8830 IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
8831 FixFreeIdx) != Inputs.end();
8832 assert(IsFixIdxInput != IsFixFreeIdxInput &&
8833 "We need to be changing the number of flipped inputs!");
8834 int PSHUFHalfMask[] = {0, 1, 2, 3};
8835 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
8836 V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
8838 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DAG));
8841 if (M != -1 && M == FixIdx)
8843 else if (M != -1 && M == FixFreeIdx)
8846 if (NumFlippedBToBInputs != 0) {
8848 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
8849 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
8851 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
8853 AToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
8854 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
8859 int PSHUFDMask[] = {0, 1, 2, 3};
8860 PSHUFDMask[ADWord] = BDWord;
8861 PSHUFDMask[BDWord] = ADWord;
8862 V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
8863 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
8864 DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
8865 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
8867 // Adjust the mask to match the new locations of A and B.
8869 if (M != -1 && M/2 == ADWord)
8870 M = 2 * BDWord + M % 2;
8871 else if (M != -1 && M/2 == BDWord)
8872 M = 2 * ADWord + M % 2;
8874 // Recurse back into this routine to re-compute state now that this isn't
8875 // a 3 and 1 problem.
8876 return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
8879 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
8880 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
8881 else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
8882 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
8884 // At this point there are at most two inputs to the low and high halves from
8885 // each half. That means the inputs can always be grouped into dwords and
8886 // those dwords can then be moved to the correct half with a dword shuffle.
8887 // We use at most one low and one high word shuffle to collect these paired
8888 // inputs into dwords, and finally a dword shuffle to place them.
8889 int PSHUFLMask[4] = {-1, -1, -1, -1};
8890 int PSHUFHMask[4] = {-1, -1, -1, -1};
8891 int PSHUFDMask[4] = {-1, -1, -1, -1};
8893 // First fix the masks for all the inputs that are staying in their
8894 // original halves. This will then dictate the targets of the cross-half
8896 auto fixInPlaceInputs =
8897 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
8898 MutableArrayRef<int> SourceHalfMask,
8899 MutableArrayRef<int> HalfMask, int HalfOffset) {
8900 if (InPlaceInputs.empty())
8902 if (InPlaceInputs.size() == 1) {
8903 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
8904 InPlaceInputs[0] - HalfOffset;
8905 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
8908 if (IncomingInputs.empty()) {
8909 // Just fix all of the in place inputs.
8910 for (int Input : InPlaceInputs) {
8911 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
8912 PSHUFDMask[Input / 2] = Input / 2;
8917 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
8918 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
8919 InPlaceInputs[0] - HalfOffset;
8920 // Put the second input next to the first so that they are packed into
8921 // a dword. We find the adjacent index by toggling the low bit.
8922 int AdjIndex = InPlaceInputs[0] ^ 1;
8923 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
8924 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
8925 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
8927 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
8928 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
8930 // Now gather the cross-half inputs and place them into a free dword of
8931 // their target half.
8932 // FIXME: This operation could almost certainly be simplified dramatically to
8933 // look more like the 3-1 fixing operation.
8934 auto moveInputsToRightHalf = [&PSHUFDMask](
8935 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
8936 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
8937 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
8939 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
8940 return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word;
8942 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
8944 int LowWord = Word & ~1;
8945 int HighWord = Word | 1;
8946 return isWordClobbered(SourceHalfMask, LowWord) ||
8947 isWordClobbered(SourceHalfMask, HighWord);
8950 if (IncomingInputs.empty())
8953 if (ExistingInputs.empty()) {
8954 // Map any dwords with inputs from them into the right half.
8955 for (int Input : IncomingInputs) {
8956 // If the source half mask maps over the inputs, turn those into
8957 // swaps and use the swapped lane.
8958 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
8959 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == -1) {
8960 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
8961 Input - SourceOffset;
8962 // We have to swap the uses in our half mask in one sweep.
8963 for (int &M : HalfMask)
8964 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
8966 else if (M == Input)
8967 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
8969 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
8970 Input - SourceOffset &&
8971 "Previous placement doesn't match!");
8973 // Note that this correctly re-maps both when we do a swap and when
8974 // we observe the other side of the swap above. We rely on that to
8975 // avoid swapping the members of the input list directly.
8976 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
8979 // Map the input's dword into the correct half.
8980 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == -1)
8981 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
8983 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
8985 "Previous placement doesn't match!");
8988 // And just directly shift any other-half mask elements to be same-half
8989 // as we will have mirrored the dword containing the element into the
8990 // same position within that half.
8991 for (int &M : HalfMask)
8992 if (M >= SourceOffset && M < SourceOffset + 4) {
8993 M = M - SourceOffset + DestOffset;
8994 assert(M >= 0 && "This should never wrap below zero!");
8999 // Ensure we have the input in a viable dword of its current half. This
9000 // is particularly tricky because the original position may be clobbered
9001 // by inputs being moved and *staying* in that half.
9002 if (IncomingInputs.size() == 1) {
9003 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
9004 int InputFixed = std::find(std::begin(SourceHalfMask),
9005 std::end(SourceHalfMask), -1) -
9006 std::begin(SourceHalfMask) + SourceOffset;
9007 SourceHalfMask[InputFixed - SourceOffset] =
9008 IncomingInputs[0] - SourceOffset;
9009 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
9011 IncomingInputs[0] = InputFixed;
9013 } else if (IncomingInputs.size() == 2) {
9014 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
9015 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
9016 // We have two non-adjacent or clobbered inputs we need to extract from
9017 // the source half. To do this, we need to map them into some adjacent
9018 // dword slot in the source mask.
9019 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
9020 IncomingInputs[1] - SourceOffset};
9022 // If there is a free slot in the source half mask adjacent to one of
9023 // the inputs, place the other input in it. We use (Index XOR 1) to
9024 // compute an adjacent index.
9025 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
9026 SourceHalfMask[InputsFixed[0] ^ 1] == -1) {
9027 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
9028 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
9029 InputsFixed[1] = InputsFixed[0] ^ 1;
9030 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
9031 SourceHalfMask[InputsFixed[1] ^ 1] == -1) {
9032 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
9033 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
9034 InputsFixed[0] = InputsFixed[1] ^ 1;
9035 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] == -1 &&
9036 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] == -1) {
9037 // The two inputs are in the same DWord but it is clobbered and the
9038 // adjacent DWord isn't used at all. Move both inputs to the free
9040 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
9041 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
9042 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
9043 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
9045 // The only way we hit this point is if there is no clobbering
9046 // (because there are no off-half inputs to this half) and there is no
9047 // free slot adjacent to one of the inputs. In this case, we have to
9048 // swap an input with a non-input.
9049 for (int i = 0; i < 4; ++i)
9050 assert((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) &&
9051 "We can't handle any clobbers here!");
9052 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
9053 "Cannot have adjacent inputs here!");
9055 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
9056 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
9058 // We also have to update the final source mask in this case because
9059 // it may need to undo the above swap.
9060 for (int &M : FinalSourceHalfMask)
9061 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
9062 M = InputsFixed[1] + SourceOffset;
9063 else if (M == InputsFixed[1] + SourceOffset)
9064 M = (InputsFixed[0] ^ 1) + SourceOffset;
9066 InputsFixed[1] = InputsFixed[0] ^ 1;
9069 // Point everything at the fixed inputs.
9070 for (int &M : HalfMask)
9071 if (M == IncomingInputs[0])
9072 M = InputsFixed[0] + SourceOffset;
9073 else if (M == IncomingInputs[1])
9074 M = InputsFixed[1] + SourceOffset;
9076 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
9077 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
9080 llvm_unreachable("Unhandled input size!");
9083 // Now hoist the DWord down to the right half.
9084 int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2;
9085 assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free");
9086 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
9087 for (int &M : HalfMask)
9088 for (int Input : IncomingInputs)
9090 M = FreeDWord * 2 + Input % 2;
9092 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
9093 /*SourceOffset*/ 4, /*DestOffset*/ 0);
9094 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
9095 /*SourceOffset*/ 0, /*DestOffset*/ 4);
9097 // Now enact all the shuffles we've computed to move the inputs into their
9099 if (!isNoopShuffleMask(PSHUFLMask))
9100 V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
9101 getV4X86ShuffleImm8ForMask(PSHUFLMask, DAG));
9102 if (!isNoopShuffleMask(PSHUFHMask))
9103 V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
9104 getV4X86ShuffleImm8ForMask(PSHUFHMask, DAG));
9105 if (!isNoopShuffleMask(PSHUFDMask))
9106 V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9107 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9108 DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
9109 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
9111 // At this point, each half should contain all its inputs, and we can then
9112 // just shuffle them into their final position.
9113 assert(std::count_if(LoMask.begin(), LoMask.end(),
9114 [](int M) { return M >= 4; }) == 0 &&
9115 "Failed to lift all the high half inputs to the low mask!");
9116 assert(std::count_if(HiMask.begin(), HiMask.end(),
9117 [](int M) { return M >= 0 && M < 4; }) == 0 &&
9118 "Failed to lift all the low half inputs to the high mask!");
9120 // Do a half shuffle for the low mask.
9121 if (!isNoopShuffleMask(LoMask))
9122 V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
9123 getV4X86ShuffleImm8ForMask(LoMask, DAG));
9125 // Do a half shuffle with the high mask after shifting its values down.
9126 for (int &M : HiMask)
9129 if (!isNoopShuffleMask(HiMask))
9130 V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
9131 getV4X86ShuffleImm8ForMask(HiMask, DAG));
9136 /// \brief Detect whether the mask pattern should be lowered through
9139 /// This essentially tests whether viewing the mask as an interleaving of two
9140 /// sub-sequences reduces the cross-input traffic of a blend operation. If so,
9141 /// lowering it through interleaving is a significantly better strategy.
9142 static bool shouldLowerAsInterleaving(ArrayRef<int> Mask) {
9143 int NumEvenInputs[2] = {0, 0};
9144 int NumOddInputs[2] = {0, 0};
9145 int NumLoInputs[2] = {0, 0};
9146 int NumHiInputs[2] = {0, 0};
9147 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9151 int InputIdx = Mask[i] >= Size;
9154 ++NumLoInputs[InputIdx];
9156 ++NumHiInputs[InputIdx];
9159 ++NumEvenInputs[InputIdx];
9161 ++NumOddInputs[InputIdx];
9164 // The minimum number of cross-input results for both the interleaved and
9165 // split cases. If interleaving results in fewer cross-input results, return
9167 int InterleavedCrosses = std::min(NumEvenInputs[1] + NumOddInputs[0],
9168 NumEvenInputs[0] + NumOddInputs[1]);
9169 int SplitCrosses = std::min(NumLoInputs[1] + NumHiInputs[0],
9170 NumLoInputs[0] + NumHiInputs[1]);
9171 return InterleavedCrosses < SplitCrosses;
9174 /// \brief Blend two v8i16 vectors using a naive unpack strategy.
9176 /// This strategy only works when the inputs from each vector fit into a single
9177 /// half of that vector, and generally there are not so many inputs as to leave
9178 /// the in-place shuffles required highly constrained (and thus expensive). It
9179 /// shifts all the inputs into a single side of both input vectors and then
9180 /// uses an unpack to interleave these inputs in a single vector. At that
9181 /// point, we will fall back on the generic single input shuffle lowering.
9182 static SDValue lowerV8I16BasicBlendVectorShuffle(SDLoc DL, SDValue V1,
9184 MutableArrayRef<int> Mask,
9185 const X86Subtarget *Subtarget,
9186 SelectionDAG &DAG) {
9187 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
9188 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
9189 SmallVector<int, 3> LoV1Inputs, HiV1Inputs, LoV2Inputs, HiV2Inputs;
9190 for (int i = 0; i < 8; ++i)
9191 if (Mask[i] >= 0 && Mask[i] < 4)
9192 LoV1Inputs.push_back(i);
9193 else if (Mask[i] >= 4 && Mask[i] < 8)
9194 HiV1Inputs.push_back(i);
9195 else if (Mask[i] >= 8 && Mask[i] < 12)
9196 LoV2Inputs.push_back(i);
9197 else if (Mask[i] >= 12)
9198 HiV2Inputs.push_back(i);
9200 int NumV1Inputs = LoV1Inputs.size() + HiV1Inputs.size();
9201 int NumV2Inputs = LoV2Inputs.size() + HiV2Inputs.size();
9204 assert(NumV1Inputs > 0 && NumV1Inputs <= 3 && "At most 3 inputs supported");
9205 assert(NumV2Inputs > 0 && NumV2Inputs <= 3 && "At most 3 inputs supported");
9206 assert(NumV1Inputs + NumV2Inputs <= 4 && "At most 4 combined inputs");
9208 bool MergeFromLo = LoV1Inputs.size() + LoV2Inputs.size() >=
9209 HiV1Inputs.size() + HiV2Inputs.size();
9211 auto moveInputsToHalf = [&](SDValue V, ArrayRef<int> LoInputs,
9212 ArrayRef<int> HiInputs, bool MoveToLo,
9214 ArrayRef<int> GoodInputs = MoveToLo ? LoInputs : HiInputs;
9215 ArrayRef<int> BadInputs = MoveToLo ? HiInputs : LoInputs;
9216 if (BadInputs.empty())
9219 int MoveMask[] = {-1, -1, -1, -1, -1, -1, -1, -1};
9220 int MoveOffset = MoveToLo ? 0 : 4;
9222 if (GoodInputs.empty()) {
9223 for (int BadInput : BadInputs) {
9224 MoveMask[Mask[BadInput] % 4 + MoveOffset] = Mask[BadInput] - MaskOffset;
9225 Mask[BadInput] = Mask[BadInput] % 4 + MoveOffset + MaskOffset;
9228 if (GoodInputs.size() == 2) {
9229 // If the low inputs are spread across two dwords, pack them into
9231 MoveMask[MoveOffset] = Mask[GoodInputs[0]] - MaskOffset;
9232 MoveMask[MoveOffset + 1] = Mask[GoodInputs[1]] - MaskOffset;
9233 Mask[GoodInputs[0]] = MoveOffset + MaskOffset;
9234 Mask[GoodInputs[1]] = MoveOffset + 1 + MaskOffset;
9236 // Otherwise pin the good inputs.
9237 for (int GoodInput : GoodInputs)
9238 MoveMask[Mask[GoodInput] - MaskOffset] = Mask[GoodInput] - MaskOffset;
9241 if (BadInputs.size() == 2) {
9242 // If we have two bad inputs then there may be either one or two good
9243 // inputs fixed in place. Find a fixed input, and then find the *other*
9244 // two adjacent indices by using modular arithmetic.
9246 std::find_if(std::begin(MoveMask) + MoveOffset, std::end(MoveMask),
9247 [](int M) { return M >= 0; }) -
9248 std::begin(MoveMask);
9250 ((((GoodMaskIdx - MoveOffset) & ~1) + 2) % 4) + MoveOffset;
9251 assert(MoveMask[MoveMaskIdx] == -1 && "Expected empty slot");
9252 assert(MoveMask[MoveMaskIdx + 1] == -1 && "Expected empty slot");
9253 MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
9254 MoveMask[MoveMaskIdx + 1] = Mask[BadInputs[1]] - MaskOffset;
9255 Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
9256 Mask[BadInputs[1]] = MoveMaskIdx + 1 + MaskOffset;
9258 assert(BadInputs.size() == 1 && "All sizes handled");
9259 int MoveMaskIdx = std::find(std::begin(MoveMask) + MoveOffset,
9260 std::end(MoveMask), -1) -
9261 std::begin(MoveMask);
9262 MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
9263 Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
9267 return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
9270 V1 = moveInputsToHalf(V1, LoV1Inputs, HiV1Inputs, MergeFromLo,
9272 V2 = moveInputsToHalf(V2, LoV2Inputs, HiV2Inputs, MergeFromLo,
9275 // FIXME: Select an interleaving of the merge of V1 and V2 that minimizes
9276 // cross-half traffic in the final shuffle.
9278 // Munge the mask to be a single-input mask after the unpack merges the
9282 M = 2 * (M % 4) + (M / 8);
9284 return DAG.getVectorShuffle(
9285 MVT::v8i16, DL, DAG.getNode(MergeFromLo ? X86ISD::UNPCKL : X86ISD::UNPCKH,
9286 DL, MVT::v8i16, V1, V2),
9287 DAG.getUNDEF(MVT::v8i16), Mask);
9290 /// \brief Generic lowering of 8-lane i16 shuffles.
9292 /// This handles both single-input shuffles and combined shuffle/blends with
9293 /// two inputs. The single input shuffles are immediately delegated to
9294 /// a dedicated lowering routine.
9296 /// The blends are lowered in one of three fundamental ways. If there are few
9297 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
9298 /// of the input is significantly cheaper when lowered as an interleaving of
9299 /// the two inputs, try to interleave them. Otherwise, blend the low and high
9300 /// halves of the inputs separately (making them have relatively few inputs)
9301 /// and then concatenate them.
9302 static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9303 const X86Subtarget *Subtarget,
9304 SelectionDAG &DAG) {
9306 assert(Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!");
9307 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
9308 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
9309 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
9310 ArrayRef<int> OrigMask = SVOp->getMask();
9311 int MaskStorage[8] = {OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3],
9312 OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7]};
9313 MutableArrayRef<int> Mask(MaskStorage);
9315 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
9317 // Whenever we can lower this as a zext, that instruction is strictly faster
9318 // than any alternative.
9319 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
9320 DL, MVT::v8i16, V1, V2, OrigMask, Subtarget, DAG))
9323 auto isV1 = [](int M) { return M >= 0 && M < 8; };
9324 auto isV2 = [](int M) { return M >= 8; };
9326 int NumV1Inputs = std::count_if(Mask.begin(), Mask.end(), isV1);
9327 int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2);
9329 if (NumV2Inputs == 0)
9330 return lowerV8I16SingleInputVectorShuffle(DL, V1, Mask, Subtarget, DAG);
9332 assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized "
9333 "to be V1-input shuffles.");
9335 // Try to use byte shift instructions.
9336 if (SDValue Shift = lowerVectorShuffleAsByteShift(
9337 DL, MVT::v8i16, V1, V2, Mask, DAG))
9340 // There are special ways we can lower some single-element blends.
9341 if (NumV2Inputs == 1)
9342 if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v8i16, DL, V1, V2,
9343 Mask, Subtarget, DAG))
9346 // Use dedicated unpack instructions for masks that match their pattern.
9347 if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 2, 10, 3, 11))
9348 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2);
9349 if (isShuffleEquivalent(Mask, 4, 12, 5, 13, 6, 14, 7, 15))
9350 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2);
9352 if (Subtarget->hasSSE41())
9353 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
9357 // Try to use byte rotation instructions.
9358 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9359 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
9362 if (NumV1Inputs + NumV2Inputs <= 4)
9363 return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG);
9365 // Check whether an interleaving lowering is likely to be more efficient.
9366 // This isn't perfect but it is a strong heuristic that tends to work well on
9367 // the kinds of shuffles that show up in practice.
9369 // FIXME: Handle 1x, 2x, and 4x interleaving.
9370 if (shouldLowerAsInterleaving(Mask)) {
9371 // FIXME: Figure out whether we should pack these into the low or high
9374 int EMask[8], OMask[8];
9375 for (int i = 0; i < 4; ++i) {
9376 EMask[i] = Mask[2*i];
9377 OMask[i] = Mask[2*i + 1];
9382 SDValue Evens = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, EMask);
9383 SDValue Odds = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, OMask);
9385 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, Evens, Odds);
9388 int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9389 int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9391 for (int i = 0; i < 4; ++i) {
9392 LoBlendMask[i] = Mask[i];
9393 HiBlendMask[i] = Mask[i + 4];
9396 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
9397 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
9398 LoV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, LoV);
9399 HiV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, HiV);
9401 return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9402 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, LoV, HiV));
9405 /// \brief Check whether a compaction lowering can be done by dropping even
9406 /// elements and compute how many times even elements must be dropped.
9408 /// This handles shuffles which take every Nth element where N is a power of
9409 /// two. Example shuffle masks:
9411 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
9412 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
9413 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
9414 /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
9415 /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
9416 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
9418 /// Any of these lanes can of course be undef.
9420 /// This routine only supports N <= 3.
9421 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
9424 /// \returns N above, or the number of times even elements must be dropped if
9425 /// there is such a number. Otherwise returns zero.
9426 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask) {
9427 // Figure out whether we're looping over two inputs or just one.
9428 bool IsSingleInput = isSingleInputShuffleMask(Mask);
9430 // The modulus for the shuffle vector entries is based on whether this is
9431 // a single input or not.
9432 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
9433 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
9434 "We should only be called with masks with a power-of-2 size!");
9436 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
9438 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
9439 // and 2^3 simultaneously. This is because we may have ambiguity with
9440 // partially undef inputs.
9441 bool ViableForN[3] = {true, true, true};
9443 for (int i = 0, e = Mask.size(); i < e; ++i) {
9444 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
9449 bool IsAnyViable = false;
9450 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
9451 if (ViableForN[j]) {
9454 // The shuffle mask must be equal to (i * 2^N) % M.
9455 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
9458 ViableForN[j] = false;
9460 // Early exit if we exhaust the possible powers of two.
9465 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
9469 // Return 0 as there is no viable power of two.
9473 /// \brief Generic lowering of v16i8 shuffles.
9475 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
9476 /// detect any complexity reducing interleaving. If that doesn't help, it uses
9477 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
9478 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
9480 static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9481 const X86Subtarget *Subtarget,
9482 SelectionDAG &DAG) {
9484 assert(Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!");
9485 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
9486 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
9487 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
9488 ArrayRef<int> OrigMask = SVOp->getMask();
9489 assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!");
9491 // Try to use byte shift instructions.
9492 if (SDValue Shift = lowerVectorShuffleAsByteShift(
9493 DL, MVT::v16i8, V1, V2, OrigMask, DAG))
9496 // Try to use byte rotation instructions.
9497 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9498 DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
9501 // Try to use a zext lowering.
9502 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
9503 DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
9506 int MaskStorage[16] = {
9507 OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3],
9508 OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7],
9509 OrigMask[8], OrigMask[9], OrigMask[10], OrigMask[11],
9510 OrigMask[12], OrigMask[13], OrigMask[14], OrigMask[15]};
9511 MutableArrayRef<int> Mask(MaskStorage);
9512 MutableArrayRef<int> LoMask = Mask.slice(0, 8);
9513 MutableArrayRef<int> HiMask = Mask.slice(8, 8);
9516 std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; });
9518 // For single-input shuffles, there are some nicer lowering tricks we can use.
9519 if (NumV2Elements == 0) {
9520 // Check for being able to broadcast a single element.
9521 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i8, DL, V1,
9522 Mask, Subtarget, DAG))
9525 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
9526 // Notably, this handles splat and partial-splat shuffles more efficiently.
9527 // However, it only makes sense if the pre-duplication shuffle simplifies
9528 // things significantly. Currently, this means we need to be able to
9529 // express the pre-duplication shuffle as an i16 shuffle.
9531 // FIXME: We should check for other patterns which can be widened into an
9532 // i16 shuffle as well.
9533 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
9534 for (int i = 0; i < 16; i += 2)
9535 if (Mask[i] != -1 && Mask[i + 1] != -1 && Mask[i] != Mask[i + 1])
9540 auto tryToWidenViaDuplication = [&]() -> SDValue {
9541 if (!canWidenViaDuplication(Mask))
9543 SmallVector<int, 4> LoInputs;
9544 std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
9545 [](int M) { return M >= 0 && M < 8; });
9546 std::sort(LoInputs.begin(), LoInputs.end());
9547 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
9549 SmallVector<int, 4> HiInputs;
9550 std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
9551 [](int M) { return M >= 8; });
9552 std::sort(HiInputs.begin(), HiInputs.end());
9553 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
9556 bool TargetLo = LoInputs.size() >= HiInputs.size();
9557 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
9558 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
9560 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
9561 SmallDenseMap<int, int, 8> LaneMap;
9562 for (int I : InPlaceInputs) {
9563 PreDupI16Shuffle[I/2] = I/2;
9566 int j = TargetLo ? 0 : 4, je = j + 4;
9567 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
9568 // Check if j is already a shuffle of this input. This happens when
9569 // there are two adjacent bytes after we move the low one.
9570 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
9571 // If we haven't yet mapped the input, search for a slot into which
9573 while (j < je && PreDupI16Shuffle[j] != -1)
9577 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
9580 // Map this input with the i16 shuffle.
9581 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
9584 // Update the lane map based on the mapping we ended up with.
9585 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
9588 ISD::BITCAST, DL, MVT::v16i8,
9589 DAG.getVectorShuffle(MVT::v8i16, DL,
9590 DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
9591 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
9593 // Unpack the bytes to form the i16s that will be shuffled into place.
9594 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
9595 MVT::v16i8, V1, V1);
9597 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9598 for (int i = 0; i < 16; ++i)
9599 if (Mask[i] != -1) {
9600 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
9601 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
9602 if (PostDupI16Shuffle[i / 2] == -1)
9603 PostDupI16Shuffle[i / 2] = MappedMask;
9605 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
9606 "Conflicting entrties in the original shuffle!");
9609 ISD::BITCAST, DL, MVT::v16i8,
9610 DAG.getVectorShuffle(MVT::v8i16, DL,
9611 DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
9612 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
9614 if (SDValue V = tryToWidenViaDuplication())
9618 // Check whether an interleaving lowering is likely to be more efficient.
9619 // This isn't perfect but it is a strong heuristic that tends to work well on
9620 // the kinds of shuffles that show up in practice.
9622 // FIXME: We need to handle other interleaving widths (i16, i32, ...).
9623 if (shouldLowerAsInterleaving(Mask)) {
9624 int NumLoHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {
9625 return (M >= 0 && M < 8) || (M >= 16 && M < 24);
9627 int NumHiHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {
9628 return (M >= 8 && M < 16) || M >= 24;
9630 int EMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,
9631 -1, -1, -1, -1, -1, -1, -1, -1};
9632 int OMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,
9633 -1, -1, -1, -1, -1, -1, -1, -1};
9634 bool UnpackLo = NumLoHalf >= NumHiHalf;
9635 MutableArrayRef<int> TargetEMask(UnpackLo ? EMask : EMask + 8, 8);
9636 MutableArrayRef<int> TargetOMask(UnpackLo ? OMask : OMask + 8, 8);
9637 for (int i = 0; i < 8; ++i) {
9638 TargetEMask[i] = Mask[2 * i];
9639 TargetOMask[i] = Mask[2 * i + 1];
9642 SDValue Evens = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, EMask);
9643 SDValue Odds = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, OMask);
9645 return DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
9646 MVT::v16i8, Evens, Odds);
9649 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
9650 // with PSHUFB. It is important to do this before we attempt to generate any
9651 // blends but after all of the single-input lowerings. If the single input
9652 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
9653 // want to preserve that and we can DAG combine any longer sequences into
9654 // a PSHUFB in the end. But once we start blending from multiple inputs,
9655 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
9656 // and there are *very* few patterns that would actually be faster than the
9657 // PSHUFB approach because of its ability to zero lanes.
9659 // FIXME: The only exceptions to the above are blends which are exact
9660 // interleavings with direct instructions supporting them. We currently don't
9661 // handle those well here.
9662 if (Subtarget->hasSSSE3()) {
9665 bool V1InUse = false;
9666 bool V2InUse = false;
9667 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
9669 for (int i = 0; i < 16; ++i) {
9670 if (Mask[i] == -1) {
9671 V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
9673 const int ZeroMask = 0x80;
9674 int V1Idx = (Mask[i] < 16 ? Mask[i] : ZeroMask);
9675 int V2Idx = (Mask[i] < 16 ? ZeroMask : Mask[i] - 16);
9677 V1Idx = V2Idx = ZeroMask;
9678 V1Mask[i] = DAG.getConstant(V1Idx, MVT::i8);
9679 V2Mask[i] = DAG.getConstant(V2Idx, MVT::i8);
9680 V1InUse |= (ZeroMask != V1Idx);
9681 V2InUse |= (ZeroMask != V2Idx);
9686 V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V1,
9687 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
9689 V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V2,
9690 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));
9692 // If we need shuffled inputs from both, blend the two.
9693 if (V1InUse && V2InUse)
9694 return DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
9696 return V1; // Single inputs are easy.
9698 return V2; // Single inputs are easy.
9699 // Shuffling to a zeroable vector.
9700 return getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
9703 // There are special ways we can lower some single-element blends.
9704 if (NumV2Elements == 1)
9705 if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v16i8, DL, V1, V2,
9706 Mask, Subtarget, DAG))
9709 // Check whether a compaction lowering can be done. This handles shuffles
9710 // which take every Nth element for some even N. See the helper function for
9713 // We special case these as they can be particularly efficiently handled with
9714 // the PACKUSB instruction on x86 and they show up in common patterns of
9715 // rearranging bytes to truncate wide elements.
9716 if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask)) {
9717 // NumEvenDrops is the power of two stride of the elements. Another way of
9718 // thinking about it is that we need to drop the even elements this many
9719 // times to get the original input.
9720 bool IsSingleInput = isSingleInputShuffleMask(Mask);
9722 // First we need to zero all the dropped bytes.
9723 assert(NumEvenDrops <= 3 &&
9724 "No support for dropping even elements more than 3 times.");
9725 // We use the mask type to pick which bytes are preserved based on how many
9726 // elements are dropped.
9727 MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
9728 SDValue ByteClearMask =
9729 DAG.getNode(ISD::BITCAST, DL, MVT::v16i8,
9730 DAG.getConstant(0xFF, MaskVTs[NumEvenDrops - 1]));
9731 V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
9733 V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
9735 // Now pack things back together.
9736 V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
9737 V2 = IsSingleInput ? V1 : DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
9738 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
9739 for (int i = 1; i < NumEvenDrops; ++i) {
9740 Result = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Result);
9741 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
9747 int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9748 int V1HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9749 int V2LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9750 int V2HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9752 auto buildBlendMasks = [](MutableArrayRef<int> HalfMask,
9753 MutableArrayRef<int> V1HalfBlendMask,
9754 MutableArrayRef<int> V2HalfBlendMask) {
9755 for (int i = 0; i < 8; ++i)
9756 if (HalfMask[i] >= 0 && HalfMask[i] < 16) {
9757 V1HalfBlendMask[i] = HalfMask[i];
9759 } else if (HalfMask[i] >= 16) {
9760 V2HalfBlendMask[i] = HalfMask[i] - 16;
9761 HalfMask[i] = i + 8;
9764 buildBlendMasks(LoMask, V1LoBlendMask, V2LoBlendMask);
9765 buildBlendMasks(HiMask, V1HiBlendMask, V2HiBlendMask);
9767 SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
9769 auto buildLoAndHiV8s = [&](SDValue V, MutableArrayRef<int> LoBlendMask,
9770 MutableArrayRef<int> HiBlendMask) {
9772 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
9773 // them out and avoid using UNPCK{L,H} to extract the elements of V as
9775 if (std::none_of(LoBlendMask.begin(), LoBlendMask.end(),
9776 [](int M) { return M >= 0 && M % 2 == 1; }) &&
9777 std::none_of(HiBlendMask.begin(), HiBlendMask.end(),
9778 [](int M) { return M >= 0 && M % 2 == 1; })) {
9779 // Use a mask to drop the high bytes.
9780 V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
9781 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, V1,
9782 DAG.getConstant(0x00FF, MVT::v8i16));
9784 // This will be a single vector shuffle instead of a blend so nuke V2.
9785 V2 = DAG.getUNDEF(MVT::v8i16);
9787 // Squash the masks to point directly into V1.
9788 for (int &M : LoBlendMask)
9791 for (int &M : HiBlendMask)
9795 // Otherwise just unpack the low half of V into V1 and the high half into
9796 // V2 so that we can blend them as i16s.
9797 V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9798 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
9799 V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9800 DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
9803 SDValue BlendedLo = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
9804 SDValue BlendedHi = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
9805 return std::make_pair(BlendedLo, BlendedHi);
9807 SDValue V1Lo, V1Hi, V2Lo, V2Hi;
9808 std::tie(V1Lo, V1Hi) = buildLoAndHiV8s(V1, V1LoBlendMask, V1HiBlendMask);
9809 std::tie(V2Lo, V2Hi) = buildLoAndHiV8s(V2, V2LoBlendMask, V2HiBlendMask);
9811 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Lo, V2Lo, LoMask);
9812 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Hi, V2Hi, HiMask);
9814 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
9817 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
9819 /// This routine breaks down the specific type of 128-bit shuffle and
9820 /// dispatches to the lowering routines accordingly.
9821 static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9822 MVT VT, const X86Subtarget *Subtarget,
9823 SelectionDAG &DAG) {
9824 switch (VT.SimpleTy) {
9826 return lowerV2I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
9828 return lowerV2F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
9830 return lowerV4I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
9832 return lowerV4F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
9834 return lowerV8I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
9836 return lowerV16I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
9839 llvm_unreachable("Unimplemented!");
9843 /// \brief Helper function to test whether a shuffle mask could be
9844 /// simplified by widening the elements being shuffled.
9846 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
9847 /// leaves it in an unspecified state.
9849 /// NOTE: This must handle normal vector shuffle masks and *target* vector
9850 /// shuffle masks. The latter have the special property of a '-2' representing
9851 /// a zero-ed lane of a vector.
9852 static bool canWidenShuffleElements(ArrayRef<int> Mask,
9853 SmallVectorImpl<int> &WidenedMask) {
9854 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
9855 // If both elements are undef, its trivial.
9856 if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
9857 WidenedMask.push_back(SM_SentinelUndef);
9861 // Check for an undef mask and a mask value properly aligned to fit with
9862 // a pair of values. If we find such a case, use the non-undef mask's value.
9863 if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) {
9864 WidenedMask.push_back(Mask[i + 1] / 2);
9867 if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
9868 WidenedMask.push_back(Mask[i] / 2);
9872 // When zeroing, we need to spread the zeroing across both lanes to widen.
9873 if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
9874 if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
9875 (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
9876 WidenedMask.push_back(SM_SentinelZero);
9882 // Finally check if the two mask values are adjacent and aligned with
9884 if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) {
9885 WidenedMask.push_back(Mask[i] / 2);
9889 // Otherwise we can't safely widen the elements used in this shuffle.
9892 assert(WidenedMask.size() == Mask.size() / 2 &&
9893 "Incorrect size of mask after widening the elements!");
9898 /// \brief Generic routine to split ector shuffle into half-sized shuffles.
9900 /// This routine just extracts two subvectors, shuffles them independently, and
9901 /// then concatenates them back together. This should work effectively with all
9902 /// AVX vector shuffle types.
9903 static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
9904 SDValue V2, ArrayRef<int> Mask,
9905 SelectionDAG &DAG) {
9906 assert(VT.getSizeInBits() >= 256 &&
9907 "Only for 256-bit or wider vector shuffles!");
9908 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
9909 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
9911 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
9912 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
9914 int NumElements = VT.getVectorNumElements();
9915 int SplitNumElements = NumElements / 2;
9916 MVT ScalarVT = VT.getScalarType();
9917 MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
9919 SDValue LoV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1,
9920 DAG.getIntPtrConstant(0));
9921 SDValue HiV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1,
9922 DAG.getIntPtrConstant(SplitNumElements));
9923 SDValue LoV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2,
9924 DAG.getIntPtrConstant(0));
9925 SDValue HiV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2,
9926 DAG.getIntPtrConstant(SplitNumElements));
9928 // Now create two 4-way blends of these half-width vectors.
9929 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
9930 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
9931 SmallVector<int, 32> V1BlendMask, V2BlendMask, BlendMask;
9932 for (int i = 0; i < SplitNumElements; ++i) {
9933 int M = HalfMask[i];
9934 if (M >= NumElements) {
9935 if (M >= NumElements + SplitNumElements)
9939 V2BlendMask.push_back(M - NumElements);
9940 V1BlendMask.push_back(-1);
9941 BlendMask.push_back(SplitNumElements + i);
9942 } else if (M >= 0) {
9943 if (M >= SplitNumElements)
9947 V2BlendMask.push_back(-1);
9948 V1BlendMask.push_back(M);
9949 BlendMask.push_back(i);
9951 V2BlendMask.push_back(-1);
9952 V1BlendMask.push_back(-1);
9953 BlendMask.push_back(-1);
9957 // Because the lowering happens after all combining takes place, we need to
9958 // manually combine these blend masks as much as possible so that we create
9959 // a minimal number of high-level vector shuffle nodes.
9961 // First try just blending the halves of V1 or V2.
9962 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
9963 return DAG.getUNDEF(SplitVT);
9964 if (!UseLoV2 && !UseHiV2)
9965 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
9966 if (!UseLoV1 && !UseHiV1)
9967 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
9969 SDValue V1Blend, V2Blend;
9970 if (UseLoV1 && UseHiV1) {
9972 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
9974 // We only use half of V1 so map the usage down into the final blend mask.
9975 V1Blend = UseLoV1 ? LoV1 : HiV1;
9976 for (int i = 0; i < SplitNumElements; ++i)
9977 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
9978 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
9980 if (UseLoV2 && UseHiV2) {
9982 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
9984 // We only use half of V2 so map the usage down into the final blend mask.
9985 V2Blend = UseLoV2 ? LoV2 : HiV2;
9986 for (int i = 0; i < SplitNumElements; ++i)
9987 if (BlendMask[i] >= SplitNumElements)
9988 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
9990 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
9992 SDValue Lo = HalfBlend(LoMask);
9993 SDValue Hi = HalfBlend(HiMask);
9994 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
9997 /// \brief Either split a vector in halves or decompose the shuffles and the
10000 /// This is provided as a good fallback for many lowerings of non-single-input
10001 /// shuffles with more than one 128-bit lane. In those cases, we want to select
10002 /// between splitting the shuffle into 128-bit components and stitching those
10003 /// back together vs. extracting the single-input shuffles and blending those
10005 static SDValue lowerVectorShuffleAsSplitOrBlend(SDLoc DL, MVT VT, SDValue V1,
10006 SDValue V2, ArrayRef<int> Mask,
10007 SelectionDAG &DAG) {
10008 assert(!isSingleInputShuffleMask(Mask) && "This routine must not be used to "
10009 "lower single-input shuffles as it "
10010 "could then recurse on itself.");
10011 int Size = Mask.size();
10013 // If this can be modeled as a broadcast of two elements followed by a blend,
10014 // prefer that lowering. This is especially important because broadcasts can
10015 // often fold with memory operands.
10016 auto DoBothBroadcast = [&] {
10017 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
10020 if (V2BroadcastIdx == -1)
10021 V2BroadcastIdx = M - Size;
10022 else if (M - Size != V2BroadcastIdx)
10024 } else if (M >= 0) {
10025 if (V1BroadcastIdx == -1)
10026 V1BroadcastIdx = M;
10027 else if (M != V1BroadcastIdx)
10032 if (DoBothBroadcast())
10033 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
10036 // If the inputs all stem from a single 128-bit lane of each input, then we
10037 // split them rather than blending because the split will decompose to
10038 // unusually few instructions.
10039 int LaneCount = VT.getSizeInBits() / 128;
10040 int LaneSize = Size / LaneCount;
10041 SmallBitVector LaneInputs[2];
10042 LaneInputs[0].resize(LaneCount, false);
10043 LaneInputs[1].resize(LaneCount, false);
10044 for (int i = 0; i < Size; ++i)
10046 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
10047 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
10048 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10050 // Otherwise, just fall back to decomposed shuffles and a blend. This requires
10051 // that the decomposed single-input shuffles don't end up here.
10052 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
10055 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
10056 /// a permutation and blend of those lanes.
10058 /// This essentially blends the out-of-lane inputs to each lane into the lane
10059 /// from a permuted copy of the vector. This lowering strategy results in four
10060 /// instructions in the worst case for a single-input cross lane shuffle which
10061 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
10062 /// of. Special cases for each particular shuffle pattern should be handled
10063 /// prior to trying this lowering.
10064 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT,
10065 SDValue V1, SDValue V2,
10066 ArrayRef<int> Mask,
10067 SelectionDAG &DAG) {
10068 // FIXME: This should probably be generalized for 512-bit vectors as well.
10069 assert(VT.getSizeInBits() == 256 && "Only for 256-bit vector shuffles!");
10070 int LaneSize = Mask.size() / 2;
10072 // If there are only inputs from one 128-bit lane, splitting will in fact be
10073 // less expensive. The flags track wether the given lane contains an element
10074 // that crosses to another lane.
10075 bool LaneCrossing[2] = {false, false};
10076 for (int i = 0, Size = Mask.size(); i < Size; ++i)
10077 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
10078 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
10079 if (!LaneCrossing[0] || !LaneCrossing[1])
10080 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10082 if (isSingleInputShuffleMask(Mask)) {
10083 SmallVector<int, 32> FlippedBlendMask;
10084 for (int i = 0, Size = Mask.size(); i < Size; ++i)
10085 FlippedBlendMask.push_back(
10086 Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
10088 : Mask[i] % LaneSize +
10089 (i / LaneSize) * LaneSize + Size));
10091 // Flip the vector, and blend the results which should now be in-lane. The
10092 // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
10093 // 5 for the high source. The value 3 selects the high half of source 2 and
10094 // the value 2 selects the low half of source 2. We only use source 2 to
10095 // allow folding it into a memory operand.
10096 unsigned PERMMask = 3 | 2 << 4;
10097 SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
10098 V1, DAG.getConstant(PERMMask, MVT::i8));
10099 return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
10102 // This now reduces to two single-input shuffles of V1 and V2 which at worst
10103 // will be handled by the above logic and a blend of the results, much like
10104 // other patterns in AVX.
10105 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
10108 /// \brief Handle lowering 2-lane 128-bit shuffles.
10109 static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
10110 SDValue V2, ArrayRef<int> Mask,
10111 const X86Subtarget *Subtarget,
10112 SelectionDAG &DAG) {
10113 // Blends are faster and handle all the non-lane-crossing cases.
10114 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
10118 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
10119 VT.getVectorNumElements() / 2);
10120 // Check for patterns which can be matched with a single insert of a 128-bit
10122 if (isShuffleEquivalent(Mask, 0, 1, 0, 1) ||
10123 isShuffleEquivalent(Mask, 0, 1, 4, 5)) {
10124 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
10125 DAG.getIntPtrConstant(0));
10126 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
10127 Mask[2] < 4 ? V1 : V2, DAG.getIntPtrConstant(0));
10128 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
10130 if (isShuffleEquivalent(Mask, 0, 1, 6, 7)) {
10131 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
10132 DAG.getIntPtrConstant(0));
10133 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
10134 DAG.getIntPtrConstant(2));
10135 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
10138 // Otherwise form a 128-bit permutation.
10139 // FIXME: Detect zero-vector inputs and use the VPERM2X128 to zero that half.
10140 unsigned PermMask = Mask[0] / 2 | (Mask[2] / 2) << 4;
10141 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
10142 DAG.getConstant(PermMask, MVT::i8));
10145 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
10146 /// shuffling each lane.
10148 /// This will only succeed when the result of fixing the 128-bit lanes results
10149 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
10150 /// each 128-bit lanes. This handles many cases where we can quickly blend away
10151 /// the lane crosses early and then use simpler shuffles within each lane.
10153 /// FIXME: It might be worthwhile at some point to support this without
10154 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
10155 /// in x86 only floating point has interesting non-repeating shuffles, and even
10156 /// those are still *marginally* more expensive.
10157 static SDValue lowerVectorShuffleByMerging128BitLanes(
10158 SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10159 const X86Subtarget *Subtarget, SelectionDAG &DAG) {
10160 assert(!isSingleInputShuffleMask(Mask) &&
10161 "This is only useful with multiple inputs.");
10163 int Size = Mask.size();
10164 int LaneSize = 128 / VT.getScalarSizeInBits();
10165 int NumLanes = Size / LaneSize;
10166 assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
10168 // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
10169 // check whether the in-128-bit lane shuffles share a repeating pattern.
10170 SmallVector<int, 4> Lanes;
10171 Lanes.resize(NumLanes, -1);
10172 SmallVector<int, 4> InLaneMask;
10173 InLaneMask.resize(LaneSize, -1);
10174 for (int i = 0; i < Size; ++i) {
10178 int j = i / LaneSize;
10180 if (Lanes[j] < 0) {
10181 // First entry we've seen for this lane.
10182 Lanes[j] = Mask[i] / LaneSize;
10183 } else if (Lanes[j] != Mask[i] / LaneSize) {
10184 // This doesn't match the lane selected previously!
10188 // Check that within each lane we have a consistent shuffle mask.
10189 int k = i % LaneSize;
10190 if (InLaneMask[k] < 0) {
10191 InLaneMask[k] = Mask[i] % LaneSize;
10192 } else if (InLaneMask[k] != Mask[i] % LaneSize) {
10193 // This doesn't fit a repeating in-lane mask.
10198 // First shuffle the lanes into place.
10199 MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
10200 VT.getSizeInBits() / 64);
10201 SmallVector<int, 8> LaneMask;
10202 LaneMask.resize(NumLanes * 2, -1);
10203 for (int i = 0; i < NumLanes; ++i)
10204 if (Lanes[i] >= 0) {
10205 LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
10206 LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
10209 V1 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V1);
10210 V2 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V2);
10211 SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
10213 // Cast it back to the type we actually want.
10214 LaneShuffle = DAG.getNode(ISD::BITCAST, DL, VT, LaneShuffle);
10216 // Now do a simple shuffle that isn't lane crossing.
10217 SmallVector<int, 8> NewMask;
10218 NewMask.resize(Size, -1);
10219 for (int i = 0; i < Size; ++i)
10221 NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
10222 assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
10223 "Must not introduce lane crosses at this point!");
10225 return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
10228 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
10231 /// This returns true if the elements from a particular input are already in the
10232 /// slot required by the given mask and require no permutation.
10233 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
10234 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
10235 int Size = Mask.size();
10236 for (int i = 0; i < Size; ++i)
10237 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
10243 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
10245 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
10246 /// isn't available.
10247 static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10248 const X86Subtarget *Subtarget,
10249 SelectionDAG &DAG) {
10251 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
10252 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
10253 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10254 ArrayRef<int> Mask = SVOp->getMask();
10255 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10257 SmallVector<int, 4> WidenedMask;
10258 if (canWidenShuffleElements(Mask, WidenedMask))
10259 return lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, Subtarget,
10262 if (isSingleInputShuffleMask(Mask)) {
10263 // Check for being able to broadcast a single element.
10264 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f64, DL, V1,
10265 Mask, Subtarget, DAG))
10268 // Use low duplicate instructions for masks that match their pattern.
10269 if (isShuffleEquivalent(Mask, 0, 0, 2, 2))
10270 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
10272 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
10273 // Non-half-crossing single input shuffles can be lowerid with an
10274 // interleaved permutation.
10275 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
10276 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
10277 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
10278 DAG.getConstant(VPERMILPMask, MVT::i8));
10281 // With AVX2 we have direct support for this permutation.
10282 if (Subtarget->hasAVX2())
10283 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
10284 getV4X86ShuffleImm8ForMask(Mask, DAG));
10286 // Otherwise, fall back.
10287 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
10291 // X86 has dedicated unpack instructions that can handle specific blend
10292 // operations: UNPCKH and UNPCKL.
10293 if (isShuffleEquivalent(Mask, 0, 4, 2, 6))
10294 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2);
10295 if (isShuffleEquivalent(Mask, 1, 5, 3, 7))
10296 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2);
10298 // If we have a single input to the zero element, insert that into V1 if we
10299 // can do so cheaply.
10300 int NumV2Elements =
10301 std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
10302 if (NumV2Elements == 1 && Mask[0] >= 4)
10303 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10304 MVT::v4f64, DL, V1, V2, Mask, Subtarget, DAG))
10307 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
10311 // Check if the blend happens to exactly fit that of SHUFPD.
10312 if ((Mask[0] == -1 || Mask[0] < 2) &&
10313 (Mask[1] == -1 || (Mask[1] >= 4 && Mask[1] < 6)) &&
10314 (Mask[2] == -1 || (Mask[2] >= 2 && Mask[2] < 4)) &&
10315 (Mask[3] == -1 || Mask[3] >= 6)) {
10316 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 5) << 1) |
10317 ((Mask[2] == 3) << 2) | ((Mask[3] == 7) << 3);
10318 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V1, V2,
10319 DAG.getConstant(SHUFPDMask, MVT::i8));
10321 if ((Mask[0] == -1 || (Mask[0] >= 4 && Mask[0] < 6)) &&
10322 (Mask[1] == -1 || Mask[1] < 2) &&
10323 (Mask[2] == -1 || Mask[2] >= 6) &&
10324 (Mask[3] == -1 || (Mask[3] >= 2 && Mask[3] < 4))) {
10325 unsigned SHUFPDMask = (Mask[0] == 5) | ((Mask[1] == 1) << 1) |
10326 ((Mask[2] == 7) << 2) | ((Mask[3] == 3) << 3);
10327 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V2, V1,
10328 DAG.getConstant(SHUFPDMask, MVT::i8));
10331 // Try to simplify this by merging 128-bit lanes to enable a lane-based
10332 // shuffle. However, if we have AVX2 and either inputs are already in place,
10333 // we will be able to shuffle even across lanes the other input in a single
10334 // instruction so skip this pattern.
10335 if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
10336 isShuffleMaskInputInPlace(1, Mask))))
10337 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10338 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
10341 // If we have AVX2 then we always want to lower with a blend because an v4 we
10342 // can fully permute the elements.
10343 if (Subtarget->hasAVX2())
10344 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
10347 // Otherwise fall back on generic lowering.
10348 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
10351 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
10353 /// This routine is only called when we have AVX2 and thus a reasonable
10354 /// instruction set for v4i64 shuffling..
10355 static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10356 const X86Subtarget *Subtarget,
10357 SelectionDAG &DAG) {
10359 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
10360 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
10361 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10362 ArrayRef<int> Mask = SVOp->getMask();
10363 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10364 assert(Subtarget->hasAVX2() && "We can only lower v4i64 with AVX2!");
10366 SmallVector<int, 4> WidenedMask;
10367 if (canWidenShuffleElements(Mask, WidenedMask))
10368 return lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, Subtarget,
10371 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
10375 // Check for being able to broadcast a single element.
10376 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i64, DL, V1,
10377 Mask, Subtarget, DAG))
10380 // When the shuffle is mirrored between the 128-bit lanes of the unit, we can
10381 // use lower latency instructions that will operate on both 128-bit lanes.
10382 SmallVector<int, 2> RepeatedMask;
10383 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
10384 if (isSingleInputShuffleMask(Mask)) {
10385 int PSHUFDMask[] = {-1, -1, -1, -1};
10386 for (int i = 0; i < 2; ++i)
10387 if (RepeatedMask[i] >= 0) {
10388 PSHUFDMask[2 * i] = 2 * RepeatedMask[i];
10389 PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1;
10391 return DAG.getNode(
10392 ISD::BITCAST, DL, MVT::v4i64,
10393 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
10394 DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, V1),
10395 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
10398 // Use dedicated unpack instructions for masks that match their pattern.
10399 if (isShuffleEquivalent(Mask, 0, 4, 2, 6))
10400 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
10401 if (isShuffleEquivalent(Mask, 1, 5, 3, 7))
10402 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);
10405 // AVX2 provides a direct instruction for permuting a single input across
10407 if (isSingleInputShuffleMask(Mask))
10408 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
10409 getV4X86ShuffleImm8ForMask(Mask, DAG));
10411 // Try to simplify this by merging 128-bit lanes to enable a lane-based
10412 // shuffle. However, if we have AVX2 and either inputs are already in place,
10413 // we will be able to shuffle even across lanes the other input in a single
10414 // instruction so skip this pattern.
10415 if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
10416 isShuffleMaskInputInPlace(1, Mask))))
10417 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10418 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
10421 // Otherwise fall back on generic blend lowering.
10422 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
10426 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
10428 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
10429 /// isn't available.
10430 static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10431 const X86Subtarget *Subtarget,
10432 SelectionDAG &DAG) {
10434 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
10435 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
10436 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10437 ArrayRef<int> Mask = SVOp->getMask();
10438 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10440 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
10444 // Check for being able to broadcast a single element.
10445 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8f32, DL, V1,
10446 Mask, Subtarget, DAG))
10449 // If the shuffle mask is repeated in each 128-bit lane, we have many more
10450 // options to efficiently lower the shuffle.
10451 SmallVector<int, 4> RepeatedMask;
10452 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
10453 assert(RepeatedMask.size() == 4 &&
10454 "Repeated masks must be half the mask width!");
10456 // Use even/odd duplicate instructions for masks that match their pattern.
10457 if (isShuffleEquivalent(Mask, 0, 0, 2, 2, 4, 4, 6, 6))
10458 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
10459 if (isShuffleEquivalent(Mask, 1, 1, 3, 3, 5, 5, 7, 7))
10460 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
10462 if (isSingleInputShuffleMask(Mask))
10463 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
10464 getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
10466 // Use dedicated unpack instructions for masks that match their pattern.
10467 if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13))
10468 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2);
10469 if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15))
10470 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2);
10472 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
10473 // have already handled any direct blends. We also need to squash the
10474 // repeated mask into a simulated v4f32 mask.
10475 for (int i = 0; i < 4; ++i)
10476 if (RepeatedMask[i] >= 8)
10477 RepeatedMask[i] -= 4;
10478 return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
10481 // If we have a single input shuffle with different shuffle patterns in the
10482 // two 128-bit lanes use the variable mask to VPERMILPS.
10483 if (isSingleInputShuffleMask(Mask)) {
10484 SDValue VPermMask[8];
10485 for (int i = 0; i < 8; ++i)
10486 VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
10487 : DAG.getConstant(Mask[i], MVT::i32);
10488 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
10489 return DAG.getNode(
10490 X86ISD::VPERMILPV, DL, MVT::v8f32, V1,
10491 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask));
10493 if (Subtarget->hasAVX2())
10494 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32,
10495 DAG.getNode(ISD::BITCAST, DL, MVT::v8f32,
10496 DAG.getNode(ISD::BUILD_VECTOR, DL,
10497 MVT::v8i32, VPermMask)),
10500 // Otherwise, fall back.
10501 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
10505 // Try to simplify this by merging 128-bit lanes to enable a lane-based
10507 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10508 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
10511 // If we have AVX2 then we always want to lower with a blend because at v8 we
10512 // can fully permute the elements.
10513 if (Subtarget->hasAVX2())
10514 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
10517 // Otherwise fall back on generic lowering.
10518 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
10521 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
10523 /// This routine is only called when we have AVX2 and thus a reasonable
10524 /// instruction set for v8i32 shuffling..
10525 static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10526 const X86Subtarget *Subtarget,
10527 SelectionDAG &DAG) {
10529 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
10530 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
10531 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10532 ArrayRef<int> Mask = SVOp->getMask();
10533 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10534 assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!");
10536 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
10540 // Check for being able to broadcast a single element.
10541 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i32, DL, V1,
10542 Mask, Subtarget, DAG))
10545 // If the shuffle mask is repeated in each 128-bit lane we can use more
10546 // efficient instructions that mirror the shuffles across the two 128-bit
10548 SmallVector<int, 4> RepeatedMask;
10549 if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) {
10550 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
10551 if (isSingleInputShuffleMask(Mask))
10552 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
10553 getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
10555 // Use dedicated unpack instructions for masks that match their pattern.
10556 if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13))
10557 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2);
10558 if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15))
10559 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2);
10562 // If the shuffle patterns aren't repeated but it is a single input, directly
10563 // generate a cross-lane VPERMD instruction.
10564 if (isSingleInputShuffleMask(Mask)) {
10565 SDValue VPermMask[8];
10566 for (int i = 0; i < 8; ++i)
10567 VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
10568 : DAG.getConstant(Mask[i], MVT::i32);
10569 return DAG.getNode(
10570 X86ISD::VPERMV, DL, MVT::v8i32,
10571 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
10574 // Try to simplify this by merging 128-bit lanes to enable a lane-based
10576 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10577 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
10580 // Otherwise fall back on generic blend lowering.
10581 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
10585 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
10587 /// This routine is only called when we have AVX2 and thus a reasonable
10588 /// instruction set for v16i16 shuffling..
10589 static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10590 const X86Subtarget *Subtarget,
10591 SelectionDAG &DAG) {
10593 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
10594 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
10595 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10596 ArrayRef<int> Mask = SVOp->getMask();
10597 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
10598 assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!");
10600 // Check for being able to broadcast a single element.
10601 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i16, DL, V1,
10602 Mask, Subtarget, DAG))
10605 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
10609 // Use dedicated unpack instructions for masks that match their pattern.
10610 if (isShuffleEquivalent(Mask,
10611 // First 128-bit lane:
10612 0, 16, 1, 17, 2, 18, 3, 19,
10613 // Second 128-bit lane:
10614 8, 24, 9, 25, 10, 26, 11, 27))
10615 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2);
10616 if (isShuffleEquivalent(Mask,
10617 // First 128-bit lane:
10618 4, 20, 5, 21, 6, 22, 7, 23,
10619 // Second 128-bit lane:
10620 12, 28, 13, 29, 14, 30, 15, 31))
10621 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2);
10623 if (isSingleInputShuffleMask(Mask)) {
10624 // There are no generalized cross-lane shuffle operations available on i16
10626 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
10627 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
10630 SDValue PSHUFBMask[32];
10631 for (int i = 0; i < 16; ++i) {
10632 if (Mask[i] == -1) {
10633 PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8);
10637 int M = i < 8 ? Mask[i] : Mask[i] - 8;
10638 assert(M >= 0 && M < 8 && "Invalid single-input mask!");
10639 PSHUFBMask[2 * i] = DAG.getConstant(2 * M, MVT::i8);
10640 PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, MVT::i8);
10642 return DAG.getNode(
10643 ISD::BITCAST, DL, MVT::v16i16,
10645 X86ISD::PSHUFB, DL, MVT::v32i8,
10646 DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1),
10647 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)));
10650 // Try to simplify this by merging 128-bit lanes to enable a lane-based
10652 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10653 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
10656 // Otherwise fall back on generic lowering.
10657 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
10660 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
10662 /// This routine is only called when we have AVX2 and thus a reasonable
10663 /// instruction set for v32i8 shuffling..
10664 static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10665 const X86Subtarget *Subtarget,
10666 SelectionDAG &DAG) {
10668 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
10669 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
10670 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10671 ArrayRef<int> Mask = SVOp->getMask();
10672 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
10673 assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!");
10675 // Check for being able to broadcast a single element.
10676 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v32i8, DL, V1,
10677 Mask, Subtarget, DAG))
10680 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
10684 // Use dedicated unpack instructions for masks that match their pattern.
10685 // Note that these are repeated 128-bit lane unpacks, not unpacks across all
10687 if (isShuffleEquivalent(
10689 // First 128-bit lane:
10690 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
10691 // Second 128-bit lane:
10692 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55))
10693 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2);
10694 if (isShuffleEquivalent(
10696 // First 128-bit lane:
10697 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
10698 // Second 128-bit lane:
10699 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63))
10700 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2);
10702 if (isSingleInputShuffleMask(Mask)) {
10703 // There are no generalized cross-lane shuffle operations available on i8
10705 if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
10706 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2,
10709 SDValue PSHUFBMask[32];
10710 for (int i = 0; i < 32; ++i)
10713 ? DAG.getUNDEF(MVT::i8)
10714 : DAG.getConstant(Mask[i] < 16 ? Mask[i] : Mask[i] - 16, MVT::i8);
10716 return DAG.getNode(
10717 X86ISD::PSHUFB, DL, MVT::v32i8, V1,
10718 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask));
10721 // Try to simplify this by merging 128-bit lanes to enable a lane-based
10723 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10724 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
10727 // Otherwise fall back on generic lowering.
10728 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
10731 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
10733 /// This routine either breaks down the specific type of a 256-bit x86 vector
10734 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
10735 /// together based on the available instructions.
10736 static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10737 MVT VT, const X86Subtarget *Subtarget,
10738 SelectionDAG &DAG) {
10740 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10741 ArrayRef<int> Mask = SVOp->getMask();
10743 // There is a really nice hard cut-over between AVX1 and AVX2 that means we can
10744 // check for those subtargets here and avoid much of the subtarget querying in
10745 // the per-vector-type lowering routines. With AVX1 we have essentially *zero*
10746 // ability to manipulate a 256-bit vector with integer types. Since we'll use
10747 // floating point types there eventually, just immediately cast everything to
10748 // a float and operate entirely in that domain.
10749 if (VT.isInteger() && !Subtarget->hasAVX2()) {
10750 int ElementBits = VT.getScalarSizeInBits();
10751 if (ElementBits < 32)
10752 // No floating point type available, decompose into 128-bit vectors.
10753 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10755 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
10756 VT.getVectorNumElements());
10757 V1 = DAG.getNode(ISD::BITCAST, DL, FpVT, V1);
10758 V2 = DAG.getNode(ISD::BITCAST, DL, FpVT, V2);
10759 return DAG.getNode(ISD::BITCAST, DL, VT,
10760 DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
10763 switch (VT.SimpleTy) {
10765 return lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
10767 return lowerV4I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
10769 return lowerV8F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10771 return lowerV8I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10773 return lowerV16I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
10775 return lowerV32I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
10778 llvm_unreachable("Not a valid 256-bit x86 vector type!");
10782 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
10783 static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10784 const X86Subtarget *Subtarget,
10785 SelectionDAG &DAG) {
10787 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
10788 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
10789 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10790 ArrayRef<int> Mask = SVOp->getMask();
10791 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10793 // X86 has dedicated unpack instructions that can handle specific blend
10794 // operations: UNPCKH and UNPCKL.
10795 if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14))
10796 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2);
10797 if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15))
10798 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2);
10800 // FIXME: Implement direct support for this type!
10801 return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG);
10804 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
10805 static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10806 const X86Subtarget *Subtarget,
10807 SelectionDAG &DAG) {
10809 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
10810 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
10811 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10812 ArrayRef<int> Mask = SVOp->getMask();
10813 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
10815 // Use dedicated unpack instructions for masks that match their pattern.
10816 if (isShuffleEquivalent(Mask,
10817 0, 16, 1, 17, 4, 20, 5, 21,
10818 8, 24, 9, 25, 12, 28, 13, 29))
10819 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2);
10820 if (isShuffleEquivalent(Mask,
10821 2, 18, 3, 19, 6, 22, 7, 23,
10822 10, 26, 11, 27, 14, 30, 15, 31))
10823 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2);
10825 // FIXME: Implement direct support for this type!
10826 return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG);
10829 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
10830 static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10831 const X86Subtarget *Subtarget,
10832 SelectionDAG &DAG) {
10834 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
10835 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
10836 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10837 ArrayRef<int> Mask = SVOp->getMask();
10838 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10840 // X86 has dedicated unpack instructions that can handle specific blend
10841 // operations: UNPCKH and UNPCKL.
10842 if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14))
10843 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2);
10844 if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15))
10845 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2);
10847 // FIXME: Implement direct support for this type!
10848 return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG);
10851 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
10852 static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10853 const X86Subtarget *Subtarget,
10854 SelectionDAG &DAG) {
10856 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
10857 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
10858 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10859 ArrayRef<int> Mask = SVOp->getMask();
10860 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
10862 // Use dedicated unpack instructions for masks that match their pattern.
10863 if (isShuffleEquivalent(Mask,
10864 0, 16, 1, 17, 4, 20, 5, 21,
10865 8, 24, 9, 25, 12, 28, 13, 29))
10866 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2);
10867 if (isShuffleEquivalent(Mask,
10868 2, 18, 3, 19, 6, 22, 7, 23,
10869 10, 26, 11, 27, 14, 30, 15, 31))
10870 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2);
10872 // FIXME: Implement direct support for this type!
10873 return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG);
10876 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
10877 static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10878 const X86Subtarget *Subtarget,
10879 SelectionDAG &DAG) {
10881 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
10882 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
10883 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10884 ArrayRef<int> Mask = SVOp->getMask();
10885 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
10886 assert(Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
10888 // FIXME: Implement direct support for this type!
10889 return splitAndLowerVectorShuffle(DL, MVT::v32i16, V1, V2, Mask, DAG);
10892 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
10893 static SDValue lowerV64I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10894 const X86Subtarget *Subtarget,
10895 SelectionDAG &DAG) {
10897 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
10898 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
10899 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10900 ArrayRef<int> Mask = SVOp->getMask();
10901 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
10902 assert(Subtarget->hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
10904 // FIXME: Implement direct support for this type!
10905 return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
10908 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
10910 /// This routine either breaks down the specific type of a 512-bit x86 vector
10911 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
10912 /// together based on the available instructions.
10913 static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10914 MVT VT, const X86Subtarget *Subtarget,
10915 SelectionDAG &DAG) {
10917 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10918 ArrayRef<int> Mask = SVOp->getMask();
10919 assert(Subtarget->hasAVX512() &&
10920 "Cannot lower 512-bit vectors w/ basic ISA!");
10922 // Check for being able to broadcast a single element.
10923 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(VT.SimpleTy, DL, V1,
10924 Mask, Subtarget, DAG))
10927 // Dispatch to each element type for lowering. If we don't have supprot for
10928 // specific element type shuffles at 512 bits, immediately split them and
10929 // lower them. Each lowering routine of a given type is allowed to assume that
10930 // the requisite ISA extensions for that element type are available.
10931 switch (VT.SimpleTy) {
10933 return lowerV8F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
10935 return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10937 return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
10939 return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10941 if (Subtarget->hasBWI())
10942 return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
10945 if (Subtarget->hasBWI())
10946 return lowerV64I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
10950 llvm_unreachable("Not a valid 512-bit x86 vector type!");
10953 // Otherwise fall back on splitting.
10954 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10957 /// \brief Top-level lowering for x86 vector shuffles.
10959 /// This handles decomposition, canonicalization, and lowering of all x86
10960 /// vector shuffles. Most of the specific lowering strategies are encapsulated
10961 /// above in helper routines. The canonicalization attempts to widen shuffles
10962 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
10963 /// s.t. only one of the two inputs needs to be tested, etc.
10964 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
10965 SelectionDAG &DAG) {
10966 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10967 ArrayRef<int> Mask = SVOp->getMask();
10968 SDValue V1 = Op.getOperand(0);
10969 SDValue V2 = Op.getOperand(1);
10970 MVT VT = Op.getSimpleValueType();
10971 int NumElements = VT.getVectorNumElements();
10974 assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
10976 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
10977 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
10978 if (V1IsUndef && V2IsUndef)
10979 return DAG.getUNDEF(VT);
10981 // When we create a shuffle node we put the UNDEF node to second operand,
10982 // but in some cases the first operand may be transformed to UNDEF.
10983 // In this case we should just commute the node.
10985 return DAG.getCommutedVectorShuffle(*SVOp);
10987 // Check for non-undef masks pointing at an undef vector and make the masks
10988 // undef as well. This makes it easier to match the shuffle based solely on
10992 if (M >= NumElements) {
10993 SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
10994 for (int &M : NewMask)
10995 if (M >= NumElements)
10997 return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask);
11000 // Try to collapse shuffles into using a vector type with fewer elements but
11001 // wider element types. We cap this to not form integers or floating point
11002 // elements wider than 64 bits, but it might be interesting to form i128
11003 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
11004 SmallVector<int, 16> WidenedMask;
11005 if (VT.getScalarSizeInBits() < 64 &&
11006 canWidenShuffleElements(Mask, WidenedMask)) {
11007 MVT NewEltVT = VT.isFloatingPoint()
11008 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
11009 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
11010 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
11011 // Make sure that the new vector type is legal. For example, v2f64 isn't
11013 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
11014 V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
11015 V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
11016 return DAG.getNode(ISD::BITCAST, dl, VT,
11017 DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask));
11021 int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0;
11022 for (int M : SVOp->getMask())
11024 ++NumUndefElements;
11025 else if (M < NumElements)
11030 // Commute the shuffle as needed such that more elements come from V1 than
11031 // V2. This allows us to match the shuffle pattern strictly on how many
11032 // elements come from V1 without handling the symmetric cases.
11033 if (NumV2Elements > NumV1Elements)
11034 return DAG.getCommutedVectorShuffle(*SVOp);
11036 // When the number of V1 and V2 elements are the same, try to minimize the
11037 // number of uses of V2 in the low half of the vector. When that is tied,
11038 // ensure that the sum of indices for V1 is equal to or lower than the sum
11039 // indices for V2. When those are equal, try to ensure that the number of odd
11040 // indices for V1 is lower than the number of odd indices for V2.
11041 if (NumV1Elements == NumV2Elements) {
11042 int LowV1Elements = 0, LowV2Elements = 0;
11043 for (int M : SVOp->getMask().slice(0, NumElements / 2))
11044 if (M >= NumElements)
11048 if (LowV2Elements > LowV1Elements) {
11049 return DAG.getCommutedVectorShuffle(*SVOp);
11050 } else if (LowV2Elements == LowV1Elements) {
11051 int SumV1Indices = 0, SumV2Indices = 0;
11052 for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
11053 if (SVOp->getMask()[i] >= NumElements)
11055 else if (SVOp->getMask()[i] >= 0)
11057 if (SumV2Indices < SumV1Indices) {
11058 return DAG.getCommutedVectorShuffle(*SVOp);
11059 } else if (SumV2Indices == SumV1Indices) {
11060 int NumV1OddIndices = 0, NumV2OddIndices = 0;
11061 for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
11062 if (SVOp->getMask()[i] >= NumElements)
11063 NumV2OddIndices += i % 2;
11064 else if (SVOp->getMask()[i] >= 0)
11065 NumV1OddIndices += i % 2;
11066 if (NumV2OddIndices < NumV1OddIndices)
11067 return DAG.getCommutedVectorShuffle(*SVOp);
11072 // For each vector width, delegate to a specialized lowering routine.
11073 if (VT.getSizeInBits() == 128)
11074 return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
11076 if (VT.getSizeInBits() == 256)
11077 return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
11079 // Force AVX-512 vectors to be scalarized for now.
11080 // FIXME: Implement AVX-512 support!
11081 if (VT.getSizeInBits() == 512)
11082 return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
11084 llvm_unreachable("Unimplemented!");
11088 //===----------------------------------------------------------------------===//
11089 // Legacy vector shuffle lowering
11091 // This code is the legacy code handling vector shuffles until the above
11092 // replaces its functionality and performance.
11093 //===----------------------------------------------------------------------===//
11095 static bool isBlendMask(ArrayRef<int> MaskVals, MVT VT, bool hasSSE41,
11096 bool hasInt256, unsigned *MaskOut = nullptr) {
11097 MVT EltVT = VT.getVectorElementType();
11099 // There is no blend with immediate in AVX-512.
11100 if (VT.is512BitVector())
11103 if (!hasSSE41 || EltVT == MVT::i8)
11105 if (!hasInt256 && VT == MVT::v16i16)
11108 unsigned MaskValue = 0;
11109 unsigned NumElems = VT.getVectorNumElements();
11110 // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
11111 unsigned NumLanes = (NumElems - 1) / 8 + 1;
11112 unsigned NumElemsInLane = NumElems / NumLanes;
11114 // Blend for v16i16 should be symetric for the both lanes.
11115 for (unsigned i = 0; i < NumElemsInLane; ++i) {
11117 int SndLaneEltIdx = (NumLanes == 2) ? MaskVals[i + NumElemsInLane] : -1;
11118 int EltIdx = MaskVals[i];
11120 if ((EltIdx < 0 || EltIdx == (int)i) &&
11121 (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane)))
11124 if (((unsigned)EltIdx == (i + NumElems)) &&
11125 (SndLaneEltIdx < 0 ||
11126 (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane))
11127 MaskValue |= (1 << i);
11133 *MaskOut = MaskValue;
11137 // Try to lower a shuffle node into a simple blend instruction.
11138 // This function assumes isBlendMask returns true for this
11139 // SuffleVectorSDNode
11140 static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
11141 unsigned MaskValue,
11142 const X86Subtarget *Subtarget,
11143 SelectionDAG &DAG) {
11144 MVT VT = SVOp->getSimpleValueType(0);
11145 MVT EltVT = VT.getVectorElementType();
11146 assert(isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(),
11147 Subtarget->hasInt256() && "Trying to lower a "
11148 "VECTOR_SHUFFLE to a Blend but "
11149 "with the wrong mask"));
11150 SDValue V1 = SVOp->getOperand(0);
11151 SDValue V2 = SVOp->getOperand(1);
11153 unsigned NumElems = VT.getVectorNumElements();
11155 // Convert i32 vectors to floating point if it is not AVX2.
11156 // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
11158 if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
11159 BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
11161 V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1);
11162 V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2);
11165 SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2,
11166 DAG.getConstant(MaskValue, MVT::i32));
11167 return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
11170 /// In vector type \p VT, return true if the element at index \p InputIdx
11171 /// falls on a different 128-bit lane than \p OutputIdx.
11172 static bool ShuffleCrosses128bitLane(MVT VT, unsigned InputIdx,
11173 unsigned OutputIdx) {
11174 unsigned EltSize = VT.getVectorElementType().getSizeInBits();
11175 return InputIdx * EltSize / 128 != OutputIdx * EltSize / 128;
11178 /// Generate a PSHUFB if possible. Selects elements from \p V1 according to
11179 /// \p MaskVals. MaskVals[OutputIdx] = InputIdx specifies that we want to
11180 /// shuffle the element at InputIdx in V1 to OutputIdx in the result. If \p
11181 /// MaskVals refers to elements outside of \p V1 or is undef (-1), insert a
11183 static SDValue getPSHUFB(ArrayRef<int> MaskVals, SDValue V1, SDLoc &dl,
11184 SelectionDAG &DAG) {
11185 MVT VT = V1.getSimpleValueType();
11186 assert(VT.is128BitVector() || VT.is256BitVector());
11188 MVT EltVT = VT.getVectorElementType();
11189 unsigned EltSizeInBytes = EltVT.getSizeInBits() / 8;
11190 unsigned NumElts = VT.getVectorNumElements();
11192 SmallVector<SDValue, 32> PshufbMask;
11193 for (unsigned OutputIdx = 0; OutputIdx < NumElts; ++OutputIdx) {
11194 int InputIdx = MaskVals[OutputIdx];
11195 unsigned InputByteIdx;
11197 if (InputIdx < 0 || NumElts <= (unsigned)InputIdx)
11198 InputByteIdx = 0x80;
11200 // Cross lane is not allowed.
11201 if (ShuffleCrosses128bitLane(VT, InputIdx, OutputIdx))
11203 InputByteIdx = InputIdx * EltSizeInBytes;
11204 // Index is an byte offset within the 128-bit lane.
11205 InputByteIdx &= 0xf;
11208 for (unsigned j = 0; j < EltSizeInBytes; ++j) {
11209 PshufbMask.push_back(DAG.getConstant(InputByteIdx, MVT::i8));
11210 if (InputByteIdx != 0x80)
11215 MVT ShufVT = MVT::getVectorVT(MVT::i8, PshufbMask.size());
11217 V1 = DAG.getNode(ISD::BITCAST, dl, ShufVT, V1);
11218 return DAG.getNode(X86ISD::PSHUFB, dl, ShufVT, V1,
11219 DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT, PshufbMask));
11222 // v8i16 shuffles - Prefer shuffles in the following order:
11223 // 1. [all] pshuflw, pshufhw, optional move
11224 // 2. [ssse3] 1 x pshufb
11225 // 3. [ssse3] 2 x pshufb + 1 x por
11226 // 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
11228 LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
11229 SelectionDAG &DAG) {
11230 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11231 SDValue V1 = SVOp->getOperand(0);
11232 SDValue V2 = SVOp->getOperand(1);
11234 SmallVector<int, 8> MaskVals;
11236 // Determine if more than 1 of the words in each of the low and high quadwords
11237 // of the result come from the same quadword of one of the two inputs. Undef
11238 // mask values count as coming from any quadword, for better codegen.
11240 // Lo/HiQuad[i] = j indicates how many words from the ith quad of the input
11241 // feeds this quad. For i, 0 and 1 refer to V1, 2 and 3 refer to V2.
11242 unsigned LoQuad[] = { 0, 0, 0, 0 };
11243 unsigned HiQuad[] = { 0, 0, 0, 0 };
11244 // Indices of quads used.
11245 std::bitset<4> InputQuads;
11246 for (unsigned i = 0; i < 8; ++i) {
11247 unsigned *Quad = i < 4 ? LoQuad : HiQuad;
11248 int EltIdx = SVOp->getMaskElt(i);
11249 MaskVals.push_back(EltIdx);
11257 ++Quad[EltIdx / 4];
11258 InputQuads.set(EltIdx / 4);
11261 int BestLoQuad = -1;
11262 unsigned MaxQuad = 1;
11263 for (unsigned i = 0; i < 4; ++i) {
11264 if (LoQuad[i] > MaxQuad) {
11266 MaxQuad = LoQuad[i];
11270 int BestHiQuad = -1;
11272 for (unsigned i = 0; i < 4; ++i) {
11273 if (HiQuad[i] > MaxQuad) {
11275 MaxQuad = HiQuad[i];
11279 // For SSSE3, If all 8 words of the result come from only 1 quadword of each
11280 // of the two input vectors, shuffle them into one input vector so only a
11281 // single pshufb instruction is necessary. If there are more than 2 input
11282 // quads, disable the next transformation since it does not help SSSE3.
11283 bool V1Used = InputQuads[0] || InputQuads[1];
11284 bool V2Used = InputQuads[2] || InputQuads[3];
11285 if (Subtarget->hasSSSE3()) {
11286 if (InputQuads.count() == 2 && V1Used && V2Used) {
11287 BestLoQuad = InputQuads[0] ? 0 : 1;
11288 BestHiQuad = InputQuads[2] ? 2 : 3;
11290 if (InputQuads.count() > 2) {
11296 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
11297 // the shuffle mask. If a quad is scored as -1, that means that it contains
11298 // words from all 4 input quadwords.
11300 if (BestLoQuad >= 0 || BestHiQuad >= 0) {
11302 BestLoQuad < 0 ? 0 : BestLoQuad,
11303 BestHiQuad < 0 ? 1 : BestHiQuad
11305 NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
11306 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
11307 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
11308 NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV);
11310 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
11311 // source words for the shuffle, to aid later transformations.
11312 bool AllWordsInNewV = true;
11313 bool InOrder[2] = { true, true };
11314 for (unsigned i = 0; i != 8; ++i) {
11315 int idx = MaskVals[i];
11317 InOrder[i/4] = false;
11318 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
11320 AllWordsInNewV = false;
11324 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
11325 if (AllWordsInNewV) {
11326 for (int i = 0; i != 8; ++i) {
11327 int idx = MaskVals[i];
11330 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
11331 if ((idx != i) && idx < 4)
11333 if ((idx != i) && idx > 3)
11342 // If we've eliminated the use of V2, and the new mask is a pshuflw or
11343 // pshufhw, that's as cheap as it gets. Return the new shuffle.
11344 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
11345 unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;
11346 unsigned TargetMask = 0;
11347 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
11348 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
11349 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
11350 TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp):
11351 getShufflePSHUFLWImmediate(SVOp);
11352 V1 = NewV.getOperand(0);
11353 return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
11357 // Promote splats to a larger type which usually leads to more efficient code.
11358 // FIXME: Is this true if pshufb is available?
11359 if (SVOp->isSplat())
11360 return PromoteSplat(SVOp, DAG);
11362 // If we have SSSE3, and all words of the result are from 1 input vector,
11363 // case 2 is generated, otherwise case 3 is generated. If no SSSE3
11364 // is present, fall back to case 4.
11365 if (Subtarget->hasSSSE3()) {
11366 SmallVector<SDValue,16> pshufbMask;
11368 // If we have elements from both input vectors, set the high bit of the
11369 // shuffle mask element to zero out elements that come from V2 in the V1
11370 // mask, and elements that come from V1 in the V2 mask, so that the two
11371 // results can be OR'd together.
11372 bool TwoInputs = V1Used && V2Used;
11373 V1 = getPSHUFB(MaskVals, V1, dl, DAG);
11375 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
11377 // Calculate the shuffle mask for the second input, shuffle it, and
11378 // OR it with the first shuffled input.
11379 CommuteVectorShuffleMask(MaskVals, 8);
11380 V2 = getPSHUFB(MaskVals, V2, dl, DAG);
11381 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
11382 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
11385 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
11386 // and update MaskVals with new element order.
11387 std::bitset<8> InOrder;
11388 if (BestLoQuad >= 0) {
11389 int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 };
11390 for (int i = 0; i != 4; ++i) {
11391 int idx = MaskVals[i];
11394 } else if ((idx / 4) == BestLoQuad) {
11395 MaskV[i] = idx & 3;
11399 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
11402 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
11403 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
11404 NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
11405 NewV.getOperand(0),
11406 getShufflePSHUFLWImmediate(SVOp), DAG);
11410 // If BestHi >= 0, generate a pshufhw to put the high elements in order,
11411 // and update MaskVals with the new element order.
11412 if (BestHiQuad >= 0) {
11413 int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 };
11414 for (unsigned i = 4; i != 8; ++i) {
11415 int idx = MaskVals[i];
11418 } else if ((idx / 4) == BestHiQuad) {
11419 MaskV[i] = (idx & 3) + 4;
11423 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
11426 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
11427 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
11428 NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
11429 NewV.getOperand(0),
11430 getShufflePSHUFHWImmediate(SVOp), DAG);
11434 // In case BestHi & BestLo were both -1, which means each quadword has a word
11435 // from each of the four input quadwords, calculate the InOrder bitvector now
11436 // before falling through to the insert/extract cleanup.
11437 if (BestLoQuad == -1 && BestHiQuad == -1) {
11439 for (int i = 0; i != 8; ++i)
11440 if (MaskVals[i] < 0 || MaskVals[i] == i)
11444 // The other elements are put in the right place using pextrw and pinsrw.
11445 for (unsigned i = 0; i != 8; ++i) {
11448 int EltIdx = MaskVals[i];
11451 SDValue ExtOp = (EltIdx < 8) ?
11452 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
11453 DAG.getIntPtrConstant(EltIdx)) :
11454 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
11455 DAG.getIntPtrConstant(EltIdx - 8));
11456 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
11457 DAG.getIntPtrConstant(i));
11462 /// \brief v16i16 shuffles
11464 /// FIXME: We only support generation of a single pshufb currently. We can
11465 /// generalize the other applicable cases from LowerVECTOR_SHUFFLEv8i16 as
11466 /// well (e.g 2 x pshufb + 1 x por).
11468 LowerVECTOR_SHUFFLEv16i16(SDValue Op, SelectionDAG &DAG) {
11469 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11470 SDValue V1 = SVOp->getOperand(0);
11471 SDValue V2 = SVOp->getOperand(1);
11474 if (V2.getOpcode() != ISD::UNDEF)
11477 SmallVector<int, 16> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
11478 return getPSHUFB(MaskVals, V1, dl, DAG);
11481 // v16i8 shuffles - Prefer shuffles in the following order:
11482 // 1. [ssse3] 1 x pshufb
11483 // 2. [ssse3] 2 x pshufb + 1 x por
11484 // 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw
11485 static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
11486 const X86Subtarget* Subtarget,
11487 SelectionDAG &DAG) {
11488 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
11489 SDValue V1 = SVOp->getOperand(0);
11490 SDValue V2 = SVOp->getOperand(1);
11492 ArrayRef<int> MaskVals = SVOp->getMask();
11494 // Promote splats to a larger type which usually leads to more efficient code.
11495 // FIXME: Is this true if pshufb is available?
11496 if (SVOp->isSplat())
11497 return PromoteSplat(SVOp, DAG);
11499 // If we have SSSE3, case 1 is generated when all result bytes come from
11500 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is
11501 // present, fall back to case 3.
11503 // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
11504 if (Subtarget->hasSSSE3()) {
11505 SmallVector<SDValue,16> pshufbMask;
11507 // If all result elements are from one input vector, then only translate
11508 // undef mask values to 0x80 (zero out result) in the pshufb mask.
11510 // Otherwise, we have elements from both input vectors, and must zero out
11511 // elements that come from V2 in the first mask, and V1 in the second mask
11512 // so that we can OR them together.
11513 for (unsigned i = 0; i != 16; ++i) {
11514 int EltIdx = MaskVals[i];
11515 if (EltIdx < 0 || EltIdx >= 16)
11517 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
11519 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
11520 DAG.getNode(ISD::BUILD_VECTOR, dl,
11521 MVT::v16i8, pshufbMask));
11523 // As PSHUFB will zero elements with negative indices, it's safe to ignore
11524 // the 2nd operand if it's undefined or zero.
11525 if (V2.getOpcode() == ISD::UNDEF ||
11526 ISD::isBuildVectorAllZeros(V2.getNode()))
11529 // Calculate the shuffle mask for the second input, shuffle it, and
11530 // OR it with the first shuffled input.
11531 pshufbMask.clear();
11532 for (unsigned i = 0; i != 16; ++i) {
11533 int EltIdx = MaskVals[i];
11534 EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16;
11535 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
11537 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
11538 DAG.getNode(ISD::BUILD_VECTOR, dl,
11539 MVT::v16i8, pshufbMask));
11540 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
11543 // No SSSE3 - Calculate in place words and then fix all out of place words
11544 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from
11545 // the 16 different words that comprise the two doublequadword input vectors.
11546 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
11547 V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
11549 for (int i = 0; i != 8; ++i) {
11550 int Elt0 = MaskVals[i*2];
11551 int Elt1 = MaskVals[i*2+1];
11553 // This word of the result is all undef, skip it.
11554 if (Elt0 < 0 && Elt1 < 0)
11557 // This word of the result is already in the correct place, skip it.
11558 if ((Elt0 == i*2) && (Elt1 == i*2+1))
11561 SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
11562 SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
11565 // If Elt0 and Elt1 are defined, are consecutive, and can be load
11566 // using a single extract together, load it and store it.
11567 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
11568 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
11569 DAG.getIntPtrConstant(Elt1 / 2));
11570 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
11571 DAG.getIntPtrConstant(i));
11575 // If Elt1 is defined, extract it from the appropriate source. If the
11576 // source byte is not also odd, shift the extracted word left 8 bits
11577 // otherwise clear the bottom 8 bits if we need to do an or.
11579 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
11580 DAG.getIntPtrConstant(Elt1 / 2));
11581 if ((Elt1 & 1) == 0)
11582 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
11584 TLI.getShiftAmountTy(InsElt.getValueType())));
11585 else if (Elt0 >= 0)
11586 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
11587 DAG.getConstant(0xFF00, MVT::i16));
11589 // If Elt0 is defined, extract it from the appropriate source. If the
11590 // source byte is not also even, shift the extracted word right 8 bits. If
11591 // Elt1 was also defined, OR the extracted values together before
11592 // inserting them in the result.
11594 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
11595 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
11596 if ((Elt0 & 1) != 0)
11597 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
11599 TLI.getShiftAmountTy(InsElt0.getValueType())));
11600 else if (Elt1 >= 0)
11601 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
11602 DAG.getConstant(0x00FF, MVT::i16));
11603 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
11606 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
11607 DAG.getIntPtrConstant(i));
11609 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
11612 // v32i8 shuffles - Translate to VPSHUFB if possible.
11614 SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
11615 const X86Subtarget *Subtarget,
11616 SelectionDAG &DAG) {
11617 MVT VT = SVOp->getSimpleValueType(0);
11618 SDValue V1 = SVOp->getOperand(0);
11619 SDValue V2 = SVOp->getOperand(1);
11621 SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
11623 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
11624 bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode());
11625 bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode());
11627 // VPSHUFB may be generated if
11628 // (1) one of input vector is undefined or zeroinitializer.
11629 // The mask value 0x80 puts 0 in the corresponding slot of the vector.
11630 // And (2) the mask indexes don't cross the 128-bit lane.
11631 if (VT != MVT::v32i8 || !Subtarget->hasInt256() ||
11632 (!V2IsUndef && !V2IsAllZero && !V1IsAllZero))
11635 if (V1IsAllZero && !V2IsAllZero) {
11636 CommuteVectorShuffleMask(MaskVals, 32);
11639 return getPSHUFB(MaskVals, V1, dl, DAG);
11642 /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
11643 /// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
11644 /// done when every pair / quad of shuffle mask elements point to elements in
11645 /// the right sequence. e.g.
11646 /// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15>
11648 SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
11649 SelectionDAG &DAG) {
11650 MVT VT = SVOp->getSimpleValueType(0);
11652 unsigned NumElems = VT.getVectorNumElements();
11655 switch (VT.SimpleTy) {
11656 default: llvm_unreachable("Unexpected!");
11659 return SDValue(SVOp, 0);
11660 case MVT::v4f32: NewVT = MVT::v2f64; Scale = 2; break;
11661 case MVT::v4i32: NewVT = MVT::v2i64; Scale = 2; break;
11662 case MVT::v8i16: NewVT = MVT::v4i32; Scale = 2; break;
11663 case MVT::v16i8: NewVT = MVT::v4i32; Scale = 4; break;
11664 case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break;
11665 case MVT::v32i8: NewVT = MVT::v8i32; Scale = 4; break;
11668 SmallVector<int, 8> MaskVec;
11669 for (unsigned i = 0; i != NumElems; i += Scale) {
11671 for (unsigned j = 0; j != Scale; ++j) {
11672 int EltIdx = SVOp->getMaskElt(i+j);
11676 StartIdx = (EltIdx / Scale);
11677 if (EltIdx != (int)(StartIdx*Scale + j))
11680 MaskVec.push_back(StartIdx);
11683 SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0));
11684 SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1));
11685 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
11688 /// getVZextMovL - Return a zero-extending vector move low node.
11690 static SDValue getVZextMovL(MVT VT, MVT OpVT,
11691 SDValue SrcOp, SelectionDAG &DAG,
11692 const X86Subtarget *Subtarget, SDLoc dl) {
11693 if (VT == MVT::v2f64 || VT == MVT::v4f32) {
11694 LoadSDNode *LD = nullptr;
11695 if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
11696 LD = dyn_cast<LoadSDNode>(SrcOp);
11698 // movssrr and movsdrr do not clear top bits. Try to use movd, movq
11700 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
11701 if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) &&
11702 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
11703 SrcOp.getOperand(0).getOpcode() == ISD::BITCAST &&
11704 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
11706 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
11707 return DAG.getNode(ISD::BITCAST, dl, VT,
11708 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
11709 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
11711 SrcOp.getOperand(0)
11717 return DAG.getNode(ISD::BITCAST, dl, VT,
11718 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
11719 DAG.getNode(ISD::BITCAST, dl,
11723 /// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
11724 /// which could not be matched by any known target speficic shuffle
11726 LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
11728 SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG);
11729 if (NewOp.getNode())
11732 MVT VT = SVOp->getSimpleValueType(0);
11734 unsigned NumElems = VT.getVectorNumElements();
11735 unsigned NumLaneElems = NumElems / 2;
11738 MVT EltVT = VT.getVectorElementType();
11739 MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
11742 SmallVector<int, 16> Mask;
11743 for (unsigned l = 0; l < 2; ++l) {
11744 // Build a shuffle mask for the output, discovering on the fly which
11745 // input vectors to use as shuffle operands (recorded in InputUsed).
11746 // If building a suitable shuffle vector proves too hard, then bail
11747 // out with UseBuildVector set.
11748 bool UseBuildVector = false;
11749 int InputUsed[2] = { -1, -1 }; // Not yet discovered.
11750 unsigned LaneStart = l * NumLaneElems;
11751 for (unsigned i = 0; i != NumLaneElems; ++i) {
11752 // The mask element. This indexes into the input.
11753 int Idx = SVOp->getMaskElt(i+LaneStart);
11755 // the mask element does not index into any input vector.
11756 Mask.push_back(-1);
11760 // The input vector this mask element indexes into.
11761 int Input = Idx / NumLaneElems;
11763 // Turn the index into an offset from the start of the input vector.
11764 Idx -= Input * NumLaneElems;
11766 // Find or create a shuffle vector operand to hold this input.
11768 for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
11769 if (InputUsed[OpNo] == Input)
11770 // This input vector is already an operand.
11772 if (InputUsed[OpNo] < 0) {
11773 // Create a new operand for this input vector.
11774 InputUsed[OpNo] = Input;
11779 if (OpNo >= array_lengthof(InputUsed)) {
11780 // More than two input vectors used! Give up on trying to create a
11781 // shuffle vector. Insert all elements into a BUILD_VECTOR instead.
11782 UseBuildVector = true;
11786 // Add the mask index for the new shuffle vector.
11787 Mask.push_back(Idx + OpNo * NumLaneElems);
11790 if (UseBuildVector) {
11791 SmallVector<SDValue, 16> SVOps;
11792 for (unsigned i = 0; i != NumLaneElems; ++i) {
11793 // The mask element. This indexes into the input.
11794 int Idx = SVOp->getMaskElt(i+LaneStart);
11796 SVOps.push_back(DAG.getUNDEF(EltVT));
11800 // The input vector this mask element indexes into.
11801 int Input = Idx / NumElems;
11803 // Turn the index into an offset from the start of the input vector.
11804 Idx -= Input * NumElems;
11806 // Extract the vector element by hand.
11807 SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
11808 SVOp->getOperand(Input),
11809 DAG.getIntPtrConstant(Idx)));
11812 // Construct the output using a BUILD_VECTOR.
11813 Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, SVOps);
11814 } else if (InputUsed[0] < 0) {
11815 // No input vectors were used! The result is undefined.
11816 Output[l] = DAG.getUNDEF(NVT);
11818 SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),
11819 (InputUsed[0] % 2) * NumLaneElems,
11821 // If only one input was used, use an undefined vector for the other.
11822 SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :
11823 Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),
11824 (InputUsed[1] % 2) * NumLaneElems, DAG, dl);
11825 // At least one input vector was used. Create a new shuffle vector.
11826 Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
11832 // Concatenate the result back
11833 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]);
11836 /// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
11837 /// 4 elements, and match them with several different shuffle types.
11839 LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
11840 SDValue V1 = SVOp->getOperand(0);
11841 SDValue V2 = SVOp->getOperand(1);
11843 MVT VT = SVOp->getSimpleValueType(0);
11845 assert(VT.is128BitVector() && "Unsupported vector size");
11847 std::pair<int, int> Locs[4];
11848 int Mask1[] = { -1, -1, -1, -1 };
11849 SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end());
11851 unsigned NumHi = 0;
11852 unsigned NumLo = 0;
11853 for (unsigned i = 0; i != 4; ++i) {
11854 int Idx = PermMask[i];
11856 Locs[i] = std::make_pair(-1, -1);
11858 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
11860 Locs[i] = std::make_pair(0, NumLo);
11861 Mask1[NumLo] = Idx;
11864 Locs[i] = std::make_pair(1, NumHi);
11866 Mask1[2+NumHi] = Idx;
11872 if (NumLo <= 2 && NumHi <= 2) {
11873 // If no more than two elements come from either vector. This can be
11874 // implemented with two shuffles. First shuffle gather the elements.
11875 // The second shuffle, which takes the first shuffle as both of its
11876 // vector operands, put the elements into the right order.
11877 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
11879 int Mask2[] = { -1, -1, -1, -1 };
11881 for (unsigned i = 0; i != 4; ++i)
11882 if (Locs[i].first != -1) {
11883 unsigned Idx = (i < 2) ? 0 : 4;
11884 Idx += Locs[i].first * 2 + Locs[i].second;
11888 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
11891 if (NumLo == 3 || NumHi == 3) {
11892 // Otherwise, we must have three elements from one vector, call it X, and
11893 // one element from the other, call it Y. First, use a shufps to build an
11894 // intermediate vector with the one element from Y and the element from X
11895 // that will be in the same half in the final destination (the indexes don't
11896 // matter). Then, use a shufps to build the final vector, taking the half
11897 // containing the element from Y from the intermediate, and the other half
11900 // Normalize it so the 3 elements come from V1.
11901 CommuteVectorShuffleMask(PermMask, 4);
11905 // Find the element from V2.
11907 for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
11908 int Val = PermMask[HiIndex];
11915 Mask1[0] = PermMask[HiIndex];
11917 Mask1[2] = PermMask[HiIndex^1];
11919 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
11921 if (HiIndex >= 2) {
11922 Mask1[0] = PermMask[0];
11923 Mask1[1] = PermMask[1];
11924 Mask1[2] = HiIndex & 1 ? 6 : 4;
11925 Mask1[3] = HiIndex & 1 ? 4 : 6;
11926 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
11929 Mask1[0] = HiIndex & 1 ? 2 : 0;
11930 Mask1[1] = HiIndex & 1 ? 0 : 2;
11931 Mask1[2] = PermMask[2];
11932 Mask1[3] = PermMask[3];
11937 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
11940 // Break it into (shuffle shuffle_hi, shuffle_lo).
11941 int LoMask[] = { -1, -1, -1, -1 };
11942 int HiMask[] = { -1, -1, -1, -1 };
11944 int *MaskPtr = LoMask;
11945 unsigned MaskIdx = 0;
11946 unsigned LoIdx = 0;
11947 unsigned HiIdx = 2;
11948 for (unsigned i = 0; i != 4; ++i) {
11955 int Idx = PermMask[i];
11957 Locs[i] = std::make_pair(-1, -1);
11958 } else if (Idx < 4) {
11959 Locs[i] = std::make_pair(MaskIdx, LoIdx);
11960 MaskPtr[LoIdx] = Idx;
11963 Locs[i] = std::make_pair(MaskIdx, HiIdx);
11964 MaskPtr[HiIdx] = Idx;
11969 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
11970 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
11971 int MaskOps[] = { -1, -1, -1, -1 };
11972 for (unsigned i = 0; i != 4; ++i)
11973 if (Locs[i].first != -1)
11974 MaskOps[i] = Locs[i].first * 4 + Locs[i].second;
11975 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
11978 static bool MayFoldVectorLoad(SDValue V) {
11979 while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
11980 V = V.getOperand(0);
11982 if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
11983 V = V.getOperand(0);
11984 if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&
11985 V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)
11986 // BUILD_VECTOR (load), undef
11987 V = V.getOperand(0);
11989 return MayFoldLoad(V);
11993 SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) {
11994 MVT VT = Op.getSimpleValueType();
11996 // Canonizalize to v2f64.
11997 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
11998 return DAG.getNode(ISD::BITCAST, dl, VT,
11999 getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64,
12004 SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG,
12006 SDValue V1 = Op.getOperand(0);
12007 SDValue V2 = Op.getOperand(1);
12008 MVT VT = Op.getSimpleValueType();
12010 assert(VT != MVT::v2i64 && "unsupported shuffle type");
12012 if (HasSSE2 && VT == MVT::v2f64)
12013 return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
12015 // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1)
12016 return DAG.getNode(ISD::BITCAST, dl, VT,
12017 getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32,
12018 DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1),
12019 DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG));
12023 SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG) {
12024 SDValue V1 = Op.getOperand(0);
12025 SDValue V2 = Op.getOperand(1);
12026 MVT VT = Op.getSimpleValueType();
12028 assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
12029 "unsupported shuffle type");
12031 if (V2.getOpcode() == ISD::UNDEF)
12035 return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
12039 SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
12040 SDValue V1 = Op.getOperand(0);
12041 SDValue V2 = Op.getOperand(1);
12042 MVT VT = Op.getSimpleValueType();
12043 unsigned NumElems = VT.getVectorNumElements();
12045 // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
12046 // operand of these instructions is only memory, so check if there's a
12047 // potencial load folding here, otherwise use SHUFPS or MOVSD to match the
12049 bool CanFoldLoad = false;
12051 // Trivial case, when V2 comes from a load.
12052 if (MayFoldVectorLoad(V2))
12053 CanFoldLoad = true;
12055 // When V1 is a load, it can be folded later into a store in isel, example:
12056 // (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1)
12058 // (MOVLPSmr addr:$src1, VR128:$src2)
12059 // So, recognize this potential and also use MOVLPS or MOVLPD
12060 else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
12061 CanFoldLoad = true;
12063 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12065 if (HasSSE2 && NumElems == 2)
12066 return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
12069 // If we don't care about the second element, proceed to use movss.
12070 if (SVOp->getMaskElt(1) != -1)
12071 return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
12074 // movl and movlp will both match v2i64, but v2i64 is never matched by
12075 // movl earlier because we make it strict to avoid messing with the movlp load
12076 // folding logic (see the code above getMOVLP call). Match it here then,
12077 // this is horrible, but will stay like this until we move all shuffle
12078 // matching to x86 specific nodes. Note that for the 1st condition all
12079 // types are matched with movsd.
12081 // FIXME: isMOVLMask should be checked and matched before getMOVLP,
12082 // as to remove this logic from here, as much as possible
12083 if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT))
12084 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
12085 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
12088 assert(VT != MVT::v4i32 && "unsupported shuffle type");
12090 // Invert the operand order and use SHUFPS to match it.
12091 return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1,
12092 getShuffleSHUFImmediate(SVOp), DAG);
12095 static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
12096 SelectionDAG &DAG) {
12098 MVT VT = Load->getSimpleValueType(0);
12099 MVT EVT = VT.getVectorElementType();
12100 SDValue Addr = Load->getOperand(1);
12101 SDValue NewAddr = DAG.getNode(
12102 ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
12103 DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType()));
12106 DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
12107 DAG.getMachineFunction().getMachineMemOperand(
12108 Load->getMemOperand(), 0, EVT.getStoreSize()));
12112 // It is only safe to call this function if isINSERTPSMask is true for
12113 // this shufflevector mask.
12114 static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
12115 SelectionDAG &DAG) {
12116 // Generate an insertps instruction when inserting an f32 from memory onto a
12117 // v4f32 or when copying a member from one v4f32 to another.
12118 // We also use it for transferring i32 from one register to another,
12119 // since it simply copies the same bits.
12120 // If we're transferring an i32 from memory to a specific element in a
12121 // register, we output a generic DAG that will match the PINSRD
12123 MVT VT = SVOp->getSimpleValueType(0);
12124 MVT EVT = VT.getVectorElementType();
12125 SDValue V1 = SVOp->getOperand(0);
12126 SDValue V2 = SVOp->getOperand(1);
12127 auto Mask = SVOp->getMask();
12128 assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
12129 "unsupported vector type for insertps/pinsrd");
12131 auto FromV1Predicate = [](const int &i) { return i < 4 && i > -1; };
12132 auto FromV2Predicate = [](const int &i) { return i >= 4; };
12133 int FromV1 = std::count_if(Mask.begin(), Mask.end(), FromV1Predicate);
12137 unsigned DestIndex;
12141 DestIndex = std::find_if(Mask.begin(), Mask.end(), FromV1Predicate) -
12144 // If we have 1 element from each vector, we have to check if we're
12145 // changing V1's element's place. If so, we're done. Otherwise, we
12146 // should assume we're changing V2's element's place and behave
12148 int FromV2 = std::count_if(Mask.begin(), Mask.end(), FromV2Predicate);
12149 assert(DestIndex <= INT32_MAX && "truncated destination index");
12150 if (FromV1 == FromV2 &&
12151 static_cast<int>(DestIndex) == Mask[DestIndex] % 4) {
12155 std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
12158 assert(std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 &&
12159 "More than one element from V1 and from V2, or no elements from one "
12160 "of the vectors. This case should not have returned true from "
12165 std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
12168 // Get an index into the source vector in the range [0,4) (the mask is
12169 // in the range [0,8) because it can address V1 and V2)
12170 unsigned SrcIndex = Mask[DestIndex] % 4;
12171 if (MayFoldLoad(From)) {
12172 // Trivial case, when From comes from a load and is only used by the
12173 // shuffle. Make it use insertps from the vector that we need from that
12176 NarrowVectorLoadToElement(cast<LoadSDNode>(From), SrcIndex, DAG);
12177 if (!NewLoad.getNode())
12180 if (EVT == MVT::f32) {
12181 // Create this as a scalar to vector to match the instruction pattern.
12182 SDValue LoadScalarToVector =
12183 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad);
12184 SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4);
12185 return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector,
12187 } else { // EVT == MVT::i32
12188 // If we're getting an i32 from memory, use an INSERT_VECTOR_ELT
12189 // instruction, to match the PINSRD instruction, which loads an i32 to a
12190 // certain vector element.
12191 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, To, NewLoad,
12192 DAG.getConstant(DestIndex, MVT::i32));
12196 // Vector-element-to-vector
12197 SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6);
12198 return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask);
12201 // Reduce a vector shuffle to zext.
12202 static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
12203 SelectionDAG &DAG) {
12204 // PMOVZX is only available from SSE41.
12205 if (!Subtarget->hasSSE41())
12208 MVT VT = Op.getSimpleValueType();
12210 // Only AVX2 support 256-bit vector integer extending.
12211 if (!Subtarget->hasInt256() && VT.is256BitVector())
12214 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12216 SDValue V1 = Op.getOperand(0);
12217 SDValue V2 = Op.getOperand(1);
12218 unsigned NumElems = VT.getVectorNumElements();
12220 // Extending is an unary operation and the element type of the source vector
12221 // won't be equal to or larger than i64.
12222 if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() ||
12223 VT.getVectorElementType() == MVT::i64)
12226 // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4.
12227 unsigned Shift = 1; // Start from 2, i.e. 1 << 1.
12228 while ((1U << Shift) < NumElems) {
12229 if (SVOp->getMaskElt(1U << Shift) == 1)
12232 // The maximal ratio is 8, i.e. from i8 to i64.
12237 // Check the shuffle mask.
12238 unsigned Mask = (1U << Shift) - 1;
12239 for (unsigned i = 0; i != NumElems; ++i) {
12240 int EltIdx = SVOp->getMaskElt(i);
12241 if ((i & Mask) != 0 && EltIdx != -1)
12243 if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift))
12247 unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift;
12248 MVT NeVT = MVT::getIntegerVT(NBits);
12249 MVT NVT = MVT::getVectorVT(NeVT, NumElems >> Shift);
12251 if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT))
12254 return DAG.getNode(ISD::BITCAST, DL, VT,
12255 DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
12258 static SDValue NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
12259 SelectionDAG &DAG) {
12260 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12261 MVT VT = Op.getSimpleValueType();
12263 SDValue V1 = Op.getOperand(0);
12264 SDValue V2 = Op.getOperand(1);
12266 if (isZeroShuffle(SVOp))
12267 return getZeroVector(VT, Subtarget, DAG, dl);
12269 // Handle splat operations
12270 if (SVOp->isSplat()) {
12271 // Use vbroadcast whenever the splat comes from a foldable load
12272 SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
12273 if (Broadcast.getNode())
12277 // Check integer expanding shuffles.
12278 SDValue NewOp = LowerVectorIntExtend(Op, Subtarget, DAG);
12279 if (NewOp.getNode())
12282 // If the shuffle can be profitably rewritten as a narrower shuffle, then
12284 if (VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v16i16 ||
12285 VT == MVT::v32i8) {
12286 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
12287 if (NewOp.getNode())
12288 return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
12289 } else if (VT.is128BitVector() && Subtarget->hasSSE2()) {
12290 // FIXME: Figure out a cleaner way to do this.
12291 if (ISD::isBuildVectorAllZeros(V2.getNode())) {
12292 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
12293 if (NewOp.getNode()) {
12294 MVT NewVT = NewOp.getSimpleValueType();
12295 if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
12296 NewVT, true, false))
12297 return getVZextMovL(VT, NewVT, NewOp.getOperand(0), DAG, Subtarget,
12300 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
12301 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
12302 if (NewOp.getNode()) {
12303 MVT NewVT = NewOp.getSimpleValueType();
12304 if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
12305 return getVZextMovL(VT, NewVT, NewOp.getOperand(1), DAG, Subtarget,
12314 X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
12315 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12316 SDValue V1 = Op.getOperand(0);
12317 SDValue V2 = Op.getOperand(1);
12318 MVT VT = Op.getSimpleValueType();
12320 unsigned NumElems = VT.getVectorNumElements();
12321 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
12322 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
12323 bool V1IsSplat = false;
12324 bool V2IsSplat = false;
12325 bool HasSSE2 = Subtarget->hasSSE2();
12326 bool HasFp256 = Subtarget->hasFp256();
12327 bool HasInt256 = Subtarget->hasInt256();
12328 MachineFunction &MF = DAG.getMachineFunction();
12329 bool OptForSize = MF.getFunction()->getAttributes().
12330 hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
12332 // Check if we should use the experimental vector shuffle lowering. If so,
12333 // delegate completely to that code path.
12334 if (ExperimentalVectorShuffleLowering)
12335 return lowerVectorShuffle(Op, Subtarget, DAG);
12337 assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
12339 if (V1IsUndef && V2IsUndef)
12340 return DAG.getUNDEF(VT);
12342 // When we create a shuffle node we put the UNDEF node to second operand,
12343 // but in some cases the first operand may be transformed to UNDEF.
12344 // In this case we should just commute the node.
12346 return DAG.getCommutedVectorShuffle(*SVOp);
12348 // Vector shuffle lowering takes 3 steps:
12350 // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable
12351 // narrowing and commutation of operands should be handled.
12352 // 2) Matching of shuffles with known shuffle masks to x86 target specific
12354 // 3) Rewriting of unmatched masks into new generic shuffle operations,
12355 // so the shuffle can be broken into other shuffles and the legalizer can
12356 // try the lowering again.
12358 // The general idea is that no vector_shuffle operation should be left to
12359 // be matched during isel, all of them must be converted to a target specific
12362 // Normalize the input vectors. Here splats, zeroed vectors, profitable
12363 // narrowing and commutation of operands should be handled. The actual code
12364 // doesn't include all of those, work in progress...
12365 SDValue NewOp = NormalizeVectorShuffle(Op, Subtarget, DAG);
12366 if (NewOp.getNode())
12369 SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end());
12371 // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
12372 // unpckh_undef). Only use pshufd if speed is more important than size.
12373 if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasInt256))
12374 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
12375 if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasInt256))
12376 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
12378 if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() &&
12379 V2IsUndef && MayFoldVectorLoad(V1))
12380 return getMOVDDup(Op, dl, V1, DAG);
12382 if (isMOVHLPS_v_undef_Mask(M, VT))
12383 return getMOVHighToLow(Op, dl, DAG);
12385 // Use to match splats
12386 if (HasSSE2 && isUNPCKHMask(M, VT, HasInt256) && V2IsUndef &&
12387 (VT == MVT::v2f64 || VT == MVT::v2i64))
12388 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
12390 if (isPSHUFDMask(M, VT)) {
12391 // The actual implementation will match the mask in the if above and then
12392 // during isel it can match several different instructions, not only pshufd
12393 // as its name says, sad but true, emulate the behavior for now...
12394 if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64)))
12395 return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG);
12397 unsigned TargetMask = getShuffleSHUFImmediate(SVOp);
12399 if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
12400 return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
12402 if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64))
12403 return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1, TargetMask,
12406 return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1,
12410 if (isPALIGNRMask(M, VT, Subtarget))
12411 return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2,
12412 getShufflePALIGNRImmediate(SVOp),
12415 if (isVALIGNMask(M, VT, Subtarget))
12416 return getTargetShuffleNode(X86ISD::VALIGN, dl, VT, V1, V2,
12417 getShuffleVALIGNImmediate(SVOp),
12420 // Check if this can be converted into a logical shift.
12421 bool isLeft = false;
12422 unsigned ShAmt = 0;
12424 bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
12425 if (isShift && ShVal.hasOneUse()) {
12426 // If the shifted value has multiple uses, it may be cheaper to use
12427 // v_set0 + movlhps or movhlps, etc.
12428 MVT EltVT = VT.getVectorElementType();
12429 ShAmt *= EltVT.getSizeInBits();
12430 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
12433 if (isMOVLMask(M, VT)) {
12434 if (ISD::isBuildVectorAllZeros(V1.getNode()))
12435 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
12436 if (!isMOVLPMask(M, VT)) {
12437 if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
12438 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
12440 if (VT == MVT::v4i32 || VT == MVT::v4f32)
12441 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
12445 // FIXME: fold these into legal mask.
12446 if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasInt256))
12447 return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
12449 if (isMOVHLPSMask(M, VT))
12450 return getMOVHighToLow(Op, dl, DAG);
12452 if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget))
12453 return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
12455 if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget))
12456 return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
12458 if (isMOVLPMask(M, VT))
12459 return getMOVLP(Op, dl, DAG, HasSSE2);
12461 if (ShouldXformToMOVHLPS(M, VT) ||
12462 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT))
12463 return DAG.getCommutedVectorShuffle(*SVOp);
12466 // No better options. Use a vshldq / vsrldq.
12467 MVT EltVT = VT.getVectorElementType();
12468 ShAmt *= EltVT.getSizeInBits();
12469 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
12472 bool Commuted = false;
12473 // FIXME: This should also accept a bitcast of a splat? Be careful, not
12474 // 1,1,1,1 -> v8i16 though.
12475 BitVector UndefElements;
12476 if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V1.getNode()))
12477 if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
12479 if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V2.getNode()))
12480 if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
12483 // Canonicalize the splat or undef, if present, to be on the RHS.
12484 if (!V2IsUndef && V1IsSplat && !V2IsSplat) {
12485 CommuteVectorShuffleMask(M, NumElems);
12487 std::swap(V1IsSplat, V2IsSplat);
12491 if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) {
12492 // Shuffling low element of v1 into undef, just return v1.
12495 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
12496 // the instruction selector will not match, so get a canonical MOVL with
12497 // swapped operands to undo the commute.
12498 return getMOVL(DAG, dl, VT, V2, V1);
12501 if (isUNPCKLMask(M, VT, HasInt256))
12502 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
12504 if (isUNPCKHMask(M, VT, HasInt256))
12505 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
12508 // Normalize mask so all entries that point to V2 points to its first
12509 // element then try to match unpck{h|l} again. If match, return a
12510 // new vector_shuffle with the corrected mask.p
12511 SmallVector<int, 8> NewMask(M.begin(), M.end());
12512 NormalizeMask(NewMask, NumElems);
12513 if (isUNPCKLMask(NewMask, VT, HasInt256, true))
12514 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
12515 if (isUNPCKHMask(NewMask, VT, HasInt256, true))
12516 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
12520 // Commute is back and try unpck* again.
12521 // FIXME: this seems wrong.
12522 CommuteVectorShuffleMask(M, NumElems);
12524 std::swap(V1IsSplat, V2IsSplat);
12526 if (isUNPCKLMask(M, VT, HasInt256))
12527 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
12529 if (isUNPCKHMask(M, VT, HasInt256))
12530 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
12533 // Normalize the node to match x86 shuffle ops if needed
12534 if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true)))
12535 return DAG.getCommutedVectorShuffle(*SVOp);
12537 // The checks below are all present in isShuffleMaskLegal, but they are
12538 // inlined here right now to enable us to directly emit target specific
12539 // nodes, and remove one by one until they don't return Op anymore.
12541 if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
12542 SVOp->getSplatIndex() == 0 && V2IsUndef) {
12543 if (VT == MVT::v2f64 || VT == MVT::v2i64)
12544 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
12547 if (isPSHUFHWMask(M, VT, HasInt256))
12548 return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
12549 getShufflePSHUFHWImmediate(SVOp),
12552 if (isPSHUFLWMask(M, VT, HasInt256))
12553 return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
12554 getShufflePSHUFLWImmediate(SVOp),
12557 unsigned MaskValue;
12558 if (isBlendMask(M, VT, Subtarget->hasSSE41(), Subtarget->hasInt256(),
12560 return LowerVECTOR_SHUFFLEtoBlend(SVOp, MaskValue, Subtarget, DAG);
12562 if (isSHUFPMask(M, VT))
12563 return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
12564 getShuffleSHUFImmediate(SVOp), DAG);
12566 if (isUNPCKL_v_undef_Mask(M, VT, HasInt256))
12567 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
12568 if (isUNPCKH_v_undef_Mask(M, VT, HasInt256))
12569 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
12571 //===--------------------------------------------------------------------===//
12572 // Generate target specific nodes for 128 or 256-bit shuffles only
12573 // supported in the AVX instruction set.
12576 // Handle VMOVDDUPY permutations
12577 if (V2IsUndef && isMOVDDUPYMask(M, VT, HasFp256))
12578 return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
12580 // Handle VPERMILPS/D* permutations
12581 if (isVPERMILPMask(M, VT)) {
12582 if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32)
12583 return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
12584 getShuffleSHUFImmediate(SVOp), DAG);
12585 return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1,
12586 getShuffleSHUFImmediate(SVOp), DAG);
12590 if (VT.is512BitVector() && isINSERT64x4Mask(M, VT, &Idx))
12591 return Insert256BitVector(V1, Extract256BitVector(V2, 0, DAG, dl),
12592 Idx*(NumElems/2), DAG, dl);
12594 // Handle VPERM2F128/VPERM2I128 permutations
12595 if (isVPERM2X128Mask(M, VT, HasFp256))
12596 return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
12597 V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
12599 if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT))
12600 return getINSERTPS(SVOp, dl, DAG);
12603 if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8))
12604 return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG);
12606 if ((V2IsUndef && HasInt256 && VT.is256BitVector() && NumElems == 8) ||
12607 VT.is512BitVector()) {
12608 MVT MaskEltVT = MVT::getIntegerVT(VT.getVectorElementType().getSizeInBits());
12609 MVT MaskVectorVT = MVT::getVectorVT(MaskEltVT, NumElems);
12610 SmallVector<SDValue, 16> permclMask;
12611 for (unsigned i = 0; i != NumElems; ++i) {
12612 permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT));
12615 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, permclMask);
12617 // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
12618 return DAG.getNode(X86ISD::VPERMV, dl, VT,
12619 DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
12620 return DAG.getNode(X86ISD::VPERMV3, dl, VT, V1,
12621 DAG.getNode(ISD::BITCAST, dl, VT, Mask), V2);
12624 //===--------------------------------------------------------------------===//
12625 // Since no target specific shuffle was selected for this generic one,
12626 // lower it into other known shuffles. FIXME: this isn't true yet, but
12627 // this is the plan.
12630 // Handle v8i16 specifically since SSE can do byte extraction and insertion.
12631 if (VT == MVT::v8i16) {
12632 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG);
12633 if (NewOp.getNode())
12637 if (VT == MVT::v16i16 && Subtarget->hasInt256()) {
12638 SDValue NewOp = LowerVECTOR_SHUFFLEv16i16(Op, DAG);
12639 if (NewOp.getNode())
12643 if (VT == MVT::v16i8) {
12644 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, Subtarget, DAG);
12645 if (NewOp.getNode())
12649 if (VT == MVT::v32i8) {
12650 SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG);
12651 if (NewOp.getNode())
12655 // Handle all 128-bit wide vectors with 4 elements, and match them with
12656 // several different shuffle types.
12657 if (NumElems == 4 && VT.is128BitVector())
12658 return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
12660 // Handle general 256-bit shuffles
12661 if (VT.is256BitVector())
12662 return LowerVECTOR_SHUFFLE_256(SVOp, DAG);
12667 // This function assumes its argument is a BUILD_VECTOR of constants or
12668 // undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is
12670 static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
12671 unsigned &MaskValue) {
12673 unsigned NumElems = BuildVector->getNumOperands();
12674 // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
12675 unsigned NumLanes = (NumElems - 1) / 8 + 1;
12676 unsigned NumElemsInLane = NumElems / NumLanes;
12678 // Blend for v16i16 should be symetric for the both lanes.
12679 for (unsigned i = 0; i < NumElemsInLane; ++i) {
12680 SDValue EltCond = BuildVector->getOperand(i);
12681 SDValue SndLaneEltCond =
12682 (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond;
12684 int Lane1Cond = -1, Lane2Cond = -1;
12685 if (isa<ConstantSDNode>(EltCond))
12686 Lane1Cond = !isZero(EltCond);
12687 if (isa<ConstantSDNode>(SndLaneEltCond))
12688 Lane2Cond = !isZero(SndLaneEltCond);
12690 if (Lane1Cond == Lane2Cond || Lane2Cond < 0)
12691 // Lane1Cond != 0, means we want the first argument.
12692 // Lane1Cond == 0, means we want the second argument.
12693 // The encoding of this argument is 0 for the first argument, 1
12694 // for the second. Therefore, invert the condition.
12695 MaskValue |= !Lane1Cond << i;
12696 else if (Lane1Cond < 0)
12697 MaskValue |= !Lane2Cond << i;
12704 /// \brief Try to lower a VSELECT instruction to an immediate-controlled blend
12706 static SDValue lowerVSELECTtoBLENDI(SDValue Op, const X86Subtarget *Subtarget,
12707 SelectionDAG &DAG) {
12708 SDValue Cond = Op.getOperand(0);
12709 SDValue LHS = Op.getOperand(1);
12710 SDValue RHS = Op.getOperand(2);
12712 MVT VT = Op.getSimpleValueType();
12713 MVT EltVT = VT.getVectorElementType();
12714 unsigned NumElems = VT.getVectorNumElements();
12716 // There is no blend with immediate in AVX-512.
12717 if (VT.is512BitVector())
12720 if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
12722 if (!Subtarget->hasInt256() && VT == MVT::v16i16)
12725 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
12728 // Check the mask for BLEND and build the value.
12729 unsigned MaskValue = 0;
12730 if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
12733 // Convert i32 vectors to floating point if it is not AVX2.
12734 // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
12736 if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
12737 BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
12739 LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS);
12740 RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS);
12743 SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS,
12744 DAG.getConstant(MaskValue, MVT::i32));
12745 return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
12748 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
12749 // A vselect where all conditions and data are constants can be optimized into
12750 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
12751 if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
12752 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
12753 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
12756 SDValue BlendOp = lowerVSELECTtoBLENDI(Op, Subtarget, DAG);
12757 if (BlendOp.getNode())
12760 // Some types for vselect were previously set to Expand, not Legal or
12761 // Custom. Return an empty SDValue so we fall-through to Expand, after
12762 // the Custom lowering phase.
12763 MVT VT = Op.getSimpleValueType();
12764 switch (VT.SimpleTy) {
12769 if (Subtarget->hasBWI() && Subtarget->hasVLX())
12774 // We couldn't create a "Blend with immediate" node.
12775 // This node should still be legal, but we'll have to emit a blendv*
12780 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
12781 MVT VT = Op.getSimpleValueType();
12784 if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
12787 if (VT.getSizeInBits() == 8) {
12788 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
12789 Op.getOperand(0), Op.getOperand(1));
12790 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
12791 DAG.getValueType(VT));
12792 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
12795 if (VT.getSizeInBits() == 16) {
12796 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
12797 // If Idx is 0, it's cheaper to do a move instead of a pextrw.
12799 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
12800 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
12801 DAG.getNode(ISD::BITCAST, dl,
12804 Op.getOperand(1)));
12805 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
12806 Op.getOperand(0), Op.getOperand(1));
12807 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
12808 DAG.getValueType(VT));
12809 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
12812 if (VT == MVT::f32) {
12813 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
12814 // the result back to FR32 register. It's only worth matching if the
12815 // result has a single use which is a store or a bitcast to i32. And in
12816 // the case of a store, it's not worth it if the index is a constant 0,
12817 // because a MOVSSmr can be used instead, which is smaller and faster.
12818 if (!Op.hasOneUse())
12820 SDNode *User = *Op.getNode()->use_begin();
12821 if ((User->getOpcode() != ISD::STORE ||
12822 (isa<ConstantSDNode>(Op.getOperand(1)) &&
12823 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
12824 (User->getOpcode() != ISD::BITCAST ||
12825 User->getValueType(0) != MVT::i32))
12827 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
12828 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32,
12831 return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
12834 if (VT == MVT::i32 || VT == MVT::i64) {
12835 // ExtractPS/pextrq works with constant index.
12836 if (isa<ConstantSDNode>(Op.getOperand(1)))
12842 /// Extract one bit from mask vector, like v16i1 or v8i1.
12843 /// AVX-512 feature.
12845 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
12846 SDValue Vec = Op.getOperand(0);
12848 MVT VecVT = Vec.getSimpleValueType();
12849 SDValue Idx = Op.getOperand(1);
12850 MVT EltVT = Op.getSimpleValueType();
12852 assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
12853 assert((VecVT.getVectorNumElements() <= 16 || Subtarget->hasBWI()) &&
12854 "Unexpected vector type in ExtractBitFromMaskVector");
12856 // variable index can't be handled in mask registers,
12857 // extend vector to VR512
12858 if (!isa<ConstantSDNode>(Idx)) {
12859 MVT ExtVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
12860 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
12861 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
12862 ExtVT.getVectorElementType(), Ext, Idx);
12863 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
12866 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
12867 const TargetRegisterClass* rc = getRegClassFor(VecVT);
12868 if (!Subtarget->hasDQI() && (VecVT.getVectorNumElements() <= 8))
12869 rc = getRegClassFor(MVT::v16i1);
12870 unsigned MaxSift = rc->getSize()*8 - 1;
12871 Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
12872 DAG.getConstant(MaxSift - IdxVal, MVT::i8));
12873 Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
12874 DAG.getConstant(MaxSift, MVT::i8));
12875 return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
12876 DAG.getIntPtrConstant(0));
12880 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
12881 SelectionDAG &DAG) const {
12883 SDValue Vec = Op.getOperand(0);
12884 MVT VecVT = Vec.getSimpleValueType();
12885 SDValue Idx = Op.getOperand(1);
12887 if (Op.getSimpleValueType() == MVT::i1)
12888 return ExtractBitFromMaskVector(Op, DAG);
12890 if (!isa<ConstantSDNode>(Idx)) {
12891 if (VecVT.is512BitVector() ||
12892 (VecVT.is256BitVector() && Subtarget->hasInt256() &&
12893 VecVT.getVectorElementType().getSizeInBits() == 32)) {
12896 MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits());
12897 MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
12898 MaskEltVT.getSizeInBits());
12900 Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
12901 SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
12902 getZeroVector(MaskVT, Subtarget, DAG, dl),
12903 Idx, DAG.getConstant(0, getPointerTy()));
12904 SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
12905 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(),
12906 Perm, DAG.getConstant(0, getPointerTy()));
12911 // If this is a 256-bit vector result, first extract the 128-bit vector and
12912 // then extract the element from the 128-bit vector.
12913 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
12915 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
12916 // Get the 128-bit vector.
12917 Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
12918 MVT EltVT = VecVT.getVectorElementType();
12920 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
12922 //if (IdxVal >= NumElems/2)
12923 // IdxVal -= NumElems/2;
12924 IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk;
12925 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
12926 DAG.getConstant(IdxVal, MVT::i32));
12929 assert(VecVT.is128BitVector() && "Unexpected vector length");
12931 if (Subtarget->hasSSE41()) {
12932 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
12937 MVT VT = Op.getSimpleValueType();
12938 // TODO: handle v16i8.
12939 if (VT.getSizeInBits() == 16) {
12940 SDValue Vec = Op.getOperand(0);
12941 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
12943 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
12944 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
12945 DAG.getNode(ISD::BITCAST, dl,
12947 Op.getOperand(1)));
12948 // Transform it so it match pextrw which produces a 32-bit result.
12949 MVT EltVT = MVT::i32;
12950 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
12951 Op.getOperand(0), Op.getOperand(1));
12952 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
12953 DAG.getValueType(VT));
12954 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
12957 if (VT.getSizeInBits() == 32) {
12958 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
12962 // SHUFPS the element to the lowest double word, then movss.
12963 int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
12964 MVT VVT = Op.getOperand(0).getSimpleValueType();
12965 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
12966 DAG.getUNDEF(VVT), Mask);
12967 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
12968 DAG.getIntPtrConstant(0));
12971 if (VT.getSizeInBits() == 64) {
12972 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
12973 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
12974 // to match extract_elt for f64.
12975 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
12979 // UNPCKHPD the element to the lowest double word, then movsd.
12980 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
12981 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
12982 int Mask[2] = { 1, -1 };
12983 MVT VVT = Op.getOperand(0).getSimpleValueType();
12984 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
12985 DAG.getUNDEF(VVT), Mask);
12986 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
12987 DAG.getIntPtrConstant(0));
12993 /// Insert one bit to mask vector, like v16i1 or v8i1.
12994 /// AVX-512 feature.
12996 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
12998 SDValue Vec = Op.getOperand(0);
12999 SDValue Elt = Op.getOperand(1);
13000 SDValue Idx = Op.getOperand(2);
13001 MVT VecVT = Vec.getSimpleValueType();
13003 if (!isa<ConstantSDNode>(Idx)) {
13004 // Non constant index. Extend source and destination,
13005 // insert element and then truncate the result.
13006 MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
13007 MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32);
13008 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
13009 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
13010 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
13011 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
13014 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13015 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
13016 if (Vec.getOpcode() == ISD::UNDEF)
13017 return DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
13018 DAG.getConstant(IdxVal, MVT::i8));
13019 const TargetRegisterClass* rc = getRegClassFor(VecVT);
13020 unsigned MaxSift = rc->getSize()*8 - 1;
13021 EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
13022 DAG.getConstant(MaxSift, MVT::i8));
13023 EltInVec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, EltInVec,
13024 DAG.getConstant(MaxSift - IdxVal, MVT::i8));
13025 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
13028 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
13029 SelectionDAG &DAG) const {
13030 MVT VT = Op.getSimpleValueType();
13031 MVT EltVT = VT.getVectorElementType();
13033 if (EltVT == MVT::i1)
13034 return InsertBitToMaskVector(Op, DAG);
13037 SDValue N0 = Op.getOperand(0);
13038 SDValue N1 = Op.getOperand(1);
13039 SDValue N2 = Op.getOperand(2);
13040 if (!isa<ConstantSDNode>(N2))
13042 auto *N2C = cast<ConstantSDNode>(N2);
13043 unsigned IdxVal = N2C->getZExtValue();
13045 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
13046 // into that, and then insert the subvector back into the result.
13047 if (VT.is256BitVector() || VT.is512BitVector()) {
13048 // Get the desired 128-bit vector half.
13049 SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
13051 // Insert the element into the desired half.
13052 unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
13053 unsigned IdxIn128 = IdxVal - (IdxVal / NumEltsIn128) * NumEltsIn128;
13055 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
13056 DAG.getConstant(IdxIn128, MVT::i32));
13058 // Insert the changed part back to the 256-bit vector
13059 return Insert128BitVector(N0, V, IdxVal, DAG, dl);
13061 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
13063 if (Subtarget->hasSSE41()) {
13064 if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) {
13066 if (VT == MVT::v8i16) {
13067 Opc = X86ISD::PINSRW;
13069 assert(VT == MVT::v16i8);
13070 Opc = X86ISD::PINSRB;
13073 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
13075 if (N1.getValueType() != MVT::i32)
13076 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
13077 if (N2.getValueType() != MVT::i32)
13078 N2 = DAG.getIntPtrConstant(IdxVal);
13079 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
13082 if (EltVT == MVT::f32) {
13083 // Bits [7:6] of the constant are the source select. This will always be
13084 // zero here. The DAG Combiner may combine an extract_elt index into
13086 // bits. For example (insert (extract, 3), 2) could be matched by
13088 // the '3' into bits [7:6] of X86ISD::INSERTPS.
13089 // Bits [5:4] of the constant are the destination select. This is the
13090 // value of the incoming immediate.
13091 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
13092 // combine either bitwise AND or insert of float 0.0 to set these bits.
13093 N2 = DAG.getIntPtrConstant(IdxVal << 4);
13094 // Create this as a scalar to vector..
13095 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
13096 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
13099 if (EltVT == MVT::i32 || EltVT == MVT::i64) {
13100 // PINSR* works with constant index.
13105 if (EltVT == MVT::i8)
13108 if (EltVT.getSizeInBits() == 16) {
13109 // Transform it so it match pinsrw which expects a 16-bit value in a GR32
13110 // as its second argument.
13111 if (N1.getValueType() != MVT::i32)
13112 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
13113 if (N2.getValueType() != MVT::i32)
13114 N2 = DAG.getIntPtrConstant(IdxVal);
13115 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
13120 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
13122 MVT OpVT = Op.getSimpleValueType();
13124 // If this is a 256-bit vector result, first insert into a 128-bit
13125 // vector and then insert into the 256-bit vector.
13126 if (!OpVT.is128BitVector()) {
13127 // Insert into a 128-bit vector.
13128 unsigned SizeFactor = OpVT.getSizeInBits()/128;
13129 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
13130 OpVT.getVectorNumElements() / SizeFactor);
13132 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
13134 // Insert the 128-bit vector.
13135 return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
13138 if (OpVT == MVT::v1i64 &&
13139 Op.getOperand(0).getValueType() == MVT::i64)
13140 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
13142 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
13143 assert(OpVT.is128BitVector() && "Expected an SSE type!");
13144 return DAG.getNode(ISD::BITCAST, dl, OpVT,
13145 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
13148 // Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in
13149 // a simple subregister reference or explicit instructions to grab
13150 // upper bits of a vector.
13151 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
13152 SelectionDAG &DAG) {
13154 SDValue In = Op.getOperand(0);
13155 SDValue Idx = Op.getOperand(1);
13156 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13157 MVT ResVT = Op.getSimpleValueType();
13158 MVT InVT = In.getSimpleValueType();
13160 if (Subtarget->hasFp256()) {
13161 if (ResVT.is128BitVector() &&
13162 (InVT.is256BitVector() || InVT.is512BitVector()) &&
13163 isa<ConstantSDNode>(Idx)) {
13164 return Extract128BitVector(In, IdxVal, DAG, dl);
13166 if (ResVT.is256BitVector() && InVT.is512BitVector() &&
13167 isa<ConstantSDNode>(Idx)) {
13168 return Extract256BitVector(In, IdxVal, DAG, dl);
13174 // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
13175 // simple superregister reference or explicit instructions to insert
13176 // the upper bits of a vector.
13177 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
13178 SelectionDAG &DAG) {
13179 if (!Subtarget->hasAVX())
13183 SDValue Vec = Op.getOperand(0);
13184 SDValue SubVec = Op.getOperand(1);
13185 SDValue Idx = Op.getOperand(2);
13186 MVT OpVT = Op.getSimpleValueType();
13187 MVT SubVecVT = SubVec.getSimpleValueType();
13189 if ((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
13190 SubVecVT.is128BitVector() && isa<ConstantSDNode>(Idx)) {
13191 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13192 return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
13195 if (OpVT.is512BitVector() &&
13196 SubVecVT.is256BitVector() && isa<ConstantSDNode>(Idx)) {
13197 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13198 return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
13204 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
13205 // their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
13206 // one of the above mentioned nodes. It has to be wrapped because otherwise
13207 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
13208 // be used to form addressing mode. These wrapped nodes will be selected
13211 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
13212 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
13214 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13215 // global base reg.
13216 unsigned char OpFlag = 0;
13217 unsigned WrapperKind = X86ISD::Wrapper;
13218 CodeModel::Model M = DAG.getTarget().getCodeModel();
13220 if (Subtarget->isPICStyleRIPRel() &&
13221 (M == CodeModel::Small || M == CodeModel::Kernel))
13222 WrapperKind = X86ISD::WrapperRIP;
13223 else if (Subtarget->isPICStyleGOT())
13224 OpFlag = X86II::MO_GOTOFF;
13225 else if (Subtarget->isPICStyleStubPIC())
13226 OpFlag = X86II::MO_PIC_BASE_OFFSET;
13228 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
13229 CP->getAlignment(),
13230 CP->getOffset(), OpFlag);
13232 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13233 // With PIC, the address is actually $g + Offset.
13235 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13236 DAG.getNode(X86ISD::GlobalBaseReg,
13237 SDLoc(), getPointerTy()),
13244 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
13245 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
13247 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13248 // global base reg.
13249 unsigned char OpFlag = 0;
13250 unsigned WrapperKind = X86ISD::Wrapper;
13251 CodeModel::Model M = DAG.getTarget().getCodeModel();
13253 if (Subtarget->isPICStyleRIPRel() &&
13254 (M == CodeModel::Small || M == CodeModel::Kernel))
13255 WrapperKind = X86ISD::WrapperRIP;
13256 else if (Subtarget->isPICStyleGOT())
13257 OpFlag = X86II::MO_GOTOFF;
13258 else if (Subtarget->isPICStyleStubPIC())
13259 OpFlag = X86II::MO_PIC_BASE_OFFSET;
13261 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
13264 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13266 // With PIC, the address is actually $g + Offset.
13268 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13269 DAG.getNode(X86ISD::GlobalBaseReg,
13270 SDLoc(), getPointerTy()),
13277 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
13278 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
13280 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13281 // global base reg.
13282 unsigned char OpFlag = 0;
13283 unsigned WrapperKind = X86ISD::Wrapper;
13284 CodeModel::Model M = DAG.getTarget().getCodeModel();
13286 if (Subtarget->isPICStyleRIPRel() &&
13287 (M == CodeModel::Small || M == CodeModel::Kernel)) {
13288 if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())
13289 OpFlag = X86II::MO_GOTPCREL;
13290 WrapperKind = X86ISD::WrapperRIP;
13291 } else if (Subtarget->isPICStyleGOT()) {
13292 OpFlag = X86II::MO_GOT;
13293 } else if (Subtarget->isPICStyleStubPIC()) {
13294 OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE;
13295 } else if (Subtarget->isPICStyleStubNoDynamic()) {
13296 OpFlag = X86II::MO_DARWIN_NONLAZY;
13299 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
13302 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13304 // With PIC, the address is actually $g + Offset.
13305 if (DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
13306 !Subtarget->is64Bit()) {
13307 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13308 DAG.getNode(X86ISD::GlobalBaseReg,
13309 SDLoc(), getPointerTy()),
13313 // For symbols that require a load from a stub to get the address, emit the
13315 if (isGlobalStubReference(OpFlag))
13316 Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
13317 MachinePointerInfo::getGOT(), false, false, false, 0);
13323 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
13324 // Create the TargetBlockAddressAddress node.
13325 unsigned char OpFlags =
13326 Subtarget->ClassifyBlockAddressReference();
13327 CodeModel::Model M = DAG.getTarget().getCodeModel();
13328 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
13329 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
13331 SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset,
13334 if (Subtarget->isPICStyleRIPRel() &&
13335 (M == CodeModel::Small || M == CodeModel::Kernel))
13336 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
13338 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
13340 // With PIC, the address is actually $g + Offset.
13341 if (isGlobalRelativeToPICBase(OpFlags)) {
13342 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
13343 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
13351 X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
13352 int64_t Offset, SelectionDAG &DAG) const {
13353 // Create the TargetGlobalAddress node, folding in the constant
13354 // offset if it is legal.
13355 unsigned char OpFlags =
13356 Subtarget->ClassifyGlobalReference(GV, DAG.getTarget());
13357 CodeModel::Model M = DAG.getTarget().getCodeModel();
13359 if (OpFlags == X86II::MO_NO_FLAG &&
13360 X86::isOffsetSuitableForCodeModel(Offset, M)) {
13361 // A direct static reference to a global.
13362 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
13365 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
13368 if (Subtarget->isPICStyleRIPRel() &&
13369 (M == CodeModel::Small || M == CodeModel::Kernel))
13370 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
13372 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
13374 // With PIC, the address is actually $g + Offset.
13375 if (isGlobalRelativeToPICBase(OpFlags)) {
13376 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
13377 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
13381 // For globals that require a load from a stub to get the address, emit the
13383 if (isGlobalStubReference(OpFlags))
13384 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
13385 MachinePointerInfo::getGOT(), false, false, false, 0);
13387 // If there was a non-zero offset that we didn't fold, create an explicit
13388 // addition for it.
13390 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
13391 DAG.getConstant(Offset, getPointerTy()));
13397 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
13398 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
13399 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
13400 return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
13404 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
13405 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
13406 unsigned char OperandFlags, bool LocalDynamic = false) {
13407 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
13408 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
13410 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13411 GA->getValueType(0),
13415 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
13419 SDValue Ops[] = { Chain, TGA, *InFlag };
13420 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
13422 SDValue Ops[] = { Chain, TGA };
13423 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
13426 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
13427 MFI->setAdjustsStack(true);
13428 MFI->setHasCalls(true);
13430 SDValue Flag = Chain.getValue(1);
13431 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
13434 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
13436 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13439 SDLoc dl(GA); // ? function entry point might be better
13440 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
13441 DAG.getNode(X86ISD::GlobalBaseReg,
13442 SDLoc(), PtrVT), InFlag);
13443 InFlag = Chain.getValue(1);
13445 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
13448 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
13450 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13452 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
13453 X86::RAX, X86II::MO_TLSGD);
13456 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
13462 // Get the start address of the TLS block for this module.
13463 X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
13464 .getInfo<X86MachineFunctionInfo>();
13465 MFI->incNumLocalDynamicTLSAccesses();
13469 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
13470 X86II::MO_TLSLD, /*LocalDynamic=*/true);
13473 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
13474 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
13475 InFlag = Chain.getValue(1);
13476 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
13477 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
13480 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
13484 unsigned char OperandFlags = X86II::MO_DTPOFF;
13485 unsigned WrapperKind = X86ISD::Wrapper;
13486 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13487 GA->getValueType(0),
13488 GA->getOffset(), OperandFlags);
13489 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
13491 // Add x@dtpoff with the base.
13492 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
13495 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
13496 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13497 const EVT PtrVT, TLSModel::Model model,
13498 bool is64Bit, bool isPIC) {
13501 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
13502 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
13503 is64Bit ? 257 : 256));
13505 SDValue ThreadPointer =
13506 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0),
13507 MachinePointerInfo(Ptr), false, false, false, 0);
13509 unsigned char OperandFlags = 0;
13510 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
13512 unsigned WrapperKind = X86ISD::Wrapper;
13513 if (model == TLSModel::LocalExec) {
13514 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
13515 } else if (model == TLSModel::InitialExec) {
13517 OperandFlags = X86II::MO_GOTTPOFF;
13518 WrapperKind = X86ISD::WrapperRIP;
13520 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
13523 llvm_unreachable("Unexpected model");
13526 // emit "addl x@ntpoff,%eax" (local exec)
13527 // or "addl x@indntpoff,%eax" (initial exec)
13528 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
13530 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
13531 GA->getOffset(), OperandFlags);
13532 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
13534 if (model == TLSModel::InitialExec) {
13535 if (isPIC && !is64Bit) {
13536 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
13537 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
13541 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
13542 MachinePointerInfo::getGOT(), false, false, false, 0);
13545 // The address of the thread local variable is the add of the thread
13546 // pointer with the offset of the variable.
13547 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
13551 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
13553 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
13554 const GlobalValue *GV = GA->getGlobal();
13556 if (Subtarget->isTargetELF()) {
13557 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
13560 case TLSModel::GeneralDynamic:
13561 if (Subtarget->is64Bit())
13562 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
13563 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
13564 case TLSModel::LocalDynamic:
13565 return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(),
13566 Subtarget->is64Bit());
13567 case TLSModel::InitialExec:
13568 case TLSModel::LocalExec:
13569 return LowerToTLSExecModel(
13570 GA, DAG, getPointerTy(), model, Subtarget->is64Bit(),
13571 DAG.getTarget().getRelocationModel() == Reloc::PIC_);
13573 llvm_unreachable("Unknown TLS model.");
13576 if (Subtarget->isTargetDarwin()) {
13577 // Darwin only has one model of TLS. Lower to that.
13578 unsigned char OpFlag = 0;
13579 unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
13580 X86ISD::WrapperRIP : X86ISD::Wrapper;
13582 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13583 // global base reg.
13584 bool PIC32 = (DAG.getTarget().getRelocationModel() == Reloc::PIC_) &&
13585 !Subtarget->is64Bit();
13587 OpFlag = X86II::MO_TLVP_PIC_BASE;
13589 OpFlag = X86II::MO_TLVP;
13591 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
13592 GA->getValueType(0),
13593 GA->getOffset(), OpFlag);
13594 SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13596 // With PIC32, the address is actually $g + Offset.
13598 Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13599 DAG.getNode(X86ISD::GlobalBaseReg,
13600 SDLoc(), getPointerTy()),
13603 // Lowering the machine isd will make sure everything is in the right
13605 SDValue Chain = DAG.getEntryNode();
13606 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
13607 SDValue Args[] = { Chain, Offset };
13608 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
13610 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
13611 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
13612 MFI->setAdjustsStack(true);
13614 // And our return value (tls address) is in the standard call return value
13616 unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
13617 return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
13618 Chain.getValue(1));
13621 if (Subtarget->isTargetKnownWindowsMSVC() ||
13622 Subtarget->isTargetWindowsGNU()) {
13623 // Just use the implicit TLS architecture
13624 // Need to generate someting similar to:
13625 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
13627 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
13628 // mov rcx, qword [rdx+rcx*8]
13629 // mov eax, .tls$:tlsvar
13630 // [rax+rcx] contains the address
13631 // Windows 64bit: gs:0x58
13632 // Windows 32bit: fs:__tls_array
13635 SDValue Chain = DAG.getEntryNode();
13637 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
13638 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
13639 // use its literal value of 0x2C.
13640 Value *Ptr = Constant::getNullValue(Subtarget->is64Bit()
13641 ? Type::getInt8PtrTy(*DAG.getContext(),
13643 : Type::getInt32PtrTy(*DAG.getContext(),
13647 Subtarget->is64Bit()
13648 ? DAG.getIntPtrConstant(0x58)
13649 : (Subtarget->isTargetWindowsGNU()
13650 ? DAG.getIntPtrConstant(0x2C)
13651 : DAG.getExternalSymbol("_tls_array", getPointerTy()));
13653 SDValue ThreadPointer =
13654 DAG.getLoad(getPointerTy(), dl, Chain, TlsArray,
13655 MachinePointerInfo(Ptr), false, false, false, 0);
13657 // Load the _tls_index variable
13658 SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy());
13659 if (Subtarget->is64Bit())
13660 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain,
13661 IDX, MachinePointerInfo(), MVT::i32,
13662 false, false, false, 0);
13664 IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(),
13665 false, false, false, 0);
13667 SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()),
13669 IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);
13671 SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);
13672 res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(),
13673 false, false, false, 0);
13675 // Get the offset of start of .tls section
13676 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13677 GA->getValueType(0),
13678 GA->getOffset(), X86II::MO_SECREL);
13679 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA);
13681 // The address of the thread local variable is the add of the thread
13682 // pointer with the offset of the variable.
13683 return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset);
13686 llvm_unreachable("TLS not implemented for this target.");
13689 /// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
13690 /// and take a 2 x i32 value to shift plus a shift amount.
13691 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
13692 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
13693 MVT VT = Op.getSimpleValueType();
13694 unsigned VTBits = VT.getSizeInBits();
13696 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
13697 SDValue ShOpLo = Op.getOperand(0);
13698 SDValue ShOpHi = Op.getOperand(1);
13699 SDValue ShAmt = Op.getOperand(2);
13700 // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
13701 // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
13703 SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
13704 DAG.getConstant(VTBits - 1, MVT::i8));
13705 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
13706 DAG.getConstant(VTBits - 1, MVT::i8))
13707 : DAG.getConstant(0, VT);
13709 SDValue Tmp2, Tmp3;
13710 if (Op.getOpcode() == ISD::SHL_PARTS) {
13711 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
13712 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
13714 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
13715 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
13718 // If the shift amount is larger or equal than the width of a part we can't
13719 // rely on the results of shld/shrd. Insert a test and select the appropriate
13720 // values for large shift amounts.
13721 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
13722 DAG.getConstant(VTBits, MVT::i8));
13723 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
13724 AndNode, DAG.getConstant(0, MVT::i8));
13727 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
13728 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
13729 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
13731 if (Op.getOpcode() == ISD::SHL_PARTS) {
13732 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
13733 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
13735 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
13736 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
13739 SDValue Ops[2] = { Lo, Hi };
13740 return DAG.getMergeValues(Ops, dl);
13743 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
13744 SelectionDAG &DAG) const {
13745 MVT SrcVT = Op.getOperand(0).getSimpleValueType();
13748 if (SrcVT.isVector()) {
13749 if (SrcVT.getVectorElementType() == MVT::i1) {
13750 MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
13751 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
13752 DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT,
13753 Op.getOperand(0)));
13758 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
13759 "Unknown SINT_TO_FP to lower!");
13761 // These are really Legal; return the operand so the caller accepts it as
13763 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
13765 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
13766 Subtarget->is64Bit()) {
13770 unsigned Size = SrcVT.getSizeInBits()/8;
13771 MachineFunction &MF = DAG.getMachineFunction();
13772 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
13773 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
13774 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
13776 MachinePointerInfo::getFixedStack(SSFI),
13778 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
13781 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
13783 SelectionDAG &DAG) const {
13787 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
13789 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
13791 Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
13793 unsigned ByteSize = SrcVT.getSizeInBits()/8;
13795 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
13796 MachineMemOperand *MMO;
13798 int SSFI = FI->getIndex();
13800 DAG.getMachineFunction()
13801 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
13802 MachineMemOperand::MOLoad, ByteSize, ByteSize);
13804 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
13805 StackSlot = StackSlot.getOperand(1);
13807 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
13808 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
13810 Tys, Ops, SrcVT, MMO);
13813 Chain = Result.getValue(1);
13814 SDValue InFlag = Result.getValue(2);
13816 // FIXME: Currently the FST is flagged to the FILD_FLAG. This
13817 // shouldn't be necessary except that RFP cannot be live across
13818 // multiple blocks. When stackifier is fixed, they can be uncoupled.
13819 MachineFunction &MF = DAG.getMachineFunction();
13820 unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
13821 int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
13822 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
13823 Tys = DAG.getVTList(MVT::Other);
13825 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
13827 MachineMemOperand *MMO =
13828 DAG.getMachineFunction()
13829 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
13830 MachineMemOperand::MOStore, SSFISize, SSFISize);
13832 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
13833 Ops, Op.getValueType(), MMO);
13834 Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
13835 MachinePointerInfo::getFixedStack(SSFI),
13836 false, false, false, 0);
13842 // LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
13843 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
13844 SelectionDAG &DAG) const {
13845 // This algorithm is not obvious. Here it is what we're trying to output:
13848 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
13849 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
13851 haddpd %xmm0, %xmm0
13853 pshufd $0x4e, %xmm0, %xmm1
13859 LLVMContext *Context = DAG.getContext();
13861 // Build some magic constants.
13862 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
13863 Constant *C0 = ConstantDataVector::get(*Context, CV0);
13864 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
13866 SmallVector<Constant*,2> CV1;
13868 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
13869 APInt(64, 0x4330000000000000ULL))));
13871 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
13872 APInt(64, 0x4530000000000000ULL))));
13873 Constant *C1 = ConstantVector::get(CV1);
13874 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
13876 // Load the 64-bit value into an XMM register.
13877 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
13879 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
13880 MachinePointerInfo::getConstantPool(),
13881 false, false, false, 16);
13882 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32,
13883 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1),
13886 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
13887 MachinePointerInfo::getConstantPool(),
13888 false, false, false, 16);
13889 SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1);
13890 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
13893 if (Subtarget->hasSSE3()) {
13894 // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
13895 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
13897 SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub);
13898 SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
13900 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
13901 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle),
13905 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
13906 DAG.getIntPtrConstant(0));
13909 // LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
13910 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
13911 SelectionDAG &DAG) const {
13913 // FP constant to bias correct the final result.
13914 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
13917 // Load the 32-bit value into an XMM register.
13918 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
13921 // Zero out the upper parts of the register.
13922 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
13924 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
13925 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
13926 DAG.getIntPtrConstant(0));
13928 // Or the load with the bias.
13929 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
13930 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
13931 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
13932 MVT::v2f64, Load)),
13933 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
13934 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
13935 MVT::v2f64, Bias)));
13936 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
13937 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or),
13938 DAG.getIntPtrConstant(0));
13940 // Subtract the bias.
13941 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
13943 // Handle final rounding.
13944 EVT DestVT = Op.getValueType();
13946 if (DestVT.bitsLT(MVT::f64))
13947 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
13948 DAG.getIntPtrConstant(0));
13949 if (DestVT.bitsGT(MVT::f64))
13950 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
13952 // Handle final rounding.
13956 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
13957 const X86Subtarget &Subtarget) {
13958 // The algorithm is the following:
13959 // #ifdef __SSE4_1__
13960 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
13961 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
13962 // (uint4) 0x53000000, 0xaa);
13964 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
13965 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
13967 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
13968 // return (float4) lo + fhi;
13971 SDValue V = Op->getOperand(0);
13972 EVT VecIntVT = V.getValueType();
13973 bool Is128 = VecIntVT == MVT::v4i32;
13974 EVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
13975 // If we convert to something else than the supported type, e.g., to v4f64,
13977 if (VecFloatVT != Op->getValueType(0))
13980 unsigned NumElts = VecIntVT.getVectorNumElements();
13981 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
13982 "Unsupported custom type");
13983 assert(NumElts <= 8 && "The size of the constant array must be fixed");
13985 // In the #idef/#else code, we have in common:
13986 // - The vector of constants:
13992 // Create the splat vector for 0x4b000000.
13993 SDValue CstLow = DAG.getConstant(0x4b000000, MVT::i32);
13994 SDValue CstLowArray[] = {CstLow, CstLow, CstLow, CstLow,
13995 CstLow, CstLow, CstLow, CstLow};
13996 SDValue VecCstLow = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
13997 makeArrayRef(&CstLowArray[0], NumElts));
13998 // Create the splat vector for 0x53000000.
13999 SDValue CstHigh = DAG.getConstant(0x53000000, MVT::i32);
14000 SDValue CstHighArray[] = {CstHigh, CstHigh, CstHigh, CstHigh,
14001 CstHigh, CstHigh, CstHigh, CstHigh};
14002 SDValue VecCstHigh = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
14003 makeArrayRef(&CstHighArray[0], NumElts));
14005 // Create the right shift.
14006 SDValue CstShift = DAG.getConstant(16, MVT::i32);
14007 SDValue CstShiftArray[] = {CstShift, CstShift, CstShift, CstShift,
14008 CstShift, CstShift, CstShift, CstShift};
14009 SDValue VecCstShift = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
14010 makeArrayRef(&CstShiftArray[0], NumElts));
14011 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
14014 if (Subtarget.hasSSE41()) {
14015 EVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
14016 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
14017 SDValue VecCstLowBitcast =
14018 DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstLow);
14019 SDValue VecBitcast = DAG.getNode(ISD::BITCAST, DL, VecI16VT, V);
14020 // Low will be bitcasted right away, so do not bother bitcasting back to its
14022 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
14023 VecCstLowBitcast, DAG.getConstant(0xaa, MVT::i32));
14024 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
14025 // (uint4) 0x53000000, 0xaa);
14026 SDValue VecCstHighBitcast =
14027 DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstHigh);
14028 SDValue VecShiftBitcast =
14029 DAG.getNode(ISD::BITCAST, DL, VecI16VT, HighShift);
14030 // High will be bitcasted right away, so do not bother bitcasting back to
14031 // its original type.
14032 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
14033 VecCstHighBitcast, DAG.getConstant(0xaa, MVT::i32));
14035 SDValue CstMask = DAG.getConstant(0xffff, MVT::i32);
14036 SDValue VecCstMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, CstMask,
14037 CstMask, CstMask, CstMask);
14038 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
14039 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
14040 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
14042 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
14043 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
14046 // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
14047 SDValue CstFAdd = DAG.getConstantFP(
14048 APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), MVT::f32);
14049 SDValue CstFAddArray[] = {CstFAdd, CstFAdd, CstFAdd, CstFAdd,
14050 CstFAdd, CstFAdd, CstFAdd, CstFAdd};
14051 SDValue VecCstFAdd = DAG.getNode(ISD::BUILD_VECTOR, DL, VecFloatVT,
14052 makeArrayRef(&CstFAddArray[0], NumElts));
14054 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
14055 SDValue HighBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, High);
14057 DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
14058 // return (float4) lo + fhi;
14059 SDValue LowBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, Low);
14060 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
14063 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
14064 SelectionDAG &DAG) const {
14065 SDValue N0 = Op.getOperand(0);
14066 MVT SVT = N0.getSimpleValueType();
14069 switch (SVT.SimpleTy) {
14071 llvm_unreachable("Custom UINT_TO_FP is not supported!");
14076 MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements());
14077 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14078 DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
14082 return lowerUINT_TO_FP_vXi32(Op, DAG, *Subtarget);
14084 llvm_unreachable(nullptr);
14087 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
14088 SelectionDAG &DAG) const {
14089 SDValue N0 = Op.getOperand(0);
14092 if (Op.getValueType().isVector())
14093 return lowerUINT_TO_FP_vec(Op, DAG);
14095 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
14096 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
14097 // the optimization here.
14098 if (DAG.SignBitIsZero(N0))
14099 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
14101 MVT SrcVT = N0.getSimpleValueType();
14102 MVT DstVT = Op.getSimpleValueType();
14103 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
14104 return LowerUINT_TO_FP_i64(Op, DAG);
14105 if (SrcVT == MVT::i32 && X86ScalarSSEf64)
14106 return LowerUINT_TO_FP_i32(Op, DAG);
14107 if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
14110 // Make a 64-bit buffer, and use it to build an FILD.
14111 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
14112 if (SrcVT == MVT::i32) {
14113 SDValue WordOff = DAG.getConstant(4, getPointerTy());
14114 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
14115 getPointerTy(), StackSlot, WordOff);
14116 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
14117 StackSlot, MachinePointerInfo(),
14119 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
14120 OffsetSlot, MachinePointerInfo(),
14122 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
14126 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
14127 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
14128 StackSlot, MachinePointerInfo(),
14130 // For i64 source, we need to add the appropriate power of 2 if the input
14131 // was negative. This is the same as the optimization in
14132 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
14133 // we must be careful to do the computation in x87 extended precision, not
14134 // in SSE. (The generic code can't know it's OK to do this, or how to.)
14135 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
14136 MachineMemOperand *MMO =
14137 DAG.getMachineFunction()
14138 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14139 MachineMemOperand::MOLoad, 8, 8);
14141 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
14142 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
14143 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
14146 APInt FF(32, 0x5F800000ULL);
14148 // Check whether the sign bit is set.
14149 SDValue SignSet = DAG.getSetCC(dl,
14150 getSetCCResultType(*DAG.getContext(), MVT::i64),
14151 Op.getOperand(0), DAG.getConstant(0, MVT::i64),
14154 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
14155 SDValue FudgePtr = DAG.getConstantPool(
14156 ConstantInt::get(*DAG.getContext(), FF.zext(64)),
14159 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
14160 SDValue Zero = DAG.getIntPtrConstant(0);
14161 SDValue Four = DAG.getIntPtrConstant(4);
14162 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
14164 FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset);
14166 // Load the value out, extending it from f32 to f80.
14167 // FIXME: Avoid the extend by constructing the right constant pool?
14168 SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(),
14169 FudgePtr, MachinePointerInfo::getConstantPool(),
14170 MVT::f32, false, false, false, 4);
14171 // Extend everything to 80 bits to force it to be done on x87.
14172 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
14173 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0));
14176 std::pair<SDValue,SDValue>
14177 X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
14178 bool IsSigned, bool IsReplace) const {
14181 EVT DstTy = Op.getValueType();
14183 if (!IsSigned && !isIntegerTypeFTOL(DstTy)) {
14184 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
14188 assert(DstTy.getSimpleVT() <= MVT::i64 &&
14189 DstTy.getSimpleVT() >= MVT::i16 &&
14190 "Unknown FP_TO_INT to lower!");
14192 // These are really Legal.
14193 if (DstTy == MVT::i32 &&
14194 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
14195 return std::make_pair(SDValue(), SDValue());
14196 if (Subtarget->is64Bit() &&
14197 DstTy == MVT::i64 &&
14198 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
14199 return std::make_pair(SDValue(), SDValue());
14201 // We lower FP->int64 either into FISTP64 followed by a load from a temporary
14202 // stack slot, or into the FTOL runtime function.
14203 MachineFunction &MF = DAG.getMachineFunction();
14204 unsigned MemSize = DstTy.getSizeInBits()/8;
14205 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
14206 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
14209 if (!IsSigned && isIntegerTypeFTOL(DstTy))
14210 Opc = X86ISD::WIN_FTOL;
14212 switch (DstTy.getSimpleVT().SimpleTy) {
14213 default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
14214 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
14215 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
14216 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
14219 SDValue Chain = DAG.getEntryNode();
14220 SDValue Value = Op.getOperand(0);
14221 EVT TheVT = Op.getOperand(0).getValueType();
14222 // FIXME This causes a redundant load/store if the SSE-class value is already
14223 // in memory, such as if it is on the callstack.
14224 if (isScalarFPTypeInSSEReg(TheVT)) {
14225 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
14226 Chain = DAG.getStore(Chain, DL, Value, StackSlot,
14227 MachinePointerInfo::getFixedStack(SSFI),
14229 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
14231 Chain, StackSlot, DAG.getValueType(TheVT)
14234 MachineMemOperand *MMO =
14235 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14236 MachineMemOperand::MOLoad, MemSize, MemSize);
14237 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
14238 Chain = Value.getValue(1);
14239 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
14240 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
14243 MachineMemOperand *MMO =
14244 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14245 MachineMemOperand::MOStore, MemSize, MemSize);
14247 if (Opc != X86ISD::WIN_FTOL) {
14248 // Build the FP_TO_INT*_IN_MEM
14249 SDValue Ops[] = { Chain, Value, StackSlot };
14250 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
14252 return std::make_pair(FIST, StackSlot);
14254 SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
14255 DAG.getVTList(MVT::Other, MVT::Glue),
14257 SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX,
14258 MVT::i32, ftol.getValue(1));
14259 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX,
14260 MVT::i32, eax.getValue(2));
14261 SDValue Ops[] = { eax, edx };
14262 SDValue pair = IsReplace
14263 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops)
14264 : DAG.getMergeValues(Ops, DL);
14265 return std::make_pair(pair, SDValue());
14269 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
14270 const X86Subtarget *Subtarget) {
14271 MVT VT = Op->getSimpleValueType(0);
14272 SDValue In = Op->getOperand(0);
14273 MVT InVT = In.getSimpleValueType();
14276 // Optimize vectors in AVX mode:
14279 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
14280 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
14281 // Concat upper and lower parts.
14284 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
14285 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
14286 // Concat upper and lower parts.
14289 if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
14290 ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
14291 ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
14294 if (Subtarget->hasInt256())
14295 return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
14297 SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
14298 SDValue Undef = DAG.getUNDEF(InVT);
14299 bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
14300 SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
14301 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
14303 MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
14304 VT.getVectorNumElements()/2);
14306 OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
14307 OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
14309 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
14312 static SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
14313 SelectionDAG &DAG) {
14314 MVT VT = Op->getSimpleValueType(0);
14315 SDValue In = Op->getOperand(0);
14316 MVT InVT = In.getSimpleValueType();
14318 unsigned int NumElts = VT.getVectorNumElements();
14319 if (NumElts != 8 && NumElts != 16)
14322 if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
14323 return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
14325 EVT ExtVT = (NumElts == 8)? MVT::v8i64 : MVT::v16i32;
14326 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14327 // Now we have only mask extension
14328 assert(InVT.getVectorElementType() == MVT::i1);
14329 SDValue Cst = DAG.getTargetConstant(1, ExtVT.getScalarType());
14330 const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
14331 SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
14332 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
14333 SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
14334 MachinePointerInfo::getConstantPool(),
14335 false, false, false, Alignment);
14337 SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, DL, ExtVT, In, Ld);
14338 if (VT.is512BitVector())
14340 return DAG.getNode(X86ISD::VTRUNC, DL, VT, Brcst);
14343 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
14344 SelectionDAG &DAG) {
14345 if (Subtarget->hasFp256()) {
14346 SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
14354 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
14355 SelectionDAG &DAG) {
14357 MVT VT = Op.getSimpleValueType();
14358 SDValue In = Op.getOperand(0);
14359 MVT SVT = In.getSimpleValueType();
14361 if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
14362 return LowerZERO_EXTEND_AVX512(Op, DAG);
14364 if (Subtarget->hasFp256()) {
14365 SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
14370 assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
14371 VT.getVectorNumElements() != SVT.getVectorNumElements());
14375 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
14377 MVT VT = Op.getSimpleValueType();
14378 SDValue In = Op.getOperand(0);
14379 MVT InVT = In.getSimpleValueType();
14381 if (VT == MVT::i1) {
14382 assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
14383 "Invalid scalar TRUNCATE operation");
14384 if (InVT.getSizeInBits() >= 32)
14386 In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
14387 return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
14389 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
14390 "Invalid TRUNCATE operation");
14392 if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) {
14393 if (VT.getVectorElementType().getSizeInBits() >=8)
14394 return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
14396 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
14397 unsigned NumElts = InVT.getVectorNumElements();
14398 assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type");
14399 if (InVT.getSizeInBits() < 512) {
14400 MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64;
14401 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
14405 SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType());
14406 const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
14407 SDValue CP = DAG.getConstantPool(C, getPointerTy());
14408 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
14409 SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
14410 MachinePointerInfo::getConstantPool(),
14411 false, false, false, Alignment);
14412 SDValue OneV = DAG.getNode(X86ISD::VBROADCAST, DL, InVT, Ld);
14413 SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In);
14414 return DAG.getNode(X86ISD::TESTM, DL, VT, And, And);
14417 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
14418 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
14419 if (Subtarget->hasInt256()) {
14420 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
14421 In = DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, In);
14422 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
14424 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
14425 DAG.getIntPtrConstant(0));
14428 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14429 DAG.getIntPtrConstant(0));
14430 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14431 DAG.getIntPtrConstant(2));
14432 OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
14433 OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
14434 static const int ShufMask[] = {0, 2, 4, 6};
14435 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
14438 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
14439 // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
14440 if (Subtarget->hasInt256()) {
14441 In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In);
14443 SmallVector<SDValue,32> pshufbMask;
14444 for (unsigned i = 0; i < 2; ++i) {
14445 pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8));
14446 pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8));
14447 pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8));
14448 pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8));
14449 pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8));
14450 pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8));
14451 pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8));
14452 pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8));
14453 for (unsigned j = 0; j < 8; ++j)
14454 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
14456 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask);
14457 In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
14458 In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In);
14460 static const int ShufMask[] = {0, 2, -1, -1};
14461 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, DAG.getUNDEF(MVT::v4i64),
14463 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14464 DAG.getIntPtrConstant(0));
14465 return DAG.getNode(ISD::BITCAST, DL, VT, In);
14468 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
14469 DAG.getIntPtrConstant(0));
14471 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
14472 DAG.getIntPtrConstant(4));
14474 OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpLo);
14475 OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpHi);
14477 // The PSHUFB mask:
14478 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
14479 -1, -1, -1, -1, -1, -1, -1, -1};
14481 SDValue Undef = DAG.getUNDEF(MVT::v16i8);
14482 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
14483 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
14485 OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
14486 OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
14488 // The MOVLHPS Mask:
14489 static const int ShufMask2[] = {0, 1, 4, 5};
14490 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
14491 return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, res);
14494 // Handle truncation of V256 to V128 using shuffles.
14495 if (!VT.is128BitVector() || !InVT.is256BitVector())
14498 assert(Subtarget->hasFp256() && "256-bit vector without AVX!");
14500 unsigned NumElems = VT.getVectorNumElements();
14501 MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
14503 SmallVector<int, 16> MaskVec(NumElems * 2, -1);
14504 // Prepare truncation shuffle mask
14505 for (unsigned i = 0; i != NumElems; ++i)
14506 MaskVec[i] = i * 2;
14507 SDValue V = DAG.getVectorShuffle(NVT, DL,
14508 DAG.getNode(ISD::BITCAST, DL, NVT, In),
14509 DAG.getUNDEF(NVT), &MaskVec[0]);
14510 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
14511 DAG.getIntPtrConstant(0));
14514 SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
14515 SelectionDAG &DAG) const {
14516 assert(!Op.getSimpleValueType().isVector());
14518 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
14519 /*IsSigned=*/ true, /*IsReplace=*/ false);
14520 SDValue FIST = Vals.first, StackSlot = Vals.second;
14521 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
14522 if (!FIST.getNode()) return Op;
14524 if (StackSlot.getNode())
14525 // Load the result.
14526 return DAG.getLoad(Op.getValueType(), SDLoc(Op),
14527 FIST, StackSlot, MachinePointerInfo(),
14528 false, false, false, 0);
14530 // The node is the result.
14534 SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
14535 SelectionDAG &DAG) const {
14536 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
14537 /*IsSigned=*/ false, /*IsReplace=*/ false);
14538 SDValue FIST = Vals.first, StackSlot = Vals.second;
14539 assert(FIST.getNode() && "Unexpected failure");
14541 if (StackSlot.getNode())
14542 // Load the result.
14543 return DAG.getLoad(Op.getValueType(), SDLoc(Op),
14544 FIST, StackSlot, MachinePointerInfo(),
14545 false, false, false, 0);
14547 // The node is the result.
14551 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
14553 MVT VT = Op.getSimpleValueType();
14554 SDValue In = Op.getOperand(0);
14555 MVT SVT = In.getSimpleValueType();
14557 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
14559 return DAG.getNode(X86ISD::VFPEXT, DL, VT,
14560 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
14561 In, DAG.getUNDEF(SVT)));
14564 /// The only differences between FABS and FNEG are the mask and the logic op.
14565 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
14566 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
14567 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
14568 "Wrong opcode for lowering FABS or FNEG.");
14570 bool IsFABS = (Op.getOpcode() == ISD::FABS);
14572 // If this is a FABS and it has an FNEG user, bail out to fold the combination
14573 // into an FNABS. We'll lower the FABS after that if it is still in use.
14575 for (SDNode *User : Op->uses())
14576 if (User->getOpcode() == ISD::FNEG)
14579 SDValue Op0 = Op.getOperand(0);
14580 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
14583 MVT VT = Op.getSimpleValueType();
14584 // Assume scalar op for initialization; update for vector if needed.
14585 // Note that there are no scalar bitwise logical SSE/AVX instructions, so we
14586 // generate a 16-byte vector constant and logic op even for the scalar case.
14587 // Using a 16-byte mask allows folding the load of the mask with
14588 // the logic op, so it can save (~4 bytes) on code size.
14590 unsigned NumElts = VT == MVT::f64 ? 2 : 4;
14591 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
14592 // decide if we should generate a 16-byte constant mask when we only need 4 or
14593 // 8 bytes for the scalar case.
14594 if (VT.isVector()) {
14595 EltVT = VT.getVectorElementType();
14596 NumElts = VT.getVectorNumElements();
14599 unsigned EltBits = EltVT.getSizeInBits();
14600 LLVMContext *Context = DAG.getContext();
14601 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
14603 IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits);
14604 Constant *C = ConstantInt::get(*Context, MaskElt);
14605 C = ConstantVector::getSplat(NumElts, C);
14606 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14607 SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy());
14608 unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
14609 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
14610 MachinePointerInfo::getConstantPool(),
14611 false, false, false, Alignment);
14613 if (VT.isVector()) {
14614 // For a vector, cast operands to a vector type, perform the logic op,
14615 // and cast the result back to the original value type.
14616 MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
14617 SDValue MaskCasted = DAG.getNode(ISD::BITCAST, dl, VecVT, Mask);
14618 SDValue Operand = IsFNABS ?
14619 DAG.getNode(ISD::BITCAST, dl, VecVT, Op0.getOperand(0)) :
14620 DAG.getNode(ISD::BITCAST, dl, VecVT, Op0);
14621 unsigned BitOp = IsFABS ? ISD::AND : IsFNABS ? ISD::OR : ISD::XOR;
14622 return DAG.getNode(ISD::BITCAST, dl, VT,
14623 DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted));
14626 // If not vector, then scalar.
14627 unsigned BitOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
14628 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
14629 return DAG.getNode(BitOp, dl, VT, Operand, Mask);
14632 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
14633 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14634 LLVMContext *Context = DAG.getContext();
14635 SDValue Op0 = Op.getOperand(0);
14636 SDValue Op1 = Op.getOperand(1);
14638 MVT VT = Op.getSimpleValueType();
14639 MVT SrcVT = Op1.getSimpleValueType();
14641 // If second operand is smaller, extend it first.
14642 if (SrcVT.bitsLT(VT)) {
14643 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
14646 // And if it is bigger, shrink it first.
14647 if (SrcVT.bitsGT(VT)) {
14648 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
14652 // At this point the operands and the result should have the same
14653 // type, and that won't be f80 since that is not custom lowered.
14655 const fltSemantics &Sem =
14656 VT == MVT::f64 ? APFloat::IEEEdouble : APFloat::IEEEsingle;
14657 const unsigned SizeInBits = VT.getSizeInBits();
14659 SmallVector<Constant *, 4> CV(
14660 VT == MVT::f64 ? 2 : 4,
14661 ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0))));
14663 // First, clear all bits but the sign bit from the second operand (sign).
14664 CV[0] = ConstantFP::get(*Context,
14665 APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1)));
14666 Constant *C = ConstantVector::get(CV);
14667 SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
14668 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
14669 MachinePointerInfo::getConstantPool(),
14670 false, false, false, 16);
14671 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
14673 // Next, clear the sign bit from the first operand (magnitude).
14674 // If it's a constant, we can clear it here.
14675 if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Op0)) {
14676 APFloat APF = Op0CN->getValueAPF();
14677 // If the magnitude is a positive zero, the sign bit alone is enough.
14678 if (APF.isPosZero())
14681 CV[0] = ConstantFP::get(*Context, APF);
14683 CV[0] = ConstantFP::get(
14685 APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1)));
14687 C = ConstantVector::get(CV);
14688 CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
14689 SDValue Val = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
14690 MachinePointerInfo::getConstantPool(),
14691 false, false, false, 16);
14692 // If the magnitude operand wasn't a constant, we need to AND out the sign.
14693 if (!isa<ConstantFPSDNode>(Op0))
14694 Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Val);
14696 // OR the magnitude value with the sign bit.
14697 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
14700 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
14701 SDValue N0 = Op.getOperand(0);
14703 MVT VT = Op.getSimpleValueType();
14705 // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
14706 SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
14707 DAG.getConstant(1, VT));
14708 return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
14711 // Check whether an OR'd tree is PTEST-able.
14712 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
14713 SelectionDAG &DAG) {
14714 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
14716 if (!Subtarget->hasSSE41())
14719 if (!Op->hasOneUse())
14722 SDNode *N = Op.getNode();
14725 SmallVector<SDValue, 8> Opnds;
14726 DenseMap<SDValue, unsigned> VecInMap;
14727 SmallVector<SDValue, 8> VecIns;
14728 EVT VT = MVT::Other;
14730 // Recognize a special case where a vector is casted into wide integer to
14732 Opnds.push_back(N->getOperand(0));
14733 Opnds.push_back(N->getOperand(1));
14735 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
14736 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
14737 // BFS traverse all OR'd operands.
14738 if (I->getOpcode() == ISD::OR) {
14739 Opnds.push_back(I->getOperand(0));
14740 Opnds.push_back(I->getOperand(1));
14741 // Re-evaluate the number of nodes to be traversed.
14742 e += 2; // 2 more nodes (LHS and RHS) are pushed.
14746 // Quit if a non-EXTRACT_VECTOR_ELT
14747 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
14750 // Quit if without a constant index.
14751 SDValue Idx = I->getOperand(1);
14752 if (!isa<ConstantSDNode>(Idx))
14755 SDValue ExtractedFromVec = I->getOperand(0);
14756 DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
14757 if (M == VecInMap.end()) {
14758 VT = ExtractedFromVec.getValueType();
14759 // Quit if not 128/256-bit vector.
14760 if (!VT.is128BitVector() && !VT.is256BitVector())
14762 // Quit if not the same type.
14763 if (VecInMap.begin() != VecInMap.end() &&
14764 VT != VecInMap.begin()->first.getValueType())
14766 M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
14767 VecIns.push_back(ExtractedFromVec);
14769 M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
14772 assert((VT.is128BitVector() || VT.is256BitVector()) &&
14773 "Not extracted from 128-/256-bit vector.");
14775 unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
14777 for (DenseMap<SDValue, unsigned>::const_iterator
14778 I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
14779 // Quit if not all elements are used.
14780 if (I->second != FullMask)
14784 EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
14786 // Cast all vectors into TestVT for PTEST.
14787 for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
14788 VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]);
14790 // If more than one full vectors are evaluated, OR them first before PTEST.
14791 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
14792 // Each iteration will OR 2 nodes and append the result until there is only
14793 // 1 node left, i.e. the final OR'd value of all vectors.
14794 SDValue LHS = VecIns[Slot];
14795 SDValue RHS = VecIns[Slot + 1];
14796 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
14799 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
14800 VecIns.back(), VecIns.back());
14803 /// \brief return true if \c Op has a use that doesn't just read flags.
14804 static bool hasNonFlagsUse(SDValue Op) {
14805 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
14807 SDNode *User = *UI;
14808 unsigned UOpNo = UI.getOperandNo();
14809 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
14810 // Look pass truncate.
14811 UOpNo = User->use_begin().getOperandNo();
14812 User = *User->use_begin();
14815 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
14816 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
14822 /// Emit nodes that will be selected as "test Op0,Op0", or something
14824 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
14825 SelectionDAG &DAG) const {
14826 if (Op.getValueType() == MVT::i1)
14827 // KORTEST instruction should be selected
14828 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
14829 DAG.getConstant(0, Op.getValueType()));
14831 // CF and OF aren't always set the way we want. Determine which
14832 // of these we need.
14833 bool NeedCF = false;
14834 bool NeedOF = false;
14837 case X86::COND_A: case X86::COND_AE:
14838 case X86::COND_B: case X86::COND_BE:
14841 case X86::COND_G: case X86::COND_GE:
14842 case X86::COND_L: case X86::COND_LE:
14843 case X86::COND_O: case X86::COND_NO: {
14844 // Check if we really need to set the
14845 // Overflow flag. If NoSignedWrap is present
14846 // that is not actually needed.
14847 switch (Op->getOpcode()) {
14852 const BinaryWithFlagsSDNode *BinNode =
14853 cast<BinaryWithFlagsSDNode>(Op.getNode());
14854 if (BinNode->hasNoSignedWrap())
14864 // See if we can use the EFLAGS value from the operand instead of
14865 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
14866 // we prove that the arithmetic won't overflow, we can't use OF or CF.
14867 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
14868 // Emit a CMP with 0, which is the TEST pattern.
14869 //if (Op.getValueType() == MVT::i1)
14870 // return DAG.getNode(X86ISD::CMP, dl, MVT::i1, Op,
14871 // DAG.getConstant(0, MVT::i1));
14872 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
14873 DAG.getConstant(0, Op.getValueType()));
14875 unsigned Opcode = 0;
14876 unsigned NumOperands = 0;
14878 // Truncate operations may prevent the merge of the SETCC instruction
14879 // and the arithmetic instruction before it. Attempt to truncate the operands
14880 // of the arithmetic instruction and use a reduced bit-width instruction.
14881 bool NeedTruncation = false;
14882 SDValue ArithOp = Op;
14883 if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
14884 SDValue Arith = Op->getOperand(0);
14885 // Both the trunc and the arithmetic op need to have one user each.
14886 if (Arith->hasOneUse())
14887 switch (Arith.getOpcode()) {
14894 NeedTruncation = true;
14900 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
14901 // which may be the result of a CAST. We use the variable 'Op', which is the
14902 // non-casted variable when we check for possible users.
14903 switch (ArithOp.getOpcode()) {
14905 // Due to an isel shortcoming, be conservative if this add is likely to be
14906 // selected as part of a load-modify-store instruction. When the root node
14907 // in a match is a store, isel doesn't know how to remap non-chain non-flag
14908 // uses of other nodes in the match, such as the ADD in this case. This
14909 // leads to the ADD being left around and reselected, with the result being
14910 // two adds in the output. Alas, even if none our users are stores, that
14911 // doesn't prove we're O.K. Ergo, if we have any parents that aren't
14912 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require
14913 // climbing the DAG back to the root, and it doesn't seem to be worth the
14915 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
14916 UE = Op.getNode()->use_end(); UI != UE; ++UI)
14917 if (UI->getOpcode() != ISD::CopyToReg &&
14918 UI->getOpcode() != ISD::SETCC &&
14919 UI->getOpcode() != ISD::STORE)
14922 if (ConstantSDNode *C =
14923 dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
14924 // An add of one will be selected as an INC.
14925 if (C->getAPIntValue() == 1 && !Subtarget->slowIncDec()) {
14926 Opcode = X86ISD::INC;
14931 // An add of negative one (subtract of one) will be selected as a DEC.
14932 if (C->getAPIntValue().isAllOnesValue() && !Subtarget->slowIncDec()) {
14933 Opcode = X86ISD::DEC;
14939 // Otherwise use a regular EFLAGS-setting add.
14940 Opcode = X86ISD::ADD;
14945 // If we have a constant logical shift that's only used in a comparison
14946 // against zero turn it into an equivalent AND. This allows turning it into
14947 // a TEST instruction later.
14948 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
14949 isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
14950 EVT VT = Op.getValueType();
14951 unsigned BitWidth = VT.getSizeInBits();
14952 unsigned ShAmt = Op->getConstantOperandVal(1);
14953 if (ShAmt >= BitWidth) // Avoid undefined shifts.
14955 APInt Mask = ArithOp.getOpcode() == ISD::SRL
14956 ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
14957 : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
14958 if (!Mask.isSignedIntN(32)) // Avoid large immediates.
14960 SDValue New = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
14961 DAG.getConstant(Mask, VT));
14962 DAG.ReplaceAllUsesWith(Op, New);
14968 // If the primary and result isn't used, don't bother using X86ISD::AND,
14969 // because a TEST instruction will be better.
14970 if (!hasNonFlagsUse(Op))
14976 // Due to the ISEL shortcoming noted above, be conservative if this op is
14977 // likely to be selected as part of a load-modify-store instruction.
14978 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
14979 UE = Op.getNode()->use_end(); UI != UE; ++UI)
14980 if (UI->getOpcode() == ISD::STORE)
14983 // Otherwise use a regular EFLAGS-setting instruction.
14984 switch (ArithOp.getOpcode()) {
14985 default: llvm_unreachable("unexpected operator!");
14986 case ISD::SUB: Opcode = X86ISD::SUB; break;
14987 case ISD::XOR: Opcode = X86ISD::XOR; break;
14988 case ISD::AND: Opcode = X86ISD::AND; break;
14990 if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
14991 SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG);
14992 if (EFLAGS.getNode())
14995 Opcode = X86ISD::OR;
15009 return SDValue(Op.getNode(), 1);
15015 // If we found that truncation is beneficial, perform the truncation and
15017 if (NeedTruncation) {
15018 EVT VT = Op.getValueType();
15019 SDValue WideVal = Op->getOperand(0);
15020 EVT WideVT = WideVal.getValueType();
15021 unsigned ConvertedOp = 0;
15022 // Use a target machine opcode to prevent further DAGCombine
15023 // optimizations that may separate the arithmetic operations
15024 // from the setcc node.
15025 switch (WideVal.getOpcode()) {
15027 case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
15028 case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
15029 case ISD::AND: ConvertedOp = X86ISD::AND; break;
15030 case ISD::OR: ConvertedOp = X86ISD::OR; break;
15031 case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
15035 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15036 if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
15037 SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
15038 SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
15039 Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
15045 // Emit a CMP with 0, which is the TEST pattern.
15046 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
15047 DAG.getConstant(0, Op.getValueType()));
15049 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
15050 SmallVector<SDValue, 4> Ops;
15051 for (unsigned i = 0; i != NumOperands; ++i)
15052 Ops.push_back(Op.getOperand(i));
15054 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
15055 DAG.ReplaceAllUsesWith(Op, New);
15056 return SDValue(New.getNode(), 1);
15059 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
15061 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
15062 SDLoc dl, SelectionDAG &DAG) const {
15063 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) {
15064 if (C->getAPIntValue() == 0)
15065 return EmitTest(Op0, X86CC, dl, DAG);
15067 if (Op0.getValueType() == MVT::i1)
15068 llvm_unreachable("Unexpected comparison operation for MVT::i1 operands");
15071 if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
15072 Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
15073 // Do the comparison at i32 if it's smaller, besides the Atom case.
15074 // This avoids subregister aliasing issues. Keep the smaller reference
15075 // if we're optimizing for size, however, as that'll allow better folding
15076 // of memory operations.
15077 if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 &&
15078 !DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
15079 AttributeSet::FunctionIndex, Attribute::MinSize) &&
15080 !Subtarget->isAtom()) {
15081 unsigned ExtendOp =
15082 isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
15083 Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
15084 Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
15086 // Use SUB instead of CMP to enable CSE between SUB and CMP.
15087 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
15088 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
15090 return SDValue(Sub.getNode(), 1);
15092 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
15095 /// Convert a comparison if required by the subtarget.
15096 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
15097 SelectionDAG &DAG) const {
15098 // If the subtarget does not support the FUCOMI instruction, floating-point
15099 // comparisons have to be converted.
15100 if (Subtarget->hasCMov() ||
15101 Cmp.getOpcode() != X86ISD::CMP ||
15102 !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
15103 !Cmp.getOperand(1).getValueType().isFloatingPoint())
15106 // The instruction selector will select an FUCOM instruction instead of
15107 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
15108 // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
15109 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
15111 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
15112 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
15113 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
15114 DAG.getConstant(8, MVT::i8));
15115 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
15116 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
15119 /// The minimum architected relative accuracy is 2^-12. We need one
15120 /// Newton-Raphson step to have a good float result (24 bits of precision).
15121 SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
15122 DAGCombinerInfo &DCI,
15123 unsigned &RefinementSteps,
15124 bool &UseOneConstNR) const {
15125 // FIXME: We should use instruction latency models to calculate the cost of
15126 // each potential sequence, but this is very hard to do reliably because
15127 // at least Intel's Core* chips have variable timing based on the number of
15128 // significant digits in the divisor and/or sqrt operand.
15129 if (!Subtarget->useSqrtEst())
15132 EVT VT = Op.getValueType();
15134 // SSE1 has rsqrtss and rsqrtps.
15135 // TODO: Add support for AVX512 (v16f32).
15136 // It is likely not profitable to do this for f64 because a double-precision
15137 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
15138 // instructions: convert to single, rsqrtss, convert back to double, refine
15139 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
15140 // along with FMA, this could be a throughput win.
15141 if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
15142 (Subtarget->hasAVX() && VT == MVT::v8f32)) {
15143 RefinementSteps = 1;
15144 UseOneConstNR = false;
15145 return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
15150 /// The minimum architected relative accuracy is 2^-12. We need one
15151 /// Newton-Raphson step to have a good float result (24 bits of precision).
15152 SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
15153 DAGCombinerInfo &DCI,
15154 unsigned &RefinementSteps) const {
15155 // FIXME: We should use instruction latency models to calculate the cost of
15156 // each potential sequence, but this is very hard to do reliably because
15157 // at least Intel's Core* chips have variable timing based on the number of
15158 // significant digits in the divisor.
15159 if (!Subtarget->useReciprocalEst())
15162 EVT VT = Op.getValueType();
15164 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
15165 // TODO: Add support for AVX512 (v16f32).
15166 // It is likely not profitable to do this for f64 because a double-precision
15167 // reciprocal estimate with refinement on x86 prior to FMA requires
15168 // 15 instructions: convert to single, rcpss, convert back to double, refine
15169 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
15170 // along with FMA, this could be a throughput win.
15171 if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
15172 (Subtarget->hasAVX() && VT == MVT::v8f32)) {
15173 RefinementSteps = ReciprocalEstimateRefinementSteps;
15174 return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
15179 static bool isAllOnes(SDValue V) {
15180 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
15181 return C && C->isAllOnesValue();
15184 /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
15185 /// if it's possible.
15186 SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
15187 SDLoc dl, SelectionDAG &DAG) const {
15188 SDValue Op0 = And.getOperand(0);
15189 SDValue Op1 = And.getOperand(1);
15190 if (Op0.getOpcode() == ISD::TRUNCATE)
15191 Op0 = Op0.getOperand(0);
15192 if (Op1.getOpcode() == ISD::TRUNCATE)
15193 Op1 = Op1.getOperand(0);
15196 if (Op1.getOpcode() == ISD::SHL)
15197 std::swap(Op0, Op1);
15198 if (Op0.getOpcode() == ISD::SHL) {
15199 if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0)))
15200 if (And00C->getZExtValue() == 1) {
15201 // If we looked past a truncate, check that it's only truncating away
15203 unsigned BitWidth = Op0.getValueSizeInBits();
15204 unsigned AndBitWidth = And.getValueSizeInBits();
15205 if (BitWidth > AndBitWidth) {
15207 DAG.computeKnownBits(Op0, Zeros, Ones);
15208 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
15212 RHS = Op0.getOperand(1);
15214 } else if (Op1.getOpcode() == ISD::Constant) {
15215 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
15216 uint64_t AndRHSVal = AndRHS->getZExtValue();
15217 SDValue AndLHS = Op0;
15219 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
15220 LHS = AndLHS.getOperand(0);
15221 RHS = AndLHS.getOperand(1);
15224 // Use BT if the immediate can't be encoded in a TEST instruction.
15225 if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
15227 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType());
15231 if (LHS.getNode()) {
15232 // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT
15233 // instruction. Since the shift amount is in-range-or-undefined, we know
15234 // that doing a bittest on the i32 value is ok. We extend to i32 because
15235 // the encoding for the i16 version is larger than the i32 version.
15236 // Also promote i16 to i32 for performance / code size reason.
15237 if (LHS.getValueType() == MVT::i8 ||
15238 LHS.getValueType() == MVT::i16)
15239 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
15241 // If the operand types disagree, extend the shift amount to match. Since
15242 // BT ignores high bits (like shifts) we can use anyextend.
15243 if (LHS.getValueType() != RHS.getValueType())
15244 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
15246 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
15247 X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
15248 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15249 DAG.getConstant(Cond, MVT::i8), BT);
15255 /// \brief - Turns an ISD::CondCode into a value suitable for SSE floating point
15257 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
15262 // SSE Condition code mapping:
15271 switch (SetCCOpcode) {
15272 default: llvm_unreachable("Unexpected SETCC condition");
15274 case ISD::SETEQ: SSECC = 0; break;
15276 case ISD::SETGT: Swap = true; // Fallthrough
15278 case ISD::SETOLT: SSECC = 1; break;
15280 case ISD::SETGE: Swap = true; // Fallthrough
15282 case ISD::SETOLE: SSECC = 2; break;
15283 case ISD::SETUO: SSECC = 3; break;
15285 case ISD::SETNE: SSECC = 4; break;
15286 case ISD::SETULE: Swap = true; // Fallthrough
15287 case ISD::SETUGE: SSECC = 5; break;
15288 case ISD::SETULT: Swap = true; // Fallthrough
15289 case ISD::SETUGT: SSECC = 6; break;
15290 case ISD::SETO: SSECC = 7; break;
15292 case ISD::SETONE: SSECC = 8; break;
15295 std::swap(Op0, Op1);
15300 // Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
15301 // ones, and then concatenate the result back.
15302 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
15303 MVT VT = Op.getSimpleValueType();
15305 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
15306 "Unsupported value type for operation");
15308 unsigned NumElems = VT.getVectorNumElements();
15310 SDValue CC = Op.getOperand(2);
15312 // Extract the LHS vectors
15313 SDValue LHS = Op.getOperand(0);
15314 SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
15315 SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
15317 // Extract the RHS vectors
15318 SDValue RHS = Op.getOperand(1);
15319 SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
15320 SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
15322 // Issue the operation on the smaller types and concatenate the result back
15323 MVT EltVT = VT.getVectorElementType();
15324 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
15325 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
15326 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
15327 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
15330 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG,
15331 const X86Subtarget *Subtarget) {
15332 SDValue Op0 = Op.getOperand(0);
15333 SDValue Op1 = Op.getOperand(1);
15334 SDValue CC = Op.getOperand(2);
15335 MVT VT = Op.getSimpleValueType();
15338 assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 8 &&
15339 Op.getValueType().getScalarType() == MVT::i1 &&
15340 "Cannot set masked compare for this operation");
15342 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
15344 bool Unsigned = false;
15347 switch (SetCCOpcode) {
15348 default: llvm_unreachable("Unexpected SETCC condition");
15349 case ISD::SETNE: SSECC = 4; break;
15350 case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break;
15351 case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
15352 case ISD::SETLT: Swap = true; //fall-through
15353 case ISD::SETGT: Opc = X86ISD::PCMPGTM; break;
15354 case ISD::SETULT: SSECC = 1; Unsigned = true; break;
15355 case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
15356 case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap
15357 case ISD::SETULE: Unsigned = true; //fall-through
15358 case ISD::SETLE: SSECC = 2; break;
15362 std::swap(Op0, Op1);
15364 return DAG.getNode(Opc, dl, VT, Op0, Op1);
15365 Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
15366 return DAG.getNode(Opc, dl, VT, Op0, Op1,
15367 DAG.getConstant(SSECC, MVT::i8));
15370 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
15371 /// operand \p Op1. If non-trivial (for example because it's not constant)
15372 /// return an empty value.
15373 static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG)
15375 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
15379 MVT VT = Op1.getSimpleValueType();
15380 MVT EVT = VT.getVectorElementType();
15381 unsigned n = VT.getVectorNumElements();
15382 SmallVector<SDValue, 8> ULTOp1;
15384 for (unsigned i = 0; i < n; ++i) {
15385 ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
15386 if (!Elt || Elt->isOpaque() || Elt->getValueType(0) != EVT)
15389 // Avoid underflow.
15390 APInt Val = Elt->getAPIntValue();
15394 ULTOp1.push_back(DAG.getConstant(Val - 1, EVT));
15397 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ULTOp1);
15400 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
15401 SelectionDAG &DAG) {
15402 SDValue Op0 = Op.getOperand(0);
15403 SDValue Op1 = Op.getOperand(1);
15404 SDValue CC = Op.getOperand(2);
15405 MVT VT = Op.getSimpleValueType();
15406 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
15407 bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
15412 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
15413 assert(EltVT == MVT::f32 || EltVT == MVT::f64);
15416 unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
15417 unsigned Opc = X86ISD::CMPP;
15418 if (Subtarget->hasAVX512() && VT.getVectorElementType() == MVT::i1) {
15419 assert(VT.getVectorNumElements() <= 16);
15420 Opc = X86ISD::CMPM;
15422 // In the two special cases we can't handle, emit two comparisons.
15425 unsigned CombineOpc;
15426 if (SetCCOpcode == ISD::SETUEQ) {
15427 CC0 = 3; CC1 = 0; CombineOpc = ISD::OR;
15429 assert(SetCCOpcode == ISD::SETONE);
15430 CC0 = 7; CC1 = 4; CombineOpc = ISD::AND;
15433 SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
15434 DAG.getConstant(CC0, MVT::i8));
15435 SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
15436 DAG.getConstant(CC1, MVT::i8));
15437 return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
15439 // Handle all other FP comparisons here.
15440 return DAG.getNode(Opc, dl, VT, Op0, Op1,
15441 DAG.getConstant(SSECC, MVT::i8));
15444 // Break 256-bit integer vector compare into smaller ones.
15445 if (VT.is256BitVector() && !Subtarget->hasInt256())
15446 return Lower256IntVSETCC(Op, DAG);
15448 bool MaskResult = (VT.getVectorElementType() == MVT::i1);
15449 EVT OpVT = Op1.getValueType();
15450 if (Subtarget->hasAVX512()) {
15451 if (Op1.getValueType().is512BitVector() ||
15452 (Subtarget->hasBWI() && Subtarget->hasVLX()) ||
15453 (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32))
15454 return LowerIntVSETCC_AVX512(Op, DAG, Subtarget);
15456 // In AVX-512 architecture setcc returns mask with i1 elements,
15457 // But there is no compare instruction for i8 and i16 elements in KNL.
15458 // We are not talking about 512-bit operands in this case, these
15459 // types are illegal.
15461 (OpVT.getVectorElementType().getSizeInBits() < 32 &&
15462 OpVT.getVectorElementType().getSizeInBits() >= 8))
15463 return DAG.getNode(ISD::TRUNCATE, dl, VT,
15464 DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
15467 // We are handling one of the integer comparisons here. Since SSE only has
15468 // GT and EQ comparisons for integer, swapping operands and multiple
15469 // operations may be required for some comparisons.
15471 bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
15472 bool Subus = false;
15474 switch (SetCCOpcode) {
15475 default: llvm_unreachable("Unexpected SETCC condition");
15476 case ISD::SETNE: Invert = true;
15477 case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break;
15478 case ISD::SETLT: Swap = true;
15479 case ISD::SETGT: Opc = X86ISD::PCMPGT; break;
15480 case ISD::SETGE: Swap = true;
15481 case ISD::SETLE: Opc = X86ISD::PCMPGT;
15482 Invert = true; break;
15483 case ISD::SETULT: Swap = true;
15484 case ISD::SETUGT: Opc = X86ISD::PCMPGT;
15485 FlipSigns = true; break;
15486 case ISD::SETUGE: Swap = true;
15487 case ISD::SETULE: Opc = X86ISD::PCMPGT;
15488 FlipSigns = true; Invert = true; break;
15491 // Special case: Use min/max operations for SETULE/SETUGE
15492 MVT VET = VT.getVectorElementType();
15494 (Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
15495 || (Subtarget->hasSSE2() && (VET == MVT::i8));
15498 switch (SetCCOpcode) {
15500 case ISD::SETULE: Opc = X86ISD::UMIN; MinMax = true; break;
15501 case ISD::SETUGE: Opc = X86ISD::UMAX; MinMax = true; break;
15504 if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
15507 bool hasSubus = Subtarget->hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
15508 if (!MinMax && hasSubus) {
15509 // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
15511 // t = psubus Op0, Op1
15512 // pcmpeq t, <0..0>
15513 switch (SetCCOpcode) {
15515 case ISD::SETULT: {
15516 // If the comparison is against a constant we can turn this into a
15517 // setule. With psubus, setule does not require a swap. This is
15518 // beneficial because the constant in the register is no longer
15519 // destructed as the destination so it can be hoisted out of a loop.
15520 // Only do this pre-AVX since vpcmp* is no longer destructive.
15521 if (Subtarget->hasAVX())
15523 SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG);
15524 if (ULEOp1.getNode()) {
15526 Subus = true; Invert = false; Swap = false;
15530 // Psubus is better than flip-sign because it requires no inversion.
15531 case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break;
15532 case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
15536 Opc = X86ISD::SUBUS;
15542 std::swap(Op0, Op1);
15544 // Check that the operation in question is available (most are plain SSE2,
15545 // but PCMPGTQ and PCMPEQQ have different requirements).
15546 if (VT == MVT::v2i64) {
15547 if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) {
15548 assert(Subtarget->hasSSE2() && "Don't know how to lower!");
15550 // First cast everything to the right type.
15551 Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
15552 Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
15554 // Since SSE has no unsigned integer comparisons, we need to flip the sign
15555 // bits of the inputs before performing those operations. The lower
15556 // compare is always unsigned.
15559 SB = DAG.getConstant(0x80000000U, MVT::v4i32);
15561 SDValue Sign = DAG.getConstant(0x80000000U, MVT::i32);
15562 SDValue Zero = DAG.getConstant(0x00000000U, MVT::i32);
15563 SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
15564 Sign, Zero, Sign, Zero);
15566 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
15567 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
15569 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
15570 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
15571 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
15573 // Create masks for only the low parts/high parts of the 64 bit integers.
15574 static const int MaskHi[] = { 1, 1, 3, 3 };
15575 static const int MaskLo[] = { 0, 0, 2, 2 };
15576 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
15577 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
15578 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
15580 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
15581 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
15584 Result = DAG.getNOT(dl, Result, MVT::v4i32);
15586 return DAG.getNode(ISD::BITCAST, dl, VT, Result);
15589 if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) {
15590 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
15591 // pcmpeqd + pshufd + pand.
15592 assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!");
15594 // First cast everything to the right type.
15595 Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
15596 Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
15599 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
15601 // Make sure the lower and upper halves are both all-ones.
15602 static const int Mask[] = { 1, 0, 3, 2 };
15603 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
15604 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
15607 Result = DAG.getNOT(dl, Result, MVT::v4i32);
15609 return DAG.getNode(ISD::BITCAST, dl, VT, Result);
15613 // Since SSE has no unsigned integer comparisons, we need to flip the sign
15614 // bits of the inputs before performing those operations.
15616 EVT EltVT = VT.getVectorElementType();
15617 SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), VT);
15618 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
15619 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
15622 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
15624 // If the logical-not of the result is required, perform that now.
15626 Result = DAG.getNOT(dl, Result, VT);
15629 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
15632 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
15633 getZeroVector(VT, Subtarget, DAG, dl));
15638 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
15640 MVT VT = Op.getSimpleValueType();
15642 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
15644 assert(((!Subtarget->hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
15645 && "SetCC type must be 8-bit or 1-bit integer");
15646 SDValue Op0 = Op.getOperand(0);
15647 SDValue Op1 = Op.getOperand(1);
15649 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
15651 // Optimize to BT if possible.
15652 // Lower (X & (1 << N)) == 0 to BT(X, N).
15653 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
15654 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
15655 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
15656 Op1.getOpcode() == ISD::Constant &&
15657 cast<ConstantSDNode>(Op1)->isNullValue() &&
15658 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15659 SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
15660 if (NewSetCC.getNode()) {
15662 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
15667 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
15669 if (Op1.getOpcode() == ISD::Constant &&
15670 (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
15671 cast<ConstantSDNode>(Op1)->isNullValue()) &&
15672 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15674 // If the input is a setcc, then reuse the input setcc or use a new one with
15675 // the inverted condition.
15676 if (Op0.getOpcode() == X86ISD::SETCC) {
15677 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
15678 bool Invert = (CC == ISD::SETNE) ^
15679 cast<ConstantSDNode>(Op1)->isNullValue();
15683 CCode = X86::GetOppositeBranchCondition(CCode);
15684 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15685 DAG.getConstant(CCode, MVT::i8),
15686 Op0.getOperand(1));
15688 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
15692 if ((Op0.getValueType() == MVT::i1) && (Op1.getOpcode() == ISD::Constant) &&
15693 (cast<ConstantSDNode>(Op1)->getZExtValue() == 1) &&
15694 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15696 ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
15697 return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, MVT::i1), NewCC);
15700 bool isFP = Op1.getSimpleValueType().isFloatingPoint();
15701 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
15702 if (X86CC == X86::COND_INVALID)
15705 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
15706 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
15707 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15708 DAG.getConstant(X86CC, MVT::i8), EFLAGS);
15710 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
15714 // isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
15715 static bool isX86LogicalCmp(SDValue Op) {
15716 unsigned Opc = Op.getNode()->getOpcode();
15717 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
15718 Opc == X86ISD::SAHF)
15720 if (Op.getResNo() == 1 &&
15721 (Opc == X86ISD::ADD ||
15722 Opc == X86ISD::SUB ||
15723 Opc == X86ISD::ADC ||
15724 Opc == X86ISD::SBB ||
15725 Opc == X86ISD::SMUL ||
15726 Opc == X86ISD::UMUL ||
15727 Opc == X86ISD::INC ||
15728 Opc == X86ISD::DEC ||
15729 Opc == X86ISD::OR ||
15730 Opc == X86ISD::XOR ||
15731 Opc == X86ISD::AND))
15734 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
15740 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
15741 if (V.getOpcode() != ISD::TRUNCATE)
15744 SDValue VOp0 = V.getOperand(0);
15745 unsigned InBits = VOp0.getValueSizeInBits();
15746 unsigned Bits = V.getValueSizeInBits();
15747 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
15750 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
15751 bool addTest = true;
15752 SDValue Cond = Op.getOperand(0);
15753 SDValue Op1 = Op.getOperand(1);
15754 SDValue Op2 = Op.getOperand(2);
15756 EVT VT = Op1.getValueType();
15759 // Lower fp selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
15760 // are available. Otherwise fp cmovs get lowered into a less efficient branch
15761 // sequence later on.
15762 if (Cond.getOpcode() == ISD::SETCC &&
15763 ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
15764 (Subtarget->hasSSE1() && VT == MVT::f32)) &&
15765 VT == Cond.getOperand(0).getValueType() && Cond->hasOneUse()) {
15766 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
15767 int SSECC = translateX86FSETCC(
15768 cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
15771 if (Subtarget->hasAVX512()) {
15772 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1,
15773 DAG.getConstant(SSECC, MVT::i8));
15774 return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2);
15776 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
15777 DAG.getConstant(SSECC, MVT::i8));
15778 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
15779 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
15780 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
15784 if (Cond.getOpcode() == ISD::SETCC) {
15785 SDValue NewCond = LowerSETCC(Cond, DAG);
15786 if (NewCond.getNode())
15790 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
15791 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
15792 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
15793 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
15794 if (Cond.getOpcode() == X86ISD::SETCC &&
15795 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
15796 isZero(Cond.getOperand(1).getOperand(1))) {
15797 SDValue Cmp = Cond.getOperand(1);
15799 unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
15801 if ((isAllOnes(Op1) || isAllOnes(Op2)) &&
15802 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
15803 SDValue Y = isAllOnes(Op2) ? Op1 : Op2;
15805 SDValue CmpOp0 = Cmp.getOperand(0);
15806 // Apply further optimizations for special cases
15807 // (select (x != 0), -1, 0) -> neg & sbb
15808 // (select (x == 0), 0, -1) -> neg & sbb
15809 if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y))
15810 if (YC->isNullValue() &&
15811 (isAllOnes(Op1) == (CondCode == X86::COND_NE))) {
15812 SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
15813 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
15814 DAG.getConstant(0, CmpOp0.getValueType()),
15816 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
15817 DAG.getConstant(X86::COND_B, MVT::i8),
15818 SDValue(Neg.getNode(), 1));
15822 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
15823 CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));
15824 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
15826 SDValue Res = // Res = 0 or -1.
15827 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
15828 DAG.getConstant(X86::COND_B, MVT::i8), Cmp);
15830 if (isAllOnes(Op1) != (CondCode == X86::COND_E))
15831 Res = DAG.getNOT(DL, Res, Res.getValueType());
15833 ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
15834 if (!N2C || !N2C->isNullValue())
15835 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
15840 // Look past (and (setcc_carry (cmp ...)), 1).
15841 if (Cond.getOpcode() == ISD::AND &&
15842 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
15843 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
15844 if (C && C->getAPIntValue() == 1)
15845 Cond = Cond.getOperand(0);
15848 // If condition flag is set by a X86ISD::CMP, then use it as the condition
15849 // setting operand in place of the X86ISD::SETCC.
15850 unsigned CondOpcode = Cond.getOpcode();
15851 if (CondOpcode == X86ISD::SETCC ||
15852 CondOpcode == X86ISD::SETCC_CARRY) {
15853 CC = Cond.getOperand(0);
15855 SDValue Cmp = Cond.getOperand(1);
15856 unsigned Opc = Cmp.getOpcode();
15857 MVT VT = Op.getSimpleValueType();
15859 bool IllegalFPCMov = false;
15860 if (VT.isFloatingPoint() && !VT.isVector() &&
15861 !isScalarFPTypeInSSEReg(VT)) // FPStack?
15862 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
15864 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
15865 Opc == X86ISD::BT) { // FIXME
15869 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
15870 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
15871 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
15872 Cond.getOperand(0).getValueType() != MVT::i8)) {
15873 SDValue LHS = Cond.getOperand(0);
15874 SDValue RHS = Cond.getOperand(1);
15875 unsigned X86Opcode;
15878 switch (CondOpcode) {
15879 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
15880 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
15881 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
15882 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
15883 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
15884 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
15885 default: llvm_unreachable("unexpected overflowing operator");
15887 if (CondOpcode == ISD::UMULO)
15888 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
15891 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
15893 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
15895 if (CondOpcode == ISD::UMULO)
15896 Cond = X86Op.getValue(2);
15898 Cond = X86Op.getValue(1);
15900 CC = DAG.getConstant(X86Cond, MVT::i8);
15905 // Look pass the truncate if the high bits are known zero.
15906 if (isTruncWithZeroHighBitsInput(Cond, DAG))
15907 Cond = Cond.getOperand(0);
15909 // We know the result of AND is compared against zero. Try to match
15911 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
15912 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG);
15913 if (NewSetCC.getNode()) {
15914 CC = NewSetCC.getOperand(0);
15915 Cond = NewSetCC.getOperand(1);
15922 CC = DAG.getConstant(X86::COND_NE, MVT::i8);
15923 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
15926 // a < b ? -1 : 0 -> RES = ~setcc_carry
15927 // a < b ? 0 : -1 -> RES = setcc_carry
15928 // a >= b ? -1 : 0 -> RES = setcc_carry
15929 // a >= b ? 0 : -1 -> RES = ~setcc_carry
15930 if (Cond.getOpcode() == X86ISD::SUB) {
15931 Cond = ConvertCmpIfNecessary(Cond, DAG);
15932 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
15934 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
15935 (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) {
15936 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
15937 DAG.getConstant(X86::COND_B, MVT::i8), Cond);
15938 if (isAllOnes(Op1) != (CondCode == X86::COND_B))
15939 return DAG.getNOT(DL, Res, Res.getValueType());
15944 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
15945 // widen the cmov and push the truncate through. This avoids introducing a new
15946 // branch during isel and doesn't add any extensions.
15947 if (Op.getValueType() == MVT::i8 &&
15948 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
15949 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
15950 if (T1.getValueType() == T2.getValueType() &&
15951 // Blacklist CopyFromReg to avoid partial register stalls.
15952 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
15953 SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
15954 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
15955 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
15959 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
15960 // condition is true.
15961 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
15962 SDValue Ops[] = { Op2, Op1, CC, Cond };
15963 return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
15966 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget,
15967 SelectionDAG &DAG) {
15968 MVT VT = Op->getSimpleValueType(0);
15969 SDValue In = Op->getOperand(0);
15970 MVT InVT = In.getSimpleValueType();
15971 MVT VTElt = VT.getVectorElementType();
15972 MVT InVTElt = InVT.getVectorElementType();
15976 if ((InVTElt == MVT::i1) &&
15977 (((Subtarget->hasBWI() && Subtarget->hasVLX() &&
15978 VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) ||
15980 ((Subtarget->hasBWI() && VT.is512BitVector() &&
15981 VTElt.getSizeInBits() <= 16)) ||
15983 ((Subtarget->hasDQI() && Subtarget->hasVLX() &&
15984 VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
15986 ((Subtarget->hasDQI() && VT.is512BitVector() &&
15987 VTElt.getSizeInBits() >= 32))))
15988 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
15990 unsigned int NumElts = VT.getVectorNumElements();
15992 if (NumElts != 8 && NumElts != 16)
15995 if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) {
15996 if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
15997 return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
15998 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16001 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16002 assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
16004 MVT ExtVT = (NumElts == 8) ? MVT::v8i64 : MVT::v16i32;
16005 Constant *C = ConstantInt::get(*DAG.getContext(),
16006 APInt::getAllOnesValue(ExtVT.getScalarType().getSizeInBits()));
16008 SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
16009 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
16010 SDValue Ld = DAG.getLoad(ExtVT.getScalarType(), dl, DAG.getEntryNode(), CP,
16011 MachinePointerInfo::getConstantPool(),
16012 false, false, false, Alignment);
16013 SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, dl, ExtVT, In, Ld);
16014 if (VT.is512BitVector())
16016 return DAG.getNode(X86ISD::VTRUNC, dl, VT, Brcst);
16019 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
16020 SelectionDAG &DAG) {
16021 MVT VT = Op->getSimpleValueType(0);
16022 SDValue In = Op->getOperand(0);
16023 MVT InVT = In.getSimpleValueType();
16026 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
16027 return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
16029 if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
16030 (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
16031 (VT != MVT::v16i16 || InVT != MVT::v16i8))
16034 if (Subtarget->hasInt256())
16035 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16037 // Optimize vectors in AVX mode
16038 // Sign extend v8i16 to v8i32 and
16041 // Divide input vector into two parts
16042 // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
16043 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
16044 // concat the vectors to original VT
16046 unsigned NumElems = InVT.getVectorNumElements();
16047 SDValue Undef = DAG.getUNDEF(InVT);
16049 SmallVector<int,8> ShufMask1(NumElems, -1);
16050 for (unsigned i = 0; i != NumElems/2; ++i)
16053 SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask1[0]);
16055 SmallVector<int,8> ShufMask2(NumElems, -1);
16056 for (unsigned i = 0; i != NumElems/2; ++i)
16057 ShufMask2[i] = i + NumElems/2;
16059 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]);
16061 MVT HalfVT = MVT::getVectorVT(VT.getScalarType(),
16062 VT.getVectorNumElements()/2);
16064 OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
16065 OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi);
16067 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
16070 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
16071 // may emit an illegal shuffle but the expansion is still better than scalar
16072 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
16073 // we'll emit a shuffle and a arithmetic shift.
16074 // TODO: It is possible to support ZExt by zeroing the undef values during
16075 // the shuffle phase or after the shuffle.
16076 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
16077 SelectionDAG &DAG) {
16078 MVT RegVT = Op.getSimpleValueType();
16079 assert(RegVT.isVector() && "We only custom lower vector sext loads.");
16080 assert(RegVT.isInteger() &&
16081 "We only custom lower integer vector sext loads.");
16083 // Nothing useful we can do without SSE2 shuffles.
16084 assert(Subtarget->hasSSE2() && "We only custom lower sext loads with SSE2.");
16086 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
16088 EVT MemVT = Ld->getMemoryVT();
16089 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16090 unsigned RegSz = RegVT.getSizeInBits();
16092 ISD::LoadExtType Ext = Ld->getExtensionType();
16094 assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
16095 && "Only anyext and sext are currently implemented.");
16096 assert(MemVT != RegVT && "Cannot extend to the same type");
16097 assert(MemVT.isVector() && "Must load a vector from memory");
16099 unsigned NumElems = RegVT.getVectorNumElements();
16100 unsigned MemSz = MemVT.getSizeInBits();
16101 assert(RegSz > MemSz && "Register size must be greater than the mem size");
16103 if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) {
16104 // The only way in which we have a legal 256-bit vector result but not the
16105 // integer 256-bit operations needed to directly lower a sextload is if we
16106 // have AVX1 but not AVX2. In that case, we can always emit a sextload to
16107 // a 128-bit vector and a normal sign_extend to 256-bits that should get
16108 // correctly legalized. We do this late to allow the canonical form of
16109 // sextload to persist throughout the rest of the DAG combiner -- it wants
16110 // to fold together any extensions it can, and so will fuse a sign_extend
16111 // of an sextload into a sextload targeting a wider value.
16113 if (MemSz == 128) {
16114 // Just switch this to a normal load.
16115 assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
16116 "it must be a legal 128-bit vector "
16118 Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
16119 Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(),
16120 Ld->isInvariant(), Ld->getAlignment());
16122 assert(MemSz < 128 &&
16123 "Can't extend a type wider than 128 bits to a 256 bit vector!");
16124 // Do an sext load to a 128-bit vector type. We want to use the same
16125 // number of elements, but elements half as wide. This will end up being
16126 // recursively lowered by this routine, but will succeed as we definitely
16127 // have all the necessary features if we're using AVX1.
16129 EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
16130 EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
16132 DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
16133 Ld->getPointerInfo(), MemVT, Ld->isVolatile(),
16134 Ld->isNonTemporal(), Ld->isInvariant(),
16135 Ld->getAlignment());
16138 // Replace chain users with the new chain.
16139 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
16140 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
16142 // Finally, do a normal sign-extend to the desired register.
16143 return DAG.getSExtOrTrunc(Load, dl, RegVT);
16146 // All sizes must be a power of two.
16147 assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
16148 "Non-power-of-two elements are not custom lowered!");
16150 // Attempt to load the original value using scalar loads.
16151 // Find the largest scalar type that divides the total loaded size.
16152 MVT SclrLoadTy = MVT::i8;
16153 for (MVT Tp : MVT::integer_valuetypes()) {
16154 if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
16159 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
16160 if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
16162 SclrLoadTy = MVT::f64;
16164 // Calculate the number of scalar loads that we need to perform
16165 // in order to load our vector from memory.
16166 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
16168 assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
16169 "Can only lower sext loads with a single scalar load!");
16171 unsigned loadRegZize = RegSz;
16172 if (Ext == ISD::SEXTLOAD && RegSz == 256)
16175 // Represent our vector as a sequence of elements which are the
16176 // largest scalar that we can load.
16177 EVT LoadUnitVecVT = EVT::getVectorVT(
16178 *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
16180 // Represent the data using the same element type that is stored in
16181 // memory. In practice, we ''widen'' MemVT.
16183 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
16184 loadRegZize / MemVT.getScalarType().getSizeInBits());
16186 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
16187 "Invalid vector type");
16189 // We can't shuffle using an illegal type.
16190 assert(TLI.isTypeLegal(WideVecVT) &&
16191 "We only lower types that form legal widened vector types");
16193 SmallVector<SDValue, 8> Chains;
16194 SDValue Ptr = Ld->getBasePtr();
16195 SDValue Increment =
16196 DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, TLI.getPointerTy());
16197 SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
16199 for (unsigned i = 0; i < NumLoads; ++i) {
16200 // Perform a single load.
16201 SDValue ScalarLoad =
16202 DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
16203 Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(),
16204 Ld->getAlignment());
16205 Chains.push_back(ScalarLoad.getValue(1));
16206 // Create the first element type using SCALAR_TO_VECTOR in order to avoid
16207 // another round of DAGCombining.
16209 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
16211 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
16212 ScalarLoad, DAG.getIntPtrConstant(i));
16214 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
16217 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
16219 // Bitcast the loaded value to a vector of the original element type, in
16220 // the size of the target vector type.
16221 SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
16222 unsigned SizeRatio = RegSz / MemSz;
16224 if (Ext == ISD::SEXTLOAD) {
16225 // If we have SSE4.1, we can directly emit a VSEXT node.
16226 if (Subtarget->hasSSE41()) {
16227 SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
16228 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16232 // Otherwise we'll shuffle the small elements in the high bits of the
16233 // larger type and perform an arithmetic shift. If the shift is not legal
16234 // it's better to scalarize.
16235 assert(TLI.isOperationLegalOrCustom(ISD::SRA, RegVT) &&
16236 "We can't implement a sext load without an arithmetic right shift!");
16238 // Redistribute the loaded elements into the different locations.
16239 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
16240 for (unsigned i = 0; i != NumElems; ++i)
16241 ShuffleVec[i * SizeRatio + SizeRatio - 1] = i;
16243 SDValue Shuff = DAG.getVectorShuffle(
16244 WideVecVT, dl, SlicedVec, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
16246 Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
16248 // Build the arithmetic shift.
16249 unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
16250 MemVT.getVectorElementType().getSizeInBits();
16252 DAG.getNode(ISD::SRA, dl, RegVT, Shuff, DAG.getConstant(Amt, RegVT));
16254 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16258 // Redistribute the loaded elements into the different locations.
16259 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
16260 for (unsigned i = 0; i != NumElems; ++i)
16261 ShuffleVec[i * SizeRatio] = i;
16263 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
16264 DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
16266 // Bitcast to the requested type.
16267 Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
16268 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16272 // isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
16273 // ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
16274 // from the AND / OR.
16275 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
16276 Opc = Op.getOpcode();
16277 if (Opc != ISD::OR && Opc != ISD::AND)
16279 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
16280 Op.getOperand(0).hasOneUse() &&
16281 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
16282 Op.getOperand(1).hasOneUse());
16285 // isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
16286 // 1 and that the SETCC node has a single use.
16287 static bool isXor1OfSetCC(SDValue Op) {
16288 if (Op.getOpcode() != ISD::XOR)
16290 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
16291 if (N1C && N1C->getAPIntValue() == 1) {
16292 return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
16293 Op.getOperand(0).hasOneUse();
16298 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
16299 bool addTest = true;
16300 SDValue Chain = Op.getOperand(0);
16301 SDValue Cond = Op.getOperand(1);
16302 SDValue Dest = Op.getOperand(2);
16305 bool Inverted = false;
16307 if (Cond.getOpcode() == ISD::SETCC) {
16308 // Check for setcc([su]{add,sub,mul}o == 0).
16309 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
16310 isa<ConstantSDNode>(Cond.getOperand(1)) &&
16311 cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() &&
16312 Cond.getOperand(0).getResNo() == 1 &&
16313 (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
16314 Cond.getOperand(0).getOpcode() == ISD::UADDO ||
16315 Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
16316 Cond.getOperand(0).getOpcode() == ISD::USUBO ||
16317 Cond.getOperand(0).getOpcode() == ISD::SMULO ||
16318 Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
16320 Cond = Cond.getOperand(0);
16322 SDValue NewCond = LowerSETCC(Cond, DAG);
16323 if (NewCond.getNode())
16328 // FIXME: LowerXALUO doesn't handle these!!
16329 else if (Cond.getOpcode() == X86ISD::ADD ||
16330 Cond.getOpcode() == X86ISD::SUB ||
16331 Cond.getOpcode() == X86ISD::SMUL ||
16332 Cond.getOpcode() == X86ISD::UMUL)
16333 Cond = LowerXALUO(Cond, DAG);
16336 // Look pass (and (setcc_carry (cmp ...)), 1).
16337 if (Cond.getOpcode() == ISD::AND &&
16338 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
16339 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
16340 if (C && C->getAPIntValue() == 1)
16341 Cond = Cond.getOperand(0);
16344 // If condition flag is set by a X86ISD::CMP, then use it as the condition
16345 // setting operand in place of the X86ISD::SETCC.
16346 unsigned CondOpcode = Cond.getOpcode();
16347 if (CondOpcode == X86ISD::SETCC ||
16348 CondOpcode == X86ISD::SETCC_CARRY) {
16349 CC = Cond.getOperand(0);
16351 SDValue Cmp = Cond.getOperand(1);
16352 unsigned Opc = Cmp.getOpcode();
16353 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
16354 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
16358 switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
16362 // These can only come from an arithmetic instruction with overflow,
16363 // e.g. SADDO, UADDO.
16364 Cond = Cond.getNode()->getOperand(1);
16370 CondOpcode = Cond.getOpcode();
16371 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
16372 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
16373 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
16374 Cond.getOperand(0).getValueType() != MVT::i8)) {
16375 SDValue LHS = Cond.getOperand(0);
16376 SDValue RHS = Cond.getOperand(1);
16377 unsigned X86Opcode;
16380 // Keep this in sync with LowerXALUO, otherwise we might create redundant
16381 // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
16383 switch (CondOpcode) {
16384 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
16386 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
16388 X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
16391 X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
16392 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
16394 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
16396 X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
16399 X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
16400 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
16401 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
16402 default: llvm_unreachable("unexpected overflowing operator");
16405 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
16406 if (CondOpcode == ISD::UMULO)
16407 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
16410 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
16412 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
16414 if (CondOpcode == ISD::UMULO)
16415 Cond = X86Op.getValue(2);
16417 Cond = X86Op.getValue(1);
16419 CC = DAG.getConstant(X86Cond, MVT::i8);
16423 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
16424 SDValue Cmp = Cond.getOperand(0).getOperand(1);
16425 if (CondOpc == ISD::OR) {
16426 // Also, recognize the pattern generated by an FCMP_UNE. We can emit
16427 // two branches instead of an explicit OR instruction with a
16429 if (Cmp == Cond.getOperand(1).getOperand(1) &&
16430 isX86LogicalCmp(Cmp)) {
16431 CC = Cond.getOperand(0).getOperand(0);
16432 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16433 Chain, Dest, CC, Cmp);
16434 CC = Cond.getOperand(1).getOperand(0);
16438 } else { // ISD::AND
16439 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
16440 // two branches instead of an explicit AND instruction with a
16441 // separate test. However, we only do this if this block doesn't
16442 // have a fall-through edge, because this requires an explicit
16443 // jmp when the condition is false.
16444 if (Cmp == Cond.getOperand(1).getOperand(1) &&
16445 isX86LogicalCmp(Cmp) &&
16446 Op.getNode()->hasOneUse()) {
16447 X86::CondCode CCode =
16448 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
16449 CCode = X86::GetOppositeBranchCondition(CCode);
16450 CC = DAG.getConstant(CCode, MVT::i8);
16451 SDNode *User = *Op.getNode()->use_begin();
16452 // Look for an unconditional branch following this conditional branch.
16453 // We need this because we need to reverse the successors in order
16454 // to implement FCMP_OEQ.
16455 if (User->getOpcode() == ISD::BR) {
16456 SDValue FalseBB = User->getOperand(1);
16458 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16459 assert(NewBR == User);
16463 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16464 Chain, Dest, CC, Cmp);
16465 X86::CondCode CCode =
16466 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
16467 CCode = X86::GetOppositeBranchCondition(CCode);
16468 CC = DAG.getConstant(CCode, MVT::i8);
16474 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
16475 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
16476 // It should be transformed during dag combiner except when the condition
16477 // is set by a arithmetics with overflow node.
16478 X86::CondCode CCode =
16479 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
16480 CCode = X86::GetOppositeBranchCondition(CCode);
16481 CC = DAG.getConstant(CCode, MVT::i8);
16482 Cond = Cond.getOperand(0).getOperand(1);
16484 } else if (Cond.getOpcode() == ISD::SETCC &&
16485 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
16486 // For FCMP_OEQ, we can emit
16487 // two branches instead of an explicit AND instruction with a
16488 // separate test. However, we only do this if this block doesn't
16489 // have a fall-through edge, because this requires an explicit
16490 // jmp when the condition is false.
16491 if (Op.getNode()->hasOneUse()) {
16492 SDNode *User = *Op.getNode()->use_begin();
16493 // Look for an unconditional branch following this conditional branch.
16494 // We need this because we need to reverse the successors in order
16495 // to implement FCMP_OEQ.
16496 if (User->getOpcode() == ISD::BR) {
16497 SDValue FalseBB = User->getOperand(1);
16499 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16500 assert(NewBR == User);
16504 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
16505 Cond.getOperand(0), Cond.getOperand(1));
16506 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16507 CC = DAG.getConstant(X86::COND_NE, MVT::i8);
16508 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16509 Chain, Dest, CC, Cmp);
16510 CC = DAG.getConstant(X86::COND_P, MVT::i8);
16515 } else if (Cond.getOpcode() == ISD::SETCC &&
16516 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
16517 // For FCMP_UNE, we can emit
16518 // two branches instead of an explicit AND instruction with a
16519 // separate test. However, we only do this if this block doesn't
16520 // have a fall-through edge, because this requires an explicit
16521 // jmp when the condition is false.
16522 if (Op.getNode()->hasOneUse()) {
16523 SDNode *User = *Op.getNode()->use_begin();
16524 // Look for an unconditional branch following this conditional branch.
16525 // We need this because we need to reverse the successors in order
16526 // to implement FCMP_UNE.
16527 if (User->getOpcode() == ISD::BR) {
16528 SDValue FalseBB = User->getOperand(1);
16530 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16531 assert(NewBR == User);
16534 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
16535 Cond.getOperand(0), Cond.getOperand(1));
16536 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16537 CC = DAG.getConstant(X86::COND_NE, MVT::i8);
16538 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16539 Chain, Dest, CC, Cmp);
16540 CC = DAG.getConstant(X86::COND_NP, MVT::i8);
16550 // Look pass the truncate if the high bits are known zero.
16551 if (isTruncWithZeroHighBitsInput(Cond, DAG))
16552 Cond = Cond.getOperand(0);
16554 // We know the result of AND is compared against zero. Try to match
16556 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
16557 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
16558 if (NewSetCC.getNode()) {
16559 CC = NewSetCC.getOperand(0);
16560 Cond = NewSetCC.getOperand(1);
16567 X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
16568 CC = DAG.getConstant(X86Cond, MVT::i8);
16569 Cond = EmitTest(Cond, X86Cond, dl, DAG);
16571 Cond = ConvertCmpIfNecessary(Cond, DAG);
16572 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16573 Chain, Dest, CC, Cond);
16576 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
16577 // Calls to _alloca are needed to probe the stack when allocating more than 4k
16578 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
16579 // that the guard pages used by the OS virtual memory manager are allocated in
16580 // correct sequence.
16582 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
16583 SelectionDAG &DAG) const {
16584 MachineFunction &MF = DAG.getMachineFunction();
16585 bool SplitStack = MF.shouldSplitStack();
16586 bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMachO()) ||
16591 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16592 SDNode* Node = Op.getNode();
16594 unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
16595 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
16596 " not tell us which reg is the stack pointer!");
16597 EVT VT = Node->getValueType(0);
16598 SDValue Tmp1 = SDValue(Node, 0);
16599 SDValue Tmp2 = SDValue(Node, 1);
16600 SDValue Tmp3 = Node->getOperand(2);
16601 SDValue Chain = Tmp1.getOperand(0);
16603 // Chain the dynamic stack allocation so that it doesn't modify the stack
16604 // pointer when other instructions are using the stack.
16605 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true),
16608 SDValue Size = Tmp2.getOperand(1);
16609 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
16610 Chain = SP.getValue(1);
16611 unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
16612 const TargetFrameLowering &TFI = *DAG.getSubtarget().getFrameLowering();
16613 unsigned StackAlign = TFI.getStackAlignment();
16614 Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
16615 if (Align > StackAlign)
16616 Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
16617 DAG.getConstant(-(uint64_t)Align, VT));
16618 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
16620 Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, true),
16621 DAG.getIntPtrConstant(0, true), SDValue(),
16624 SDValue Ops[2] = { Tmp1, Tmp2 };
16625 return DAG.getMergeValues(Ops, dl);
16629 SDValue Chain = Op.getOperand(0);
16630 SDValue Size = Op.getOperand(1);
16631 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
16632 EVT VT = Op.getNode()->getValueType(0);
16634 bool Is64Bit = Subtarget->is64Bit();
16635 EVT SPTy = getPointerTy();
16638 MachineRegisterInfo &MRI = MF.getRegInfo();
16641 // The 64 bit implementation of segmented stacks needs to clobber both r10
16642 // r11. This makes it impossible to use it along with nested parameters.
16643 const Function *F = MF.getFunction();
16645 for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
16647 if (I->hasNestAttr())
16648 report_fatal_error("Cannot use segmented stacks with functions that "
16649 "have nested arguments.");
16652 const TargetRegisterClass *AddrRegClass =
16653 getRegClassFor(getPointerTy());
16654 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
16655 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
16656 SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
16657 DAG.getRegister(Vreg, SPTy));
16658 SDValue Ops1[2] = { Value, Chain };
16659 return DAG.getMergeValues(Ops1, dl);
16662 const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX);
16664 Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
16665 Flag = Chain.getValue(1);
16666 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
16668 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
16670 const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
16671 DAG.getSubtarget().getRegisterInfo());
16672 unsigned SPReg = RegInfo->getStackRegister();
16673 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
16674 Chain = SP.getValue(1);
16677 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
16678 DAG.getConstant(-(uint64_t)Align, VT));
16679 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
16682 SDValue Ops1[2] = { SP, Chain };
16683 return DAG.getMergeValues(Ops1, dl);
16687 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
16688 MachineFunction &MF = DAG.getMachineFunction();
16689 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
16691 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
16694 if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) {
16695 // vastart just stores the address of the VarArgsFrameIndex slot into the
16696 // memory location argument.
16697 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
16699 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
16700 MachinePointerInfo(SV), false, false, 0);
16704 // gp_offset (0 - 6 * 8)
16705 // fp_offset (48 - 48 + 8 * 16)
16706 // overflow_arg_area (point to parameters coming in memory).
16708 SmallVector<SDValue, 8> MemOps;
16709 SDValue FIN = Op.getOperand(1);
16711 SDValue Store = DAG.getStore(Op.getOperand(0), DL,
16712 DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
16714 FIN, MachinePointerInfo(SV), false, false, 0);
16715 MemOps.push_back(Store);
16718 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
16719 FIN, DAG.getIntPtrConstant(4));
16720 Store = DAG.getStore(Op.getOperand(0), DL,
16721 DAG.getConstant(FuncInfo->getVarArgsFPOffset(),
16723 FIN, MachinePointerInfo(SV, 4), false, false, 0);
16724 MemOps.push_back(Store);
16726 // Store ptr to overflow_arg_area
16727 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
16728 FIN, DAG.getIntPtrConstant(4));
16729 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
16731 Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
16732 MachinePointerInfo(SV, 8),
16734 MemOps.push_back(Store);
16736 // Store ptr to reg_save_area.
16737 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
16738 FIN, DAG.getIntPtrConstant(8));
16739 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
16741 Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
16742 MachinePointerInfo(SV, 16), false, false, 0);
16743 MemOps.push_back(Store);
16744 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
16747 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
16748 assert(Subtarget->is64Bit() &&
16749 "LowerVAARG only handles 64-bit va_arg!");
16750 assert((Subtarget->isTargetLinux() ||
16751 Subtarget->isTargetDarwin()) &&
16752 "Unhandled target in LowerVAARG");
16753 assert(Op.getNode()->getNumOperands() == 4);
16754 SDValue Chain = Op.getOperand(0);
16755 SDValue SrcPtr = Op.getOperand(1);
16756 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
16757 unsigned Align = Op.getConstantOperandVal(3);
16760 EVT ArgVT = Op.getNode()->getValueType(0);
16761 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
16762 uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
16765 // Decide which area this value should be read from.
16766 // TODO: Implement the AMD64 ABI in its entirety. This simple
16767 // selection mechanism works only for the basic types.
16768 if (ArgVT == MVT::f80) {
16769 llvm_unreachable("va_arg for f80 not yet implemented");
16770 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
16771 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
16772 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
16773 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
16775 llvm_unreachable("Unhandled argument type in LowerVAARG");
16778 if (ArgMode == 2) {
16779 // Sanity Check: Make sure using fp_offset makes sense.
16780 assert(!DAG.getTarget().Options.UseSoftFloat &&
16781 !(DAG.getMachineFunction()
16782 .getFunction()->getAttributes()
16783 .hasAttribute(AttributeSet::FunctionIndex,
16784 Attribute::NoImplicitFloat)) &&
16785 Subtarget->hasSSE1());
16788 // Insert VAARG_64 node into the DAG
16789 // VAARG_64 returns two values: Variable Argument Address, Chain
16790 SmallVector<SDValue, 11> InstOps;
16791 InstOps.push_back(Chain);
16792 InstOps.push_back(SrcPtr);
16793 InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32));
16794 InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8));
16795 InstOps.push_back(DAG.getConstant(Align, MVT::i32));
16796 SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other);
16797 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
16798 VTs, InstOps, MVT::i64,
16799 MachinePointerInfo(SV),
16801 /*Volatile=*/false,
16803 /*WriteMem=*/true);
16804 Chain = VAARG.getValue(1);
16806 // Load the next argument and return it
16807 return DAG.getLoad(ArgVT, dl,
16810 MachinePointerInfo(),
16811 false, false, false, 0);
16814 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
16815 SelectionDAG &DAG) {
16816 // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
16817 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
16818 SDValue Chain = Op.getOperand(0);
16819 SDValue DstPtr = Op.getOperand(1);
16820 SDValue SrcPtr = Op.getOperand(2);
16821 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
16822 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
16825 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
16826 DAG.getIntPtrConstant(24), 8, /*isVolatile*/false,
16828 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
16831 // getTargetVShiftByConstNode - Handle vector element shifts where the shift
16832 // amount is a constant. Takes immediate version of shift as input.
16833 static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
16834 SDValue SrcOp, uint64_t ShiftAmt,
16835 SelectionDAG &DAG) {
16836 MVT ElementType = VT.getVectorElementType();
16838 // Fold this packed shift into its first operand if ShiftAmt is 0.
16842 // Check for ShiftAmt >= element width
16843 if (ShiftAmt >= ElementType.getSizeInBits()) {
16844 if (Opc == X86ISD::VSRAI)
16845 ShiftAmt = ElementType.getSizeInBits() - 1;
16847 return DAG.getConstant(0, VT);
16850 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
16851 && "Unknown target vector shift-by-constant node");
16853 // Fold this packed vector shift into a build vector if SrcOp is a
16854 // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT.
16855 if (VT == SrcOp.getSimpleValueType() &&
16856 ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
16857 SmallVector<SDValue, 8> Elts;
16858 unsigned NumElts = SrcOp->getNumOperands();
16859 ConstantSDNode *ND;
16862 default: llvm_unreachable(nullptr);
16863 case X86ISD::VSHLI:
16864 for (unsigned i=0; i!=NumElts; ++i) {
16865 SDValue CurrentOp = SrcOp->getOperand(i);
16866 if (CurrentOp->getOpcode() == ISD::UNDEF) {
16867 Elts.push_back(CurrentOp);
16870 ND = cast<ConstantSDNode>(CurrentOp);
16871 const APInt &C = ND->getAPIntValue();
16872 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), ElementType));
16875 case X86ISD::VSRLI:
16876 for (unsigned i=0; i!=NumElts; ++i) {
16877 SDValue CurrentOp = SrcOp->getOperand(i);
16878 if (CurrentOp->getOpcode() == ISD::UNDEF) {
16879 Elts.push_back(CurrentOp);
16882 ND = cast<ConstantSDNode>(CurrentOp);
16883 const APInt &C = ND->getAPIntValue();
16884 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), ElementType));
16887 case X86ISD::VSRAI:
16888 for (unsigned i=0; i!=NumElts; ++i) {
16889 SDValue CurrentOp = SrcOp->getOperand(i);
16890 if (CurrentOp->getOpcode() == ISD::UNDEF) {
16891 Elts.push_back(CurrentOp);
16894 ND = cast<ConstantSDNode>(CurrentOp);
16895 const APInt &C = ND->getAPIntValue();
16896 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), ElementType));
16901 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
16904 return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8));
16907 // getTargetVShiftNode - Handle vector element shifts where the shift amount
16908 // may or may not be a constant. Takes immediate version of shift as input.
16909 static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
16910 SDValue SrcOp, SDValue ShAmt,
16911 SelectionDAG &DAG) {
16912 MVT SVT = ShAmt.getSimpleValueType();
16913 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
16915 // Catch shift-by-constant.
16916 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
16917 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
16918 CShAmt->getZExtValue(), DAG);
16920 // Change opcode to non-immediate version
16922 default: llvm_unreachable("Unknown target vector shift node");
16923 case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
16924 case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
16925 case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
16928 const X86Subtarget &Subtarget =
16929 DAG.getTarget().getSubtarget<X86Subtarget>();
16930 if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
16931 ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
16932 // Let the shuffle legalizer expand this shift amount node.
16933 SDValue Op0 = ShAmt.getOperand(0);
16934 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
16935 ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, &Subtarget, DAG);
16937 // Need to build a vector containing shift amount.
16938 // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
16939 SmallVector<SDValue, 4> ShOps;
16940 ShOps.push_back(ShAmt);
16941 if (SVT == MVT::i32) {
16942 ShOps.push_back(DAG.getConstant(0, SVT));
16943 ShOps.push_back(DAG.getUNDEF(SVT));
16945 ShOps.push_back(DAG.getUNDEF(SVT));
16947 MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
16948 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, BVT, ShOps);
16951 // The return type has to be a 128-bit type with the same element
16952 // type as the input type.
16953 MVT EltVT = VT.getVectorElementType();
16954 EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
16956 ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt);
16957 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
16960 /// \brief Return (and \p Op, \p Mask) for compare instructions or
16961 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
16962 /// necessary casting for \p Mask when lowering masking intrinsics.
16963 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
16964 SDValue PreservedSrc,
16965 const X86Subtarget *Subtarget,
16966 SelectionDAG &DAG) {
16967 EVT VT = Op.getValueType();
16968 EVT MaskVT = EVT::getVectorVT(*DAG.getContext(),
16969 MVT::i1, VT.getVectorNumElements());
16970 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
16971 Mask.getValueType().getSizeInBits());
16974 assert(MaskVT.isSimple() && "invalid mask type");
16976 if (isAllOnes(Mask))
16979 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
16980 // are extracted by EXTRACT_SUBVECTOR.
16981 SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
16982 DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
16983 DAG.getIntPtrConstant(0));
16985 switch (Op.getOpcode()) {
16987 case X86ISD::PCMPEQM:
16988 case X86ISD::PCMPGTM:
16990 case X86ISD::CMPMU:
16991 return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
16993 if (PreservedSrc.getOpcode() == ISD::UNDEF)
16994 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
16995 return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc);
16998 /// \brief Creates an SDNode for a predicated scalar operation.
16999 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
17000 /// The mask is comming as MVT::i8 and it should be truncated
17001 /// to MVT::i1 while lowering masking intrinsics.
17002 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
17003 /// "X86select" instead of "vselect". We just can't create the "vselect" node for
17004 /// a scalar instruction.
17005 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
17006 SDValue PreservedSrc,
17007 const X86Subtarget *Subtarget,
17008 SelectionDAG &DAG) {
17009 if (isAllOnes(Mask))
17012 EVT VT = Op.getValueType();
17014 // The mask should be of type MVT::i1
17015 SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
17017 if (PreservedSrc.getOpcode() == ISD::UNDEF)
17018 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
17019 return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
17022 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
17023 SelectionDAG &DAG) {
17025 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
17026 EVT VT = Op.getValueType();
17027 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
17029 switch(IntrData->Type) {
17030 case INTR_TYPE_1OP:
17031 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
17032 case INTR_TYPE_2OP:
17033 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17035 case INTR_TYPE_3OP:
17036 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17037 Op.getOperand(2), Op.getOperand(3));
17038 case INTR_TYPE_1OP_MASK_RM: {
17039 SDValue Src = Op.getOperand(1);
17040 SDValue Src0 = Op.getOperand(2);
17041 SDValue Mask = Op.getOperand(3);
17042 SDValue RoundingMode = Op.getOperand(4);
17043 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
17045 Mask, Src0, Subtarget, DAG);
17047 case INTR_TYPE_SCALAR_MASK_RM: {
17048 SDValue Src1 = Op.getOperand(1);
17049 SDValue Src2 = Op.getOperand(2);
17050 SDValue Src0 = Op.getOperand(3);
17051 SDValue Mask = Op.getOperand(4);
17052 SDValue RoundingMode = Op.getOperand(5);
17053 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
17055 Mask, Src0, Subtarget, DAG);
17057 case INTR_TYPE_2OP_MASK: {
17058 SDValue Mask = Op.getOperand(4);
17059 SDValue PassThru = Op.getOperand(3);
17060 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17061 if (IntrWithRoundingModeOpcode != 0) {
17062 unsigned Round = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
17063 if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
17064 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17065 dl, Op.getValueType(),
17066 Op.getOperand(1), Op.getOperand(2),
17067 Op.getOperand(3), Op.getOperand(5)),
17068 Mask, PassThru, Subtarget, DAG);
17071 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17074 Mask, PassThru, Subtarget, DAG);
17076 case FMA_OP_MASK: {
17077 SDValue Src1 = Op.getOperand(1);
17078 SDValue Src2 = Op.getOperand(2);
17079 SDValue Src3 = Op.getOperand(3);
17080 SDValue Mask = Op.getOperand(4);
17081 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17082 if (IntrWithRoundingModeOpcode != 0) {
17083 SDValue Rnd = Op.getOperand(5);
17084 if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
17085 X86::STATIC_ROUNDING::CUR_DIRECTION)
17086 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17087 dl, Op.getValueType(),
17088 Src1, Src2, Src3, Rnd),
17089 Mask, Src1, Subtarget, DAG);
17091 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
17092 dl, Op.getValueType(),
17094 Mask, Src1, Subtarget, DAG);
17097 case CMP_MASK_CC: {
17098 // Comparison intrinsics with masks.
17099 // Example of transformation:
17100 // (i8 (int_x86_avx512_mask_pcmpeq_q_128
17101 // (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
17103 // (v8i1 (insert_subvector undef,
17104 // (v2i1 (and (PCMPEQM %a, %b),
17105 // (extract_subvector
17106 // (v8i1 (bitcast %mask)), 0))), 0))))
17107 EVT VT = Op.getOperand(1).getValueType();
17108 EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17109 VT.getVectorNumElements());
17110 SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
17111 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17112 Mask.getValueType().getSizeInBits());
17114 if (IntrData->Type == CMP_MASK_CC) {
17115 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
17116 Op.getOperand(2), Op.getOperand(3));
17118 assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
17119 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
17122 SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
17123 DAG.getTargetConstant(0, MaskVT),
17125 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
17126 DAG.getUNDEF(BitcastVT), CmpMask,
17127 DAG.getIntPtrConstant(0));
17128 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
17130 case COMI: { // Comparison intrinsics
17131 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
17132 SDValue LHS = Op.getOperand(1);
17133 SDValue RHS = Op.getOperand(2);
17134 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
17135 assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
17136 SDValue Cond = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
17137 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17138 DAG.getConstant(X86CC, MVT::i8), Cond);
17139 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17142 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
17143 Op.getOperand(1), Op.getOperand(2), DAG);
17145 return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl,
17146 Op.getSimpleValueType(),
17148 Op.getOperand(2), DAG),
17149 Op.getOperand(4), Op.getOperand(3), Subtarget,
17151 case COMPRESS_EXPAND_IN_REG: {
17152 SDValue Mask = Op.getOperand(3);
17153 SDValue DataToCompress = Op.getOperand(1);
17154 SDValue PassThru = Op.getOperand(2);
17155 if (isAllOnes(Mask)) // return data as is
17156 return Op.getOperand(1);
17157 EVT VT = Op.getValueType();
17158 EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17159 VT.getVectorNumElements());
17160 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17161 Mask.getValueType().getSizeInBits());
17163 SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17164 DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17165 DAG.getIntPtrConstant(0));
17167 return DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToCompress,
17171 SDValue Mask = Op.getOperand(3);
17172 EVT VT = Op.getValueType();
17173 EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17174 VT.getVectorNumElements());
17175 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17176 Mask.getValueType().getSizeInBits());
17178 SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17179 DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17180 DAG.getIntPtrConstant(0));
17181 return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1),
17190 default: return SDValue(); // Don't custom lower most intrinsics.
17192 case Intrinsic::x86_avx512_mask_valign_q_512:
17193 case Intrinsic::x86_avx512_mask_valign_d_512:
17194 // Vector source operands are swapped.
17195 return getVectorMaskingNode(DAG.getNode(X86ISD::VALIGN, dl,
17196 Op.getValueType(), Op.getOperand(2),
17199 Op.getOperand(5), Op.getOperand(4),
17202 // ptest and testp intrinsics. The intrinsic these come from are designed to
17203 // return an integer value, not just an instruction so lower it to the ptest
17204 // or testp pattern and a setcc for the result.
17205 case Intrinsic::x86_sse41_ptestz:
17206 case Intrinsic::x86_sse41_ptestc:
17207 case Intrinsic::x86_sse41_ptestnzc:
17208 case Intrinsic::x86_avx_ptestz_256:
17209 case Intrinsic::x86_avx_ptestc_256:
17210 case Intrinsic::x86_avx_ptestnzc_256:
17211 case Intrinsic::x86_avx_vtestz_ps:
17212 case Intrinsic::x86_avx_vtestc_ps:
17213 case Intrinsic::x86_avx_vtestnzc_ps:
17214 case Intrinsic::x86_avx_vtestz_pd:
17215 case Intrinsic::x86_avx_vtestc_pd:
17216 case Intrinsic::x86_avx_vtestnzc_pd:
17217 case Intrinsic::x86_avx_vtestz_ps_256:
17218 case Intrinsic::x86_avx_vtestc_ps_256:
17219 case Intrinsic::x86_avx_vtestnzc_ps_256:
17220 case Intrinsic::x86_avx_vtestz_pd_256:
17221 case Intrinsic::x86_avx_vtestc_pd_256:
17222 case Intrinsic::x86_avx_vtestnzc_pd_256: {
17223 bool IsTestPacked = false;
17226 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
17227 case Intrinsic::x86_avx_vtestz_ps:
17228 case Intrinsic::x86_avx_vtestz_pd:
17229 case Intrinsic::x86_avx_vtestz_ps_256:
17230 case Intrinsic::x86_avx_vtestz_pd_256:
17231 IsTestPacked = true; // Fallthrough
17232 case Intrinsic::x86_sse41_ptestz:
17233 case Intrinsic::x86_avx_ptestz_256:
17235 X86CC = X86::COND_E;
17237 case Intrinsic::x86_avx_vtestc_ps:
17238 case Intrinsic::x86_avx_vtestc_pd:
17239 case Intrinsic::x86_avx_vtestc_ps_256:
17240 case Intrinsic::x86_avx_vtestc_pd_256:
17241 IsTestPacked = true; // Fallthrough
17242 case Intrinsic::x86_sse41_ptestc:
17243 case Intrinsic::x86_avx_ptestc_256:
17245 X86CC = X86::COND_B;
17247 case Intrinsic::x86_avx_vtestnzc_ps:
17248 case Intrinsic::x86_avx_vtestnzc_pd:
17249 case Intrinsic::x86_avx_vtestnzc_ps_256:
17250 case Intrinsic::x86_avx_vtestnzc_pd_256:
17251 IsTestPacked = true; // Fallthrough
17252 case Intrinsic::x86_sse41_ptestnzc:
17253 case Intrinsic::x86_avx_ptestnzc_256:
17255 X86CC = X86::COND_A;
17259 SDValue LHS = Op.getOperand(1);
17260 SDValue RHS = Op.getOperand(2);
17261 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
17262 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
17263 SDValue CC = DAG.getConstant(X86CC, MVT::i8);
17264 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
17265 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17267 case Intrinsic::x86_avx512_kortestz_w:
17268 case Intrinsic::x86_avx512_kortestc_w: {
17269 unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B;
17270 SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1));
17271 SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2));
17272 SDValue CC = DAG.getConstant(X86CC, MVT::i8);
17273 SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
17274 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i1, CC, Test);
17275 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17278 case Intrinsic::x86_sse42_pcmpistria128:
17279 case Intrinsic::x86_sse42_pcmpestria128:
17280 case Intrinsic::x86_sse42_pcmpistric128:
17281 case Intrinsic::x86_sse42_pcmpestric128:
17282 case Intrinsic::x86_sse42_pcmpistrio128:
17283 case Intrinsic::x86_sse42_pcmpestrio128:
17284 case Intrinsic::x86_sse42_pcmpistris128:
17285 case Intrinsic::x86_sse42_pcmpestris128:
17286 case Intrinsic::x86_sse42_pcmpistriz128:
17287 case Intrinsic::x86_sse42_pcmpestriz128: {
17291 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
17292 case Intrinsic::x86_sse42_pcmpistria128:
17293 Opcode = X86ISD::PCMPISTRI;
17294 X86CC = X86::COND_A;
17296 case Intrinsic::x86_sse42_pcmpestria128:
17297 Opcode = X86ISD::PCMPESTRI;
17298 X86CC = X86::COND_A;
17300 case Intrinsic::x86_sse42_pcmpistric128:
17301 Opcode = X86ISD::PCMPISTRI;
17302 X86CC = X86::COND_B;
17304 case Intrinsic::x86_sse42_pcmpestric128:
17305 Opcode = X86ISD::PCMPESTRI;
17306 X86CC = X86::COND_B;
17308 case Intrinsic::x86_sse42_pcmpistrio128:
17309 Opcode = X86ISD::PCMPISTRI;
17310 X86CC = X86::COND_O;
17312 case Intrinsic::x86_sse42_pcmpestrio128:
17313 Opcode = X86ISD::PCMPESTRI;
17314 X86CC = X86::COND_O;
17316 case Intrinsic::x86_sse42_pcmpistris128:
17317 Opcode = X86ISD::PCMPISTRI;
17318 X86CC = X86::COND_S;
17320 case Intrinsic::x86_sse42_pcmpestris128:
17321 Opcode = X86ISD::PCMPESTRI;
17322 X86CC = X86::COND_S;
17324 case Intrinsic::x86_sse42_pcmpistriz128:
17325 Opcode = X86ISD::PCMPISTRI;
17326 X86CC = X86::COND_E;
17328 case Intrinsic::x86_sse42_pcmpestriz128:
17329 Opcode = X86ISD::PCMPESTRI;
17330 X86CC = X86::COND_E;
17333 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
17334 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17335 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
17336 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17337 DAG.getConstant(X86CC, MVT::i8),
17338 SDValue(PCMP.getNode(), 1));
17339 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17342 case Intrinsic::x86_sse42_pcmpistri128:
17343 case Intrinsic::x86_sse42_pcmpestri128: {
17345 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
17346 Opcode = X86ISD::PCMPISTRI;
17348 Opcode = X86ISD::PCMPESTRI;
17350 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
17351 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17352 return DAG.getNode(Opcode, dl, VTs, NewOps);
17357 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
17358 SDValue Src, SDValue Mask, SDValue Base,
17359 SDValue Index, SDValue ScaleOp, SDValue Chain,
17360 const X86Subtarget * Subtarget) {
17362 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
17363 assert(C && "Invalid scale type");
17364 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
17365 EVT MaskVT = MVT::getVectorVT(MVT::i1,
17366 Index.getSimpleValueType().getVectorNumElements());
17368 ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
17370 MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
17372 MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
17373 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
17374 SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
17375 SDValue Segment = DAG.getRegister(0, MVT::i32);
17376 if (Src.getOpcode() == ISD::UNDEF)
17377 Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl);
17378 SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
17379 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
17380 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
17381 return DAG.getMergeValues(RetOps, dl);
17384 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
17385 SDValue Src, SDValue Mask, SDValue Base,
17386 SDValue Index, SDValue ScaleOp, SDValue Chain) {
17388 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
17389 assert(C && "Invalid scale type");
17390 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
17391 SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
17392 SDValue Segment = DAG.getRegister(0, MVT::i32);
17393 EVT MaskVT = MVT::getVectorVT(MVT::i1,
17394 Index.getSimpleValueType().getVectorNumElements());
17396 ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
17398 MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
17400 MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
17401 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
17402 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
17403 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
17404 return SDValue(Res, 1);
17407 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
17408 SDValue Mask, SDValue Base, SDValue Index,
17409 SDValue ScaleOp, SDValue Chain) {
17411 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
17412 assert(C && "Invalid scale type");
17413 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
17414 SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
17415 SDValue Segment = DAG.getRegister(0, MVT::i32);
17417 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
17419 ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
17421 MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
17423 MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
17424 //SDVTList VTs = DAG.getVTList(MVT::Other);
17425 SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
17426 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
17427 return SDValue(Res, 0);
17430 // getReadPerformanceCounter - Handles the lowering of builtin intrinsics that
17431 // read performance monitor counters (x86_rdpmc).
17432 static void getReadPerformanceCounter(SDNode *N, SDLoc DL,
17433 SelectionDAG &DAG, const X86Subtarget *Subtarget,
17434 SmallVectorImpl<SDValue> &Results) {
17435 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
17436 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
17439 // The ECX register is used to select the index of the performance counter
17441 SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
17443 SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
17445 // Reads the content of a 64-bit performance counter and returns it in the
17446 // registers EDX:EAX.
17447 if (Subtarget->is64Bit()) {
17448 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
17449 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
17452 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
17453 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
17456 Chain = HI.getValue(1);
17458 if (Subtarget->is64Bit()) {
17459 // The EAX register is loaded with the low-order 32 bits. The EDX register
17460 // is loaded with the supported high-order bits of the counter.
17461 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
17462 DAG.getConstant(32, MVT::i8));
17463 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
17464 Results.push_back(Chain);
17468 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
17469 SDValue Ops[] = { LO, HI };
17470 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
17471 Results.push_back(Pair);
17472 Results.push_back(Chain);
17475 // getReadTimeStampCounter - Handles the lowering of builtin intrinsics that
17476 // read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is
17477 // also used to custom lower READCYCLECOUNTER nodes.
17478 static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode,
17479 SelectionDAG &DAG, const X86Subtarget *Subtarget,
17480 SmallVectorImpl<SDValue> &Results) {
17481 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
17482 SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
17485 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
17486 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
17487 // and the EAX register is loaded with the low-order 32 bits.
17488 if (Subtarget->is64Bit()) {
17489 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
17490 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
17493 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
17494 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
17497 SDValue Chain = HI.getValue(1);
17499 if (Opcode == X86ISD::RDTSCP_DAG) {
17500 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
17502 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
17503 // the ECX register. Add 'ecx' explicitly to the chain.
17504 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
17506 // Explicitly store the content of ECX at the location passed in input
17507 // to the 'rdtscp' intrinsic.
17508 Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
17509 MachinePointerInfo(), false, false, 0);
17512 if (Subtarget->is64Bit()) {
17513 // The EDX register is loaded with the high-order 32 bits of the MSR, and
17514 // the EAX register is loaded with the low-order 32 bits.
17515 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
17516 DAG.getConstant(32, MVT::i8));
17517 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
17518 Results.push_back(Chain);
17522 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
17523 SDValue Ops[] = { LO, HI };
17524 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
17525 Results.push_back(Pair);
17526 Results.push_back(Chain);
17529 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
17530 SelectionDAG &DAG) {
17531 SmallVector<SDValue, 2> Results;
17533 getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
17535 return DAG.getMergeValues(Results, DL);
17539 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
17540 SelectionDAG &DAG) {
17541 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
17543 const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo);
17548 switch(IntrData->Type) {
17550 llvm_unreachable("Unknown Intrinsic Type");
17554 // Emit the node with the right value type.
17555 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
17556 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
17558 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
17559 // Otherwise return the value from Rand, which is always 0, casted to i32.
17560 SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
17561 DAG.getConstant(1, Op->getValueType(1)),
17562 DAG.getConstant(X86::COND_B, MVT::i32),
17563 SDValue(Result.getNode(), 1) };
17564 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
17565 DAG.getVTList(Op->getValueType(1), MVT::Glue),
17568 // Return { result, isValid, chain }.
17569 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
17570 SDValue(Result.getNode(), 2));
17573 //gather(v1, mask, index, base, scale);
17574 SDValue Chain = Op.getOperand(0);
17575 SDValue Src = Op.getOperand(2);
17576 SDValue Base = Op.getOperand(3);
17577 SDValue Index = Op.getOperand(4);
17578 SDValue Mask = Op.getOperand(5);
17579 SDValue Scale = Op.getOperand(6);
17580 return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain,
17584 //scatter(base, mask, index, v1, scale);
17585 SDValue Chain = Op.getOperand(0);
17586 SDValue Base = Op.getOperand(2);
17587 SDValue Mask = Op.getOperand(3);
17588 SDValue Index = Op.getOperand(4);
17589 SDValue Src = Op.getOperand(5);
17590 SDValue Scale = Op.getOperand(6);
17591 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain);
17594 SDValue Hint = Op.getOperand(6);
17596 if (dyn_cast<ConstantSDNode> (Hint) == nullptr ||
17597 (HintVal = dyn_cast<ConstantSDNode> (Hint)->getZExtValue()) > 1)
17598 llvm_unreachable("Wrong prefetch hint in intrinsic: should be 0 or 1");
17599 unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0);
17600 SDValue Chain = Op.getOperand(0);
17601 SDValue Mask = Op.getOperand(2);
17602 SDValue Index = Op.getOperand(3);
17603 SDValue Base = Op.getOperand(4);
17604 SDValue Scale = Op.getOperand(5);
17605 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain);
17607 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
17609 SmallVector<SDValue, 2> Results;
17610 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget, Results);
17611 return DAG.getMergeValues(Results, dl);
17613 // Read Performance Monitoring Counters.
17615 SmallVector<SDValue, 2> Results;
17616 getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
17617 return DAG.getMergeValues(Results, dl);
17619 // XTEST intrinsics.
17621 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
17622 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
17623 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17624 DAG.getConstant(X86::COND_NE, MVT::i8),
17626 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
17627 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
17628 Ret, SDValue(InTrans.getNode(), 1));
17632 SmallVector<SDValue, 2> Results;
17633 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
17634 SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
17635 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
17636 DAG.getConstant(-1, MVT::i8));
17637 SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
17638 Op.getOperand(4), GenCF.getValue(1));
17639 SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
17640 Op.getOperand(5), MachinePointerInfo(),
17642 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17643 DAG.getConstant(X86::COND_B, MVT::i8),
17645 Results.push_back(SetCC);
17646 Results.push_back(Store);
17647 return DAG.getMergeValues(Results, dl);
17649 case COMPRESS_TO_MEM: {
17651 SDValue Mask = Op.getOperand(4);
17652 SDValue DataToCompress = Op.getOperand(3);
17653 SDValue Addr = Op.getOperand(2);
17654 SDValue Chain = Op.getOperand(0);
17656 if (isAllOnes(Mask)) // return just a store
17657 return DAG.getStore(Chain, dl, DataToCompress, Addr,
17658 MachinePointerInfo(), false, false, 0);
17660 EVT VT = DataToCompress.getValueType();
17661 EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17662 VT.getVectorNumElements());
17663 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17664 Mask.getValueType().getSizeInBits());
17665 SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17666 DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17667 DAG.getIntPtrConstant(0));
17669 SDValue Compressed = DAG.getNode(IntrData->Opc0, dl, VT, VMask,
17670 DataToCompress, DAG.getUNDEF(VT));
17671 return DAG.getStore(Chain, dl, Compressed, Addr,
17672 MachinePointerInfo(), false, false, 0);
17674 case EXPAND_FROM_MEM: {
17676 SDValue Mask = Op.getOperand(4);
17677 SDValue PathThru = Op.getOperand(3);
17678 SDValue Addr = Op.getOperand(2);
17679 SDValue Chain = Op.getOperand(0);
17680 EVT VT = Op.getValueType();
17682 if (isAllOnes(Mask)) // return just a load
17683 return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false,
17685 EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17686 VT.getVectorNumElements());
17687 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17688 Mask.getValueType().getSizeInBits());
17689 SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17690 DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17691 DAG.getIntPtrConstant(0));
17693 SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(),
17694 false, false, false, 0);
17696 SmallVector<SDValue, 2> Results;
17697 Results.push_back(DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToExpand,
17699 Results.push_back(Chain);
17700 return DAG.getMergeValues(Results, dl);
17705 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
17706 SelectionDAG &DAG) const {
17707 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
17708 MFI->setReturnAddressIsTaken(true);
17710 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
17713 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
17715 EVT PtrVT = getPointerTy();
17718 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
17719 const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
17720 DAG.getSubtarget().getRegisterInfo());
17721 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
17722 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
17723 DAG.getNode(ISD::ADD, dl, PtrVT,
17724 FrameAddr, Offset),
17725 MachinePointerInfo(), false, false, false, 0);
17728 // Just load the return address.
17729 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
17730 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
17731 RetAddrFI, MachinePointerInfo(), false, false, false, 0);
17734 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
17735 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
17736 MFI->setFrameAddressIsTaken(true);
17738 EVT VT = Op.getValueType();
17739 SDLoc dl(Op); // FIXME probably not meaningful
17740 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
17741 const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
17742 DAG.getSubtarget().getRegisterInfo());
17743 unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(
17744 DAG.getMachineFunction());
17745 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
17746 (FrameReg == X86::EBP && VT == MVT::i32)) &&
17747 "Invalid Frame Register!");
17748 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
17750 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
17751 MachinePointerInfo(),
17752 false, false, false, 0);
17756 // FIXME? Maybe this could be a TableGen attribute on some registers and
17757 // this table could be generated automatically from RegInfo.
17758 unsigned X86TargetLowering::getRegisterByName(const char* RegName,
17760 unsigned Reg = StringSwitch<unsigned>(RegName)
17761 .Case("esp", X86::ESP)
17762 .Case("rsp", X86::RSP)
17766 report_fatal_error("Invalid register name global variable");
17769 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
17770 SelectionDAG &DAG) const {
17771 const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
17772 DAG.getSubtarget().getRegisterInfo());
17773 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());
17776 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
17777 SDValue Chain = Op.getOperand(0);
17778 SDValue Offset = Op.getOperand(1);
17779 SDValue Handler = Op.getOperand(2);
17782 EVT PtrVT = getPointerTy();
17783 const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
17784 DAG.getSubtarget().getRegisterInfo());
17785 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
17786 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
17787 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
17788 "Invalid Frame Register!");
17789 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
17790 unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
17792 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
17793 DAG.getIntPtrConstant(RegInfo->getSlotSize()));
17794 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
17795 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
17797 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
17799 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
17800 DAG.getRegister(StoreAddrReg, PtrVT));
17803 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
17804 SelectionDAG &DAG) const {
17806 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
17807 DAG.getVTList(MVT::i32, MVT::Other),
17808 Op.getOperand(0), Op.getOperand(1));
17811 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
17812 SelectionDAG &DAG) const {
17814 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
17815 Op.getOperand(0), Op.getOperand(1));
17818 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
17819 return Op.getOperand(0);
17822 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
17823 SelectionDAG &DAG) const {
17824 SDValue Root = Op.getOperand(0);
17825 SDValue Trmp = Op.getOperand(1); // trampoline
17826 SDValue FPtr = Op.getOperand(2); // nested function
17827 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
17830 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
17831 const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
17833 if (Subtarget->is64Bit()) {
17834 SDValue OutChains[6];
17836 // Large code-model.
17837 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
17838 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
17840 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
17841 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
17843 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
17845 // Load the pointer to the nested function into R11.
17846 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
17847 SDValue Addr = Trmp;
17848 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
17849 Addr, MachinePointerInfo(TrmpAddr),
17852 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
17853 DAG.getConstant(2, MVT::i64));
17854 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
17855 MachinePointerInfo(TrmpAddr, 2),
17858 // Load the 'nest' parameter value into R10.
17859 // R10 is specified in X86CallingConv.td
17860 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
17861 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
17862 DAG.getConstant(10, MVT::i64));
17863 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
17864 Addr, MachinePointerInfo(TrmpAddr, 10),
17867 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
17868 DAG.getConstant(12, MVT::i64));
17869 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
17870 MachinePointerInfo(TrmpAddr, 12),
17873 // Jump to the nested function.
17874 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
17875 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
17876 DAG.getConstant(20, MVT::i64));
17877 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
17878 Addr, MachinePointerInfo(TrmpAddr, 20),
17881 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
17882 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
17883 DAG.getConstant(22, MVT::i64));
17884 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,
17885 MachinePointerInfo(TrmpAddr, 22),
17888 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
17890 const Function *Func =
17891 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
17892 CallingConv::ID CC = Func->getCallingConv();
17897 llvm_unreachable("Unsupported calling convention");
17898 case CallingConv::C:
17899 case CallingConv::X86_StdCall: {
17900 // Pass 'nest' parameter in ECX.
17901 // Must be kept in sync with X86CallingConv.td
17902 NestReg = X86::ECX;
17904 // Check that ECX wasn't needed by an 'inreg' parameter.
17905 FunctionType *FTy = Func->getFunctionType();
17906 const AttributeSet &Attrs = Func->getAttributes();
17908 if (!Attrs.isEmpty() && !Func->isVarArg()) {
17909 unsigned InRegCount = 0;
17912 for (FunctionType::param_iterator I = FTy->param_begin(),
17913 E = FTy->param_end(); I != E; ++I, ++Idx)
17914 if (Attrs.hasAttribute(Idx, Attribute::InReg))
17915 // FIXME: should only count parameters that are lowered to integers.
17916 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
17918 if (InRegCount > 2) {
17919 report_fatal_error("Nest register in use - reduce number of inreg"
17925 case CallingConv::X86_FastCall:
17926 case CallingConv::X86_ThisCall:
17927 case CallingConv::Fast:
17928 // Pass 'nest' parameter in EAX.
17929 // Must be kept in sync with X86CallingConv.td
17930 NestReg = X86::EAX;
17934 SDValue OutChains[4];
17935 SDValue Addr, Disp;
17937 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
17938 DAG.getConstant(10, MVT::i32));
17939 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
17941 // This is storing the opcode for MOV32ri.
17942 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
17943 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
17944 OutChains[0] = DAG.getStore(Root, dl,
17945 DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
17946 Trmp, MachinePointerInfo(TrmpAddr),
17949 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
17950 DAG.getConstant(1, MVT::i32));
17951 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
17952 MachinePointerInfo(TrmpAddr, 1),
17955 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
17956 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
17957 DAG.getConstant(5, MVT::i32));
17958 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,
17959 MachinePointerInfo(TrmpAddr, 5),
17962 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
17963 DAG.getConstant(6, MVT::i32));
17964 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
17965 MachinePointerInfo(TrmpAddr, 6),
17968 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
17972 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
17973 SelectionDAG &DAG) const {
17975 The rounding mode is in bits 11:10 of FPSR, and has the following
17977 00 Round to nearest
17982 FLT_ROUNDS, on the other hand, expects the following:
17989 To perform the conversion, we do:
17990 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
17993 MachineFunction &MF = DAG.getMachineFunction();
17994 const TargetMachine &TM = MF.getTarget();
17995 const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering();
17996 unsigned StackAlignment = TFI.getStackAlignment();
17997 MVT VT = Op.getSimpleValueType();
18000 // Save FP Control Word to stack slot
18001 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
18002 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
18004 MachineMemOperand *MMO =
18005 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
18006 MachineMemOperand::MOStore, 2, 2);
18008 SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
18009 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
18010 DAG.getVTList(MVT::Other),
18011 Ops, MVT::i16, MMO);
18013 // Load FP Control Word from stack slot
18014 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
18015 MachinePointerInfo(), false, false, false, 0);
18017 // Transform as necessary
18019 DAG.getNode(ISD::SRL, DL, MVT::i16,
18020 DAG.getNode(ISD::AND, DL, MVT::i16,
18021 CWD, DAG.getConstant(0x800, MVT::i16)),
18022 DAG.getConstant(11, MVT::i8));
18024 DAG.getNode(ISD::SRL, DL, MVT::i16,
18025 DAG.getNode(ISD::AND, DL, MVT::i16,
18026 CWD, DAG.getConstant(0x400, MVT::i16)),
18027 DAG.getConstant(9, MVT::i8));
18030 DAG.getNode(ISD::AND, DL, MVT::i16,
18031 DAG.getNode(ISD::ADD, DL, MVT::i16,
18032 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
18033 DAG.getConstant(1, MVT::i16)),
18034 DAG.getConstant(3, MVT::i16));
18036 return DAG.getNode((VT.getSizeInBits() < 16 ?
18037 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
18040 static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
18041 MVT VT = Op.getSimpleValueType();
18043 unsigned NumBits = VT.getSizeInBits();
18046 Op = Op.getOperand(0);
18047 if (VT == MVT::i8) {
18048 // Zero extend to i32 since there is not an i8 bsr.
18050 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
18053 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
18054 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
18055 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
18057 // If src is zero (i.e. bsr sets ZF), returns NumBits.
18060 DAG.getConstant(NumBits+NumBits-1, OpVT),
18061 DAG.getConstant(X86::COND_E, MVT::i8),
18064 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
18066 // Finally xor with NumBits-1.
18067 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
18070 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
18074 static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
18075 MVT VT = Op.getSimpleValueType();
18077 unsigned NumBits = VT.getSizeInBits();
18080 Op = Op.getOperand(0);
18081 if (VT == MVT::i8) {
18082 // Zero extend to i32 since there is not an i8 bsr.
18084 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
18087 // Issue a bsr (scan bits in reverse).
18088 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
18089 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
18091 // And xor with NumBits-1.
18092 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
18095 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
18099 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
18100 MVT VT = Op.getSimpleValueType();
18101 unsigned NumBits = VT.getSizeInBits();
18103 Op = Op.getOperand(0);
18105 // Issue a bsf (scan bits forward) which also sets EFLAGS.
18106 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18107 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
18109 // If src is zero (i.e. bsf sets ZF), returns NumBits.
18112 DAG.getConstant(NumBits, VT),
18113 DAG.getConstant(X86::COND_E, MVT::i8),
18116 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
18119 // Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
18120 // ones, and then concatenate the result back.
18121 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
18122 MVT VT = Op.getSimpleValueType();
18124 assert(VT.is256BitVector() && VT.isInteger() &&
18125 "Unsupported value type for operation");
18127 unsigned NumElems = VT.getVectorNumElements();
18130 // Extract the LHS vectors
18131 SDValue LHS = Op.getOperand(0);
18132 SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
18133 SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
18135 // Extract the RHS vectors
18136 SDValue RHS = Op.getOperand(1);
18137 SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
18138 SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
18140 MVT EltVT = VT.getVectorElementType();
18141 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
18143 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
18144 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
18145 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
18148 static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
18149 assert(Op.getSimpleValueType().is256BitVector() &&
18150 Op.getSimpleValueType().isInteger() &&
18151 "Only handle AVX 256-bit vector integer operation");
18152 return Lower256IntArith(Op, DAG);
18155 static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
18156 assert(Op.getSimpleValueType().is256BitVector() &&
18157 Op.getSimpleValueType().isInteger() &&
18158 "Only handle AVX 256-bit vector integer operation");
18159 return Lower256IntArith(Op, DAG);
18162 static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
18163 SelectionDAG &DAG) {
18165 MVT VT = Op.getSimpleValueType();
18167 // Decompose 256-bit ops into smaller 128-bit ops.
18168 if (VT.is256BitVector() && !Subtarget->hasInt256())
18169 return Lower256IntArith(Op, DAG);
18171 SDValue A = Op.getOperand(0);
18172 SDValue B = Op.getOperand(1);
18174 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
18175 if (VT == MVT::v4i32) {
18176 assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() &&
18177 "Should not custom lower when pmuldq is available!");
18179 // Extract the odd parts.
18180 static const int UnpackMask[] = { 1, -1, 3, -1 };
18181 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
18182 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
18184 // Multiply the even parts.
18185 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
18186 // Now multiply odd parts.
18187 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
18189 Evens = DAG.getNode(ISD::BITCAST, dl, VT, Evens);
18190 Odds = DAG.getNode(ISD::BITCAST, dl, VT, Odds);
18192 // Merge the two vectors back together with a shuffle. This expands into 2
18194 static const int ShufMask[] = { 0, 4, 2, 6 };
18195 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
18198 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
18199 "Only know how to lower V2I64/V4I64/V8I64 multiply");
18201 // Ahi = psrlqi(a, 32);
18202 // Bhi = psrlqi(b, 32);
18204 // AloBlo = pmuludq(a, b);
18205 // AloBhi = pmuludq(a, Bhi);
18206 // AhiBlo = pmuludq(Ahi, b);
18208 // AloBhi = psllqi(AloBhi, 32);
18209 // AhiBlo = psllqi(AhiBlo, 32);
18210 // return AloBlo + AloBhi + AhiBlo;
18212 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
18213 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
18215 // Bit cast to 32-bit vectors for MULUDQ
18216 EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
18217 (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;
18218 A = DAG.getNode(ISD::BITCAST, dl, MulVT, A);
18219 B = DAG.getNode(ISD::BITCAST, dl, MulVT, B);
18220 Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi);
18221 Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi);
18223 SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
18224 SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
18225 SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
18227 AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG);
18228 AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG);
18230 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
18231 return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
18234 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
18235 assert(Subtarget->isTargetWin64() && "Unexpected target");
18236 EVT VT = Op.getValueType();
18237 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
18238 "Unexpected return type for lowering");
18242 switch (Op->getOpcode()) {
18243 default: llvm_unreachable("Unexpected request for libcall!");
18244 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
18245 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
18246 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
18247 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
18248 case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
18249 case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
18253 SDValue InChain = DAG.getEntryNode();
18255 TargetLowering::ArgListTy Args;
18256 TargetLowering::ArgListEntry Entry;
18257 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
18258 EVT ArgVT = Op->getOperand(i).getValueType();
18259 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
18260 "Unexpected argument type for lowering");
18261 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
18262 Entry.Node = StackPtr;
18263 InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(),
18265 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
18266 Entry.Ty = PointerType::get(ArgTy,0);
18267 Entry.isSExt = false;
18268 Entry.isZExt = false;
18269 Args.push_back(Entry);
18272 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
18275 TargetLowering::CallLoweringInfo CLI(DAG);
18276 CLI.setDebugLoc(dl).setChain(InChain)
18277 .setCallee(getLibcallCallingConv(LC),
18278 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
18279 Callee, std::move(Args), 0)
18280 .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
18282 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
18283 return DAG.getNode(ISD::BITCAST, dl, VT, CallInfo.first);
18286 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
18287 SelectionDAG &DAG) {
18288 SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
18289 EVT VT = Op0.getValueType();
18292 assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) ||
18293 (VT == MVT::v8i32 && Subtarget->hasInt256()));
18295 // PMULxD operations multiply each even value (starting at 0) of LHS with
18296 // the related value of RHS and produce a widen result.
18297 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
18298 // => <2 x i64> <ae|cg>
18300 // In other word, to have all the results, we need to perform two PMULxD:
18301 // 1. one with the even values.
18302 // 2. one with the odd values.
18303 // To achieve #2, with need to place the odd values at an even position.
18305 // Place the odd value at an even position (basically, shift all values 1
18306 // step to the left):
18307 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
18308 // <a|b|c|d> => <b|undef|d|undef>
18309 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask);
18310 // <e|f|g|h> => <f|undef|h|undef>
18311 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask);
18313 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
18315 MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
18316 bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
18318 (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
18319 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
18320 // => <2 x i64> <ae|cg>
18321 SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT,
18322 DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
18323 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
18324 // => <2 x i64> <bf|dh>
18325 SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT,
18326 DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
18328 // Shuffle it back into the right order.
18329 SDValue Highs, Lows;
18330 if (VT == MVT::v8i32) {
18331 const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
18332 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
18333 const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
18334 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
18336 const int HighMask[] = {1, 5, 3, 7};
18337 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
18338 const int LowMask[] = {0, 4, 2, 6};
18339 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
18342 // If we have a signed multiply but no PMULDQ fix up the high parts of a
18343 // unsigned multiply.
18344 if (IsSigned && !Subtarget->hasSSE41()) {
18346 DAG.getConstant(31, DAG.getTargetLoweringInfo().getShiftAmountTy(VT));
18347 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
18348 DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
18349 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
18350 DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
18352 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
18353 Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
18356 // The first result of MUL_LOHI is actually the low value, followed by the
18358 SDValue Ops[] = {Lows, Highs};
18359 return DAG.getMergeValues(Ops, dl);
18362 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
18363 const X86Subtarget *Subtarget) {
18364 MVT VT = Op.getSimpleValueType();
18366 SDValue R = Op.getOperand(0);
18367 SDValue Amt = Op.getOperand(1);
18369 // Optimize shl/srl/sra with constant shift amount.
18370 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
18371 if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
18372 uint64_t ShiftAmt = ShiftConst->getZExtValue();
18374 if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
18375 (Subtarget->hasInt256() &&
18376 (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16)) ||
18377 (Subtarget->hasAVX512() &&
18378 (VT == MVT::v8i64 || VT == MVT::v16i32))) {
18379 if (Op.getOpcode() == ISD::SHL)
18380 return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
18382 if (Op.getOpcode() == ISD::SRL)
18383 return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
18385 if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64)
18386 return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
18390 if (VT == MVT::v16i8) {
18391 if (Op.getOpcode() == ISD::SHL) {
18392 // Make a large shift.
18393 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
18394 MVT::v8i16, R, ShiftAmt,
18396 SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
18397 // Zero out the rightmost bits.
18398 SmallVector<SDValue, 16> V(16,
18399 DAG.getConstant(uint8_t(-1U << ShiftAmt),
18401 return DAG.getNode(ISD::AND, dl, VT, SHL,
18402 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18404 if (Op.getOpcode() == ISD::SRL) {
18405 // Make a large shift.
18406 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
18407 MVT::v8i16, R, ShiftAmt,
18409 SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
18410 // Zero out the leftmost bits.
18411 SmallVector<SDValue, 16> V(16,
18412 DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
18414 return DAG.getNode(ISD::AND, dl, VT, SRL,
18415 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18417 if (Op.getOpcode() == ISD::SRA) {
18418 if (ShiftAmt == 7) {
18419 // R s>> 7 === R s< 0
18420 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
18421 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
18424 // R s>> a === ((R u>> a) ^ m) - m
18425 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
18426 SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
18428 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
18429 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
18430 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
18433 llvm_unreachable("Unknown shift opcode.");
18436 if (Subtarget->hasInt256() && VT == MVT::v32i8) {
18437 if (Op.getOpcode() == ISD::SHL) {
18438 // Make a large shift.
18439 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
18440 MVT::v16i16, R, ShiftAmt,
18442 SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
18443 // Zero out the rightmost bits.
18444 SmallVector<SDValue, 32> V(32,
18445 DAG.getConstant(uint8_t(-1U << ShiftAmt),
18447 return DAG.getNode(ISD::AND, dl, VT, SHL,
18448 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18450 if (Op.getOpcode() == ISD::SRL) {
18451 // Make a large shift.
18452 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
18453 MVT::v16i16, R, ShiftAmt,
18455 SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
18456 // Zero out the leftmost bits.
18457 SmallVector<SDValue, 32> V(32,
18458 DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
18460 return DAG.getNode(ISD::AND, dl, VT, SRL,
18461 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18463 if (Op.getOpcode() == ISD::SRA) {
18464 if (ShiftAmt == 7) {
18465 // R s>> 7 === R s< 0
18466 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
18467 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
18470 // R s>> a === ((R u>> a) ^ m) - m
18471 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
18472 SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt,
18474 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
18475 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
18476 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
18479 llvm_unreachable("Unknown shift opcode.");
18484 // Special case in 32-bit mode, where i64 is expanded into high and low parts.
18485 if (!Subtarget->is64Bit() &&
18486 (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
18487 Amt.getOpcode() == ISD::BITCAST &&
18488 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
18489 Amt = Amt.getOperand(0);
18490 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
18491 VT.getVectorNumElements();
18492 unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
18493 uint64_t ShiftAmt = 0;
18494 for (unsigned i = 0; i != Ratio; ++i) {
18495 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i));
18499 ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
18501 // Check remaining shift amounts.
18502 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
18503 uint64_t ShAmt = 0;
18504 for (unsigned j = 0; j != Ratio; ++j) {
18505 ConstantSDNode *C =
18506 dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
18510 ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
18512 if (ShAmt != ShiftAmt)
18515 switch (Op.getOpcode()) {
18517 llvm_unreachable("Unknown shift opcode!");
18519 return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
18522 return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
18525 return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
18533 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
18534 const X86Subtarget* Subtarget) {
18535 MVT VT = Op.getSimpleValueType();
18537 SDValue R = Op.getOperand(0);
18538 SDValue Amt = Op.getOperand(1);
18540 if ((VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) ||
18541 VT == MVT::v4i32 || VT == MVT::v8i16 ||
18542 (Subtarget->hasInt256() &&
18543 ((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) ||
18544 VT == MVT::v8i32 || VT == MVT::v16i16)) ||
18545 (Subtarget->hasAVX512() && (VT == MVT::v8i64 || VT == MVT::v16i32))) {
18547 EVT EltVT = VT.getVectorElementType();
18549 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
18550 // Check if this build_vector node is doing a splat.
18551 // If so, then set BaseShAmt equal to the splat value.
18552 BaseShAmt = BV->getSplatValue();
18553 if (BaseShAmt && BaseShAmt.getOpcode() == ISD::UNDEF)
18554 BaseShAmt = SDValue();
18556 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
18557 Amt = Amt.getOperand(0);
18559 ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
18560 if (SVN && SVN->isSplat()) {
18561 unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
18562 SDValue InVec = Amt.getOperand(0);
18563 if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
18564 assert((SplatIdx < InVec.getValueType().getVectorNumElements()) &&
18565 "Unexpected shuffle index found!");
18566 BaseShAmt = InVec.getOperand(SplatIdx);
18567 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
18568 if (ConstantSDNode *C =
18569 dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
18570 if (C->getZExtValue() == SplatIdx)
18571 BaseShAmt = InVec.getOperand(1);
18576 // Avoid introducing an extract element from a shuffle.
18577 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
18578 DAG.getIntPtrConstant(SplatIdx));
18582 if (BaseShAmt.getNode()) {
18583 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
18584 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
18585 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
18586 else if (EltVT.bitsLT(MVT::i32))
18587 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
18589 switch (Op.getOpcode()) {
18591 llvm_unreachable("Unknown shift opcode!");
18593 switch (VT.SimpleTy) {
18594 default: return SDValue();
18603 return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG);
18606 switch (VT.SimpleTy) {
18607 default: return SDValue();
18614 return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG);
18617 switch (VT.SimpleTy) {
18618 default: return SDValue();
18627 return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG);
18633 // Special case in 32-bit mode, where i64 is expanded into high and low parts.
18634 if (!Subtarget->is64Bit() &&
18635 (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64) ||
18636 (Subtarget->hasAVX512() && VT == MVT::v8i64)) &&
18637 Amt.getOpcode() == ISD::BITCAST &&
18638 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
18639 Amt = Amt.getOperand(0);
18640 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
18641 VT.getVectorNumElements();
18642 std::vector<SDValue> Vals(Ratio);
18643 for (unsigned i = 0; i != Ratio; ++i)
18644 Vals[i] = Amt.getOperand(i);
18645 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
18646 for (unsigned j = 0; j != Ratio; ++j)
18647 if (Vals[j] != Amt.getOperand(i + j))
18650 switch (Op.getOpcode()) {
18652 llvm_unreachable("Unknown shift opcode!");
18654 return DAG.getNode(X86ISD::VSHL, dl, VT, R, Op.getOperand(1));
18656 return DAG.getNode(X86ISD::VSRL, dl, VT, R, Op.getOperand(1));
18658 return DAG.getNode(X86ISD::VSRA, dl, VT, R, Op.getOperand(1));
18665 static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
18666 SelectionDAG &DAG) {
18667 MVT VT = Op.getSimpleValueType();
18669 SDValue R = Op.getOperand(0);
18670 SDValue Amt = Op.getOperand(1);
18673 assert(VT.isVector() && "Custom lowering only for vector shifts!");
18674 assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!");
18676 V = LowerScalarImmediateShift(Op, DAG, Subtarget);
18680 V = LowerScalarVariableShift(Op, DAG, Subtarget);
18684 if (Subtarget->hasAVX512() && (VT == MVT::v16i32 || VT == MVT::v8i64))
18686 // AVX2 has VPSLLV/VPSRAV/VPSRLV.
18687 if (Subtarget->hasInt256()) {
18688 if (Op.getOpcode() == ISD::SRL &&
18689 (VT == MVT::v2i64 || VT == MVT::v4i32 ||
18690 VT == MVT::v4i64 || VT == MVT::v8i32))
18692 if (Op.getOpcode() == ISD::SHL &&
18693 (VT == MVT::v2i64 || VT == MVT::v4i32 ||
18694 VT == MVT::v4i64 || VT == MVT::v8i32))
18696 if (Op.getOpcode() == ISD::SRA && (VT == MVT::v4i32 || VT == MVT::v8i32))
18700 // If possible, lower this packed shift into a vector multiply instead of
18701 // expanding it into a sequence of scalar shifts.
18702 // Do this only if the vector shift count is a constant build_vector.
18703 if (Op.getOpcode() == ISD::SHL &&
18704 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
18705 (Subtarget->hasInt256() && VT == MVT::v16i16)) &&
18706 ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
18707 SmallVector<SDValue, 8> Elts;
18708 EVT SVT = VT.getScalarType();
18709 unsigned SVTBits = SVT.getSizeInBits();
18710 const APInt &One = APInt(SVTBits, 1);
18711 unsigned NumElems = VT.getVectorNumElements();
18713 for (unsigned i=0; i !=NumElems; ++i) {
18714 SDValue Op = Amt->getOperand(i);
18715 if (Op->getOpcode() == ISD::UNDEF) {
18716 Elts.push_back(Op);
18720 ConstantSDNode *ND = cast<ConstantSDNode>(Op);
18721 const APInt &C = APInt(SVTBits, ND->getAPIntValue().getZExtValue());
18722 uint64_t ShAmt = C.getZExtValue();
18723 if (ShAmt >= SVTBits) {
18724 Elts.push_back(DAG.getUNDEF(SVT));
18727 Elts.push_back(DAG.getConstant(One.shl(ShAmt), SVT));
18729 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
18730 return DAG.getNode(ISD::MUL, dl, VT, R, BV);
18733 // Lower SHL with variable shift amount.
18734 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
18735 Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT));
18737 Op = DAG.getNode(ISD::ADD, dl, VT, Op, DAG.getConstant(0x3f800000U, VT));
18738 Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op);
18739 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
18740 return DAG.getNode(ISD::MUL, dl, VT, Op, R);
18743 // If possible, lower this shift as a sequence of two shifts by
18744 // constant plus a MOVSS/MOVSD instead of scalarizing it.
18746 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
18748 // Could be rewritten as:
18749 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
18751 // The advantage is that the two shifts from the example would be
18752 // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
18753 // the vector shift into four scalar shifts plus four pairs of vector
18755 if ((VT == MVT::v8i16 || VT == MVT::v4i32) &&
18756 ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
18757 unsigned TargetOpcode = X86ISD::MOVSS;
18758 bool CanBeSimplified;
18759 // The splat value for the first packed shift (the 'X' from the example).
18760 SDValue Amt1 = Amt->getOperand(0);
18761 // The splat value for the second packed shift (the 'Y' from the example).
18762 SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) :
18763 Amt->getOperand(2);
18765 // See if it is possible to replace this node with a sequence of
18766 // two shifts followed by a MOVSS/MOVSD
18767 if (VT == MVT::v4i32) {
18768 // Check if it is legal to use a MOVSS.
18769 CanBeSimplified = Amt2 == Amt->getOperand(2) &&
18770 Amt2 == Amt->getOperand(3);
18771 if (!CanBeSimplified) {
18772 // Otherwise, check if we can still simplify this node using a MOVSD.
18773 CanBeSimplified = Amt1 == Amt->getOperand(1) &&
18774 Amt->getOperand(2) == Amt->getOperand(3);
18775 TargetOpcode = X86ISD::MOVSD;
18776 Amt2 = Amt->getOperand(2);
18779 // Do similar checks for the case where the machine value type
18781 CanBeSimplified = Amt1 == Amt->getOperand(1);
18782 for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
18783 CanBeSimplified = Amt2 == Amt->getOperand(i);
18785 if (!CanBeSimplified) {
18786 TargetOpcode = X86ISD::MOVSD;
18787 CanBeSimplified = true;
18788 Amt2 = Amt->getOperand(4);
18789 for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
18790 CanBeSimplified = Amt1 == Amt->getOperand(i);
18791 for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
18792 CanBeSimplified = Amt2 == Amt->getOperand(j);
18796 if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
18797 isa<ConstantSDNode>(Amt2)) {
18798 // Replace this node with two shifts followed by a MOVSS/MOVSD.
18799 EVT CastVT = MVT::v4i32;
18801 DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), VT);
18802 SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
18804 DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), VT);
18805 SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
18806 if (TargetOpcode == X86ISD::MOVSD)
18807 CastVT = MVT::v2i64;
18808 SDValue BitCast1 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift1);
18809 SDValue BitCast2 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift2);
18810 SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2,
18812 return DAG.getNode(ISD::BITCAST, dl, VT, Result);
18816 if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
18817 assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq.");
18820 Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(5, VT));
18821 Op = DAG.getNode(ISD::BITCAST, dl, VT, Op);
18823 // Turn 'a' into a mask suitable for VSELECT
18824 SDValue VSelM = DAG.getConstant(0x80, VT);
18825 SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
18826 OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
18828 SDValue CM1 = DAG.getConstant(0x0f, VT);
18829 SDValue CM2 = DAG.getConstant(0x3f, VT);
18831 // r = VSELECT(r, psllw(r & (char16)15, 4), a);
18832 SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1);
18833 M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 4, DAG);
18834 M = DAG.getNode(ISD::BITCAST, dl, VT, M);
18835 R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
18838 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
18839 OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
18840 OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
18842 // r = VSELECT(r, psllw(r & (char16)63, 2), a);
18843 M = DAG.getNode(ISD::AND, dl, VT, R, CM2);
18844 M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 2, DAG);
18845 M = DAG.getNode(ISD::BITCAST, dl, VT, M);
18846 R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
18849 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
18850 OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
18851 OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
18853 // return VSELECT(r, r+r, a);
18854 R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel,
18855 DAG.getNode(ISD::ADD, dl, VT, R, R), R);
18859 // It's worth extending once and using the v8i32 shifts for 16-bit types, but
18860 // the extra overheads to get from v16i8 to v8i32 make the existing SSE
18861 // solution better.
18862 if (Subtarget->hasInt256() && VT == MVT::v8i16) {
18863 MVT NewVT = VT == MVT::v8i16 ? MVT::v8i32 : MVT::v16i16;
18865 Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
18866 R = DAG.getNode(ExtOpc, dl, NewVT, R);
18867 Amt = DAG.getNode(ISD::ANY_EXTEND, dl, NewVT, Amt);
18868 return DAG.getNode(ISD::TRUNCATE, dl, VT,
18869 DAG.getNode(Op.getOpcode(), dl, NewVT, R, Amt));
18872 // Decompose 256-bit shifts into smaller 128-bit shifts.
18873 if (VT.is256BitVector()) {
18874 unsigned NumElems = VT.getVectorNumElements();
18875 MVT EltVT = VT.getVectorElementType();
18876 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
18878 // Extract the two vectors
18879 SDValue V1 = Extract128BitVector(R, 0, DAG, dl);
18880 SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl);
18882 // Recreate the shift amount vectors
18883 SDValue Amt1, Amt2;
18884 if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
18885 // Constant shift amount
18886 SmallVector<SDValue, 4> Amt1Csts;
18887 SmallVector<SDValue, 4> Amt2Csts;
18888 for (unsigned i = 0; i != NumElems/2; ++i)
18889 Amt1Csts.push_back(Amt->getOperand(i));
18890 for (unsigned i = NumElems/2; i != NumElems; ++i)
18891 Amt2Csts.push_back(Amt->getOperand(i));
18893 Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts);
18894 Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts);
18896 // Variable shift amount
18897 Amt1 = Extract128BitVector(Amt, 0, DAG, dl);
18898 Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl);
18901 // Issue new vector shifts for the smaller types
18902 V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
18903 V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
18905 // Concatenate the result back
18906 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
18912 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
18913 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
18914 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
18915 // looks for this combo and may remove the "setcc" instruction if the "setcc"
18916 // has only one use.
18917 SDNode *N = Op.getNode();
18918 SDValue LHS = N->getOperand(0);
18919 SDValue RHS = N->getOperand(1);
18920 unsigned BaseOp = 0;
18923 switch (Op.getOpcode()) {
18924 default: llvm_unreachable("Unknown ovf instruction!");
18926 // A subtract of one will be selected as a INC. Note that INC doesn't
18927 // set CF, so we can't do this for UADDO.
18928 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
18930 BaseOp = X86ISD::INC;
18931 Cond = X86::COND_O;
18934 BaseOp = X86ISD::ADD;
18935 Cond = X86::COND_O;
18938 BaseOp = X86ISD::ADD;
18939 Cond = X86::COND_B;
18942 // A subtract of one will be selected as a DEC. Note that DEC doesn't
18943 // set CF, so we can't do this for USUBO.
18944 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
18946 BaseOp = X86ISD::DEC;
18947 Cond = X86::COND_O;
18950 BaseOp = X86ISD::SUB;
18951 Cond = X86::COND_O;
18954 BaseOp = X86ISD::SUB;
18955 Cond = X86::COND_B;
18958 BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
18959 Cond = X86::COND_O;
18961 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
18962 if (N->getValueType(0) == MVT::i8) {
18963 BaseOp = X86ISD::UMUL8;
18964 Cond = X86::COND_O;
18967 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
18969 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
18972 DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
18973 DAG.getConstant(X86::COND_O, MVT::i32),
18974 SDValue(Sum.getNode(), 2));
18976 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
18980 // Also sets EFLAGS.
18981 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
18982 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
18985 DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1),
18986 DAG.getConstant(Cond, MVT::i32),
18987 SDValue(Sum.getNode(), 1));
18989 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
18992 // Sign extension of the low part of vector elements. This may be used either
18993 // when sign extend instructions are not available or if the vector element
18994 // sizes already match the sign-extended size. If the vector elements are in
18995 // their pre-extended size and sign extend instructions are available, that will
18996 // be handled by LowerSIGN_EXTEND.
18997 SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
18998 SelectionDAG &DAG) const {
19000 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
19001 MVT VT = Op.getSimpleValueType();
19003 if (!Subtarget->hasSSE2() || !VT.isVector())
19006 unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
19007 ExtraVT.getScalarType().getSizeInBits();
19009 switch (VT.SimpleTy) {
19010 default: return SDValue();
19013 if (!Subtarget->hasFp256())
19015 if (!Subtarget->hasInt256()) {
19016 // needs to be split
19017 unsigned NumElems = VT.getVectorNumElements();
19019 // Extract the LHS vectors
19020 SDValue LHS = Op.getOperand(0);
19021 SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
19022 SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
19024 MVT EltVT = VT.getVectorElementType();
19025 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
19027 EVT ExtraEltVT = ExtraVT.getVectorElementType();
19028 unsigned ExtraNumElems = ExtraVT.getVectorNumElements();
19029 ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
19031 SDValue Extra = DAG.getValueType(ExtraVT);
19033 LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
19034 LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
19036 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);
19041 SDValue Op0 = Op.getOperand(0);
19043 // This is a sign extension of some low part of vector elements without
19044 // changing the size of the vector elements themselves:
19045 // Shift-Left + Shift-Right-Algebraic.
19046 SDValue Shl = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0,
19048 return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Shl, BitsDiff,
19054 /// Returns true if the operand type is exactly twice the native width, and
19055 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
19056 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
19057 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
19058 bool X86TargetLowering::needsCmpXchgNb(const Type *MemType) const {
19059 const X86Subtarget &Subtarget =
19060 getTargetMachine().getSubtarget<X86Subtarget>();
19061 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
19064 return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
19065 else if (OpWidth == 128)
19066 return Subtarget.hasCmpxchg16b();
19071 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
19072 return needsCmpXchgNb(SI->getValueOperand()->getType());
19075 // Note: this turns large loads into lock cmpxchg8b/16b.
19076 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
19077 bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
19078 auto PTy = cast<PointerType>(LI->getPointerOperand()->getType());
19079 return needsCmpXchgNb(PTy->getElementType());
19082 bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
19083 const X86Subtarget &Subtarget =
19084 getTargetMachine().getSubtarget<X86Subtarget>();
19085 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
19086 const Type *MemType = AI->getType();
19088 // If the operand is too big, we must see if cmpxchg8/16b is available
19089 // and default to library calls otherwise.
19090 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
19091 return needsCmpXchgNb(MemType);
19093 AtomicRMWInst::BinOp Op = AI->getOperation();
19096 llvm_unreachable("Unknown atomic operation");
19097 case AtomicRMWInst::Xchg:
19098 case AtomicRMWInst::Add:
19099 case AtomicRMWInst::Sub:
19100 // It's better to use xadd, xsub or xchg for these in all cases.
19102 case AtomicRMWInst::Or:
19103 case AtomicRMWInst::And:
19104 case AtomicRMWInst::Xor:
19105 // If the atomicrmw's result isn't actually used, we can just add a "lock"
19106 // prefix to a normal instruction for these operations.
19107 return !AI->use_empty();
19108 case AtomicRMWInst::Nand:
19109 case AtomicRMWInst::Max:
19110 case AtomicRMWInst::Min:
19111 case AtomicRMWInst::UMax:
19112 case AtomicRMWInst::UMin:
19113 // These always require a non-trivial set of data operations on x86. We must
19114 // use a cmpxchg loop.
19119 static bool hasMFENCE(const X86Subtarget& Subtarget) {
19120 // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
19121 // no-sse2). There isn't any reason to disable it if the target processor
19123 return Subtarget.hasSSE2() || Subtarget.is64Bit();
19127 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
19128 const X86Subtarget &Subtarget =
19129 getTargetMachine().getSubtarget<X86Subtarget>();
19130 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
19131 const Type *MemType = AI->getType();
19132 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
19133 // there is no benefit in turning such RMWs into loads, and it is actually
19134 // harmful as it introduces a mfence.
19135 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
19138 auto Builder = IRBuilder<>(AI);
19139 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
19140 auto SynchScope = AI->getSynchScope();
19141 // We must restrict the ordering to avoid generating loads with Release or
19142 // ReleaseAcquire orderings.
19143 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
19144 auto Ptr = AI->getPointerOperand();
19146 // Before the load we need a fence. Here is an example lifted from
19147 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
19150 // x.store(1, relaxed);
19151 // r1 = y.fetch_add(0, release);
19153 // y.fetch_add(42, acquire);
19154 // r2 = x.load(relaxed);
19155 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
19156 // lowered to just a load without a fence. A mfence flushes the store buffer,
19157 // making the optimization clearly correct.
19158 // FIXME: it is required if isAtLeastRelease(Order) but it is not clear
19159 // otherwise, we might be able to be more agressive on relaxed idempotent
19160 // rmw. In practice, they do not look useful, so we don't try to be
19161 // especially clever.
19162 if (SynchScope == SingleThread) {
19163 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
19164 // the IR level, so we must wrap it in an intrinsic.
19166 } else if (hasMFENCE(Subtarget)) {
19167 Function *MFence = llvm::Intrinsic::getDeclaration(M,
19168 Intrinsic::x86_sse2_mfence);
19169 Builder.CreateCall(MFence);
19171 // FIXME: it might make sense to use a locked operation here but on a
19172 // different cache-line to prevent cache-line bouncing. In practice it
19173 // is probably a small win, and x86 processors without mfence are rare
19174 // enough that we do not bother.
19178 // Finally we can emit the atomic load.
19179 LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
19180 AI->getType()->getPrimitiveSizeInBits());
19181 Loaded->setAtomic(Order, SynchScope);
19182 AI->replaceAllUsesWith(Loaded);
19183 AI->eraseFromParent();
19187 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
19188 SelectionDAG &DAG) {
19190 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
19191 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
19192 SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
19193 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
19195 // The only fence that needs an instruction is a sequentially-consistent
19196 // cross-thread fence.
19197 if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) {
19198 if (hasMFENCE(*Subtarget))
19199 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
19201 SDValue Chain = Op.getOperand(0);
19202 SDValue Zero = DAG.getConstant(0, MVT::i32);
19204 DAG.getRegister(X86::ESP, MVT::i32), // Base
19205 DAG.getTargetConstant(1, MVT::i8), // Scale
19206 DAG.getRegister(0, MVT::i32), // Index
19207 DAG.getTargetConstant(0, MVT::i32), // Disp
19208 DAG.getRegister(0, MVT::i32), // Segment.
19212 SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
19213 return SDValue(Res, 0);
19216 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
19217 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
19220 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
19221 SelectionDAG &DAG) {
19222 MVT T = Op.getSimpleValueType();
19226 switch(T.SimpleTy) {
19227 default: llvm_unreachable("Invalid value type!");
19228 case MVT::i8: Reg = X86::AL; size = 1; break;
19229 case MVT::i16: Reg = X86::AX; size = 2; break;
19230 case MVT::i32: Reg = X86::EAX; size = 4; break;
19232 assert(Subtarget->is64Bit() && "Node not type legal!");
19233 Reg = X86::RAX; size = 8;
19236 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
19237 Op.getOperand(2), SDValue());
19238 SDValue Ops[] = { cpIn.getValue(0),
19241 DAG.getTargetConstant(size, MVT::i8),
19242 cpIn.getValue(1) };
19243 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
19244 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
19245 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
19249 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
19250 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
19251 MVT::i32, cpOut.getValue(2));
19252 SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1),
19253 DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);
19255 DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
19256 DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
19257 DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
19261 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
19262 SelectionDAG &DAG) {
19263 MVT SrcVT = Op.getOperand(0).getSimpleValueType();
19264 MVT DstVT = Op.getSimpleValueType();
19266 if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) {
19267 assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
19268 if (DstVT != MVT::f64)
19269 // This conversion needs to be expanded.
19272 SDValue InVec = Op->getOperand(0);
19274 unsigned NumElts = SrcVT.getVectorNumElements();
19275 EVT SVT = SrcVT.getVectorElementType();
19277 // Widen the vector in input in the case of MVT::v2i32.
19278 // Example: from MVT::v2i32 to MVT::v4i32.
19279 SmallVector<SDValue, 16> Elts;
19280 for (unsigned i = 0, e = NumElts; i != e; ++i)
19281 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec,
19282 DAG.getIntPtrConstant(i)));
19284 // Explicitly mark the extra elements as Undef.
19285 SDValue Undef = DAG.getUNDEF(SVT);
19286 for (unsigned i = NumElts, e = NumElts * 2; i != e; ++i)
19287 Elts.push_back(Undef);
19289 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
19290 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts);
19291 SDValue ToV2F64 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, BV);
19292 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
19293 DAG.getIntPtrConstant(0));
19296 assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
19297 Subtarget->hasMMX() && "Unexpected custom BITCAST");
19298 assert((DstVT == MVT::i64 ||
19299 (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
19300 "Unexpected custom BITCAST");
19301 // i64 <=> MMX conversions are Legal.
19302 if (SrcVT==MVT::i64 && DstVT.isVector())
19304 if (DstVT==MVT::i64 && SrcVT.isVector())
19306 // MMX <=> MMX conversions are Legal.
19307 if (SrcVT.isVector() && DstVT.isVector())
19309 // All other conversions need to be expanded.
19313 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget,
19314 SelectionDAG &DAG) {
19315 SDNode *Node = Op.getNode();
19318 Op = Op.getOperand(0);
19319 EVT VT = Op.getValueType();
19320 assert((VT.is128BitVector() || VT.is256BitVector()) &&
19321 "CTPOP lowering only implemented for 128/256-bit wide vector types");
19323 unsigned NumElts = VT.getVectorNumElements();
19324 EVT EltVT = VT.getVectorElementType();
19325 unsigned Len = EltVT.getSizeInBits();
19327 // This is the vectorized version of the "best" algorithm from
19328 // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
19329 // with a minor tweak to use a series of adds + shifts instead of vector
19330 // multiplications. Implemented for the v2i64, v4i64, v4i32, v8i32 types:
19332 // v2i64, v4i64, v4i32 => Only profitable w/ popcnt disabled
19333 // v8i32 => Always profitable
19335 // FIXME: There a couple of possible improvements:
19337 // 1) Support for i8 and i16 vectors (needs measurements if popcnt enabled).
19338 // 2) Use strategies from http://wm.ite.pl/articles/sse-popcount.html
19340 assert(EltVT.isInteger() && (Len == 32 || Len == 64) && Len % 8 == 0 &&
19341 "CTPOP not implemented for this vector element type.");
19343 // X86 canonicalize ANDs to vXi64, generate the appropriate bitcasts to avoid
19344 // extra legalization.
19345 bool NeedsBitcast = EltVT == MVT::i32;
19346 MVT BitcastVT = VT.is256BitVector() ? MVT::v4i64 : MVT::v2i64;
19348 SDValue Cst55 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), EltVT);
19349 SDValue Cst33 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), EltVT);
19350 SDValue Cst0F = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), EltVT);
19352 // v = v - ((v >> 1) & 0x55555555...)
19353 SmallVector<SDValue, 8> Ones(NumElts, DAG.getConstant(1, EltVT));
19354 SDValue OnesV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ones);
19355 SDValue Srl = DAG.getNode(ISD::SRL, dl, VT, Op, OnesV);
19357 Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);
19359 SmallVector<SDValue, 8> Mask55(NumElts, Cst55);
19360 SDValue M55 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask55);
19362 M55 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M55);
19364 SDValue And = DAG.getNode(ISD::AND, dl, Srl.getValueType(), Srl, M55);
19365 if (VT != And.getValueType())
19366 And = DAG.getNode(ISD::BITCAST, dl, VT, And);
19367 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, Op, And);
19369 // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
19370 SmallVector<SDValue, 8> Mask33(NumElts, Cst33);
19371 SDValue M33 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask33);
19372 SmallVector<SDValue, 8> Twos(NumElts, DAG.getConstant(2, EltVT));
19373 SDValue TwosV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Twos);
19375 Srl = DAG.getNode(ISD::SRL, dl, VT, Sub, TwosV);
19376 if (NeedsBitcast) {
19377 Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);
19378 M33 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M33);
19379 Sub = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Sub);
19382 SDValue AndRHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Srl, M33);
19383 SDValue AndLHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Sub, M33);
19384 if (VT != AndRHS.getValueType()) {
19385 AndRHS = DAG.getNode(ISD::BITCAST, dl, VT, AndRHS);
19386 AndLHS = DAG.getNode(ISD::BITCAST, dl, VT, AndLHS);
19388 SDValue Add = DAG.getNode(ISD::ADD, dl, VT, AndLHS, AndRHS);
19390 // v = (v + (v >> 4)) & 0x0F0F0F0F...
19391 SmallVector<SDValue, 8> Fours(NumElts, DAG.getConstant(4, EltVT));
19392 SDValue FoursV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Fours);
19393 Srl = DAG.getNode(ISD::SRL, dl, VT, Add, FoursV);
19394 Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);
19396 SmallVector<SDValue, 8> Mask0F(NumElts, Cst0F);
19397 SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask0F);
19398 if (NeedsBitcast) {
19399 Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);
19400 M0F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M0F);
19402 And = DAG.getNode(ISD::AND, dl, M0F.getValueType(), Add, M0F);
19403 if (VT != And.getValueType())
19404 And = DAG.getNode(ISD::BITCAST, dl, VT, And);
19406 // The algorithm mentioned above uses:
19407 // v = (v * 0x01010101...) >> (Len - 8)
19409 // Change it to use vector adds + vector shifts which yield faster results on
19410 // Haswell than using vector integer multiplication.
19412 // For i32 elements:
19413 // v = v + (v >> 8)
19414 // v = v + (v >> 16)
19416 // For i64 elements:
19417 // v = v + (v >> 8)
19418 // v = v + (v >> 16)
19419 // v = v + (v >> 32)
19422 SmallVector<SDValue, 8> Csts;
19423 for (unsigned i = 8; i <= Len/2; i *= 2) {
19424 Csts.assign(NumElts, DAG.getConstant(i, EltVT));
19425 SDValue CstsV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Csts);
19426 Srl = DAG.getNode(ISD::SRL, dl, VT, Add, CstsV);
19427 Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);
19431 // The result is on the least significant 6-bits on i32 and 7-bits on i64.
19432 SDValue Cst3F = DAG.getConstant(APInt(Len, Len == 32 ? 0x3F : 0x7F), EltVT);
19433 SmallVector<SDValue, 8> Cst3FV(NumElts, Cst3F);
19434 SDValue M3F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Cst3FV);
19435 if (NeedsBitcast) {
19436 Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);
19437 M3F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M3F);
19439 And = DAG.getNode(ISD::AND, dl, M3F.getValueType(), Add, M3F);
19440 if (VT != And.getValueType())
19441 And = DAG.getNode(ISD::BITCAST, dl, VT, And);
19446 static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
19447 SDNode *Node = Op.getNode();
19449 EVT T = Node->getValueType(0);
19450 SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
19451 DAG.getConstant(0, T), Node->getOperand(2));
19452 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
19453 cast<AtomicSDNode>(Node)->getMemoryVT(),
19454 Node->getOperand(0),
19455 Node->getOperand(1), negOp,
19456 cast<AtomicSDNode>(Node)->getMemOperand(),
19457 cast<AtomicSDNode>(Node)->getOrdering(),
19458 cast<AtomicSDNode>(Node)->getSynchScope());
19461 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
19462 SDNode *Node = Op.getNode();
19464 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
19466 // Convert seq_cst store -> xchg
19467 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
19468 // FIXME: On 32-bit, store -> fist or movq would be more efficient
19469 // (The only way to get a 16-byte store is cmpxchg16b)
19470 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
19471 if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent ||
19472 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
19473 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
19474 cast<AtomicSDNode>(Node)->getMemoryVT(),
19475 Node->getOperand(0),
19476 Node->getOperand(1), Node->getOperand(2),
19477 cast<AtomicSDNode>(Node)->getMemOperand(),
19478 cast<AtomicSDNode>(Node)->getOrdering(),
19479 cast<AtomicSDNode>(Node)->getSynchScope());
19480 return Swap.getValue(1);
19482 // Other atomic stores have a simple pattern.
19486 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
19487 EVT VT = Op.getNode()->getSimpleValueType(0);
19489 // Let legalize expand this if it isn't a legal type yet.
19490 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
19493 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
19496 bool ExtraOp = false;
19497 switch (Op.getOpcode()) {
19498 default: llvm_unreachable("Invalid code");
19499 case ISD::ADDC: Opc = X86ISD::ADD; break;
19500 case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
19501 case ISD::SUBC: Opc = X86ISD::SUB; break;
19502 case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
19506 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
19508 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
19509 Op.getOperand(1), Op.getOperand(2));
19512 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
19513 SelectionDAG &DAG) {
19514 assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit());
19516 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
19517 // which returns the values as { float, float } (in XMM0) or
19518 // { double, double } (which is returned in XMM0, XMM1).
19520 SDValue Arg = Op.getOperand(0);
19521 EVT ArgVT = Arg.getValueType();
19522 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
19524 TargetLowering::ArgListTy Args;
19525 TargetLowering::ArgListEntry Entry;
19529 Entry.isSExt = false;
19530 Entry.isZExt = false;
19531 Args.push_back(Entry);
19533 bool isF64 = ArgVT == MVT::f64;
19534 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
19535 // the small struct {f32, f32} is returned in (eax, edx). For f64,
19536 // the results are returned via SRet in memory.
19537 const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret";
19538 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19539 SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy());
19541 Type *RetTy = isF64
19542 ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
19543 : (Type*)VectorType::get(ArgTy, 4);
19545 TargetLowering::CallLoweringInfo CLI(DAG);
19546 CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
19547 .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0);
19549 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
19552 // Returned in xmm0 and xmm1.
19553 return CallResult.first;
19555 // Returned in bits 0:31 and 32:64 xmm0.
19556 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
19557 CallResult.first, DAG.getIntPtrConstant(0));
19558 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
19559 CallResult.first, DAG.getIntPtrConstant(1));
19560 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
19561 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
19564 /// LowerOperation - Provide custom lowering hooks for some operations.
19566 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
19567 switch (Op.getOpcode()) {
19568 default: llvm_unreachable("Should not custom lower this!");
19569 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG);
19570 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
19571 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
19572 return LowerCMP_SWAP(Op, Subtarget, DAG);
19573 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
19574 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG);
19575 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG);
19576 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
19577 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
19578 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
19579 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
19580 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
19581 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
19582 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
19583 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
19584 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG);
19585 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
19586 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
19587 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
19588 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
19589 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
19590 case ISD::SHL_PARTS:
19591 case ISD::SRA_PARTS:
19592 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
19593 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
19594 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
19595 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
19596 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
19597 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
19598 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
19599 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
19600 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
19601 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
19602 case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG);
19604 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
19605 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
19606 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
19607 case ISD::SETCC: return LowerSETCC(Op, DAG);
19608 case ISD::SELECT: return LowerSELECT(Op, DAG);
19609 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
19610 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
19611 case ISD::VASTART: return LowerVASTART(Op, DAG);
19612 case ISD::VAARG: return LowerVAARG(Op, DAG);
19613 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
19614 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
19615 case ISD::INTRINSIC_VOID:
19616 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
19617 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
19618 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
19619 case ISD::FRAME_TO_ARGS_OFFSET:
19620 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
19621 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
19622 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
19623 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
19624 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
19625 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
19626 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
19627 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
19628 case ISD::CTLZ: return LowerCTLZ(Op, DAG);
19629 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, DAG);
19630 case ISD::CTTZ: return LowerCTTZ(Op, DAG);
19631 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
19632 case ISD::UMUL_LOHI:
19633 case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
19636 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
19642 case ISD::UMULO: return LowerXALUO(Op, DAG);
19643 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
19644 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
19648 case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
19649 case ISD::ADD: return LowerADD(Op, DAG);
19650 case ISD::SUB: return LowerSUB(Op, DAG);
19651 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
19655 /// ReplaceNodeResults - Replace a node with an illegal result type
19656 /// with a new node built out of custom code.
19657 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
19658 SmallVectorImpl<SDValue>&Results,
19659 SelectionDAG &DAG) const {
19661 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19662 switch (N->getOpcode()) {
19664 llvm_unreachable("Do not know how to custom type legalize this operation!");
19665 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
19666 case X86ISD::FMINC:
19668 case X86ISD::FMAXC:
19669 case X86ISD::FMAX: {
19670 EVT VT = N->getValueType(0);
19671 if (VT != MVT::v2f32)
19672 llvm_unreachable("Unexpected type (!= v2f32) on FMIN/FMAX.");
19673 SDValue UNDEF = DAG.getUNDEF(VT);
19674 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
19675 N->getOperand(0), UNDEF);
19676 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
19677 N->getOperand(1), UNDEF);
19678 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
19681 case ISD::SIGN_EXTEND_INREG:
19686 // We don't want to expand or promote these.
19693 case ISD::UDIVREM: {
19694 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
19695 Results.push_back(V);
19698 case ISD::FP_TO_SINT:
19699 case ISD::FP_TO_UINT: {
19700 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
19702 if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType()))
19705 std::pair<SDValue,SDValue> Vals =
19706 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
19707 SDValue FIST = Vals.first, StackSlot = Vals.second;
19708 if (FIST.getNode()) {
19709 EVT VT = N->getValueType(0);
19710 // Return a load from the stack slot.
19711 if (StackSlot.getNode())
19712 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
19713 MachinePointerInfo(),
19714 false, false, false, 0));
19716 Results.push_back(FIST);
19720 case ISD::UINT_TO_FP: {
19721 assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
19722 if (N->getOperand(0).getValueType() != MVT::v2i32 ||
19723 N->getValueType(0) != MVT::v2f32)
19725 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,
19727 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
19729 SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias);
19730 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
19731 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, VBias));
19732 Or = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or);
19733 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
19734 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
19737 case ISD::FP_ROUND: {
19738 if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
19740 SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
19741 Results.push_back(V);
19744 case ISD::INTRINSIC_W_CHAIN: {
19745 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
19747 default : llvm_unreachable("Do not know how to custom type "
19748 "legalize this intrinsic operation!");
19749 case Intrinsic::x86_rdtsc:
19750 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
19752 case Intrinsic::x86_rdtscp:
19753 return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
19755 case Intrinsic::x86_rdpmc:
19756 return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
19759 case ISD::READCYCLECOUNTER: {
19760 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
19763 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
19764 EVT T = N->getValueType(0);
19765 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
19766 bool Regs64bit = T == MVT::i128;
19767 EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
19768 SDValue cpInL, cpInH;
19769 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
19770 DAG.getConstant(0, HalfT));
19771 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
19772 DAG.getConstant(1, HalfT));
19773 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
19774 Regs64bit ? X86::RAX : X86::EAX,
19776 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
19777 Regs64bit ? X86::RDX : X86::EDX,
19778 cpInH, cpInL.getValue(1));
19779 SDValue swapInL, swapInH;
19780 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
19781 DAG.getConstant(0, HalfT));
19782 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
19783 DAG.getConstant(1, HalfT));
19784 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl,
19785 Regs64bit ? X86::RBX : X86::EBX,
19786 swapInL, cpInH.getValue(1));
19787 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl,
19788 Regs64bit ? X86::RCX : X86::ECX,
19789 swapInH, swapInL.getValue(1));
19790 SDValue Ops[] = { swapInH.getValue(0),
19792 swapInH.getValue(1) };
19793 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
19794 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
19795 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
19796 X86ISD::LCMPXCHG8_DAG;
19797 SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
19798 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
19799 Regs64bit ? X86::RAX : X86::EAX,
19800 HalfT, Result.getValue(1));
19801 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
19802 Regs64bit ? X86::RDX : X86::EDX,
19803 HalfT, cpOutL.getValue(2));
19804 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
19806 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
19807 MVT::i32, cpOutH.getValue(2));
19809 DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
19810 DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);
19811 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
19813 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
19814 Results.push_back(Success);
19815 Results.push_back(EFLAGS.getValue(1));
19818 case ISD::ATOMIC_SWAP:
19819 case ISD::ATOMIC_LOAD_ADD:
19820 case ISD::ATOMIC_LOAD_SUB:
19821 case ISD::ATOMIC_LOAD_AND:
19822 case ISD::ATOMIC_LOAD_OR:
19823 case ISD::ATOMIC_LOAD_XOR:
19824 case ISD::ATOMIC_LOAD_NAND:
19825 case ISD::ATOMIC_LOAD_MIN:
19826 case ISD::ATOMIC_LOAD_MAX:
19827 case ISD::ATOMIC_LOAD_UMIN:
19828 case ISD::ATOMIC_LOAD_UMAX:
19829 case ISD::ATOMIC_LOAD: {
19830 // Delegate to generic TypeLegalization. Situations we can really handle
19831 // should have already been dealt with by AtomicExpandPass.cpp.
19834 case ISD::BITCAST: {
19835 assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
19836 EVT DstVT = N->getValueType(0);
19837 EVT SrcVT = N->getOperand(0)->getValueType(0);
19839 if (SrcVT != MVT::f64 ||
19840 (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
19843 unsigned NumElts = DstVT.getVectorNumElements();
19844 EVT SVT = DstVT.getVectorElementType();
19845 EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
19846 SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
19847 MVT::v2f64, N->getOperand(0));
19848 SDValue ToVecInt = DAG.getNode(ISD::BITCAST, dl, WiderVT, Expanded);
19850 if (ExperimentalVectorWideningLegalization) {
19851 // If we are legalizing vectors by widening, we already have the desired
19852 // legal vector type, just return it.
19853 Results.push_back(ToVecInt);
19857 SmallVector<SDValue, 8> Elts;
19858 for (unsigned i = 0, e = NumElts; i != e; ++i)
19859 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
19860 ToVecInt, DAG.getIntPtrConstant(i)));
19862 Results.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, DstVT, Elts));
19867 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
19869 default: return nullptr;
19870 case X86ISD::BSF: return "X86ISD::BSF";
19871 case X86ISD::BSR: return "X86ISD::BSR";
19872 case X86ISD::SHLD: return "X86ISD::SHLD";
19873 case X86ISD::SHRD: return "X86ISD::SHRD";
19874 case X86ISD::FAND: return "X86ISD::FAND";
19875 case X86ISD::FANDN: return "X86ISD::FANDN";
19876 case X86ISD::FOR: return "X86ISD::FOR";
19877 case X86ISD::FXOR: return "X86ISD::FXOR";
19878 case X86ISD::FSRL: return "X86ISD::FSRL";
19879 case X86ISD::FILD: return "X86ISD::FILD";
19880 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
19881 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
19882 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
19883 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
19884 case X86ISD::FLD: return "X86ISD::FLD";
19885 case X86ISD::FST: return "X86ISD::FST";
19886 case X86ISD::CALL: return "X86ISD::CALL";
19887 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
19888 case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
19889 case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
19890 case X86ISD::BT: return "X86ISD::BT";
19891 case X86ISD::CMP: return "X86ISD::CMP";
19892 case X86ISD::COMI: return "X86ISD::COMI";
19893 case X86ISD::UCOMI: return "X86ISD::UCOMI";
19894 case X86ISD::CMPM: return "X86ISD::CMPM";
19895 case X86ISD::CMPMU: return "X86ISD::CMPMU";
19896 case X86ISD::SETCC: return "X86ISD::SETCC";
19897 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
19898 case X86ISD::FSETCC: return "X86ISD::FSETCC";
19899 case X86ISD::CMOV: return "X86ISD::CMOV";
19900 case X86ISD::BRCOND: return "X86ISD::BRCOND";
19901 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
19902 case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
19903 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
19904 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
19905 case X86ISD::Wrapper: return "X86ISD::Wrapper";
19906 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
19907 case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
19908 case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
19909 case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
19910 case X86ISD::PINSRB: return "X86ISD::PINSRB";
19911 case X86ISD::PINSRW: return "X86ISD::PINSRW";
19912 case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
19913 case X86ISD::ANDNP: return "X86ISD::ANDNP";
19914 case X86ISD::PSIGN: return "X86ISD::PSIGN";
19915 case X86ISD::BLENDI: return "X86ISD::BLENDI";
19916 case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
19917 case X86ISD::SUBUS: return "X86ISD::SUBUS";
19918 case X86ISD::HADD: return "X86ISD::HADD";
19919 case X86ISD::HSUB: return "X86ISD::HSUB";
19920 case X86ISD::FHADD: return "X86ISD::FHADD";
19921 case X86ISD::FHSUB: return "X86ISD::FHSUB";
19922 case X86ISD::UMAX: return "X86ISD::UMAX";
19923 case X86ISD::UMIN: return "X86ISD::UMIN";
19924 case X86ISD::SMAX: return "X86ISD::SMAX";
19925 case X86ISD::SMIN: return "X86ISD::SMIN";
19926 case X86ISD::FMAX: return "X86ISD::FMAX";
19927 case X86ISD::FMIN: return "X86ISD::FMIN";
19928 case X86ISD::FMAXC: return "X86ISD::FMAXC";
19929 case X86ISD::FMINC: return "X86ISD::FMINC";
19930 case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
19931 case X86ISD::FRCP: return "X86ISD::FRCP";
19932 case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
19933 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
19934 case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
19935 case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
19936 case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
19937 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
19938 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
19939 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
19940 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
19941 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
19942 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
19943 case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
19944 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
19945 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
19946 case X86ISD::VZEXT: return "X86ISD::VZEXT";
19947 case X86ISD::VSEXT: return "X86ISD::VSEXT";
19948 case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
19949 case X86ISD::VTRUNCM: return "X86ISD::VTRUNCM";
19950 case X86ISD::VINSERT: return "X86ISD::VINSERT";
19951 case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
19952 case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
19953 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
19954 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
19955 case X86ISD::VSHL: return "X86ISD::VSHL";
19956 case X86ISD::VSRL: return "X86ISD::VSRL";
19957 case X86ISD::VSRA: return "X86ISD::VSRA";
19958 case X86ISD::VSHLI: return "X86ISD::VSHLI";
19959 case X86ISD::VSRLI: return "X86ISD::VSRLI";
19960 case X86ISD::VSRAI: return "X86ISD::VSRAI";
19961 case X86ISD::CMPP: return "X86ISD::CMPP";
19962 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
19963 case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
19964 case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM";
19965 case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM";
19966 case X86ISD::ADD: return "X86ISD::ADD";
19967 case X86ISD::SUB: return "X86ISD::SUB";
19968 case X86ISD::ADC: return "X86ISD::ADC";
19969 case X86ISD::SBB: return "X86ISD::SBB";
19970 case X86ISD::SMUL: return "X86ISD::SMUL";
19971 case X86ISD::UMUL: return "X86ISD::UMUL";
19972 case X86ISD::SMUL8: return "X86ISD::SMUL8";
19973 case X86ISD::UMUL8: return "X86ISD::UMUL8";
19974 case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
19975 case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
19976 case X86ISD::INC: return "X86ISD::INC";
19977 case X86ISD::DEC: return "X86ISD::DEC";
19978 case X86ISD::OR: return "X86ISD::OR";
19979 case X86ISD::XOR: return "X86ISD::XOR";
19980 case X86ISD::AND: return "X86ISD::AND";
19981 case X86ISD::BEXTR: return "X86ISD::BEXTR";
19982 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
19983 case X86ISD::PTEST: return "X86ISD::PTEST";
19984 case X86ISD::TESTP: return "X86ISD::TESTP";
19985 case X86ISD::TESTM: return "X86ISD::TESTM";
19986 case X86ISD::TESTNM: return "X86ISD::TESTNM";
19987 case X86ISD::KORTEST: return "X86ISD::KORTEST";
19988 case X86ISD::PACKSS: return "X86ISD::PACKSS";
19989 case X86ISD::PACKUS: return "X86ISD::PACKUS";
19990 case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
19991 case X86ISD::VALIGN: return "X86ISD::VALIGN";
19992 case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
19993 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
19994 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
19995 case X86ISD::SHUFP: return "X86ISD::SHUFP";
19996 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
19997 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD";
19998 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
19999 case X86ISD::MOVLPS: return "X86ISD::MOVLPS";
20000 case X86ISD::MOVLPD: return "X86ISD::MOVLPD";
20001 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
20002 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
20003 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
20004 case X86ISD::MOVSD: return "X86ISD::MOVSD";
20005 case X86ISD::MOVSS: return "X86ISD::MOVSS";
20006 case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
20007 case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
20008 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
20009 case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
20010 case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT";
20011 case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
20012 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
20013 case X86ISD::VPERMV: return "X86ISD::VPERMV";
20014 case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
20015 case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";
20016 case X86ISD::VPERMI: return "X86ISD::VPERMI";
20017 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
20018 case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
20019 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
20020 case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
20021 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
20022 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
20023 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
20024 case X86ISD::WIN_FTOL: return "X86ISD::WIN_FTOL";
20025 case X86ISD::SAHF: return "X86ISD::SAHF";
20026 case X86ISD::RDRAND: return "X86ISD::RDRAND";
20027 case X86ISD::RDSEED: return "X86ISD::RDSEED";
20028 case X86ISD::FMADD: return "X86ISD::FMADD";
20029 case X86ISD::FMSUB: return "X86ISD::FMSUB";
20030 case X86ISD::FNMADD: return "X86ISD::FNMADD";
20031 case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
20032 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
20033 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
20034 case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
20035 case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
20036 case X86ISD::XTEST: return "X86ISD::XTEST";
20037 case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
20038 case X86ISD::EXPAND: return "X86ISD::EXPAND";
20039 case X86ISD::SELECT: return "X86ISD::SELECT";
20043 // isLegalAddressingMode - Return true if the addressing mode represented
20044 // by AM is legal for this target, for a load/store of the specified type.
20045 bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
20047 // X86 supports extremely general addressing modes.
20048 CodeModel::Model M = getTargetMachine().getCodeModel();
20049 Reloc::Model R = getTargetMachine().getRelocationModel();
20051 // X86 allows a sign-extended 32-bit immediate field as a displacement.
20052 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
20057 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
20059 // If a reference to this global requires an extra load, we can't fold it.
20060 if (isGlobalStubReference(GVFlags))
20063 // If BaseGV requires a register for the PIC base, we cannot also have a
20064 // BaseReg specified.
20065 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
20068 // If lower 4G is not available, then we must use rip-relative addressing.
20069 if ((M != CodeModel::Small || R != Reloc::Static) &&
20070 Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
20074 switch (AM.Scale) {
20080 // These scales always work.
20085 // These scales are formed with basereg+scalereg. Only accept if there is
20090 default: // Other stuff never works.
20097 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
20098 unsigned Bits = Ty->getScalarSizeInBits();
20100 // 8-bit shifts are always expensive, but versions with a scalar amount aren't
20101 // particularly cheaper than those without.
20105 // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
20106 // variable shifts just as cheap as scalar ones.
20107 if (Subtarget->hasInt256() && (Bits == 32 || Bits == 64))
20110 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
20111 // fully general vector.
20115 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
20116 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
20118 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
20119 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
20120 return NumBits1 > NumBits2;
20123 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
20124 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
20127 if (!isTypeLegal(EVT::getEVT(Ty1)))
20130 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
20132 // Assuming the caller doesn't have a zeroext or signext return parameter,
20133 // truncation all the way down to i1 is valid.
20137 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
20138 return isInt<32>(Imm);
20141 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
20142 // Can also use sub to handle negated immediates.
20143 return isInt<32>(Imm);
20146 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
20147 if (!VT1.isInteger() || !VT2.isInteger())
20149 unsigned NumBits1 = VT1.getSizeInBits();
20150 unsigned NumBits2 = VT2.getSizeInBits();
20151 return NumBits1 > NumBits2;
20154 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
20155 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
20156 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
20159 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
20160 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
20161 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
20164 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
20165 EVT VT1 = Val.getValueType();
20166 if (isZExtFree(VT1, VT2))
20169 if (Val.getOpcode() != ISD::LOAD)
20172 if (!VT1.isSimple() || !VT1.isInteger() ||
20173 !VT2.isSimple() || !VT2.isInteger())
20176 switch (VT1.getSimpleVT().SimpleTy) {
20181 // X86 has 8, 16, and 32-bit zero-extending loads.
20189 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
20190 if (!(Subtarget->hasFMA() || Subtarget->hasFMA4()))
20193 VT = VT.getScalarType();
20195 if (!VT.isSimple())
20198 switch (VT.getSimpleVT().SimpleTy) {
20209 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
20210 // i16 instructions are longer (0x66 prefix) and potentially slower.
20211 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
20214 /// isShuffleMaskLegal - Targets can use this to indicate that they only
20215 /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
20216 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
20217 /// are assumed to be legal.
20219 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
20221 if (!VT.isSimple())
20224 MVT SVT = VT.getSimpleVT();
20226 // Very little shuffling can be done for 64-bit vectors right now.
20227 if (VT.getSizeInBits() == 64)
20230 // This is an experimental legality test that is tailored to match the
20231 // legality test of the experimental lowering more closely. They are gated
20232 // separately to ease testing of performance differences.
20233 if (ExperimentalVectorShuffleLegality)
20234 // We only care that the types being shuffled are legal. The lowering can
20235 // handle any possible shuffle mask that results.
20236 return isTypeLegal(SVT);
20238 // If this is a single-input shuffle with no 128 bit lane crossings we can
20239 // lower it into pshufb.
20240 if ((SVT.is128BitVector() && Subtarget->hasSSSE3()) ||
20241 (SVT.is256BitVector() && Subtarget->hasInt256())) {
20242 bool isLegal = true;
20243 for (unsigned I = 0, E = M.size(); I != E; ++I) {
20244 if (M[I] >= (int)SVT.getVectorNumElements() ||
20245 ShuffleCrosses128bitLane(SVT, I, M[I])) {
20254 // FIXME: blends, shifts.
20255 return (SVT.getVectorNumElements() == 2 ||
20256 ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
20257 isMOVLMask(M, SVT) ||
20258 isCommutedMOVLMask(M, SVT) ||
20259 isMOVHLPSMask(M, SVT) ||
20260 isSHUFPMask(M, SVT) ||
20261 isSHUFPMask(M, SVT, /* Commuted */ true) ||
20262 isPSHUFDMask(M, SVT) ||
20263 isPSHUFDMask(M, SVT, /* SecondOperand */ true) ||
20264 isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) ||
20265 isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) ||
20266 isPALIGNRMask(M, SVT, Subtarget) ||
20267 isUNPCKLMask(M, SVT, Subtarget->hasInt256()) ||
20268 isUNPCKHMask(M, SVT, Subtarget->hasInt256()) ||
20269 isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
20270 isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
20271 isBlendMask(M, SVT, Subtarget->hasSSE41(), Subtarget->hasInt256()) ||
20272 (Subtarget->hasSSE41() && isINSERTPSMask(M, SVT)));
20276 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
20278 if (!VT.isSimple())
20281 MVT SVT = VT.getSimpleVT();
20283 // This is an experimental legality test that is tailored to match the
20284 // legality test of the experimental lowering more closely. They are gated
20285 // separately to ease testing of performance differences.
20286 if (ExperimentalVectorShuffleLegality)
20287 // The new vector shuffle lowering is very good at managing zero-inputs.
20288 return isShuffleMaskLegal(Mask, VT);
20290 unsigned NumElts = SVT.getVectorNumElements();
20291 // FIXME: This collection of masks seems suspect.
20294 if (NumElts == 4 && SVT.is128BitVector()) {
20295 return (isMOVLMask(Mask, SVT) ||
20296 isCommutedMOVLMask(Mask, SVT, true) ||
20297 isSHUFPMask(Mask, SVT) ||
20298 isSHUFPMask(Mask, SVT, /* Commuted */ true) ||
20299 isBlendMask(Mask, SVT, Subtarget->hasSSE41(),
20300 Subtarget->hasInt256()));
20305 //===----------------------------------------------------------------------===//
20306 // X86 Scheduler Hooks
20307 //===----------------------------------------------------------------------===//
20309 /// Utility function to emit xbegin specifying the start of an RTM region.
20310 static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,
20311 const TargetInstrInfo *TII) {
20312 DebugLoc DL = MI->getDebugLoc();
20314 const BasicBlock *BB = MBB->getBasicBlock();
20315 MachineFunction::iterator I = MBB;
20318 // For the v = xbegin(), we generate
20329 MachineBasicBlock *thisMBB = MBB;
20330 MachineFunction *MF = MBB->getParent();
20331 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
20332 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
20333 MF->insert(I, mainMBB);
20334 MF->insert(I, sinkMBB);
20336 // Transfer the remainder of BB and its successor edges to sinkMBB.
20337 sinkMBB->splice(sinkMBB->begin(), MBB,
20338 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
20339 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
20343 // # fallthrough to mainMBB
20344 // # abortion to sinkMBB
20345 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
20346 thisMBB->addSuccessor(mainMBB);
20347 thisMBB->addSuccessor(sinkMBB);
20351 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
20352 mainMBB->addSuccessor(sinkMBB);
20355 // EAX is live into the sinkMBB
20356 sinkMBB->addLiveIn(X86::EAX);
20357 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
20358 TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
20361 MI->eraseFromParent();
20365 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
20366 // or XMM0_V32I8 in AVX all of this code can be replaced with that
20367 // in the .td file.
20368 static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB,
20369 const TargetInstrInfo *TII) {
20371 switch (MI->getOpcode()) {
20372 default: llvm_unreachable("illegal opcode!");
20373 case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;
20374 case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
20375 case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;
20376 case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
20377 case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;
20378 case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
20379 case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;
20380 case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
20383 DebugLoc dl = MI->getDebugLoc();
20384 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
20386 unsigned NumArgs = MI->getNumOperands();
20387 for (unsigned i = 1; i < NumArgs; ++i) {
20388 MachineOperand &Op = MI->getOperand(i);
20389 if (!(Op.isReg() && Op.isImplicit()))
20390 MIB.addOperand(Op);
20392 if (MI->hasOneMemOperand())
20393 MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
20395 BuildMI(*BB, MI, dl,
20396 TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
20397 .addReg(X86::XMM0);
20399 MI->eraseFromParent();
20403 // FIXME: Custom handling because TableGen doesn't support multiple implicit
20404 // defs in an instruction pattern
20405 static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB,
20406 const TargetInstrInfo *TII) {
20408 switch (MI->getOpcode()) {
20409 default: llvm_unreachable("illegal opcode!");
20410 case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;
20411 case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
20412 case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;
20413 case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
20414 case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;
20415 case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
20416 case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;
20417 case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
20420 DebugLoc dl = MI->getDebugLoc();
20421 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
20423 unsigned NumArgs = MI->getNumOperands(); // remove the results
20424 for (unsigned i = 1; i < NumArgs; ++i) {
20425 MachineOperand &Op = MI->getOperand(i);
20426 if (!(Op.isReg() && Op.isImplicit()))
20427 MIB.addOperand(Op);
20429 if (MI->hasOneMemOperand())
20430 MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
20432 BuildMI(*BB, MI, dl,
20433 TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
20436 MI->eraseFromParent();
20440 static MachineBasicBlock * EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
20441 const TargetInstrInfo *TII,
20442 const X86Subtarget* Subtarget) {
20443 DebugLoc dl = MI->getDebugLoc();
20445 // Address into RAX/EAX, other two args into ECX, EDX.
20446 unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
20447 unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
20448 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
20449 for (int i = 0; i < X86::AddrNumOperands; ++i)
20450 MIB.addOperand(MI->getOperand(i));
20452 unsigned ValOps = X86::AddrNumOperands;
20453 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
20454 .addReg(MI->getOperand(ValOps).getReg());
20455 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
20456 .addReg(MI->getOperand(ValOps+1).getReg());
20458 // The instruction doesn't actually take any operands though.
20459 BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr));
20461 MI->eraseFromParent(); // The pseudo is gone now.
20465 MachineBasicBlock *
20466 X86TargetLowering::EmitVAARG64WithCustomInserter(
20468 MachineBasicBlock *MBB) const {
20469 // Emit va_arg instruction on X86-64.
20471 // Operands to this pseudo-instruction:
20472 // 0 ) Output : destination address (reg)
20473 // 1-5) Input : va_list address (addr, i64mem)
20474 // 6 ) ArgSize : Size (in bytes) of vararg type
20475 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
20476 // 8 ) Align : Alignment of type
20477 // 9 ) EFLAGS (implicit-def)
20479 assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
20480 assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands");
20482 unsigned DestReg = MI->getOperand(0).getReg();
20483 MachineOperand &Base = MI->getOperand(1);
20484 MachineOperand &Scale = MI->getOperand(2);
20485 MachineOperand &Index = MI->getOperand(3);
20486 MachineOperand &Disp = MI->getOperand(4);
20487 MachineOperand &Segment = MI->getOperand(5);
20488 unsigned ArgSize = MI->getOperand(6).getImm();
20489 unsigned ArgMode = MI->getOperand(7).getImm();
20490 unsigned Align = MI->getOperand(8).getImm();
20492 // Memory Reference
20493 assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
20494 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
20495 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
20497 // Machine Information
20498 const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo();
20499 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
20500 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
20501 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
20502 DebugLoc DL = MI->getDebugLoc();
20504 // struct va_list {
20507 // i64 overflow_area (address)
20508 // i64 reg_save_area (address)
20510 // sizeof(va_list) = 24
20511 // alignment(va_list) = 8
20513 unsigned TotalNumIntRegs = 6;
20514 unsigned TotalNumXMMRegs = 8;
20515 bool UseGPOffset = (ArgMode == 1);
20516 bool UseFPOffset = (ArgMode == 2);
20517 unsigned MaxOffset = TotalNumIntRegs * 8 +
20518 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
20520 /* Align ArgSize to a multiple of 8 */
20521 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
20522 bool NeedsAlign = (Align > 8);
20524 MachineBasicBlock *thisMBB = MBB;
20525 MachineBasicBlock *overflowMBB;
20526 MachineBasicBlock *offsetMBB;
20527 MachineBasicBlock *endMBB;
20529 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
20530 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
20531 unsigned OffsetReg = 0;
20533 if (!UseGPOffset && !UseFPOffset) {
20534 // If we only pull from the overflow region, we don't create a branch.
20535 // We don't need to alter control flow.
20536 OffsetDestReg = 0; // unused
20537 OverflowDestReg = DestReg;
20539 offsetMBB = nullptr;
20540 overflowMBB = thisMBB;
20543 // First emit code to check if gp_offset (or fp_offset) is below the bound.
20544 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
20545 // If not, pull from overflow_area. (branch to overflowMBB)
20550 // offsetMBB overflowMBB
20555 // Registers for the PHI in endMBB
20556 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
20557 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
20559 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
20560 MachineFunction *MF = MBB->getParent();
20561 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20562 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20563 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20565 MachineFunction::iterator MBBIter = MBB;
20568 // Insert the new basic blocks
20569 MF->insert(MBBIter, offsetMBB);
20570 MF->insert(MBBIter, overflowMBB);
20571 MF->insert(MBBIter, endMBB);
20573 // Transfer the remainder of MBB and its successor edges to endMBB.
20574 endMBB->splice(endMBB->begin(), thisMBB,
20575 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
20576 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
20578 // Make offsetMBB and overflowMBB successors of thisMBB
20579 thisMBB->addSuccessor(offsetMBB);
20580 thisMBB->addSuccessor(overflowMBB);
20582 // endMBB is a successor of both offsetMBB and overflowMBB
20583 offsetMBB->addSuccessor(endMBB);
20584 overflowMBB->addSuccessor(endMBB);
20586 // Load the offset value into a register
20587 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
20588 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
20592 .addDisp(Disp, UseFPOffset ? 4 : 0)
20593 .addOperand(Segment)
20594 .setMemRefs(MMOBegin, MMOEnd);
20596 // Check if there is enough room left to pull this argument.
20597 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
20599 .addImm(MaxOffset + 8 - ArgSizeA8);
20601 // Branch to "overflowMBB" if offset >= max
20602 // Fall through to "offsetMBB" otherwise
20603 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
20604 .addMBB(overflowMBB);
20607 // In offsetMBB, emit code to use the reg_save_area.
20609 assert(OffsetReg != 0);
20611 // Read the reg_save_area address.
20612 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
20613 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
20618 .addOperand(Segment)
20619 .setMemRefs(MMOBegin, MMOEnd);
20621 // Zero-extend the offset
20622 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
20623 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
20626 .addImm(X86::sub_32bit);
20628 // Add the offset to the reg_save_area to get the final address.
20629 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
20630 .addReg(OffsetReg64)
20631 .addReg(RegSaveReg);
20633 // Compute the offset for the next argument
20634 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
20635 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
20637 .addImm(UseFPOffset ? 16 : 8);
20639 // Store it back into the va_list.
20640 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
20644 .addDisp(Disp, UseFPOffset ? 4 : 0)
20645 .addOperand(Segment)
20646 .addReg(NextOffsetReg)
20647 .setMemRefs(MMOBegin, MMOEnd);
20650 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
20655 // Emit code to use overflow area
20658 // Load the overflow_area address into a register.
20659 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
20660 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
20665 .addOperand(Segment)
20666 .setMemRefs(MMOBegin, MMOEnd);
20668 // If we need to align it, do so. Otherwise, just copy the address
20669 // to OverflowDestReg.
20671 // Align the overflow address
20672 assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2");
20673 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
20675 // aligned_addr = (addr + (align-1)) & ~(align-1)
20676 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
20677 .addReg(OverflowAddrReg)
20680 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
20682 .addImm(~(uint64_t)(Align-1));
20684 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
20685 .addReg(OverflowAddrReg);
20688 // Compute the next overflow address after this argument.
20689 // (the overflow address should be kept 8-byte aligned)
20690 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
20691 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
20692 .addReg(OverflowDestReg)
20693 .addImm(ArgSizeA8);
20695 // Store the new overflow address.
20696 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
20701 .addOperand(Segment)
20702 .addReg(NextAddrReg)
20703 .setMemRefs(MMOBegin, MMOEnd);
20705 // If we branched, emit the PHI to the front of endMBB.
20707 BuildMI(*endMBB, endMBB->begin(), DL,
20708 TII->get(X86::PHI), DestReg)
20709 .addReg(OffsetDestReg).addMBB(offsetMBB)
20710 .addReg(OverflowDestReg).addMBB(overflowMBB);
20713 // Erase the pseudo instruction
20714 MI->eraseFromParent();
20719 MachineBasicBlock *
20720 X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
20722 MachineBasicBlock *MBB) const {
20723 // Emit code to save XMM registers to the stack. The ABI says that the
20724 // number of registers to save is given in %al, so it's theoretically
20725 // possible to do an indirect jump trick to avoid saving all of them,
20726 // however this code takes a simpler approach and just executes all
20727 // of the stores if %al is non-zero. It's less code, and it's probably
20728 // easier on the hardware branch predictor, and stores aren't all that
20729 // expensive anyway.
20731 // Create the new basic blocks. One block contains all the XMM stores,
20732 // and one block is the final destination regardless of whether any
20733 // stores were performed.
20734 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
20735 MachineFunction *F = MBB->getParent();
20736 MachineFunction::iterator MBBIter = MBB;
20738 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
20739 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
20740 F->insert(MBBIter, XMMSaveMBB);
20741 F->insert(MBBIter, EndMBB);
20743 // Transfer the remainder of MBB and its successor edges to EndMBB.
20744 EndMBB->splice(EndMBB->begin(), MBB,
20745 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
20746 EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
20748 // The original block will now fall through to the XMM save block.
20749 MBB->addSuccessor(XMMSaveMBB);
20750 // The XMMSaveMBB will fall through to the end block.
20751 XMMSaveMBB->addSuccessor(EndMBB);
20753 // Now add the instructions.
20754 const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo();
20755 DebugLoc DL = MI->getDebugLoc();
20757 unsigned CountReg = MI->getOperand(0).getReg();
20758 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
20759 int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
20761 if (!Subtarget->isTargetWin64()) {
20762 // If %al is 0, branch around the XMM save block.
20763 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
20764 BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
20765 MBB->addSuccessor(EndMBB);
20768 // Make sure the last operand is EFLAGS, which gets clobbered by the branch
20769 // that was just emitted, but clearly shouldn't be "saved".
20770 assert((MI->getNumOperands() <= 3 ||
20771 !MI->getOperand(MI->getNumOperands() - 1).isReg() ||
20772 MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS)
20773 && "Expected last argument to be EFLAGS");
20774 unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
20775 // In the XMM save block, save all the XMM argument registers.
20776 for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) {
20777 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
20778 MachineMemOperand *MMO =
20779 F->getMachineMemOperand(
20780 MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset),
20781 MachineMemOperand::MOStore,
20782 /*Size=*/16, /*Align=*/16);
20783 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
20784 .addFrameIndex(RegSaveFrameIndex)
20785 .addImm(/*Scale=*/1)
20786 .addReg(/*IndexReg=*/0)
20787 .addImm(/*Disp=*/Offset)
20788 .addReg(/*Segment=*/0)
20789 .addReg(MI->getOperand(i).getReg())
20790 .addMemOperand(MMO);
20793 MI->eraseFromParent(); // The pseudo instruction is gone now.
20798 // The EFLAGS operand of SelectItr might be missing a kill marker
20799 // because there were multiple uses of EFLAGS, and ISel didn't know
20800 // which to mark. Figure out whether SelectItr should have had a
20801 // kill marker, and set it if it should. Returns the correct kill
20803 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
20804 MachineBasicBlock* BB,
20805 const TargetRegisterInfo* TRI) {
20806 // Scan forward through BB for a use/def of EFLAGS.
20807 MachineBasicBlock::iterator miI(std::next(SelectItr));
20808 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
20809 const MachineInstr& mi = *miI;
20810 if (mi.readsRegister(X86::EFLAGS))
20812 if (mi.definesRegister(X86::EFLAGS))
20813 break; // Should have kill-flag - update below.
20816 // If we hit the end of the block, check whether EFLAGS is live into a
20818 if (miI == BB->end()) {
20819 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
20820 sEnd = BB->succ_end();
20821 sItr != sEnd; ++sItr) {
20822 MachineBasicBlock* succ = *sItr;
20823 if (succ->isLiveIn(X86::EFLAGS))
20828 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
20829 // out. SelectMI should have a kill flag on EFLAGS.
20830 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
20834 MachineBasicBlock *
20835 X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
20836 MachineBasicBlock *BB) const {
20837 const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
20838 DebugLoc DL = MI->getDebugLoc();
20840 // To "insert" a SELECT_CC instruction, we actually have to insert the
20841 // diamond control-flow pattern. The incoming instruction knows the
20842 // destination vreg to set, the condition code register to branch on, the
20843 // true/false values to select between, and a branch opcode to use.
20844 const BasicBlock *LLVM_BB = BB->getBasicBlock();
20845 MachineFunction::iterator It = BB;
20851 // cmpTY ccX, r1, r2
20853 // fallthrough --> copy0MBB
20854 MachineBasicBlock *thisMBB = BB;
20855 MachineFunction *F = BB->getParent();
20856 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
20857 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
20858 F->insert(It, copy0MBB);
20859 F->insert(It, sinkMBB);
20861 // If the EFLAGS register isn't dead in the terminator, then claim that it's
20862 // live into the sink and copy blocks.
20863 const TargetRegisterInfo *TRI =
20864 BB->getParent()->getSubtarget().getRegisterInfo();
20865 if (!MI->killsRegister(X86::EFLAGS) &&
20866 !checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
20867 copy0MBB->addLiveIn(X86::EFLAGS);
20868 sinkMBB->addLiveIn(X86::EFLAGS);
20871 // Transfer the remainder of BB and its successor edges to sinkMBB.
20872 sinkMBB->splice(sinkMBB->begin(), BB,
20873 std::next(MachineBasicBlock::iterator(MI)), BB->end());
20874 sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
20876 // Add the true and fallthrough blocks as its successors.
20877 BB->addSuccessor(copy0MBB);
20878 BB->addSuccessor(sinkMBB);
20880 // Create the conditional branch instruction.
20882 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
20883 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
20886 // %FalseValue = ...
20887 // # fallthrough to sinkMBB
20888 copy0MBB->addSuccessor(sinkMBB);
20891 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
20893 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
20894 TII->get(X86::PHI), MI->getOperand(0).getReg())
20895 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
20896 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
20898 MI->eraseFromParent(); // The pseudo instruction is gone now.
20902 MachineBasicBlock *
20903 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
20904 MachineBasicBlock *BB) const {
20905 MachineFunction *MF = BB->getParent();
20906 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
20907 DebugLoc DL = MI->getDebugLoc();
20908 const BasicBlock *LLVM_BB = BB->getBasicBlock();
20910 assert(MF->shouldSplitStack());
20912 const bool Is64Bit = Subtarget->is64Bit();
20913 const bool IsLP64 = Subtarget->isTarget64BitLP64();
20915 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
20916 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
20919 // ... [Till the alloca]
20920 // If stacklet is not large enough, jump to mallocMBB
20923 // Allocate by subtracting from RSP
20924 // Jump to continueMBB
20927 // Allocate by call to runtime
20931 // [rest of original BB]
20934 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20935 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20936 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20938 MachineRegisterInfo &MRI = MF->getRegInfo();
20939 const TargetRegisterClass *AddrRegClass =
20940 getRegClassFor(getPointerTy());
20942 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
20943 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
20944 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
20945 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
20946 sizeVReg = MI->getOperand(1).getReg(),
20947 physSPReg = IsLP64 || Subtarget->isTargetNaCl64() ? X86::RSP : X86::ESP;
20949 MachineFunction::iterator MBBIter = BB;
20952 MF->insert(MBBIter, bumpMBB);
20953 MF->insert(MBBIter, mallocMBB);
20954 MF->insert(MBBIter, continueMBB);
20956 continueMBB->splice(continueMBB->begin(), BB,
20957 std::next(MachineBasicBlock::iterator(MI)), BB->end());
20958 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
20960 // Add code to the main basic block to check if the stack limit has been hit,
20961 // and if so, jump to mallocMBB otherwise to bumpMBB.
20962 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
20963 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
20964 .addReg(tmpSPVReg).addReg(sizeVReg);
20965 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
20966 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
20967 .addReg(SPLimitVReg);
20968 BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
20970 // bumpMBB simply decreases the stack pointer, since we know the current
20971 // stacklet has enough space.
20972 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
20973 .addReg(SPLimitVReg);
20974 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
20975 .addReg(SPLimitVReg);
20976 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
20978 // Calls into a routine in libgcc to allocate more space from the heap.
20979 const uint32_t *RegMask = MF->getTarget()
20980 .getSubtargetImpl()
20981 ->getRegisterInfo()
20982 ->getCallPreservedMask(CallingConv::C);
20984 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
20986 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
20987 .addExternalSymbol("__morestack_allocate_stack_space")
20988 .addRegMask(RegMask)
20989 .addReg(X86::RDI, RegState::Implicit)
20990 .addReg(X86::RAX, RegState::ImplicitDefine);
20991 } else if (Is64Bit) {
20992 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
20994 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
20995 .addExternalSymbol("__morestack_allocate_stack_space")
20996 .addRegMask(RegMask)
20997 .addReg(X86::EDI, RegState::Implicit)
20998 .addReg(X86::EAX, RegState::ImplicitDefine);
21000 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
21002 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
21003 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
21004 .addExternalSymbol("__morestack_allocate_stack_space")
21005 .addRegMask(RegMask)
21006 .addReg(X86::EAX, RegState::ImplicitDefine);
21010 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
21013 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
21014 .addReg(IsLP64 ? X86::RAX : X86::EAX);
21015 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
21017 // Set up the CFG correctly.
21018 BB->addSuccessor(bumpMBB);
21019 BB->addSuccessor(mallocMBB);
21020 mallocMBB->addSuccessor(continueMBB);
21021 bumpMBB->addSuccessor(continueMBB);
21023 // Take care of the PHI nodes.
21024 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
21025 MI->getOperand(0).getReg())
21026 .addReg(mallocPtrVReg).addMBB(mallocMBB)
21027 .addReg(bumpSPPtrVReg).addMBB(bumpMBB);
21029 // Delete the original pseudo instruction.
21030 MI->eraseFromParent();
21033 return continueMBB;
21036 MachineBasicBlock *
21037 X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
21038 MachineBasicBlock *BB) const {
21039 DebugLoc DL = MI->getDebugLoc();
21041 assert(!Subtarget->isTargetMachO());
21043 X86FrameLowering::emitStackProbeCall(*BB->getParent(), *BB, MI, DL);
21045 MI->eraseFromParent(); // The pseudo instruction is gone now.
21049 MachineBasicBlock *
21050 X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
21051 MachineBasicBlock *BB) const {
21052 // This is pretty easy. We're taking the value that we received from
21053 // our load from the relocation, sticking it in either RDI (x86-64)
21054 // or EAX and doing an indirect call. The return value will then
21055 // be in the normal return register.
21056 MachineFunction *F = BB->getParent();
21057 const X86InstrInfo *TII =
21058 static_cast<const X86InstrInfo *>(F->getSubtarget().getInstrInfo());
21059 DebugLoc DL = MI->getDebugLoc();
21061 assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
21062 assert(MI->getOperand(3).isGlobal() && "This should be a global");
21064 // Get a register mask for the lowered call.
21065 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
21066 // proper register mask.
21067 const uint32_t *RegMask = F->getTarget()
21068 .getSubtargetImpl()
21069 ->getRegisterInfo()
21070 ->getCallPreservedMask(CallingConv::C);
21071 if (Subtarget->is64Bit()) {
21072 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
21073 TII->get(X86::MOV64rm), X86::RDI)
21075 .addImm(0).addReg(0)
21076 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
21077 MI->getOperand(3).getTargetFlags())
21079 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
21080 addDirectMem(MIB, X86::RDI);
21081 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
21082 } else if (F->getTarget().getRelocationModel() != Reloc::PIC_) {
21083 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
21084 TII->get(X86::MOV32rm), X86::EAX)
21086 .addImm(0).addReg(0)
21087 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
21088 MI->getOperand(3).getTargetFlags())
21090 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
21091 addDirectMem(MIB, X86::EAX);
21092 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
21094 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
21095 TII->get(X86::MOV32rm), X86::EAX)
21096 .addReg(TII->getGlobalBaseReg(F))
21097 .addImm(0).addReg(0)
21098 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
21099 MI->getOperand(3).getTargetFlags())
21101 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
21102 addDirectMem(MIB, X86::EAX);
21103 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
21106 MI->eraseFromParent(); // The pseudo instruction is gone now.
21110 MachineBasicBlock *
21111 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
21112 MachineBasicBlock *MBB) const {
21113 DebugLoc DL = MI->getDebugLoc();
21114 MachineFunction *MF = MBB->getParent();
21115 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
21116 MachineRegisterInfo &MRI = MF->getRegInfo();
21118 const BasicBlock *BB = MBB->getBasicBlock();
21119 MachineFunction::iterator I = MBB;
21122 // Memory Reference
21123 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
21124 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
21127 unsigned MemOpndSlot = 0;
21129 unsigned CurOp = 0;
21131 DstReg = MI->getOperand(CurOp++).getReg();
21132 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
21133 assert(RC->hasType(MVT::i32) && "Invalid destination!");
21134 unsigned mainDstReg = MRI.createVirtualRegister(RC);
21135 unsigned restoreDstReg = MRI.createVirtualRegister(RC);
21137 MemOpndSlot = CurOp;
21139 MVT PVT = getPointerTy();
21140 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
21141 "Invalid Pointer Size!");
21143 // For v = setjmp(buf), we generate
21146 // buf[LabelOffset] = restoreMBB
21147 // SjLjSetup restoreMBB
21153 // v = phi(main, restore)
21156 // if base pointer being used, load it from frame
21159 MachineBasicBlock *thisMBB = MBB;
21160 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
21161 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
21162 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
21163 MF->insert(I, mainMBB);
21164 MF->insert(I, sinkMBB);
21165 MF->push_back(restoreMBB);
21167 MachineInstrBuilder MIB;
21169 // Transfer the remainder of BB and its successor edges to sinkMBB.
21170 sinkMBB->splice(sinkMBB->begin(), MBB,
21171 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
21172 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
21175 unsigned PtrStoreOpc = 0;
21176 unsigned LabelReg = 0;
21177 const int64_t LabelOffset = 1 * PVT.getStoreSize();
21178 Reloc::Model RM = MF->getTarget().getRelocationModel();
21179 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
21180 (RM == Reloc::Static || RM == Reloc::DynamicNoPIC);
21182 // Prepare IP either in reg or imm.
21183 if (!UseImmLabel) {
21184 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
21185 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
21186 LabelReg = MRI.createVirtualRegister(PtrRC);
21187 if (Subtarget->is64Bit()) {
21188 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
21192 .addMBB(restoreMBB)
21195 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
21196 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
21197 .addReg(XII->getGlobalBaseReg(MF))
21200 .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference())
21204 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
21206 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
21207 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
21208 if (i == X86::AddrDisp)
21209 MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset);
21211 MIB.addOperand(MI->getOperand(MemOpndSlot + i));
21214 MIB.addReg(LabelReg);
21216 MIB.addMBB(restoreMBB);
21217 MIB.setMemRefs(MMOBegin, MMOEnd);
21219 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
21220 .addMBB(restoreMBB);
21222 const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
21223 MF->getSubtarget().getRegisterInfo());
21224 MIB.addRegMask(RegInfo->getNoPreservedMask());
21225 thisMBB->addSuccessor(mainMBB);
21226 thisMBB->addSuccessor(restoreMBB);
21230 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
21231 mainMBB->addSuccessor(sinkMBB);
21234 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
21235 TII->get(X86::PHI), DstReg)
21236 .addReg(mainDstReg).addMBB(mainMBB)
21237 .addReg(restoreDstReg).addMBB(restoreMBB);
21240 if (RegInfo->hasBasePointer(*MF)) {
21241 const X86Subtarget &STI = MF->getTarget().getSubtarget<X86Subtarget>();
21242 const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64();
21243 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
21244 X86FI->setRestoreBasePointer(MF);
21245 unsigned FramePtr = RegInfo->getFrameRegister(*MF);
21246 unsigned BasePtr = RegInfo->getBaseRegister();
21247 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
21248 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
21249 FramePtr, true, X86FI->getRestoreBasePointerOffset())
21250 .setMIFlag(MachineInstr::FrameSetup);
21252 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
21253 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
21254 restoreMBB->addSuccessor(sinkMBB);
21256 MI->eraseFromParent();
21260 MachineBasicBlock *
21261 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
21262 MachineBasicBlock *MBB) const {
21263 DebugLoc DL = MI->getDebugLoc();
21264 MachineFunction *MF = MBB->getParent();
21265 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
21266 MachineRegisterInfo &MRI = MF->getRegInfo();
21268 // Memory Reference
21269 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
21270 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
21272 MVT PVT = getPointerTy();
21273 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
21274 "Invalid Pointer Size!");
21276 const TargetRegisterClass *RC =
21277 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
21278 unsigned Tmp = MRI.createVirtualRegister(RC);
21279 // Since FP is only updated here but NOT referenced, it's treated as GPR.
21280 const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
21281 MF->getSubtarget().getRegisterInfo());
21282 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
21283 unsigned SP = RegInfo->getStackRegister();
21285 MachineInstrBuilder MIB;
21287 const int64_t LabelOffset = 1 * PVT.getStoreSize();
21288 const int64_t SPOffset = 2 * PVT.getStoreSize();
21290 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
21291 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
21294 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
21295 for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
21296 MIB.addOperand(MI->getOperand(i));
21297 MIB.setMemRefs(MMOBegin, MMOEnd);
21299 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
21300 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
21301 if (i == X86::AddrDisp)
21302 MIB.addDisp(MI->getOperand(i), LabelOffset);
21304 MIB.addOperand(MI->getOperand(i));
21306 MIB.setMemRefs(MMOBegin, MMOEnd);
21308 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
21309 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
21310 if (i == X86::AddrDisp)
21311 MIB.addDisp(MI->getOperand(i), SPOffset);
21313 MIB.addOperand(MI->getOperand(i));
21315 MIB.setMemRefs(MMOBegin, MMOEnd);
21317 BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
21319 MI->eraseFromParent();
21323 // Replace 213-type (isel default) FMA3 instructions with 231-type for
21324 // accumulator loops. Writing back to the accumulator allows the coalescer
21325 // to remove extra copies in the loop.
21326 MachineBasicBlock *
21327 X86TargetLowering::emitFMA3Instr(MachineInstr *MI,
21328 MachineBasicBlock *MBB) const {
21329 MachineOperand &AddendOp = MI->getOperand(3);
21331 // Bail out early if the addend isn't a register - we can't switch these.
21332 if (!AddendOp.isReg())
21335 MachineFunction &MF = *MBB->getParent();
21336 MachineRegisterInfo &MRI = MF.getRegInfo();
21338 // Check whether the addend is defined by a PHI:
21339 assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?");
21340 MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg());
21341 if (!AddendDef.isPHI())
21344 // Look for the following pattern:
21346 // %addend = phi [%entry, 0], [%loop, %result]
21348 // %result<tied1> = FMA213 %m2<tied0>, %m1, %addend
21352 // %addend = phi [%entry, 0], [%loop, %result]
21354 // %result<tied1> = FMA231 %addend<tied0>, %m1, %m2
21356 for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) {
21357 assert(AddendDef.getOperand(i).isReg());
21358 MachineOperand PHISrcOp = AddendDef.getOperand(i);
21359 MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg());
21360 if (&PHISrcInst == MI) {
21361 // Found a matching instruction.
21362 unsigned NewFMAOpc = 0;
21363 switch (MI->getOpcode()) {
21364 case X86::VFMADDPDr213r: NewFMAOpc = X86::VFMADDPDr231r; break;
21365 case X86::VFMADDPSr213r: NewFMAOpc = X86::VFMADDPSr231r; break;
21366 case X86::VFMADDSDr213r: NewFMAOpc = X86::VFMADDSDr231r; break;
21367 case X86::VFMADDSSr213r: NewFMAOpc = X86::VFMADDSSr231r; break;
21368 case X86::VFMSUBPDr213r: NewFMAOpc = X86::VFMSUBPDr231r; break;
21369 case X86::VFMSUBPSr213r: NewFMAOpc = X86::VFMSUBPSr231r; break;
21370 case X86::VFMSUBSDr213r: NewFMAOpc = X86::VFMSUBSDr231r; break;
21371 case X86::VFMSUBSSr213r: NewFMAOpc = X86::VFMSUBSSr231r; break;
21372 case X86::VFNMADDPDr213r: NewFMAOpc = X86::VFNMADDPDr231r; break;
21373 case X86::VFNMADDPSr213r: NewFMAOpc = X86::VFNMADDPSr231r; break;
21374 case X86::VFNMADDSDr213r: NewFMAOpc = X86::VFNMADDSDr231r; break;
21375 case X86::VFNMADDSSr213r: NewFMAOpc = X86::VFNMADDSSr231r; break;
21376 case X86::VFNMSUBPDr213r: NewFMAOpc = X86::VFNMSUBPDr231r; break;
21377 case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break;
21378 case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break;
21379 case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break;
21380 case X86::VFMADDSUBPDr213r: NewFMAOpc = X86::VFMADDSUBPDr231r; break;
21381 case X86::VFMADDSUBPSr213r: NewFMAOpc = X86::VFMADDSUBPSr231r; break;
21382 case X86::VFMSUBADDPDr213r: NewFMAOpc = X86::VFMSUBADDPDr231r; break;
21383 case X86::VFMSUBADDPSr213r: NewFMAOpc = X86::VFMSUBADDPSr231r; break;
21385 case X86::VFMADDPDr213rY: NewFMAOpc = X86::VFMADDPDr231rY; break;
21386 case X86::VFMADDPSr213rY: NewFMAOpc = X86::VFMADDPSr231rY; break;
21387 case X86::VFMSUBPDr213rY: NewFMAOpc = X86::VFMSUBPDr231rY; break;
21388 case X86::VFMSUBPSr213rY: NewFMAOpc = X86::VFMSUBPSr231rY; break;
21389 case X86::VFNMADDPDr213rY: NewFMAOpc = X86::VFNMADDPDr231rY; break;
21390 case X86::VFNMADDPSr213rY: NewFMAOpc = X86::VFNMADDPSr231rY; break;
21391 case X86::VFNMSUBPDr213rY: NewFMAOpc = X86::VFNMSUBPDr231rY; break;
21392 case X86::VFNMSUBPSr213rY: NewFMAOpc = X86::VFNMSUBPSr231rY; break;
21393 case X86::VFMADDSUBPDr213rY: NewFMAOpc = X86::VFMADDSUBPDr231rY; break;
21394 case X86::VFMADDSUBPSr213rY: NewFMAOpc = X86::VFMADDSUBPSr231rY; break;
21395 case X86::VFMSUBADDPDr213rY: NewFMAOpc = X86::VFMSUBADDPDr231rY; break;
21396 case X86::VFMSUBADDPSr213rY: NewFMAOpc = X86::VFMSUBADDPSr231rY; break;
21397 default: llvm_unreachable("Unrecognized FMA variant.");
21400 const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
21401 MachineInstrBuilder MIB =
21402 BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc))
21403 .addOperand(MI->getOperand(0))
21404 .addOperand(MI->getOperand(3))
21405 .addOperand(MI->getOperand(2))
21406 .addOperand(MI->getOperand(1));
21407 MBB->insert(MachineBasicBlock::iterator(MI), MIB);
21408 MI->eraseFromParent();
21415 MachineBasicBlock *
21416 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
21417 MachineBasicBlock *BB) const {
21418 switch (MI->getOpcode()) {
21419 default: llvm_unreachable("Unexpected instr type to insert");
21420 case X86::TAILJMPd64:
21421 case X86::TAILJMPr64:
21422 case X86::TAILJMPm64:
21423 case X86::TAILJMPd64_REX:
21424 case X86::TAILJMPr64_REX:
21425 case X86::TAILJMPm64_REX:
21426 llvm_unreachable("TAILJMP64 would not be touched here.");
21427 case X86::TCRETURNdi64:
21428 case X86::TCRETURNri64:
21429 case X86::TCRETURNmi64:
21431 case X86::WIN_ALLOCA:
21432 return EmitLoweredWinAlloca(MI, BB);
21433 case X86::SEG_ALLOCA_32:
21434 case X86::SEG_ALLOCA_64:
21435 return EmitLoweredSegAlloca(MI, BB);
21436 case X86::TLSCall_32:
21437 case X86::TLSCall_64:
21438 return EmitLoweredTLSCall(MI, BB);
21439 case X86::CMOV_GR8:
21440 case X86::CMOV_FR32:
21441 case X86::CMOV_FR64:
21442 case X86::CMOV_V4F32:
21443 case X86::CMOV_V2F64:
21444 case X86::CMOV_V2I64:
21445 case X86::CMOV_V8F32:
21446 case X86::CMOV_V4F64:
21447 case X86::CMOV_V4I64:
21448 case X86::CMOV_V16F32:
21449 case X86::CMOV_V8F64:
21450 case X86::CMOV_V8I64:
21451 case X86::CMOV_GR16:
21452 case X86::CMOV_GR32:
21453 case X86::CMOV_RFP32:
21454 case X86::CMOV_RFP64:
21455 case X86::CMOV_RFP80:
21456 return EmitLoweredSelect(MI, BB);
21458 case X86::FP32_TO_INT16_IN_MEM:
21459 case X86::FP32_TO_INT32_IN_MEM:
21460 case X86::FP32_TO_INT64_IN_MEM:
21461 case X86::FP64_TO_INT16_IN_MEM:
21462 case X86::FP64_TO_INT32_IN_MEM:
21463 case X86::FP64_TO_INT64_IN_MEM:
21464 case X86::FP80_TO_INT16_IN_MEM:
21465 case X86::FP80_TO_INT32_IN_MEM:
21466 case X86::FP80_TO_INT64_IN_MEM: {
21467 MachineFunction *F = BB->getParent();
21468 const TargetInstrInfo *TII = F->getSubtarget().getInstrInfo();
21469 DebugLoc DL = MI->getDebugLoc();
21471 // Change the floating point control register to use "round towards zero"
21472 // mode when truncating to an integer value.
21473 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
21474 addFrameReference(BuildMI(*BB, MI, DL,
21475 TII->get(X86::FNSTCW16m)), CWFrameIdx);
21477 // Load the old value of the high byte of the control word...
21479 F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
21480 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
21483 // Set the high part to be round to zero...
21484 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
21487 // Reload the modified control word now...
21488 addFrameReference(BuildMI(*BB, MI, DL,
21489 TII->get(X86::FLDCW16m)), CWFrameIdx);
21491 // Restore the memory image of control word to original value
21492 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
21495 // Get the X86 opcode to use.
21497 switch (MI->getOpcode()) {
21498 default: llvm_unreachable("illegal opcode!");
21499 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
21500 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
21501 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
21502 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
21503 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
21504 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
21505 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
21506 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
21507 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
21511 MachineOperand &Op = MI->getOperand(0);
21513 AM.BaseType = X86AddressMode::RegBase;
21514 AM.Base.Reg = Op.getReg();
21516 AM.BaseType = X86AddressMode::FrameIndexBase;
21517 AM.Base.FrameIndex = Op.getIndex();
21519 Op = MI->getOperand(1);
21521 AM.Scale = Op.getImm();
21522 Op = MI->getOperand(2);
21524 AM.IndexReg = Op.getImm();
21525 Op = MI->getOperand(3);
21526 if (Op.isGlobal()) {
21527 AM.GV = Op.getGlobal();
21529 AM.Disp = Op.getImm();
21531 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
21532 .addReg(MI->getOperand(X86::AddrNumOperands).getReg());
21534 // Reload the original control word now.
21535 addFrameReference(BuildMI(*BB, MI, DL,
21536 TII->get(X86::FLDCW16m)), CWFrameIdx);
21538 MI->eraseFromParent(); // The pseudo instruction is gone now.
21541 // String/text processing lowering.
21542 case X86::PCMPISTRM128REG:
21543 case X86::VPCMPISTRM128REG:
21544 case X86::PCMPISTRM128MEM:
21545 case X86::VPCMPISTRM128MEM:
21546 case X86::PCMPESTRM128REG:
21547 case X86::VPCMPESTRM128REG:
21548 case X86::PCMPESTRM128MEM:
21549 case X86::VPCMPESTRM128MEM:
21550 assert(Subtarget->hasSSE42() &&
21551 "Target must have SSE4.2 or AVX features enabled");
21552 return EmitPCMPSTRM(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());
21554 // String/text processing lowering.
21555 case X86::PCMPISTRIREG:
21556 case X86::VPCMPISTRIREG:
21557 case X86::PCMPISTRIMEM:
21558 case X86::VPCMPISTRIMEM:
21559 case X86::PCMPESTRIREG:
21560 case X86::VPCMPESTRIREG:
21561 case X86::PCMPESTRIMEM:
21562 case X86::VPCMPESTRIMEM:
21563 assert(Subtarget->hasSSE42() &&
21564 "Target must have SSE4.2 or AVX features enabled");
21565 return EmitPCMPSTRI(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());
21567 // Thread synchronization.
21569 return EmitMonitor(MI, BB, BB->getParent()->getSubtarget().getInstrInfo(),
21574 return EmitXBegin(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());
21576 case X86::VASTART_SAVE_XMM_REGS:
21577 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
21579 case X86::VAARG_64:
21580 return EmitVAARG64WithCustomInserter(MI, BB);
21582 case X86::EH_SjLj_SetJmp32:
21583 case X86::EH_SjLj_SetJmp64:
21584 return emitEHSjLjSetJmp(MI, BB);
21586 case X86::EH_SjLj_LongJmp32:
21587 case X86::EH_SjLj_LongJmp64:
21588 return emitEHSjLjLongJmp(MI, BB);
21590 case TargetOpcode::STATEPOINT:
21591 // As an implementation detail, STATEPOINT shares the STACKMAP format at
21592 // this point in the process. We diverge later.
21593 return emitPatchPoint(MI, BB);
21595 case TargetOpcode::STACKMAP:
21596 case TargetOpcode::PATCHPOINT:
21597 return emitPatchPoint(MI, BB);
21599 case X86::VFMADDPDr213r:
21600 case X86::VFMADDPSr213r:
21601 case X86::VFMADDSDr213r:
21602 case X86::VFMADDSSr213r:
21603 case X86::VFMSUBPDr213r:
21604 case X86::VFMSUBPSr213r:
21605 case X86::VFMSUBSDr213r:
21606 case X86::VFMSUBSSr213r:
21607 case X86::VFNMADDPDr213r:
21608 case X86::VFNMADDPSr213r:
21609 case X86::VFNMADDSDr213r:
21610 case X86::VFNMADDSSr213r:
21611 case X86::VFNMSUBPDr213r:
21612 case X86::VFNMSUBPSr213r:
21613 case X86::VFNMSUBSDr213r:
21614 case X86::VFNMSUBSSr213r:
21615 case X86::VFMADDSUBPDr213r:
21616 case X86::VFMADDSUBPSr213r:
21617 case X86::VFMSUBADDPDr213r:
21618 case X86::VFMSUBADDPSr213r:
21619 case X86::VFMADDPDr213rY:
21620 case X86::VFMADDPSr213rY:
21621 case X86::VFMSUBPDr213rY:
21622 case X86::VFMSUBPSr213rY:
21623 case X86::VFNMADDPDr213rY:
21624 case X86::VFNMADDPSr213rY:
21625 case X86::VFNMSUBPDr213rY:
21626 case X86::VFNMSUBPSr213rY:
21627 case X86::VFMADDSUBPDr213rY:
21628 case X86::VFMADDSUBPSr213rY:
21629 case X86::VFMSUBADDPDr213rY:
21630 case X86::VFMSUBADDPSr213rY:
21631 return emitFMA3Instr(MI, BB);
21635 //===----------------------------------------------------------------------===//
21636 // X86 Optimization Hooks
21637 //===----------------------------------------------------------------------===//
21639 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
21642 const SelectionDAG &DAG,
21643 unsigned Depth) const {
21644 unsigned BitWidth = KnownZero.getBitWidth();
21645 unsigned Opc = Op.getOpcode();
21646 assert((Opc >= ISD::BUILTIN_OP_END ||
21647 Opc == ISD::INTRINSIC_WO_CHAIN ||
21648 Opc == ISD::INTRINSIC_W_CHAIN ||
21649 Opc == ISD::INTRINSIC_VOID) &&
21650 "Should use MaskedValueIsZero if you don't know whether Op"
21651 " is a target node!");
21653 KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything.
21667 // These nodes' second result is a boolean.
21668 if (Op.getResNo() == 0)
21671 case X86ISD::SETCC:
21672 KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
21674 case ISD::INTRINSIC_WO_CHAIN: {
21675 unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
21676 unsigned NumLoBits = 0;
21679 case Intrinsic::x86_sse_movmsk_ps:
21680 case Intrinsic::x86_avx_movmsk_ps_256:
21681 case Intrinsic::x86_sse2_movmsk_pd:
21682 case Intrinsic::x86_avx_movmsk_pd_256:
21683 case Intrinsic::x86_mmx_pmovmskb:
21684 case Intrinsic::x86_sse2_pmovmskb_128:
21685 case Intrinsic::x86_avx2_pmovmskb: {
21686 // High bits of movmskp{s|d}, pmovmskb are known zero.
21688 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
21689 case Intrinsic::x86_sse_movmsk_ps: NumLoBits = 4; break;
21690 case Intrinsic::x86_avx_movmsk_ps_256: NumLoBits = 8; break;
21691 case Intrinsic::x86_sse2_movmsk_pd: NumLoBits = 2; break;
21692 case Intrinsic::x86_avx_movmsk_pd_256: NumLoBits = 4; break;
21693 case Intrinsic::x86_mmx_pmovmskb: NumLoBits = 8; break;
21694 case Intrinsic::x86_sse2_pmovmskb_128: NumLoBits = 16; break;
21695 case Intrinsic::x86_avx2_pmovmskb: NumLoBits = 32; break;
21697 KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
21706 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
21708 const SelectionDAG &,
21709 unsigned Depth) const {
21710 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
21711 if (Op.getOpcode() == X86ISD::SETCC_CARRY)
21712 return Op.getValueType().getScalarType().getSizeInBits();
21718 /// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
21719 /// node is a GlobalAddress + offset.
21720 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
21721 const GlobalValue* &GA,
21722 int64_t &Offset) const {
21723 if (N->getOpcode() == X86ISD::Wrapper) {
21724 if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
21725 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
21726 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
21730 return TargetLowering::isGAPlusOffset(N, GA, Offset);
21733 /// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the
21734 /// same as extracting the high 128-bit part of 256-bit vector and then
21735 /// inserting the result into the low part of a new 256-bit vector
21736 static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
21737 EVT VT = SVOp->getValueType(0);
21738 unsigned NumElems = VT.getVectorNumElements();
21740 // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
21741 for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j)
21742 if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
21743 SVOp->getMaskElt(j) >= 0)
21749 /// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the
21750 /// same as extracting the low 128-bit part of 256-bit vector and then
21751 /// inserting the result into the high part of a new 256-bit vector
21752 static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
21753 EVT VT = SVOp->getValueType(0);
21754 unsigned NumElems = VT.getVectorNumElements();
21756 // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
21757 for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j)
21758 if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
21759 SVOp->getMaskElt(j) >= 0)
21765 /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
21766 static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
21767 TargetLowering::DAGCombinerInfo &DCI,
21768 const X86Subtarget* Subtarget) {
21770 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
21771 SDValue V1 = SVOp->getOperand(0);
21772 SDValue V2 = SVOp->getOperand(1);
21773 EVT VT = SVOp->getValueType(0);
21774 unsigned NumElems = VT.getVectorNumElements();
21776 if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
21777 V2.getOpcode() == ISD::CONCAT_VECTORS) {
21781 // V UNDEF BUILD_VECTOR UNDEF
21783 // CONCAT_VECTOR CONCAT_VECTOR
21786 // RESULT: V + zero extended
21788 if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
21789 V2.getOperand(1).getOpcode() != ISD::UNDEF ||
21790 V1.getOperand(1).getOpcode() != ISD::UNDEF)
21793 if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
21796 // To match the shuffle mask, the first half of the mask should
21797 // be exactly the first vector, and all the rest a splat with the
21798 // first element of the second one.
21799 for (unsigned i = 0; i != NumElems/2; ++i)
21800 if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
21801 !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
21804 // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
21805 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
21806 if (Ld->hasNUsesOfValue(1, 0)) {
21807 SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
21808 SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
21810 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
21812 Ld->getPointerInfo(),
21813 Ld->getAlignment(),
21814 false/*isVolatile*/, true/*ReadMem*/,
21815 false/*WriteMem*/);
21817 // Make sure the newly-created LOAD is in the same position as Ld in
21818 // terms of dependency. We create a TokenFactor for Ld and ResNode,
21819 // and update uses of Ld's output chain to use the TokenFactor.
21820 if (Ld->hasAnyUseOfValue(1)) {
21821 SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
21822 SDValue(Ld, 1), SDValue(ResNode.getNode(), 1));
21823 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
21824 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
21825 SDValue(ResNode.getNode(), 1));
21828 return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
21832 // Emit a zeroed vector and insert the desired subvector on its
21834 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
21835 SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
21836 return DCI.CombineTo(N, InsV);
21839 //===--------------------------------------------------------------------===//
21840 // Combine some shuffles into subvector extracts and inserts:
21843 // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
21844 if (isShuffleHigh128VectorInsertLow(SVOp)) {
21845 SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl);
21846 SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl);
21847 return DCI.CombineTo(N, InsV);
21850 // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
21851 if (isShuffleLow128VectorInsertHigh(SVOp)) {
21852 SDValue V = Extract128BitVector(V1, 0, DAG, dl);
21853 SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl);
21854 return DCI.CombineTo(N, InsV);
21860 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
21863 /// This is the leaf of the recursive combinine below. When we have found some
21864 /// chain of single-use x86 shuffle instructions and accumulated the combined
21865 /// shuffle mask represented by them, this will try to pattern match that mask
21866 /// into either a single instruction if there is a special purpose instruction
21867 /// for this operation, or into a PSHUFB instruction which is a fully general
21868 /// instruction but should only be used to replace chains over a certain depth.
21869 static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
21870 int Depth, bool HasPSHUFB, SelectionDAG &DAG,
21871 TargetLowering::DAGCombinerInfo &DCI,
21872 const X86Subtarget *Subtarget) {
21873 assert(!Mask.empty() && "Cannot combine an empty shuffle mask!");
21875 // Find the operand that enters the chain. Note that multiple uses are OK
21876 // here, we're not going to remove the operand we find.
21877 SDValue Input = Op.getOperand(0);
21878 while (Input.getOpcode() == ISD::BITCAST)
21879 Input = Input.getOperand(0);
21881 MVT VT = Input.getSimpleValueType();
21882 MVT RootVT = Root.getSimpleValueType();
21885 // Just remove no-op shuffle masks.
21886 if (Mask.size() == 1) {
21887 DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Input),
21892 // Use the float domain if the operand type is a floating point type.
21893 bool FloatDomain = VT.isFloatingPoint();
21895 // For floating point shuffles, we don't have free copies in the shuffle
21896 // instructions or the ability to load as part of the instruction, so
21897 // canonicalize their shuffles to UNPCK or MOV variants.
21899 // Note that even with AVX we prefer the PSHUFD form of shuffle for integer
21900 // vectors because it can have a load folded into it that UNPCK cannot. This
21901 // doesn't preclude something switching to the shorter encoding post-RA.
21903 if (Mask.equals(0, 0) || Mask.equals(1, 1)) {
21904 bool Lo = Mask.equals(0, 0);
21907 // Check if we have SSE3 which will let us use MOVDDUP. That instruction
21908 // is no slower than UNPCKLPD but has the option to fold the input operand
21909 // into even an unaligned memory load.
21910 if (Lo && Subtarget->hasSSE3()) {
21911 Shuffle = X86ISD::MOVDDUP;
21912 ShuffleVT = MVT::v2f64;
21914 // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller
21915 // than the UNPCK variants.
21916 Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS;
21917 ShuffleVT = MVT::v4f32;
21919 if (Depth == 1 && Root->getOpcode() == Shuffle)
21920 return false; // Nothing to do!
21921 Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
21922 DCI.AddToWorklist(Op.getNode());
21923 if (Shuffle == X86ISD::MOVDDUP)
21924 Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
21926 Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
21927 DCI.AddToWorklist(Op.getNode());
21928 DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
21932 if (Subtarget->hasSSE3() &&
21933 (Mask.equals(0, 0, 2, 2) || Mask.equals(1, 1, 3, 3))) {
21934 bool Lo = Mask.equals(0, 0, 2, 2);
21935 unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP;
21936 MVT ShuffleVT = MVT::v4f32;
21937 if (Depth == 1 && Root->getOpcode() == Shuffle)
21938 return false; // Nothing to do!
21939 Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
21940 DCI.AddToWorklist(Op.getNode());
21941 Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
21942 DCI.AddToWorklist(Op.getNode());
21943 DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
21947 if (Mask.equals(0, 0, 1, 1) || Mask.equals(2, 2, 3, 3)) {
21948 bool Lo = Mask.equals(0, 0, 1, 1);
21949 unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
21950 MVT ShuffleVT = MVT::v4f32;
21951 if (Depth == 1 && Root->getOpcode() == Shuffle)
21952 return false; // Nothing to do!
21953 Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
21954 DCI.AddToWorklist(Op.getNode());
21955 Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
21956 DCI.AddToWorklist(Op.getNode());
21957 DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
21963 // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK
21964 // variants as none of these have single-instruction variants that are
21965 // superior to the UNPCK formulation.
21966 if (!FloatDomain &&
21967 (Mask.equals(0, 0, 1, 1, 2, 2, 3, 3) ||
21968 Mask.equals(4, 4, 5, 5, 6, 6, 7, 7) ||
21969 Mask.equals(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7) ||
21970 Mask.equals(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15,
21972 bool Lo = Mask[0] == 0;
21973 unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
21974 if (Depth == 1 && Root->getOpcode() == Shuffle)
21975 return false; // Nothing to do!
21977 switch (Mask.size()) {
21979 ShuffleVT = MVT::v8i16;
21982 ShuffleVT = MVT::v16i8;
21985 llvm_unreachable("Impossible mask size!");
21987 Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
21988 DCI.AddToWorklist(Op.getNode());
21989 Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
21990 DCI.AddToWorklist(Op.getNode());
21991 DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
21996 // Don't try to re-form single instruction chains under any circumstances now
21997 // that we've done encoding canonicalization for them.
22001 // If we have 3 or more shuffle instructions or a chain involving PSHUFB, we
22002 // can replace them with a single PSHUFB instruction profitably. Intel's
22003 // manuals suggest only using PSHUFB if doing so replacing 5 instructions, but
22004 // in practice PSHUFB tends to be *very* fast so we're more aggressive.
22005 if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) {
22006 SmallVector<SDValue, 16> PSHUFBMask;
22007 assert(Mask.size() <= 16 && "Can't shuffle elements smaller than bytes!");
22008 int Ratio = 16 / Mask.size();
22009 for (unsigned i = 0; i < 16; ++i) {
22010 if (Mask[i / Ratio] == SM_SentinelUndef) {
22011 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
22014 int M = Mask[i / Ratio] != SM_SentinelZero
22015 ? Ratio * Mask[i / Ratio] + i % Ratio
22017 PSHUFBMask.push_back(DAG.getConstant(M, MVT::i8));
22019 Op = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Input);
22020 DCI.AddToWorklist(Op.getNode());
22021 SDValue PSHUFBMaskOp =
22022 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, PSHUFBMask);
22023 DCI.AddToWorklist(PSHUFBMaskOp.getNode());
22024 Op = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, Op, PSHUFBMaskOp);
22025 DCI.AddToWorklist(Op.getNode());
22026 DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22031 // Failed to find any combines.
22035 /// \brief Fully generic combining of x86 shuffle instructions.
22037 /// This should be the last combine run over the x86 shuffle instructions. Once
22038 /// they have been fully optimized, this will recursively consider all chains
22039 /// of single-use shuffle instructions, build a generic model of the cumulative
22040 /// shuffle operation, and check for simpler instructions which implement this
22041 /// operation. We use this primarily for two purposes:
22043 /// 1) Collapse generic shuffles to specialized single instructions when
22044 /// equivalent. In most cases, this is just an encoding size win, but
22045 /// sometimes we will collapse multiple generic shuffles into a single
22046 /// special-purpose shuffle.
22047 /// 2) Look for sequences of shuffle instructions with 3 or more total
22048 /// instructions, and replace them with the slightly more expensive SSSE3
22049 /// PSHUFB instruction if available. We do this as the last combining step
22050 /// to ensure we avoid using PSHUFB if we can implement the shuffle with
22051 /// a suitable short sequence of other instructions. The PHUFB will either
22052 /// use a register or have to read from memory and so is slightly (but only
22053 /// slightly) more expensive than the other shuffle instructions.
22055 /// Because this is inherently a quadratic operation (for each shuffle in
22056 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
22057 /// This should never be an issue in practice as the shuffle lowering doesn't
22058 /// produce sequences of more than 8 instructions.
22060 /// FIXME: We will currently miss some cases where the redundant shuffling
22061 /// would simplify under the threshold for PSHUFB formation because of
22062 /// combine-ordering. To fix this, we should do the redundant instruction
22063 /// combining in this recursive walk.
22064 static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
22065 ArrayRef<int> RootMask,
22066 int Depth, bool HasPSHUFB,
22068 TargetLowering::DAGCombinerInfo &DCI,
22069 const X86Subtarget *Subtarget) {
22070 // Bound the depth of our recursive combine because this is ultimately
22071 // quadratic in nature.
22075 // Directly rip through bitcasts to find the underlying operand.
22076 while (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).hasOneUse())
22077 Op = Op.getOperand(0);
22079 MVT VT = Op.getSimpleValueType();
22080 if (!VT.isVector())
22081 return false; // Bail if we hit a non-vector.
22082 // FIXME: This routine should be taught about 256-bit shuffles, or a 256-bit
22083 // version should be added.
22084 if (VT.getSizeInBits() != 128)
22087 assert(Root.getSimpleValueType().isVector() &&
22088 "Shuffles operate on vector types!");
22089 assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
22090 "Can only combine shuffles of the same vector register size.");
22092 if (!isTargetShuffle(Op.getOpcode()))
22094 SmallVector<int, 16> OpMask;
22096 bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, OpMask, IsUnary);
22097 // We only can combine unary shuffles which we can decode the mask for.
22098 if (!HaveMask || !IsUnary)
22101 assert(VT.getVectorNumElements() == OpMask.size() &&
22102 "Different mask size from vector size!");
22103 assert(((RootMask.size() > OpMask.size() &&
22104 RootMask.size() % OpMask.size() == 0) ||
22105 (OpMask.size() > RootMask.size() &&
22106 OpMask.size() % RootMask.size() == 0) ||
22107 OpMask.size() == RootMask.size()) &&
22108 "The smaller number of elements must divide the larger.");
22109 int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
22110 int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
22111 assert(((RootRatio == 1 && OpRatio == 1) ||
22112 (RootRatio == 1) != (OpRatio == 1)) &&
22113 "Must not have a ratio for both incoming and op masks!");
22115 SmallVector<int, 16> Mask;
22116 Mask.reserve(std::max(OpMask.size(), RootMask.size()));
22118 // Merge this shuffle operation's mask into our accumulated mask. Note that
22119 // this shuffle's mask will be the first applied to the input, followed by the
22120 // root mask to get us all the way to the root value arrangement. The reason
22121 // for this order is that we are recursing up the operation chain.
22122 for (int i = 0, e = std::max(OpMask.size(), RootMask.size()); i < e; ++i) {
22123 int RootIdx = i / RootRatio;
22124 if (RootMask[RootIdx] < 0) {
22125 // This is a zero or undef lane, we're done.
22126 Mask.push_back(RootMask[RootIdx]);
22130 int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
22131 int OpIdx = RootMaskedIdx / OpRatio;
22132 if (OpMask[OpIdx] < 0) {
22133 // The incoming lanes are zero or undef, it doesn't matter which ones we
22135 Mask.push_back(OpMask[OpIdx]);
22139 // Ok, we have non-zero lanes, map them through.
22140 Mask.push_back(OpMask[OpIdx] * OpRatio +
22141 RootMaskedIdx % OpRatio);
22144 // See if we can recurse into the operand to combine more things.
22145 switch (Op.getOpcode()) {
22146 case X86ISD::PSHUFB:
22148 case X86ISD::PSHUFD:
22149 case X86ISD::PSHUFHW:
22150 case X86ISD::PSHUFLW:
22151 if (Op.getOperand(0).hasOneUse() &&
22152 combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
22153 HasPSHUFB, DAG, DCI, Subtarget))
22157 case X86ISD::UNPCKL:
22158 case X86ISD::UNPCKH:
22159 assert(Op.getOperand(0) == Op.getOperand(1) && "We only combine unary shuffles!");
22160 // We can't check for single use, we have to check that this shuffle is the only user.
22161 if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
22162 combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
22163 HasPSHUFB, DAG, DCI, Subtarget))
22168 // Minor canonicalization of the accumulated shuffle mask to make it easier
22169 // to match below. All this does is detect masks with squential pairs of
22170 // elements, and shrink them to the half-width mask. It does this in a loop
22171 // so it will reduce the size of the mask to the minimal width mask which
22172 // performs an equivalent shuffle.
22173 SmallVector<int, 16> WidenedMask;
22174 while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
22175 Mask = std::move(WidenedMask);
22176 WidenedMask.clear();
22179 return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI,
22183 /// \brief Get the PSHUF-style mask from PSHUF node.
22185 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
22186 /// PSHUF-style masks that can be reused with such instructions.
22187 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
22188 SmallVector<int, 4> Mask;
22190 bool HaveMask = getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), Mask, IsUnary);
22194 switch (N.getOpcode()) {
22195 case X86ISD::PSHUFD:
22197 case X86ISD::PSHUFLW:
22200 case X86ISD::PSHUFHW:
22201 Mask.erase(Mask.begin(), Mask.begin() + 4);
22202 for (int &M : Mask)
22206 llvm_unreachable("No valid shuffle instruction found!");
22210 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
22212 /// We walk up the chain and look for a combinable shuffle, skipping over
22213 /// shuffles that we could hoist this shuffle's transformation past without
22214 /// altering anything.
22216 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
22218 TargetLowering::DAGCombinerInfo &DCI) {
22219 assert(N.getOpcode() == X86ISD::PSHUFD &&
22220 "Called with something other than an x86 128-bit half shuffle!");
22223 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
22224 // of the shuffles in the chain so that we can form a fresh chain to replace
22226 SmallVector<SDValue, 8> Chain;
22227 SDValue V = N.getOperand(0);
22228 for (; V.hasOneUse(); V = V.getOperand(0)) {
22229 switch (V.getOpcode()) {
22231 return SDValue(); // Nothing combined!
22234 // Skip bitcasts as we always know the type for the target specific
22238 case X86ISD::PSHUFD:
22239 // Found another dword shuffle.
22242 case X86ISD::PSHUFLW:
22243 // Check that the low words (being shuffled) are the identity in the
22244 // dword shuffle, and the high words are self-contained.
22245 if (Mask[0] != 0 || Mask[1] != 1 ||
22246 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
22249 Chain.push_back(V);
22252 case X86ISD::PSHUFHW:
22253 // Check that the high words (being shuffled) are the identity in the
22254 // dword shuffle, and the low words are self-contained.
22255 if (Mask[2] != 2 || Mask[3] != 3 ||
22256 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
22259 Chain.push_back(V);
22262 case X86ISD::UNPCKL:
22263 case X86ISD::UNPCKH:
22264 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
22265 // shuffle into a preceding word shuffle.
22266 if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16)
22269 // Search for a half-shuffle which we can combine with.
22270 unsigned CombineOp =
22271 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
22272 if (V.getOperand(0) != V.getOperand(1) ||
22273 !V->isOnlyUserOf(V.getOperand(0).getNode()))
22275 Chain.push_back(V);
22276 V = V.getOperand(0);
22278 switch (V.getOpcode()) {
22280 return SDValue(); // Nothing to combine.
22282 case X86ISD::PSHUFLW:
22283 case X86ISD::PSHUFHW:
22284 if (V.getOpcode() == CombineOp)
22287 Chain.push_back(V);
22291 V = V.getOperand(0);
22295 } while (V.hasOneUse());
22298 // Break out of the loop if we break out of the switch.
22302 if (!V.hasOneUse())
22303 // We fell out of the loop without finding a viable combining instruction.
22306 // Merge this node's mask and our incoming mask.
22307 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
22308 for (int &M : Mask)
22310 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
22311 getV4X86ShuffleImm8ForMask(Mask, DAG));
22313 // Rebuild the chain around this new shuffle.
22314 while (!Chain.empty()) {
22315 SDValue W = Chain.pop_back_val();
22317 if (V.getValueType() != W.getOperand(0).getValueType())
22318 V = DAG.getNode(ISD::BITCAST, DL, W.getOperand(0).getValueType(), V);
22320 switch (W.getOpcode()) {
22322 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
22324 case X86ISD::UNPCKL:
22325 case X86ISD::UNPCKH:
22326 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
22329 case X86ISD::PSHUFD:
22330 case X86ISD::PSHUFLW:
22331 case X86ISD::PSHUFHW:
22332 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
22336 if (V.getValueType() != N.getValueType())
22337 V = DAG.getNode(ISD::BITCAST, DL, N.getValueType(), V);
22339 // Return the new chain to replace N.
22343 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw.
22345 /// We walk up the chain, skipping shuffles of the other half and looking
22346 /// through shuffles which switch halves trying to find a shuffle of the same
22347 /// pair of dwords.
22348 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
22350 TargetLowering::DAGCombinerInfo &DCI) {
22352 (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
22353 "Called with something other than an x86 128-bit half shuffle!");
22355 unsigned CombineOpcode = N.getOpcode();
22357 // Walk up a single-use chain looking for a combinable shuffle.
22358 SDValue V = N.getOperand(0);
22359 for (; V.hasOneUse(); V = V.getOperand(0)) {
22360 switch (V.getOpcode()) {
22362 return false; // Nothing combined!
22365 // Skip bitcasts as we always know the type for the target specific
22369 case X86ISD::PSHUFLW:
22370 case X86ISD::PSHUFHW:
22371 if (V.getOpcode() == CombineOpcode)
22374 // Other-half shuffles are no-ops.
22377 // Break out of the loop if we break out of the switch.
22381 if (!V.hasOneUse())
22382 // We fell out of the loop without finding a viable combining instruction.
22385 // Combine away the bottom node as its shuffle will be accumulated into
22386 // a preceding shuffle.
22387 DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
22389 // Record the old value.
22392 // Merge this node's mask and our incoming mask (adjusted to account for all
22393 // the pshufd instructions encountered).
22394 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
22395 for (int &M : Mask)
22397 V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
22398 getV4X86ShuffleImm8ForMask(Mask, DAG));
22400 // Check that the shuffles didn't cancel each other out. If not, we need to
22401 // combine to the new one.
22403 // Replace the combinable shuffle with the combined one, updating all users
22404 // so that we re-evaluate the chain here.
22405 DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
22410 /// \brief Try to combine x86 target specific shuffles.
22411 static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
22412 TargetLowering::DAGCombinerInfo &DCI,
22413 const X86Subtarget *Subtarget) {
22415 MVT VT = N.getSimpleValueType();
22416 SmallVector<int, 4> Mask;
22418 switch (N.getOpcode()) {
22419 case X86ISD::PSHUFD:
22420 case X86ISD::PSHUFLW:
22421 case X86ISD::PSHUFHW:
22422 Mask = getPSHUFShuffleMask(N);
22423 assert(Mask.size() == 4);
22429 // Nuke no-op shuffles that show up after combining.
22430 if (isNoopShuffleMask(Mask))
22431 return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
22433 // Look for simplifications involving one or two shuffle instructions.
22434 SDValue V = N.getOperand(0);
22435 switch (N.getOpcode()) {
22438 case X86ISD::PSHUFLW:
22439 case X86ISD::PSHUFHW:
22440 assert(VT == MVT::v8i16);
22443 if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
22444 return SDValue(); // We combined away this shuffle, so we're done.
22446 // See if this reduces to a PSHUFD which is no more expensive and can
22447 // combine with more operations. Note that it has to at least flip the
22448 // dwords as otherwise it would have been removed as a no-op.
22449 if (Mask[0] == 2 && Mask[1] == 3 && Mask[2] == 0 && Mask[3] == 1) {
22450 int DMask[] = {0, 1, 2, 3};
22451 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
22452 DMask[DOffset + 0] = DOffset + 1;
22453 DMask[DOffset + 1] = DOffset + 0;
22454 V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V);
22455 DCI.AddToWorklist(V.getNode());
22456 V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V,
22457 getV4X86ShuffleImm8ForMask(DMask, DAG));
22458 DCI.AddToWorklist(V.getNode());
22459 return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
22462 // Look for shuffle patterns which can be implemented as a single unpack.
22463 // FIXME: This doesn't handle the location of the PSHUFD generically, and
22464 // only works when we have a PSHUFD followed by two half-shuffles.
22465 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
22466 (V.getOpcode() == X86ISD::PSHUFLW ||
22467 V.getOpcode() == X86ISD::PSHUFHW) &&
22468 V.getOpcode() != N.getOpcode() &&
22470 SDValue D = V.getOperand(0);
22471 while (D.getOpcode() == ISD::BITCAST && D.hasOneUse())
22472 D = D.getOperand(0);
22473 if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
22474 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
22475 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
22476 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
22477 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
22479 for (int i = 0; i < 4; ++i) {
22480 WordMask[i + NOffset] = Mask[i] + NOffset;
22481 WordMask[i + VOffset] = VMask[i] + VOffset;
22483 // Map the word mask through the DWord mask.
22485 for (int i = 0; i < 8; ++i)
22486 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
22487 const int UnpackLoMask[] = {0, 0, 1, 1, 2, 2, 3, 3};
22488 const int UnpackHiMask[] = {4, 4, 5, 5, 6, 6, 7, 7};
22489 if (std::equal(std::begin(MappedMask), std::end(MappedMask),
22490 std::begin(UnpackLoMask)) ||
22491 std::equal(std::begin(MappedMask), std::end(MappedMask),
22492 std::begin(UnpackHiMask))) {
22493 // We can replace all three shuffles with an unpack.
22494 V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, D.getOperand(0));
22495 DCI.AddToWorklist(V.getNode());
22496 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
22498 DL, MVT::v8i16, V, V);
22505 case X86ISD::PSHUFD:
22506 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI))
22515 /// \brief Try to combine a shuffle into a target-specific add-sub node.
22517 /// We combine this directly on the abstract vector shuffle nodes so it is
22518 /// easier to generically match. We also insert dummy vector shuffle nodes for
22519 /// the operands which explicitly discard the lanes which are unused by this
22520 /// operation to try to flow through the rest of the combiner the fact that
22521 /// they're unused.
22522 static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) {
22524 EVT VT = N->getValueType(0);
22526 // We only handle target-independent shuffles.
22527 // FIXME: It would be easy and harmless to use the target shuffle mask
22528 // extraction tool to support more.
22529 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
22532 auto *SVN = cast<ShuffleVectorSDNode>(N);
22533 ArrayRef<int> Mask = SVN->getMask();
22534 SDValue V1 = N->getOperand(0);
22535 SDValue V2 = N->getOperand(1);
22537 // We require the first shuffle operand to be the SUB node, and the second to
22538 // be the ADD node.
22539 // FIXME: We should support the commuted patterns.
22540 if (V1->getOpcode() != ISD::FSUB || V2->getOpcode() != ISD::FADD)
22543 // If there are other uses of these operations we can't fold them.
22544 if (!V1->hasOneUse() || !V2->hasOneUse())
22547 // Ensure that both operations have the same operands. Note that we can
22548 // commute the FADD operands.
22549 SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
22550 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
22551 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
22554 // We're looking for blends between FADD and FSUB nodes. We insist on these
22555 // nodes being lined up in a specific expected pattern.
22556 if (!(isShuffleEquivalent(Mask, 0, 3) ||
22557 isShuffleEquivalent(Mask, 0, 5, 2, 7) ||
22558 isShuffleEquivalent(Mask, 0, 9, 2, 11, 4, 13, 6, 15)))
22561 // Only specific types are legal at this point, assert so we notice if and
22562 // when these change.
22563 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 ||
22564 VT == MVT::v4f64) &&
22565 "Unknown vector type encountered!");
22567 return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS);
22570 /// PerformShuffleCombine - Performs several different shuffle combines.
22571 static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
22572 TargetLowering::DAGCombinerInfo &DCI,
22573 const X86Subtarget *Subtarget) {
22575 SDValue N0 = N->getOperand(0);
22576 SDValue N1 = N->getOperand(1);
22577 EVT VT = N->getValueType(0);
22579 // Don't create instructions with illegal types after legalize types has run.
22580 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22581 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
22584 // If we have legalized the vector types, look for blends of FADD and FSUB
22585 // nodes that we can fuse into an ADDSUB node.
22586 if (TLI.isTypeLegal(VT) && Subtarget->hasSSE3())
22587 if (SDValue AddSub = combineShuffleToAddSub(N, DAG))
22590 // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
22591 if (Subtarget->hasFp256() && VT.is256BitVector() &&
22592 N->getOpcode() == ISD::VECTOR_SHUFFLE)
22593 return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
22595 // During Type Legalization, when promoting illegal vector types,
22596 // the backend might introduce new shuffle dag nodes and bitcasts.
22598 // This code performs the following transformation:
22599 // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
22600 // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
22602 // We do this only if both the bitcast and the BINOP dag nodes have
22603 // one use. Also, perform this transformation only if the new binary
22604 // operation is legal. This is to avoid introducing dag nodes that
22605 // potentially need to be further expanded (or custom lowered) into a
22606 // less optimal sequence of dag nodes.
22607 if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
22608 N1.getOpcode() == ISD::UNDEF && N0.hasOneUse() &&
22609 N0.getOpcode() == ISD::BITCAST) {
22610 SDValue BC0 = N0.getOperand(0);
22611 EVT SVT = BC0.getValueType();
22612 unsigned Opcode = BC0.getOpcode();
22613 unsigned NumElts = VT.getVectorNumElements();
22615 if (BC0.hasOneUse() && SVT.isVector() &&
22616 SVT.getVectorNumElements() * 2 == NumElts &&
22617 TLI.isOperationLegal(Opcode, VT)) {
22618 bool CanFold = false;
22630 unsigned SVTNumElts = SVT.getVectorNumElements();
22631 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
22632 for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
22633 CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
22634 for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
22635 CanFold = SVOp->getMaskElt(i) < 0;
22638 SDValue BC00 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(0));
22639 SDValue BC01 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(1));
22640 SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
22641 return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]);
22646 // Only handle 128 wide vector from here on.
22647 if (!VT.is128BitVector())
22650 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
22651 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
22652 // consecutive, non-overlapping, and in the right order.
22653 SmallVector<SDValue, 16> Elts;
22654 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
22655 Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
22657 SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true);
22661 if (isTargetShuffle(N->getOpcode())) {
22663 PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget);
22664 if (Shuffle.getNode())
22667 // Try recursively combining arbitrary sequences of x86 shuffle
22668 // instructions into higher-order shuffles. We do this after combining
22669 // specific PSHUF instruction sequences into their minimal form so that we
22670 // can evaluate how many specialized shuffle instructions are involved in
22671 // a particular chain.
22672 SmallVector<int, 1> NonceMask; // Just a placeholder.
22673 NonceMask.push_back(0);
22674 if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask,
22675 /*Depth*/ 1, /*HasPSHUFB*/ false, DAG,
22677 return SDValue(); // This routine will use CombineTo to replace N.
22683 /// PerformTruncateCombine - Converts truncate operation to
22684 /// a sequence of vector shuffle operations.
22685 /// It is possible when we truncate 256-bit vector to 128-bit vector
22686 static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
22687 TargetLowering::DAGCombinerInfo &DCI,
22688 const X86Subtarget *Subtarget) {
22692 /// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target
22693 /// specific shuffle of a load can be folded into a single element load.
22694 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
22695 /// shuffles have been custom lowered so we need to handle those here.
22696 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
22697 TargetLowering::DAGCombinerInfo &DCI) {
22698 if (DCI.isBeforeLegalizeOps())
22701 SDValue InVec = N->getOperand(0);
22702 SDValue EltNo = N->getOperand(1);
22704 if (!isa<ConstantSDNode>(EltNo))
22707 EVT OriginalVT = InVec.getValueType();
22709 if (InVec.getOpcode() == ISD::BITCAST) {
22710 // Don't duplicate a load with other uses.
22711 if (!InVec.hasOneUse())
22713 EVT BCVT = InVec.getOperand(0).getValueType();
22714 if (BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
22716 InVec = InVec.getOperand(0);
22719 EVT CurrentVT = InVec.getValueType();
22721 if (!isTargetShuffle(InVec.getOpcode()))
22724 // Don't duplicate a load with other uses.
22725 if (!InVec.hasOneUse())
22728 SmallVector<int, 16> ShuffleMask;
22730 if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(),
22731 ShuffleMask, UnaryShuffle))
22734 // Select the input vector, guarding against out of range extract vector.
22735 unsigned NumElems = CurrentVT.getVectorNumElements();
22736 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
22737 int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt];
22738 SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0)
22739 : InVec.getOperand(1);
22741 // If inputs to shuffle are the same for both ops, then allow 2 uses
22742 unsigned AllowedUses = InVec.getNumOperands() > 1 &&
22743 InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
22745 if (LdNode.getOpcode() == ISD::BITCAST) {
22746 // Don't duplicate a load with other uses.
22747 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
22750 AllowedUses = 1; // only allow 1 load use if we have a bitcast
22751 LdNode = LdNode.getOperand(0);
22754 if (!ISD::isNormalLoad(LdNode.getNode()))
22757 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
22759 if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
22762 EVT EltVT = N->getValueType(0);
22763 // If there's a bitcast before the shuffle, check if the load type and
22764 // alignment is valid.
22765 unsigned Align = LN0->getAlignment();
22766 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22767 unsigned NewAlign = TLI.getDataLayout()->getABITypeAlignment(
22768 EltVT.getTypeForEVT(*DAG.getContext()));
22770 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
22773 // All checks match so transform back to vector_shuffle so that DAG combiner
22774 // can finish the job
22777 // Create shuffle node taking into account the case that its a unary shuffle
22778 SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT)
22779 : InVec.getOperand(1);
22780 Shuffle = DAG.getVectorShuffle(CurrentVT, dl,
22781 InVec.getOperand(0), Shuffle,
22783 Shuffle = DAG.getNode(ISD::BITCAST, dl, OriginalVT, Shuffle);
22784 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
22788 /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
22789 /// generation and convert it from being a bunch of shuffles and extracts
22790 /// into a somewhat faster sequence. For i686, the best sequence is apparently
22791 /// storing the value and loading scalars back, while for x64 we should
22792 /// use 64-bit extracts and shifts.
22793 static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
22794 TargetLowering::DAGCombinerInfo &DCI) {
22795 SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI);
22796 if (NewOp.getNode())
22799 SDValue InputVector = N->getOperand(0);
22801 // Detect whether we are trying to convert from mmx to i32 and the bitcast
22802 // from mmx to v2i32 has a single usage.
22803 if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST &&
22804 InputVector.getNode()->getOperand(0).getValueType() == MVT::x86mmx &&
22805 InputVector.hasOneUse() && N->getValueType(0) == MVT::i32)
22806 return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
22807 N->getValueType(0),
22808 InputVector.getNode()->getOperand(0));
22810 // Only operate on vectors of 4 elements, where the alternative shuffling
22811 // gets to be more expensive.
22812 if (InputVector.getValueType() != MVT::v4i32)
22815 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
22816 // single use which is a sign-extend or zero-extend, and all elements are
22818 SmallVector<SDNode *, 4> Uses;
22819 unsigned ExtractedElements = 0;
22820 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
22821 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
22822 if (UI.getUse().getResNo() != InputVector.getResNo())
22825 SDNode *Extract = *UI;
22826 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
22829 if (Extract->getValueType(0) != MVT::i32)
22831 if (!Extract->hasOneUse())
22833 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
22834 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
22836 if (!isa<ConstantSDNode>(Extract->getOperand(1)))
22839 // Record which element was extracted.
22840 ExtractedElements |=
22841 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
22843 Uses.push_back(Extract);
22846 // If not all the elements were used, this may not be worthwhile.
22847 if (ExtractedElements != 15)
22850 // Ok, we've now decided to do the transformation.
22851 // If 64-bit shifts are legal, use the extract-shift sequence,
22852 // otherwise bounce the vector off the cache.
22853 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22855 SDLoc dl(InputVector);
22857 if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
22858 SDValue Cst = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, InputVector);
22859 EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy();
22860 SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
22861 DAG.getConstant(0, VecIdxTy));
22862 SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
22863 DAG.getConstant(1, VecIdxTy));
22865 SDValue ShAmt = DAG.getConstant(32,
22866 DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64));
22867 Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
22868 Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
22869 DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
22870 Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
22871 Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
22872 DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
22874 // Store the value to a temporary stack slot.
22875 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
22876 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
22877 MachinePointerInfo(), false, false, 0);
22879 EVT ElementType = InputVector.getValueType().getVectorElementType();
22880 unsigned EltSize = ElementType.getSizeInBits() / 8;
22882 // Replace each use (extract) with a load of the appropriate element.
22883 for (unsigned i = 0; i < 4; ++i) {
22884 uint64_t Offset = EltSize * i;
22885 SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
22887 SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
22888 StackPtr, OffsetVal);
22890 // Load the scalar.
22891 Vals[i] = DAG.getLoad(ElementType, dl, Ch,
22892 ScalarAddr, MachinePointerInfo(),
22893 false, false, false, 0);
22898 // Replace the extracts
22899 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
22900 UE = Uses.end(); UI != UE; ++UI) {
22901 SDNode *Extract = *UI;
22903 SDValue Idx = Extract->getOperand(1);
22904 uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
22905 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
22908 // The replacement was made in place; don't return anything.
22912 /// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match.
22913 static std::pair<unsigned, bool>
22914 matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
22915 SelectionDAG &DAG, const X86Subtarget *Subtarget) {
22916 if (!VT.isVector())
22917 return std::make_pair(0, false);
22919 bool NeedSplit = false;
22920 switch (VT.getSimpleVT().SimpleTy) {
22921 default: return std::make_pair(0, false);
22924 if (!Subtarget->hasVLX())
22925 return std::make_pair(0, false);
22929 if (!Subtarget->hasBWI())
22930 return std::make_pair(0, false);
22934 if (!Subtarget->hasAVX512())
22935 return std::make_pair(0, false);
22940 if (!Subtarget->hasAVX2())
22942 if (!Subtarget->hasAVX())
22943 return std::make_pair(0, false);
22948 if (!Subtarget->hasSSE2())
22949 return std::make_pair(0, false);
22952 // SSE2 has only a small subset of the operations.
22953 bool hasUnsigned = Subtarget->hasSSE41() ||
22954 (Subtarget->hasSSE2() && VT == MVT::v16i8);
22955 bool hasSigned = Subtarget->hasSSE41() ||
22956 (Subtarget->hasSSE2() && VT == MVT::v8i16);
22958 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
22961 // Check for x CC y ? x : y.
22962 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
22963 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
22968 Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
22971 Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
22974 Opc = hasSigned ? X86ISD::SMIN : 0; break;
22977 Opc = hasSigned ? X86ISD::SMAX : 0; break;
22979 // Check for x CC y ? y : x -- a min/max with reversed arms.
22980 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
22981 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
22986 Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
22989 Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
22992 Opc = hasSigned ? X86ISD::SMAX : 0; break;
22995 Opc = hasSigned ? X86ISD::SMIN : 0; break;
22999 return std::make_pair(Opc, NeedSplit);
23003 transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
23004 const X86Subtarget *Subtarget) {
23006 SDValue Cond = N->getOperand(0);
23007 SDValue LHS = N->getOperand(1);
23008 SDValue RHS = N->getOperand(2);
23010 if (Cond.getOpcode() == ISD::SIGN_EXTEND) {
23011 SDValue CondSrc = Cond->getOperand(0);
23012 if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG)
23013 Cond = CondSrc->getOperand(0);
23016 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
23019 // A vselect where all conditions and data are constants can be optimized into
23020 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
23021 if (ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
23022 ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
23025 unsigned MaskValue = 0;
23026 if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
23029 MVT VT = N->getSimpleValueType(0);
23030 unsigned NumElems = VT.getVectorNumElements();
23031 SmallVector<int, 8> ShuffleMask(NumElems, -1);
23032 for (unsigned i = 0; i < NumElems; ++i) {
23033 // Be sure we emit undef where we can.
23034 if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF)
23035 ShuffleMask[i] = -1;
23037 ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1);
23040 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23041 if (!TLI.isShuffleMaskLegal(ShuffleMask, VT))
23043 return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]);
23046 /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
23048 static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
23049 TargetLowering::DAGCombinerInfo &DCI,
23050 const X86Subtarget *Subtarget) {
23052 SDValue Cond = N->getOperand(0);
23053 // Get the LHS/RHS of the select.
23054 SDValue LHS = N->getOperand(1);
23055 SDValue RHS = N->getOperand(2);
23056 EVT VT = LHS.getValueType();
23057 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23059 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
23060 // instructions match the semantics of the common C idiom x<y?x:y but not
23061 // x<=y?x:y, because of how they handle negative zero (which can be
23062 // ignored in unsafe-math mode).
23063 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
23064 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
23065 VT != MVT::f80 && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
23066 (Subtarget->hasSSE2() ||
23067 (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
23068 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23070 unsigned Opcode = 0;
23071 // Check for x CC y ? x : y.
23072 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
23073 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
23077 // Converting this to a min would handle NaNs incorrectly, and swapping
23078 // the operands would cause it to handle comparisons between positive
23079 // and negative zero incorrectly.
23080 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
23081 if (!DAG.getTarget().Options.UnsafeFPMath &&
23082 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
23084 std::swap(LHS, RHS);
23086 Opcode = X86ISD::FMIN;
23089 // Converting this to a min would handle comparisons between positive
23090 // and negative zero incorrectly.
23091 if (!DAG.getTarget().Options.UnsafeFPMath &&
23092 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
23094 Opcode = X86ISD::FMIN;
23097 // Converting this to a min would handle both negative zeros and NaNs
23098 // incorrectly, but we can swap the operands to fix both.
23099 std::swap(LHS, RHS);
23103 Opcode = X86ISD::FMIN;
23107 // Converting this to a max would handle comparisons between positive
23108 // and negative zero incorrectly.
23109 if (!DAG.getTarget().Options.UnsafeFPMath &&
23110 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
23112 Opcode = X86ISD::FMAX;
23115 // Converting this to a max would handle NaNs incorrectly, and swapping
23116 // the operands would cause it to handle comparisons between positive
23117 // and negative zero incorrectly.
23118 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
23119 if (!DAG.getTarget().Options.UnsafeFPMath &&
23120 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
23122 std::swap(LHS, RHS);
23124 Opcode = X86ISD::FMAX;
23127 // Converting this to a max would handle both negative zeros and NaNs
23128 // incorrectly, but we can swap the operands to fix both.
23129 std::swap(LHS, RHS);
23133 Opcode = X86ISD::FMAX;
23136 // Check for x CC y ? y : x -- a min/max with reversed arms.
23137 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
23138 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
23142 // Converting this to a min would handle comparisons between positive
23143 // and negative zero incorrectly, and swapping the operands would
23144 // cause it to handle NaNs incorrectly.
23145 if (!DAG.getTarget().Options.UnsafeFPMath &&
23146 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
23147 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
23149 std::swap(LHS, RHS);
23151 Opcode = X86ISD::FMIN;
23154 // Converting this to a min would handle NaNs incorrectly.
23155 if (!DAG.getTarget().Options.UnsafeFPMath &&
23156 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
23158 Opcode = X86ISD::FMIN;
23161 // Converting this to a min would handle both negative zeros and NaNs
23162 // incorrectly, but we can swap the operands to fix both.
23163 std::swap(LHS, RHS);
23167 Opcode = X86ISD::FMIN;
23171 // Converting this to a max would handle NaNs incorrectly.
23172 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
23174 Opcode = X86ISD::FMAX;
23177 // Converting this to a max would handle comparisons between positive
23178 // and negative zero incorrectly, and swapping the operands would
23179 // cause it to handle NaNs incorrectly.
23180 if (!DAG.getTarget().Options.UnsafeFPMath &&
23181 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
23182 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
23184 std::swap(LHS, RHS);
23186 Opcode = X86ISD::FMAX;
23189 // Converting this to a max would handle both negative zeros and NaNs
23190 // incorrectly, but we can swap the operands to fix both.
23191 std::swap(LHS, RHS);
23195 Opcode = X86ISD::FMAX;
23201 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
23204 EVT CondVT = Cond.getValueType();
23205 if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() &&
23206 CondVT.getVectorElementType() == MVT::i1) {
23207 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
23208 // lowering on KNL. In this case we convert it to
23209 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
23210 // The same situation for all 128 and 256-bit vectors of i8 and i16.
23211 // Since SKX these selects have a proper lowering.
23212 EVT OpVT = LHS.getValueType();
23213 if ((OpVT.is128BitVector() || OpVT.is256BitVector()) &&
23214 (OpVT.getVectorElementType() == MVT::i8 ||
23215 OpVT.getVectorElementType() == MVT::i16) &&
23216 !(Subtarget->hasBWI() && Subtarget->hasVLX())) {
23217 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond);
23218 DCI.AddToWorklist(Cond.getNode());
23219 return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS);
23222 // If this is a select between two integer constants, try to do some
23224 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
23225 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
23226 // Don't do this for crazy integer types.
23227 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
23228 // If this is efficiently invertible, canonicalize the LHSC/RHSC values
23229 // so that TrueC (the true value) is larger than FalseC.
23230 bool NeedsCondInvert = false;
23232 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
23233 // Efficiently invertible.
23234 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible.
23235 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible.
23236 isa<ConstantSDNode>(Cond.getOperand(1))))) {
23237 NeedsCondInvert = true;
23238 std::swap(TrueC, FalseC);
23241 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0.
23242 if (FalseC->getAPIntValue() == 0 &&
23243 TrueC->getAPIntValue().isPowerOf2()) {
23244 if (NeedsCondInvert) // Invert the condition if needed.
23245 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
23246 DAG.getConstant(1, Cond.getValueType()));
23248 // Zero extend the condition if needed.
23249 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
23251 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
23252 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
23253 DAG.getConstant(ShAmt, MVT::i8));
23256 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
23257 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
23258 if (NeedsCondInvert) // Invert the condition if needed.
23259 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
23260 DAG.getConstant(1, Cond.getValueType()));
23262 // Zero extend the condition if needed.
23263 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
23264 FalseC->getValueType(0), Cond);
23265 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
23266 SDValue(FalseC, 0));
23269 // Optimize cases that will turn into an LEA instruction. This requires
23270 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
23271 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
23272 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
23273 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
23275 bool isFastMultiplier = false;
23277 switch ((unsigned char)Diff) {
23279 case 1: // result = add base, cond
23280 case 2: // result = lea base( , cond*2)
23281 case 3: // result = lea base(cond, cond*2)
23282 case 4: // result = lea base( , cond*4)
23283 case 5: // result = lea base(cond, cond*4)
23284 case 8: // result = lea base( , cond*8)
23285 case 9: // result = lea base(cond, cond*8)
23286 isFastMultiplier = true;
23291 if (isFastMultiplier) {
23292 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
23293 if (NeedsCondInvert) // Invert the condition if needed.
23294 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
23295 DAG.getConstant(1, Cond.getValueType()));
23297 // Zero extend the condition if needed.
23298 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
23300 // Scale the condition by the difference.
23302 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
23303 DAG.getConstant(Diff, Cond.getValueType()));
23305 // Add the base if non-zero.
23306 if (FalseC->getAPIntValue() != 0)
23307 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
23308 SDValue(FalseC, 0));
23315 // Canonicalize max and min:
23316 // (x > y) ? x : y -> (x >= y) ? x : y
23317 // (x < y) ? x : y -> (x <= y) ? x : y
23318 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
23319 // the need for an extra compare
23320 // against zero. e.g.
23321 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
23323 // testl %edi, %edi
23325 // cmovgl %edi, %eax
23329 // cmovsl %eax, %edi
23330 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
23331 DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
23332 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
23333 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23338 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
23339 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
23340 Cond.getOperand(0), Cond.getOperand(1), NewCC);
23341 return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
23346 // Early exit check
23347 if (!TLI.isTypeLegal(VT))
23350 // Match VSELECTs into subs with unsigned saturation.
23351 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
23352 // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
23353 ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
23354 (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
23355 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23357 // Check if one of the arms of the VSELECT is a zero vector. If it's on the
23358 // left side invert the predicate to simplify logic below.
23360 if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
23362 CC = ISD::getSetCCInverse(CC, true);
23363 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
23367 if (Other.getNode() && Other->getNumOperands() == 2 &&
23368 DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
23369 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
23370 SDValue CondRHS = Cond->getOperand(1);
23372 // Look for a general sub with unsigned saturation first.
23373 // x >= y ? x-y : 0 --> subus x, y
23374 // x > y ? x-y : 0 --> subus x, y
23375 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
23376 Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
23377 return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
23379 if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
23380 if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
23381 if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
23382 if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
23383 // If the RHS is a constant we have to reverse the const
23384 // canonicalization.
23385 // x > C-1 ? x+-C : 0 --> subus x, C
23386 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
23387 CondRHSConst->getAPIntValue() ==
23388 (-OpRHSConst->getAPIntValue() - 1))
23389 return DAG.getNode(
23390 X86ISD::SUBUS, DL, VT, OpLHS,
23391 DAG.getConstant(-OpRHSConst->getAPIntValue(), VT));
23393 // Another special case: If C was a sign bit, the sub has been
23394 // canonicalized into a xor.
23395 // FIXME: Would it be better to use computeKnownBits to determine
23396 // whether it's safe to decanonicalize the xor?
23397 // x s< 0 ? x^C : 0 --> subus x, C
23398 if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
23399 ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
23400 OpRHSConst->getAPIntValue().isSignBit())
23401 // Note that we have to rebuild the RHS constant here to ensure we
23402 // don't rely on particular values of undef lanes.
23403 return DAG.getNode(
23404 X86ISD::SUBUS, DL, VT, OpLHS,
23405 DAG.getConstant(OpRHSConst->getAPIntValue(), VT));
23410 // Try to match a min/max vector operation.
23411 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) {
23412 std::pair<unsigned, bool> ret = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget);
23413 unsigned Opc = ret.first;
23414 bool NeedSplit = ret.second;
23416 if (Opc && NeedSplit) {
23417 unsigned NumElems = VT.getVectorNumElements();
23418 // Extract the LHS vectors
23419 SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, DL);
23420 SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, DL);
23422 // Extract the RHS vectors
23423 SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, DL);
23424 SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, DL);
23426 // Create min/max for each subvector
23427 LHS = DAG.getNode(Opc, DL, LHS1.getValueType(), LHS1, RHS1);
23428 RHS = DAG.getNode(Opc, DL, LHS2.getValueType(), LHS2, RHS2);
23430 // Merge the result
23431 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS, RHS);
23433 return DAG.getNode(Opc, DL, VT, LHS, RHS);
23436 // Simplify vector selection if condition value type matches vselect
23438 if (N->getOpcode() == ISD::VSELECT && CondVT == VT) {
23439 assert(Cond.getValueType().isVector() &&
23440 "vector select expects a vector selector!");
23442 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
23443 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
23445 // Try invert the condition if true value is not all 1s and false value
23447 if (!TValIsAllOnes && !FValIsAllZeros &&
23448 // Check if the selector will be produced by CMPP*/PCMP*
23449 Cond.getOpcode() == ISD::SETCC &&
23450 // Check if SETCC has already been promoted
23451 TLI.getSetCCResultType(*DAG.getContext(), VT) == CondVT) {
23452 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
23453 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
23455 if (TValIsAllZeros || FValIsAllOnes) {
23456 SDValue CC = Cond.getOperand(2);
23457 ISD::CondCode NewCC =
23458 ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
23459 Cond.getOperand(0).getValueType().isInteger());
23460 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC);
23461 std::swap(LHS, RHS);
23462 TValIsAllOnes = FValIsAllOnes;
23463 FValIsAllZeros = TValIsAllZeros;
23467 if (TValIsAllOnes || FValIsAllZeros) {
23470 if (TValIsAllOnes && FValIsAllZeros)
23472 else if (TValIsAllOnes)
23473 Ret = DAG.getNode(ISD::OR, DL, CondVT, Cond,
23474 DAG.getNode(ISD::BITCAST, DL, CondVT, RHS));
23475 else if (FValIsAllZeros)
23476 Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond,
23477 DAG.getNode(ISD::BITCAST, DL, CondVT, LHS));
23479 return DAG.getNode(ISD::BITCAST, DL, VT, Ret);
23483 // If we know that this node is legal then we know that it is going to be
23484 // matched by one of the SSE/AVX BLEND instructions. These instructions only
23485 // depend on the highest bit in each word. Try to use SimplifyDemandedBits
23486 // to simplify previous instructions.
23487 if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
23488 !DCI.isBeforeLegalize() &&
23489 // We explicitly check against v8i16 and v16i16 because, although
23490 // they're marked as Custom, they might only be legal when Cond is a
23491 // build_vector of constants. This will be taken care in a later
23493 (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 &&
23494 VT != MVT::v8i16) &&
23495 // Don't optimize vector of constants. Those are handled by
23496 // the generic code and all the bits must be properly set for
23497 // the generic optimizer.
23498 !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
23499 unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
23501 // Don't optimize vector selects that map to mask-registers.
23505 assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
23506 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
23508 APInt KnownZero, KnownOne;
23509 TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
23510 DCI.isBeforeLegalizeOps());
23511 if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
23512 TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
23514 // If we changed the computation somewhere in the DAG, this change
23515 // will affect all users of Cond.
23516 // Make sure it is fine and update all the nodes so that we do not
23517 // use the generic VSELECT anymore. Otherwise, we may perform
23518 // wrong optimizations as we messed up with the actual expectation
23519 // for the vector boolean values.
23520 if (Cond != TLO.Old) {
23521 // Check all uses of that condition operand to check whether it will be
23522 // consumed by non-BLEND instructions, which may depend on all bits are
23524 for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
23526 if (I->getOpcode() != ISD::VSELECT)
23527 // TODO: Add other opcodes eventually lowered into BLEND.
23530 // Update all the users of the condition, before committing the change,
23531 // so that the VSELECT optimizations that expect the correct vector
23532 // boolean value will not be triggered.
23533 for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
23535 DAG.ReplaceAllUsesOfValueWith(
23537 DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0),
23538 Cond, I->getOperand(1), I->getOperand(2)));
23539 DCI.CommitTargetLoweringOpt(TLO);
23542 // At this point, only Cond is changed. Change the condition
23543 // just for N to keep the opportunity to optimize all other
23544 // users their own way.
23545 DAG.ReplaceAllUsesOfValueWith(
23547 DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0),
23548 TLO.New, N->getOperand(1), N->getOperand(2)));
23553 // We should generate an X86ISD::BLENDI from a vselect if its argument
23554 // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
23555 // constants. This specific pattern gets generated when we split a
23556 // selector for a 512 bit vector in a machine without AVX512 (but with
23557 // 256-bit vectors), during legalization:
23559 // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
23561 // Iff we find this pattern and the build_vectors are built from
23562 // constants, we translate the vselect into a shuffle_vector that we
23563 // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
23564 if ((N->getOpcode() == ISD::VSELECT ||
23565 N->getOpcode() == X86ISD::SHRUNKBLEND) &&
23566 !DCI.isBeforeLegalize()) {
23567 SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
23568 if (Shuffle.getNode())
23575 // Check whether a boolean test is testing a boolean value generated by
23576 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
23579 // Simplify the following patterns:
23580 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
23581 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
23582 // to (Op EFLAGS Cond)
23584 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
23585 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
23586 // to (Op EFLAGS !Cond)
23588 // where Op could be BRCOND or CMOV.
23590 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
23591 // Quit if not CMP and SUB with its value result used.
23592 if (Cmp.getOpcode() != X86ISD::CMP &&
23593 (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0)))
23596 // Quit if not used as a boolean value.
23597 if (CC != X86::COND_E && CC != X86::COND_NE)
23600 // Check CMP operands. One of them should be 0 or 1 and the other should be
23601 // an SetCC or extended from it.
23602 SDValue Op1 = Cmp.getOperand(0);
23603 SDValue Op2 = Cmp.getOperand(1);
23606 const ConstantSDNode* C = nullptr;
23607 bool needOppositeCond = (CC == X86::COND_E);
23608 bool checkAgainstTrue = false; // Is it a comparison against 1?
23610 if ((C = dyn_cast<ConstantSDNode>(Op1)))
23612 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
23614 else // Quit if all operands are not constants.
23617 if (C->getZExtValue() == 1) {
23618 needOppositeCond = !needOppositeCond;
23619 checkAgainstTrue = true;
23620 } else if (C->getZExtValue() != 0)
23621 // Quit if the constant is neither 0 or 1.
23624 bool truncatedToBoolWithAnd = false;
23625 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
23626 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
23627 SetCC.getOpcode() == ISD::TRUNCATE ||
23628 SetCC.getOpcode() == ISD::AND) {
23629 if (SetCC.getOpcode() == ISD::AND) {
23631 ConstantSDNode *CS;
23632 if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(0))) &&
23633 CS->getZExtValue() == 1)
23635 if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(1))) &&
23636 CS->getZExtValue() == 1)
23640 SetCC = SetCC.getOperand(OpIdx);
23641 truncatedToBoolWithAnd = true;
23643 SetCC = SetCC.getOperand(0);
23646 switch (SetCC.getOpcode()) {
23647 case X86ISD::SETCC_CARRY:
23648 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
23649 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
23650 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
23651 // truncated to i1 using 'and'.
23652 if (checkAgainstTrue && !truncatedToBoolWithAnd)
23654 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
23655 "Invalid use of SETCC_CARRY!");
23657 case X86ISD::SETCC:
23658 // Set the condition code or opposite one if necessary.
23659 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
23660 if (needOppositeCond)
23661 CC = X86::GetOppositeBranchCondition(CC);
23662 return SetCC.getOperand(1);
23663 case X86ISD::CMOV: {
23664 // Check whether false/true value has canonical one, i.e. 0 or 1.
23665 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
23666 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
23667 // Quit if true value is not a constant.
23670 // Quit if false value is not a constant.
23672 SDValue Op = SetCC.getOperand(0);
23673 // Skip 'zext' or 'trunc' node.
23674 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
23675 Op.getOpcode() == ISD::TRUNCATE)
23676 Op = Op.getOperand(0);
23677 // A special case for rdrand/rdseed, where 0 is set if false cond is
23679 if ((Op.getOpcode() != X86ISD::RDRAND &&
23680 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
23683 // Quit if false value is not the constant 0 or 1.
23684 bool FValIsFalse = true;
23685 if (FVal && FVal->getZExtValue() != 0) {
23686 if (FVal->getZExtValue() != 1)
23688 // If FVal is 1, opposite cond is needed.
23689 needOppositeCond = !needOppositeCond;
23690 FValIsFalse = false;
23692 // Quit if TVal is not the constant opposite of FVal.
23693 if (FValIsFalse && TVal->getZExtValue() != 1)
23695 if (!FValIsFalse && TVal->getZExtValue() != 0)
23697 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
23698 if (needOppositeCond)
23699 CC = X86::GetOppositeBranchCondition(CC);
23700 return SetCC.getOperand(3);
23707 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
23708 static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
23709 TargetLowering::DAGCombinerInfo &DCI,
23710 const X86Subtarget *Subtarget) {
23713 // If the flag operand isn't dead, don't touch this CMOV.
23714 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
23717 SDValue FalseOp = N->getOperand(0);
23718 SDValue TrueOp = N->getOperand(1);
23719 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
23720 SDValue Cond = N->getOperand(3);
23722 if (CC == X86::COND_E || CC == X86::COND_NE) {
23723 switch (Cond.getOpcode()) {
23727 // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
23728 if (DAG.isKnownNeverZero(Cond.getOperand(0)))
23729 return (CC == X86::COND_E) ? FalseOp : TrueOp;
23735 Flags = checkBoolTestSetCCCombine(Cond, CC);
23736 if (Flags.getNode() &&
23737 // Extra check as FCMOV only supports a subset of X86 cond.
23738 (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
23739 SDValue Ops[] = { FalseOp, TrueOp,
23740 DAG.getConstant(CC, MVT::i8), Flags };
23741 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
23744 // If this is a select between two integer constants, try to do some
23745 // optimizations. Note that the operands are ordered the opposite of SELECT
23747 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
23748 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
23749 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
23750 // larger than FalseC (the false value).
23751 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
23752 CC = X86::GetOppositeBranchCondition(CC);
23753 std::swap(TrueC, FalseC);
23754 std::swap(TrueOp, FalseOp);
23757 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
23758 // This is efficient for any integer data type (including i8/i16) and
23760 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
23761 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
23762 DAG.getConstant(CC, MVT::i8), Cond);
23764 // Zero extend the condition if needed.
23765 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
23767 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
23768 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
23769 DAG.getConstant(ShAmt, MVT::i8));
23770 if (N->getNumValues() == 2) // Dead flag value?
23771 return DCI.CombineTo(N, Cond, SDValue());
23775 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
23776 // for any integer data type, including i8/i16.
23777 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
23778 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
23779 DAG.getConstant(CC, MVT::i8), Cond);
23781 // Zero extend the condition if needed.
23782 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
23783 FalseC->getValueType(0), Cond);
23784 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
23785 SDValue(FalseC, 0));
23787 if (N->getNumValues() == 2) // Dead flag value?
23788 return DCI.CombineTo(N, Cond, SDValue());
23792 // Optimize cases that will turn into an LEA instruction. This requires
23793 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
23794 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
23795 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
23796 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
23798 bool isFastMultiplier = false;
23800 switch ((unsigned char)Diff) {
23802 case 1: // result = add base, cond
23803 case 2: // result = lea base( , cond*2)
23804 case 3: // result = lea base(cond, cond*2)
23805 case 4: // result = lea base( , cond*4)
23806 case 5: // result = lea base(cond, cond*4)
23807 case 8: // result = lea base( , cond*8)
23808 case 9: // result = lea base(cond, cond*8)
23809 isFastMultiplier = true;
23814 if (isFastMultiplier) {
23815 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
23816 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
23817 DAG.getConstant(CC, MVT::i8), Cond);
23818 // Zero extend the condition if needed.
23819 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
23821 // Scale the condition by the difference.
23823 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
23824 DAG.getConstant(Diff, Cond.getValueType()));
23826 // Add the base if non-zero.
23827 if (FalseC->getAPIntValue() != 0)
23828 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
23829 SDValue(FalseC, 0));
23830 if (N->getNumValues() == 2) // Dead flag value?
23831 return DCI.CombineTo(N, Cond, SDValue());
23838 // Handle these cases:
23839 // (select (x != c), e, c) -> select (x != c), e, x),
23840 // (select (x == c), c, e) -> select (x == c), x, e)
23841 // where the c is an integer constant, and the "select" is the combination
23842 // of CMOV and CMP.
23844 // The rationale for this change is that the conditional-move from a constant
23845 // needs two instructions, however, conditional-move from a register needs
23846 // only one instruction.
23848 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
23849 // some instruction-combining opportunities. This opt needs to be
23850 // postponed as late as possible.
23852 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
23853 // the DCI.xxxx conditions are provided to postpone the optimization as
23854 // late as possible.
23856 ConstantSDNode *CmpAgainst = nullptr;
23857 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
23858 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
23859 !isa<ConstantSDNode>(Cond.getOperand(0))) {
23861 if (CC == X86::COND_NE &&
23862 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
23863 CC = X86::GetOppositeBranchCondition(CC);
23864 std::swap(TrueOp, FalseOp);
23867 if (CC == X86::COND_E &&
23868 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
23869 SDValue Ops[] = { FalseOp, Cond.getOperand(0),
23870 DAG.getConstant(CC, MVT::i8), Cond };
23871 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
23879 static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
23880 const X86Subtarget *Subtarget) {
23881 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
23883 default: return SDValue();
23884 // SSE/AVX/AVX2 blend intrinsics.
23885 case Intrinsic::x86_avx2_pblendvb:
23886 case Intrinsic::x86_avx2_pblendw:
23887 case Intrinsic::x86_avx2_pblendd_128:
23888 case Intrinsic::x86_avx2_pblendd_256:
23889 // Don't try to simplify this intrinsic if we don't have AVX2.
23890 if (!Subtarget->hasAVX2())
23893 case Intrinsic::x86_avx_blend_pd_256:
23894 case Intrinsic::x86_avx_blend_ps_256:
23895 case Intrinsic::x86_avx_blendv_pd_256:
23896 case Intrinsic::x86_avx_blendv_ps_256:
23897 // Don't try to simplify this intrinsic if we don't have AVX.
23898 if (!Subtarget->hasAVX())
23901 case Intrinsic::x86_sse41_pblendw:
23902 case Intrinsic::x86_sse41_blendpd:
23903 case Intrinsic::x86_sse41_blendps:
23904 case Intrinsic::x86_sse41_blendvps:
23905 case Intrinsic::x86_sse41_blendvpd:
23906 case Intrinsic::x86_sse41_pblendvb: {
23907 SDValue Op0 = N->getOperand(1);
23908 SDValue Op1 = N->getOperand(2);
23909 SDValue Mask = N->getOperand(3);
23911 // Don't try to simplify this intrinsic if we don't have SSE4.1.
23912 if (!Subtarget->hasSSE41())
23915 // fold (blend A, A, Mask) -> A
23918 // fold (blend A, B, allZeros) -> A
23919 if (ISD::isBuildVectorAllZeros(Mask.getNode()))
23921 // fold (blend A, B, allOnes) -> B
23922 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
23925 // Simplify the case where the mask is a constant i32 value.
23926 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mask)) {
23927 if (C->isNullValue())
23929 if (C->isAllOnesValue())
23936 // Packed SSE2/AVX2 arithmetic shift immediate intrinsics.
23937 case Intrinsic::x86_sse2_psrai_w:
23938 case Intrinsic::x86_sse2_psrai_d:
23939 case Intrinsic::x86_avx2_psrai_w:
23940 case Intrinsic::x86_avx2_psrai_d:
23941 case Intrinsic::x86_sse2_psra_w:
23942 case Intrinsic::x86_sse2_psra_d:
23943 case Intrinsic::x86_avx2_psra_w:
23944 case Intrinsic::x86_avx2_psra_d: {
23945 SDValue Op0 = N->getOperand(1);
23946 SDValue Op1 = N->getOperand(2);
23947 EVT VT = Op0.getValueType();
23948 assert(VT.isVector() && "Expected a vector type!");
23950 if (isa<BuildVectorSDNode>(Op1))
23951 Op1 = Op1.getOperand(0);
23953 if (!isa<ConstantSDNode>(Op1))
23956 EVT SVT = VT.getVectorElementType();
23957 unsigned SVTBits = SVT.getSizeInBits();
23959 ConstantSDNode *CND = cast<ConstantSDNode>(Op1);
23960 const APInt &C = APInt(SVTBits, CND->getAPIntValue().getZExtValue());
23961 uint64_t ShAmt = C.getZExtValue();
23963 // Don't try to convert this shift into a ISD::SRA if the shift
23964 // count is bigger than or equal to the element size.
23965 if (ShAmt >= SVTBits)
23968 // Trivial case: if the shift count is zero, then fold this
23969 // into the first operand.
23973 // Replace this packed shift intrinsic with a target independent
23975 SDValue Splat = DAG.getConstant(C, VT);
23976 return DAG.getNode(ISD::SRA, SDLoc(N), VT, Op0, Splat);
23981 /// PerformMulCombine - Optimize a single multiply with constant into two
23982 /// in order to implement it with two cheaper instructions, e.g.
23983 /// LEA + SHL, LEA + LEA.
23984 static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
23985 TargetLowering::DAGCombinerInfo &DCI) {
23986 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
23989 EVT VT = N->getValueType(0);
23990 if (VT != MVT::i64 && VT != MVT::i32)
23993 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
23996 uint64_t MulAmt = C->getZExtValue();
23997 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
24000 uint64_t MulAmt1 = 0;
24001 uint64_t MulAmt2 = 0;
24002 if ((MulAmt % 9) == 0) {
24004 MulAmt2 = MulAmt / 9;
24005 } else if ((MulAmt % 5) == 0) {
24007 MulAmt2 = MulAmt / 5;
24008 } else if ((MulAmt % 3) == 0) {
24010 MulAmt2 = MulAmt / 3;
24013 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
24016 if (isPowerOf2_64(MulAmt2) &&
24017 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
24018 // If second multiplifer is pow2, issue it first. We want the multiply by
24019 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
24021 std::swap(MulAmt1, MulAmt2);
24024 if (isPowerOf2_64(MulAmt1))
24025 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
24026 DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
24028 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
24029 DAG.getConstant(MulAmt1, VT));
24031 if (isPowerOf2_64(MulAmt2))
24032 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
24033 DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
24035 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
24036 DAG.getConstant(MulAmt2, VT));
24038 // Do not add new nodes to DAG combiner worklist.
24039 DCI.CombineTo(N, NewMul, false);
24044 static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
24045 SDValue N0 = N->getOperand(0);
24046 SDValue N1 = N->getOperand(1);
24047 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
24048 EVT VT = N0.getValueType();
24050 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
24051 // since the result of setcc_c is all zero's or all ones.
24052 if (VT.isInteger() && !VT.isVector() &&
24053 N1C && N0.getOpcode() == ISD::AND &&
24054 N0.getOperand(1).getOpcode() == ISD::Constant) {
24055 SDValue N00 = N0.getOperand(0);
24056 if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
24057 ((N00.getOpcode() == ISD::ANY_EXTEND ||
24058 N00.getOpcode() == ISD::ZERO_EXTEND) &&
24059 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) {
24060 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
24061 APInt ShAmt = N1C->getAPIntValue();
24062 Mask = Mask.shl(ShAmt);
24064 return DAG.getNode(ISD::AND, SDLoc(N), VT,
24065 N00, DAG.getConstant(Mask, VT));
24069 // Hardware support for vector shifts is sparse which makes us scalarize the
24070 // vector operations in many cases. Also, on sandybridge ADD is faster than
24072 // (shl V, 1) -> add V,V
24073 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
24074 if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
24075 assert(N0.getValueType().isVector() && "Invalid vector shift type");
24076 // We shift all of the values by one. In many cases we do not have
24077 // hardware support for this operation. This is better expressed as an ADD
24079 if (N1SplatC->getZExtValue() == 1)
24080 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
24086 /// \brief Returns a vector of 0s if the node in input is a vector logical
24087 /// shift by a constant amount which is known to be bigger than or equal
24088 /// to the vector element size in bits.
24089 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
24090 const X86Subtarget *Subtarget) {
24091 EVT VT = N->getValueType(0);
24093 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
24094 (!Subtarget->hasInt256() ||
24095 (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
24098 SDValue Amt = N->getOperand(1);
24100 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
24101 if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
24102 APInt ShiftAmt = AmtSplat->getAPIntValue();
24103 unsigned MaxAmount = VT.getVectorElementType().getSizeInBits();
24105 // SSE2/AVX2 logical shifts always return a vector of 0s
24106 // if the shift amount is bigger than or equal to
24107 // the element size. The constant shift amount will be
24108 // encoded as a 8-bit immediate.
24109 if (ShiftAmt.trunc(8).uge(MaxAmount))
24110 return getZeroVector(VT, Subtarget, DAG, DL);
24116 /// PerformShiftCombine - Combine shifts.
24117 static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
24118 TargetLowering::DAGCombinerInfo &DCI,
24119 const X86Subtarget *Subtarget) {
24120 if (N->getOpcode() == ISD::SHL) {
24121 SDValue V = PerformSHLCombine(N, DAG);
24122 if (V.getNode()) return V;
24125 if (N->getOpcode() != ISD::SRA) {
24126 // Try to fold this logical shift into a zero vector.
24127 SDValue V = performShiftToAllZeros(N, DAG, Subtarget);
24128 if (V.getNode()) return V;
24134 // CMPEQCombine - Recognize the distinctive (AND (setcc ...) (setcc ..))
24135 // where both setccs reference the same FP CMP, and rewrite for CMPEQSS
24136 // and friends. Likewise for OR -> CMPNEQSS.
24137 static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
24138 TargetLowering::DAGCombinerInfo &DCI,
24139 const X86Subtarget *Subtarget) {
24142 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
24143 // we're requiring SSE2 for both.
24144 if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
24145 SDValue N0 = N->getOperand(0);
24146 SDValue N1 = N->getOperand(1);
24147 SDValue CMP0 = N0->getOperand(1);
24148 SDValue CMP1 = N1->getOperand(1);
24151 // The SETCCs should both refer to the same CMP.
24152 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
24155 SDValue CMP00 = CMP0->getOperand(0);
24156 SDValue CMP01 = CMP0->getOperand(1);
24157 EVT VT = CMP00.getValueType();
24159 if (VT == MVT::f32 || VT == MVT::f64) {
24160 bool ExpectingFlags = false;
24161 // Check for any users that want flags:
24162 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
24163 !ExpectingFlags && UI != UE; ++UI)
24164 switch (UI->getOpcode()) {
24169 ExpectingFlags = true;
24171 case ISD::CopyToReg:
24172 case ISD::SIGN_EXTEND:
24173 case ISD::ZERO_EXTEND:
24174 case ISD::ANY_EXTEND:
24178 if (!ExpectingFlags) {
24179 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
24180 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
24182 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
24183 X86::CondCode tmp = cc0;
24188 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
24189 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
24190 // FIXME: need symbolic constants for these magic numbers.
24191 // See X86ATTInstPrinter.cpp:printSSECC().
24192 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
24193 if (Subtarget->hasAVX512()) {
24194 SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00,
24195 CMP01, DAG.getConstant(x86cc, MVT::i8));
24196 if (N->getValueType(0) != MVT::i1)
24197 return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
24201 SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
24202 CMP00.getValueType(), CMP00, CMP01,
24203 DAG.getConstant(x86cc, MVT::i8));
24205 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
24206 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
24208 if (is64BitFP && !Subtarget->is64Bit()) {
24209 // On a 32-bit target, we cannot bitcast the 64-bit float to a
24210 // 64-bit integer, since that's not a legal type. Since
24211 // OnesOrZeroesF is all ones of all zeroes, we don't need all the
24212 // bits, but can do this little dance to extract the lowest 32 bits
24213 // and work with those going forward.
24214 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
24216 SDValue Vector32 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32,
24218 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
24219 Vector32, DAG.getIntPtrConstant(0));
24223 SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, IntVT, OnesOrZeroesF);
24224 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
24225 DAG.getConstant(1, IntVT));
24226 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed);
24227 return OneBitOfTruth;
24235 /// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector
24236 /// so it can be folded inside ANDNP.
24237 static bool CanFoldXORWithAllOnes(const SDNode *N) {
24238 EVT VT = N->getValueType(0);
24240 // Match direct AllOnes for 128 and 256-bit vectors
24241 if (ISD::isBuildVectorAllOnes(N))
24244 // Look through a bit convert.
24245 if (N->getOpcode() == ISD::BITCAST)
24246 N = N->getOperand(0).getNode();
24248 // Sometimes the operand may come from a insert_subvector building a 256-bit
24250 if (VT.is256BitVector() &&
24251 N->getOpcode() == ISD::INSERT_SUBVECTOR) {
24252 SDValue V1 = N->getOperand(0);
24253 SDValue V2 = N->getOperand(1);
24255 if (V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
24256 V1.getOperand(0).getOpcode() == ISD::UNDEF &&
24257 ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
24258 ISD::isBuildVectorAllOnes(V2.getNode()))
24265 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
24266 // register. In most cases we actually compare or select YMM-sized registers
24267 // and mixing the two types creates horrible code. This method optimizes
24268 // some of the transition sequences.
24269 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
24270 TargetLowering::DAGCombinerInfo &DCI,
24271 const X86Subtarget *Subtarget) {
24272 EVT VT = N->getValueType(0);
24273 if (!VT.is256BitVector())
24276 assert((N->getOpcode() == ISD::ANY_EXTEND ||
24277 N->getOpcode() == ISD::ZERO_EXTEND ||
24278 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
24280 SDValue Narrow = N->getOperand(0);
24281 EVT NarrowVT = Narrow->getValueType(0);
24282 if (!NarrowVT.is128BitVector())
24285 if (Narrow->getOpcode() != ISD::XOR &&
24286 Narrow->getOpcode() != ISD::AND &&
24287 Narrow->getOpcode() != ISD::OR)
24290 SDValue N0 = Narrow->getOperand(0);
24291 SDValue N1 = Narrow->getOperand(1);
24294 // The Left side has to be a trunc.
24295 if (N0.getOpcode() != ISD::TRUNCATE)
24298 // The type of the truncated inputs.
24299 EVT WideVT = N0->getOperand(0)->getValueType(0);
24303 // The right side has to be a 'trunc' or a constant vector.
24304 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
24305 ConstantSDNode *RHSConstSplat = nullptr;
24306 if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
24307 RHSConstSplat = RHSBV->getConstantSplatNode();
24308 if (!RHSTrunc && !RHSConstSplat)
24311 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24313 if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
24316 // Set N0 and N1 to hold the inputs to the new wide operation.
24317 N0 = N0->getOperand(0);
24318 if (RHSConstSplat) {
24319 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(),
24320 SDValue(RHSConstSplat, 0));
24321 SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);
24322 N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C);
24323 } else if (RHSTrunc) {
24324 N1 = N1->getOperand(0);
24327 // Generate the wide operation.
24328 SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
24329 unsigned Opcode = N->getOpcode();
24331 case ISD::ANY_EXTEND:
24333 case ISD::ZERO_EXTEND: {
24334 unsigned InBits = NarrowVT.getScalarType().getSizeInBits();
24335 APInt Mask = APInt::getAllOnesValue(InBits);
24336 Mask = Mask.zext(VT.getScalarType().getSizeInBits());
24337 return DAG.getNode(ISD::AND, DL, VT,
24338 Op, DAG.getConstant(Mask, VT));
24340 case ISD::SIGN_EXTEND:
24341 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
24342 Op, DAG.getValueType(NarrowVT));
24344 llvm_unreachable("Unexpected opcode");
24348 static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
24349 TargetLowering::DAGCombinerInfo &DCI,
24350 const X86Subtarget *Subtarget) {
24351 EVT VT = N->getValueType(0);
24352 if (DCI.isBeforeLegalizeOps())
24355 SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
24359 // Create BEXTR instructions
24360 // BEXTR is ((X >> imm) & (2**size-1))
24361 if (VT == MVT::i32 || VT == MVT::i64) {
24362 SDValue N0 = N->getOperand(0);
24363 SDValue N1 = N->getOperand(1);
24366 // Check for BEXTR.
24367 if ((Subtarget->hasBMI() || Subtarget->hasTBM()) &&
24368 (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) {
24369 ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
24370 ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
24371 if (MaskNode && ShiftNode) {
24372 uint64_t Mask = MaskNode->getZExtValue();
24373 uint64_t Shift = ShiftNode->getZExtValue();
24374 if (isMask_64(Mask)) {
24375 uint64_t MaskSize = CountPopulation_64(Mask);
24376 if (Shift + MaskSize <= VT.getSizeInBits())
24377 return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
24378 DAG.getConstant(Shift | (MaskSize << 8), VT));
24386 // Want to form ANDNP nodes:
24387 // 1) In the hopes of then easily combining them with OR and AND nodes
24388 // to form PBLEND/PSIGN.
24389 // 2) To match ANDN packed intrinsics
24390 if (VT != MVT::v2i64 && VT != MVT::v4i64)
24393 SDValue N0 = N->getOperand(0);
24394 SDValue N1 = N->getOperand(1);
24397 // Check LHS for vnot
24398 if (N0.getOpcode() == ISD::XOR &&
24399 //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
24400 CanFoldXORWithAllOnes(N0.getOperand(1).getNode()))
24401 return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
24403 // Check RHS for vnot
24404 if (N1.getOpcode() == ISD::XOR &&
24405 //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
24406 CanFoldXORWithAllOnes(N1.getOperand(1).getNode()))
24407 return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
24412 static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
24413 TargetLowering::DAGCombinerInfo &DCI,
24414 const X86Subtarget *Subtarget) {
24415 if (DCI.isBeforeLegalizeOps())
24418 SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
24422 SDValue N0 = N->getOperand(0);
24423 SDValue N1 = N->getOperand(1);
24424 EVT VT = N->getValueType(0);
24426 // look for psign/blend
24427 if (VT == MVT::v2i64 || VT == MVT::v4i64) {
24428 if (!Subtarget->hasSSSE3() ||
24429 (VT == MVT::v4i64 && !Subtarget->hasInt256()))
24432 // Canonicalize pandn to RHS
24433 if (N0.getOpcode() == X86ISD::ANDNP)
24435 // or (and (m, y), (pandn m, x))
24436 if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
24437 SDValue Mask = N1.getOperand(0);
24438 SDValue X = N1.getOperand(1);
24440 if (N0.getOperand(0) == Mask)
24441 Y = N0.getOperand(1);
24442 if (N0.getOperand(1) == Mask)
24443 Y = N0.getOperand(0);
24445 // Check to see if the mask appeared in both the AND and ANDNP and
24449 // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
24450 // Look through mask bitcast.
24451 if (Mask.getOpcode() == ISD::BITCAST)
24452 Mask = Mask.getOperand(0);
24453 if (X.getOpcode() == ISD::BITCAST)
24454 X = X.getOperand(0);
24455 if (Y.getOpcode() == ISD::BITCAST)
24456 Y = Y.getOperand(0);
24458 EVT MaskVT = Mask.getValueType();
24460 // Validate that the Mask operand is a vector sra node.
24461 // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
24462 // there is no psrai.b
24463 unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
24464 unsigned SraAmt = ~0;
24465 if (Mask.getOpcode() == ISD::SRA) {
24466 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
24467 if (auto *AmtConst = AmtBV->getConstantSplatNode())
24468 SraAmt = AmtConst->getZExtValue();
24469 } else if (Mask.getOpcode() == X86ISD::VSRAI) {
24470 SDValue SraC = Mask.getOperand(1);
24471 SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue();
24473 if ((SraAmt + 1) != EltBits)
24478 // Now we know we at least have a plendvb with the mask val. See if
24479 // we can form a psignb/w/d.
24480 // psign = x.type == y.type == mask.type && y = sub(0, x);
24481 if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
24482 ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
24483 X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
24484 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
24485 "Unsupported VT for PSIGN");
24486 Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0));
24487 return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
24489 // PBLENDVB only available on SSE 4.1
24490 if (!Subtarget->hasSSE41())
24493 EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
24495 X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X);
24496 Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y);
24497 Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask);
24498 Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
24499 return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
24503 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
24506 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
24507 MachineFunction &MF = DAG.getMachineFunction();
24508 bool OptForSize = MF.getFunction()->getAttributes().
24509 hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
24511 // SHLD/SHRD instructions have lower register pressure, but on some
24512 // platforms they have higher latency than the equivalent
24513 // series of shifts/or that would otherwise be generated.
24514 // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
24515 // have higher latencies and we are not optimizing for size.
24516 if (!OptForSize && Subtarget->isSHLDSlow())
24519 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
24521 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
24523 if (!N0.hasOneUse() || !N1.hasOneUse())
24526 SDValue ShAmt0 = N0.getOperand(1);
24527 if (ShAmt0.getValueType() != MVT::i8)
24529 SDValue ShAmt1 = N1.getOperand(1);
24530 if (ShAmt1.getValueType() != MVT::i8)
24532 if (ShAmt0.getOpcode() == ISD::TRUNCATE)
24533 ShAmt0 = ShAmt0.getOperand(0);
24534 if (ShAmt1.getOpcode() == ISD::TRUNCATE)
24535 ShAmt1 = ShAmt1.getOperand(0);
24538 unsigned Opc = X86ISD::SHLD;
24539 SDValue Op0 = N0.getOperand(0);
24540 SDValue Op1 = N1.getOperand(0);
24541 if (ShAmt0.getOpcode() == ISD::SUB) {
24542 Opc = X86ISD::SHRD;
24543 std::swap(Op0, Op1);
24544 std::swap(ShAmt0, ShAmt1);
24547 unsigned Bits = VT.getSizeInBits();
24548 if (ShAmt1.getOpcode() == ISD::SUB) {
24549 SDValue Sum = ShAmt1.getOperand(0);
24550 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
24551 SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
24552 if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
24553 ShAmt1Op1 = ShAmt1Op1.getOperand(0);
24554 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
24555 return DAG.getNode(Opc, DL, VT,
24557 DAG.getNode(ISD::TRUNCATE, DL,
24560 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
24561 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
24563 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
24564 return DAG.getNode(Opc, DL, VT,
24565 N0.getOperand(0), N1.getOperand(0),
24566 DAG.getNode(ISD::TRUNCATE, DL,
24573 // Generate NEG and CMOV for integer abs.
24574 static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
24575 EVT VT = N->getValueType(0);
24577 // Since X86 does not have CMOV for 8-bit integer, we don't convert
24578 // 8-bit integer abs to NEG and CMOV.
24579 if (VT.isInteger() && VT.getSizeInBits() == 8)
24582 SDValue N0 = N->getOperand(0);
24583 SDValue N1 = N->getOperand(1);
24586 // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
24587 // and change it to SUB and CMOV.
24588 if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
24589 N0.getOpcode() == ISD::ADD &&
24590 N0.getOperand(1) == N1 &&
24591 N1.getOpcode() == ISD::SRA &&
24592 N1.getOperand(0) == N0.getOperand(0))
24593 if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
24594 if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) {
24595 // Generate SUB & CMOV.
24596 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
24597 DAG.getConstant(0, VT), N0.getOperand(0));
24599 SDValue Ops[] = { N0.getOperand(0), Neg,
24600 DAG.getConstant(X86::COND_GE, MVT::i8),
24601 SDValue(Neg.getNode(), 1) };
24602 return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
24607 // PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes
24608 static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
24609 TargetLowering::DAGCombinerInfo &DCI,
24610 const X86Subtarget *Subtarget) {
24611 if (DCI.isBeforeLegalizeOps())
24614 if (Subtarget->hasCMov()) {
24615 SDValue RV = performIntegerAbsCombine(N, DAG);
24623 /// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
24624 static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
24625 TargetLowering::DAGCombinerInfo &DCI,
24626 const X86Subtarget *Subtarget) {
24627 LoadSDNode *Ld = cast<LoadSDNode>(N);
24628 EVT RegVT = Ld->getValueType(0);
24629 EVT MemVT = Ld->getMemoryVT();
24631 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24633 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
24634 // into two 16-byte operations.
24635 ISD::LoadExtType Ext = Ld->getExtensionType();
24636 unsigned Alignment = Ld->getAlignment();
24637 bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
24638 if (RegVT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
24639 !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
24640 unsigned NumElems = RegVT.getVectorNumElements();
24644 SDValue Ptr = Ld->getBasePtr();
24645 SDValue Increment = DAG.getConstant(16, TLI.getPointerTy());
24647 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
24649 SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
24650 Ld->getPointerInfo(), Ld->isVolatile(),
24651 Ld->isNonTemporal(), Ld->isInvariant(),
24653 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
24654 SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
24655 Ld->getPointerInfo(), Ld->isVolatile(),
24656 Ld->isNonTemporal(), Ld->isInvariant(),
24657 std::min(16U, Alignment));
24658 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
24660 Load2.getValue(1));
24662 SDValue NewVec = DAG.getUNDEF(RegVT);
24663 NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);
24664 NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);
24665 return DCI.CombineTo(N, NewVec, TF, true);
24671 /// PerformMLOADCombine - Resolve extending loads
24672 static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
24673 TargetLowering::DAGCombinerInfo &DCI,
24674 const X86Subtarget *Subtarget) {
24675 MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
24676 if (Mld->getExtensionType() != ISD::SEXTLOAD)
24679 EVT VT = Mld->getValueType(0);
24680 unsigned NumElems = VT.getVectorNumElements();
24681 EVT LdVT = Mld->getMemoryVT();
24684 assert(LdVT != VT && "Cannot extend to the same type");
24685 unsigned ToSz = VT.getVectorElementType().getSizeInBits();
24686 unsigned FromSz = LdVT.getVectorElementType().getSizeInBits();
24687 // From, To sizes and ElemCount must be pow of two
24688 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
24689 "Unexpected size for extending masked load");
24691 unsigned SizeRatio = ToSz / FromSz;
24692 assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
24694 // Create a type on which we perform the shuffle
24695 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
24696 LdVT.getScalarType(), NumElems*SizeRatio);
24697 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
24699 // Convert Src0 value
24700 SDValue WideSrc0 = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mld->getSrc0());
24701 if (Mld->getSrc0().getOpcode() != ISD::UNDEF) {
24702 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
24703 for (unsigned i = 0; i != NumElems; ++i)
24704 ShuffleVec[i] = i * SizeRatio;
24706 // Can't shuffle using an illegal type.
24707 assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)
24708 && "WideVecVT should be legal");
24709 WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
24710 DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
24712 // Prepare the new mask
24714 SDValue Mask = Mld->getMask();
24715 if (Mask.getValueType() == VT) {
24716 // Mask and original value have the same type
24717 NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
24718 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
24719 for (unsigned i = 0; i != NumElems; ++i)
24720 ShuffleVec[i] = i * SizeRatio;
24721 for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
24722 ShuffleVec[i] = NumElems*SizeRatio;
24723 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
24724 DAG.getConstant(0, WideVecVT),
24728 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
24729 unsigned WidenNumElts = NumElems*SizeRatio;
24730 unsigned MaskNumElts = VT.getVectorNumElements();
24731 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
24734 unsigned NumConcat = WidenNumElts / MaskNumElts;
24735 SmallVector<SDValue, 16> Ops(NumConcat);
24736 SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType());
24738 for (unsigned i = 1; i != NumConcat; ++i)
24741 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
24744 SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
24745 Mld->getBasePtr(), NewMask, WideSrc0,
24746 Mld->getMemoryVT(), Mld->getMemOperand(),
24748 SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
24749 return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
24752 /// PerformMSTORECombine - Resolve truncating stores
24753 static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
24754 const X86Subtarget *Subtarget) {
24755 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
24756 if (!Mst->isTruncatingStore())
24759 EVT VT = Mst->getValue().getValueType();
24760 unsigned NumElems = VT.getVectorNumElements();
24761 EVT StVT = Mst->getMemoryVT();
24764 assert(StVT != VT && "Cannot truncate to the same type");
24765 unsigned FromSz = VT.getVectorElementType().getSizeInBits();
24766 unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
24768 // From, To sizes and ElemCount must be pow of two
24769 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
24770 "Unexpected size for truncating masked store");
24771 // We are going to use the original vector elt for storing.
24772 // Accumulated smaller vector elements must be a multiple of the store size.
24773 assert (((NumElems * FromSz) % ToSz) == 0 &&
24774 "Unexpected ratio for truncating masked store");
24776 unsigned SizeRatio = FromSz / ToSz;
24777 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
24779 // Create a type on which we perform the shuffle
24780 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
24781 StVT.getScalarType(), NumElems*SizeRatio);
24783 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
24785 SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mst->getValue());
24786 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
24787 for (unsigned i = 0; i != NumElems; ++i)
24788 ShuffleVec[i] = i * SizeRatio;
24790 // Can't shuffle using an illegal type.
24791 assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)
24792 && "WideVecVT should be legal");
24794 SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
24795 DAG.getUNDEF(WideVecVT),
24799 SDValue Mask = Mst->getMask();
24800 if (Mask.getValueType() == VT) {
24801 // Mask and original value have the same type
24802 NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
24803 for (unsigned i = 0; i != NumElems; ++i)
24804 ShuffleVec[i] = i * SizeRatio;
24805 for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
24806 ShuffleVec[i] = NumElems*SizeRatio;
24807 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
24808 DAG.getConstant(0, WideVecVT),
24812 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
24813 unsigned WidenNumElts = NumElems*SizeRatio;
24814 unsigned MaskNumElts = VT.getVectorNumElements();
24815 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
24818 unsigned NumConcat = WidenNumElts / MaskNumElts;
24819 SmallVector<SDValue, 16> Ops(NumConcat);
24820 SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType());
24822 for (unsigned i = 1; i != NumConcat; ++i)
24825 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
24828 return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, Mst->getBasePtr(),
24829 NewMask, StVT, Mst->getMemOperand(), false);
24831 /// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
24832 static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
24833 const X86Subtarget *Subtarget) {
24834 StoreSDNode *St = cast<StoreSDNode>(N);
24835 EVT VT = St->getValue().getValueType();
24836 EVT StVT = St->getMemoryVT();
24838 SDValue StoredVal = St->getOperand(1);
24839 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24841 // If we are saving a concatenation of two XMM registers and 32-byte stores
24842 // are slow, such as on Sandy Bridge, perform two 16-byte stores.
24843 unsigned Alignment = St->getAlignment();
24844 bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8;
24845 if (VT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
24846 StVT == VT && !IsAligned) {
24847 unsigned NumElems = VT.getVectorNumElements();
24851 SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl);
24852 SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl);
24854 SDValue Stride = DAG.getConstant(16, TLI.getPointerTy());
24855 SDValue Ptr0 = St->getBasePtr();
24856 SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
24858 SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
24859 St->getPointerInfo(), St->isVolatile(),
24860 St->isNonTemporal(), Alignment);
24861 SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
24862 St->getPointerInfo(), St->isVolatile(),
24863 St->isNonTemporal(),
24864 std::min(16U, Alignment));
24865 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
24868 // Optimize trunc store (of multiple scalars) to shuffle and store.
24869 // First, pack all of the elements in one place. Next, store to memory
24870 // in fewer chunks.
24871 if (St->isTruncatingStore() && VT.isVector()) {
24872 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24873 unsigned NumElems = VT.getVectorNumElements();
24874 assert(StVT != VT && "Cannot truncate to the same type");
24875 unsigned FromSz = VT.getVectorElementType().getSizeInBits();
24876 unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
24878 // From, To sizes and ElemCount must be pow of two
24879 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
24880 // We are going to use the original vector elt for storing.
24881 // Accumulated smaller vector elements must be a multiple of the store size.
24882 if (0 != (NumElems * FromSz) % ToSz) return SDValue();
24884 unsigned SizeRatio = FromSz / ToSz;
24886 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
24888 // Create a type on which we perform the shuffle
24889 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
24890 StVT.getScalarType(), NumElems*SizeRatio);
24892 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
24894 SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());
24895 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
24896 for (unsigned i = 0; i != NumElems; ++i)
24897 ShuffleVec[i] = i * SizeRatio;
24899 // Can't shuffle using an illegal type.
24900 if (!TLI.isTypeLegal(WideVecVT))
24903 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
24904 DAG.getUNDEF(WideVecVT),
24906 // At this point all of the data is stored at the bottom of the
24907 // register. We now need to save it to mem.
24909 // Find the largest store unit
24910 MVT StoreType = MVT::i8;
24911 for (MVT Tp : MVT::integer_valuetypes()) {
24912 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
24916 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
24917 if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
24918 (64 <= NumElems * ToSz))
24919 StoreType = MVT::f64;
24921 // Bitcast the original vector into a vector of store-size units
24922 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
24923 StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
24924 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
24925 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
24926 SmallVector<SDValue, 8> Chains;
24927 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
24928 TLI.getPointerTy());
24929 SDValue Ptr = St->getBasePtr();
24931 // Perform one or more big stores into memory.
24932 for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
24933 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
24934 StoreType, ShuffWide,
24935 DAG.getIntPtrConstant(i));
24936 SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
24937 St->getPointerInfo(), St->isVolatile(),
24938 St->isNonTemporal(), St->getAlignment());
24939 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
24940 Chains.push_back(Ch);
24943 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
24946 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
24947 // the FP state in cases where an emms may be missing.
24948 // A preferable solution to the general problem is to figure out the right
24949 // places to insert EMMS. This qualifies as a quick hack.
24951 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
24952 if (VT.getSizeInBits() != 64)
24955 const Function *F = DAG.getMachineFunction().getFunction();
24956 bool NoImplicitFloatOps = F->getAttributes().
24957 hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
24958 bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps
24959 && Subtarget->hasSSE2();
24960 if ((VT.isVector() ||
24961 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
24962 isa<LoadSDNode>(St->getValue()) &&
24963 !cast<LoadSDNode>(St->getValue())->isVolatile() &&
24964 St->getChain().hasOneUse() && !St->isVolatile()) {
24965 SDNode* LdVal = St->getValue().getNode();
24966 LoadSDNode *Ld = nullptr;
24967 int TokenFactorIndex = -1;
24968 SmallVector<SDValue, 8> Ops;
24969 SDNode* ChainVal = St->getChain().getNode();
24970 // Must be a store of a load. We currently handle two cases: the load
24971 // is a direct child, and it's under an intervening TokenFactor. It is
24972 // possible to dig deeper under nested TokenFactors.
24973 if (ChainVal == LdVal)
24974 Ld = cast<LoadSDNode>(St->getChain());
24975 else if (St->getValue().hasOneUse() &&
24976 ChainVal->getOpcode() == ISD::TokenFactor) {
24977 for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
24978 if (ChainVal->getOperand(i).getNode() == LdVal) {
24979 TokenFactorIndex = i;
24980 Ld = cast<LoadSDNode>(St->getValue());
24982 Ops.push_back(ChainVal->getOperand(i));
24986 if (!Ld || !ISD::isNormalLoad(Ld))
24989 // If this is not the MMX case, i.e. we are just turning i64 load/store
24990 // into f64 load/store, avoid the transformation if there are multiple
24991 // uses of the loaded value.
24992 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
24997 // If we are a 64-bit capable x86, lower to a single movq load/store pair.
24998 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
25000 if (Subtarget->is64Bit() || F64IsLegal) {
25001 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
25002 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
25003 Ld->getPointerInfo(), Ld->isVolatile(),
25004 Ld->isNonTemporal(), Ld->isInvariant(),
25005 Ld->getAlignment());
25006 SDValue NewChain = NewLd.getValue(1);
25007 if (TokenFactorIndex != -1) {
25008 Ops.push_back(NewChain);
25009 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
25011 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
25012 St->getPointerInfo(),
25013 St->isVolatile(), St->isNonTemporal(),
25014 St->getAlignment());
25017 // Otherwise, lower to two pairs of 32-bit loads / stores.
25018 SDValue LoAddr = Ld->getBasePtr();
25019 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
25020 DAG.getConstant(4, MVT::i32));
25022 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
25023 Ld->getPointerInfo(),
25024 Ld->isVolatile(), Ld->isNonTemporal(),
25025 Ld->isInvariant(), Ld->getAlignment());
25026 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
25027 Ld->getPointerInfo().getWithOffset(4),
25028 Ld->isVolatile(), Ld->isNonTemporal(),
25030 MinAlign(Ld->getAlignment(), 4));
25032 SDValue NewChain = LoLd.getValue(1);
25033 if (TokenFactorIndex != -1) {
25034 Ops.push_back(LoLd);
25035 Ops.push_back(HiLd);
25036 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
25039 LoAddr = St->getBasePtr();
25040 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
25041 DAG.getConstant(4, MVT::i32));
25043 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
25044 St->getPointerInfo(),
25045 St->isVolatile(), St->isNonTemporal(),
25046 St->getAlignment());
25047 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
25048 St->getPointerInfo().getWithOffset(4),
25050 St->isNonTemporal(),
25051 MinAlign(St->getAlignment(), 4));
25052 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
25057 /// Return 'true' if this vector operation is "horizontal"
25058 /// and return the operands for the horizontal operation in LHS and RHS. A
25059 /// horizontal operation performs the binary operation on successive elements
25060 /// of its first operand, then on successive elements of its second operand,
25061 /// returning the resulting values in a vector. For example, if
25062 /// A = < float a0, float a1, float a2, float a3 >
25064 /// B = < float b0, float b1, float b2, float b3 >
25065 /// then the result of doing a horizontal operation on A and B is
25066 /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
25067 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
25068 /// A horizontal-op B, for some already available A and B, and if so then LHS is
25069 /// set to A, RHS to B, and the routine returns 'true'.
25070 /// Note that the binary operation should have the property that if one of the
25071 /// operands is UNDEF then the result is UNDEF.
25072 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
25073 // Look for the following pattern: if
25074 // A = < float a0, float a1, float a2, float a3 >
25075 // B = < float b0, float b1, float b2, float b3 >
25077 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
25078 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
25079 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
25080 // which is A horizontal-op B.
25082 // At least one of the operands should be a vector shuffle.
25083 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
25084 RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
25087 MVT VT = LHS.getSimpleValueType();
25089 assert((VT.is128BitVector() || VT.is256BitVector()) &&
25090 "Unsupported vector type for horizontal add/sub");
25092 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
25093 // operate independently on 128-bit lanes.
25094 unsigned NumElts = VT.getVectorNumElements();
25095 unsigned NumLanes = VT.getSizeInBits()/128;
25096 unsigned NumLaneElts = NumElts / NumLanes;
25097 assert((NumLaneElts % 2 == 0) &&
25098 "Vector type should have an even number of elements in each lane");
25099 unsigned HalfLaneElts = NumLaneElts/2;
25101 // View LHS in the form
25102 // LHS = VECTOR_SHUFFLE A, B, LMask
25103 // If LHS is not a shuffle then pretend it is the shuffle
25104 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
25105 // NOTE: in what follows a default initialized SDValue represents an UNDEF of
25108 SmallVector<int, 16> LMask(NumElts);
25109 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
25110 if (LHS.getOperand(0).getOpcode() != ISD::UNDEF)
25111 A = LHS.getOperand(0);
25112 if (LHS.getOperand(1).getOpcode() != ISD::UNDEF)
25113 B = LHS.getOperand(1);
25114 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
25115 std::copy(Mask.begin(), Mask.end(), LMask.begin());
25117 if (LHS.getOpcode() != ISD::UNDEF)
25119 for (unsigned i = 0; i != NumElts; ++i)
25123 // Likewise, view RHS in the form
25124 // RHS = VECTOR_SHUFFLE C, D, RMask
25126 SmallVector<int, 16> RMask(NumElts);
25127 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
25128 if (RHS.getOperand(0).getOpcode() != ISD::UNDEF)
25129 C = RHS.getOperand(0);
25130 if (RHS.getOperand(1).getOpcode() != ISD::UNDEF)
25131 D = RHS.getOperand(1);
25132 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
25133 std::copy(Mask.begin(), Mask.end(), RMask.begin());
25135 if (RHS.getOpcode() != ISD::UNDEF)
25137 for (unsigned i = 0; i != NumElts; ++i)
25141 // Check that the shuffles are both shuffling the same vectors.
25142 if (!(A == C && B == D) && !(A == D && B == C))
25145 // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
25146 if (!A.getNode() && !B.getNode())
25149 // If A and B occur in reverse order in RHS, then "swap" them (which means
25150 // rewriting the mask).
25152 CommuteVectorShuffleMask(RMask, NumElts);
25154 // At this point LHS and RHS are equivalent to
25155 // LHS = VECTOR_SHUFFLE A, B, LMask
25156 // RHS = VECTOR_SHUFFLE A, B, RMask
25157 // Check that the masks correspond to performing a horizontal operation.
25158 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
25159 for (unsigned i = 0; i != NumLaneElts; ++i) {
25160 int LIdx = LMask[i+l], RIdx = RMask[i+l];
25162 // Ignore any UNDEF components.
25163 if (LIdx < 0 || RIdx < 0 ||
25164 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
25165 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
25168 // Check that successive elements are being operated on. If not, this is
25169 // not a horizontal operation.
25170 unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
25171 int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
25172 if (!(LIdx == Index && RIdx == Index + 1) &&
25173 !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
25178 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
25179 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
25183 /// Do target-specific dag combines on floating point adds.
25184 static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
25185 const X86Subtarget *Subtarget) {
25186 EVT VT = N->getValueType(0);
25187 SDValue LHS = N->getOperand(0);
25188 SDValue RHS = N->getOperand(1);
25190 // Try to synthesize horizontal adds from adds of shuffles.
25191 if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
25192 (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
25193 isHorizontalBinOp(LHS, RHS, true))
25194 return DAG.getNode(X86ISD::FHADD, SDLoc(N), VT, LHS, RHS);
25198 /// Do target-specific dag combines on floating point subs.
25199 static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
25200 const X86Subtarget *Subtarget) {
25201 EVT VT = N->getValueType(0);
25202 SDValue LHS = N->getOperand(0);
25203 SDValue RHS = N->getOperand(1);
25205 // Try to synthesize horizontal subs from subs of shuffles.
25206 if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
25207 (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
25208 isHorizontalBinOp(LHS, RHS, false))
25209 return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS);
25213 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
25214 static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
25215 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
25216 // F[X]OR(0.0, x) -> x
25217 // F[X]OR(x, 0.0) -> x
25218 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
25219 if (C->getValueAPF().isPosZero())
25220 return N->getOperand(1);
25221 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
25222 if (C->getValueAPF().isPosZero())
25223 return N->getOperand(0);
25227 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
25228 static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
25229 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
25231 // Only perform optimizations if UnsafeMath is used.
25232 if (!DAG.getTarget().Options.UnsafeFPMath)
25235 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
25236 // into FMINC and FMAXC, which are Commutative operations.
25237 unsigned NewOp = 0;
25238 switch (N->getOpcode()) {
25239 default: llvm_unreachable("unknown opcode");
25240 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
25241 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
25244 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
25245 N->getOperand(0), N->getOperand(1));
25248 /// Do target-specific dag combines on X86ISD::FAND nodes.
25249 static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
25250 // FAND(0.0, x) -> 0.0
25251 // FAND(x, 0.0) -> 0.0
25252 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
25253 if (C->getValueAPF().isPosZero())
25254 return N->getOperand(0);
25255 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
25256 if (C->getValueAPF().isPosZero())
25257 return N->getOperand(1);
25261 /// Do target-specific dag combines on X86ISD::FANDN nodes
25262 static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) {
25263 // FANDN(x, 0.0) -> 0.0
25264 // FANDN(0.0, x) -> x
25265 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
25266 if (C->getValueAPF().isPosZero())
25267 return N->getOperand(1);
25268 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
25269 if (C->getValueAPF().isPosZero())
25270 return N->getOperand(1);
25274 static SDValue PerformBTCombine(SDNode *N,
25276 TargetLowering::DAGCombinerInfo &DCI) {
25277 // BT ignores high bits in the bit index operand.
25278 SDValue Op1 = N->getOperand(1);
25279 if (Op1.hasOneUse()) {
25280 unsigned BitWidth = Op1.getValueSizeInBits();
25281 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
25282 APInt KnownZero, KnownOne;
25283 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
25284 !DCI.isBeforeLegalizeOps());
25285 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25286 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
25287 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
25288 DCI.CommitTargetLoweringOpt(TLO);
25293 static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
25294 SDValue Op = N->getOperand(0);
25295 if (Op.getOpcode() == ISD::BITCAST)
25296 Op = Op.getOperand(0);
25297 EVT VT = N->getValueType(0), OpVT = Op.getValueType();
25298 if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
25299 VT.getVectorElementType().getSizeInBits() ==
25300 OpVT.getVectorElementType().getSizeInBits()) {
25301 return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
25306 static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
25307 const X86Subtarget *Subtarget) {
25308 EVT VT = N->getValueType(0);
25309 if (!VT.isVector())
25312 SDValue N0 = N->getOperand(0);
25313 SDValue N1 = N->getOperand(1);
25314 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
25317 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
25318 // both SSE and AVX2 since there is no sign-extended shift right
25319 // operation on a vector with 64-bit elements.
25320 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
25321 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
25322 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
25323 N0.getOpcode() == ISD::SIGN_EXTEND)) {
25324 SDValue N00 = N0.getOperand(0);
25326 // EXTLOAD has a better solution on AVX2,
25327 // it may be replaced with X86ISD::VSEXT node.
25328 if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256())
25329 if (!ISD::isNormalLoad(N00.getNode()))
25332 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
25333 SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
25335 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
25341 static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
25342 TargetLowering::DAGCombinerInfo &DCI,
25343 const X86Subtarget *Subtarget) {
25344 SDValue N0 = N->getOperand(0);
25345 EVT VT = N->getValueType(0);
25347 // (i8,i32 sext (sdivrem (i8 x, i8 y)) ->
25348 // (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y)
25349 // This exposes the sext to the sdivrem lowering, so that it directly extends
25350 // from AH (which we otherwise need to do contortions to access).
25351 if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 &&
25352 N0.getValueType() == MVT::i8 && VT == MVT::i32) {
25354 SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
25355 SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, dl, NodeTys,
25356 N0.getOperand(0), N0.getOperand(1));
25357 DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
25358 return R.getValue(1);
25361 if (!DCI.isBeforeLegalizeOps())
25364 if (!Subtarget->hasFp256())
25367 if (VT.isVector() && VT.getSizeInBits() == 256) {
25368 SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
25376 static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
25377 const X86Subtarget* Subtarget) {
25379 EVT VT = N->getValueType(0);
25381 // Let legalize expand this if it isn't a legal type yet.
25382 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
25385 EVT ScalarVT = VT.getScalarType();
25386 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
25387 (!Subtarget->hasFMA() && !Subtarget->hasFMA4()))
25390 SDValue A = N->getOperand(0);
25391 SDValue B = N->getOperand(1);
25392 SDValue C = N->getOperand(2);
25394 bool NegA = (A.getOpcode() == ISD::FNEG);
25395 bool NegB = (B.getOpcode() == ISD::FNEG);
25396 bool NegC = (C.getOpcode() == ISD::FNEG);
25398 // Negative multiplication when NegA xor NegB
25399 bool NegMul = (NegA != NegB);
25401 A = A.getOperand(0);
25403 B = B.getOperand(0);
25405 C = C.getOperand(0);
25409 Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
25411 Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
25413 return DAG.getNode(Opcode, dl, VT, A, B, C);
25416 static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
25417 TargetLowering::DAGCombinerInfo &DCI,
25418 const X86Subtarget *Subtarget) {
25419 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
25420 // (and (i32 x86isd::setcc_carry), 1)
25421 // This eliminates the zext. This transformation is necessary because
25422 // ISD::SETCC is always legalized to i8.
25424 SDValue N0 = N->getOperand(0);
25425 EVT VT = N->getValueType(0);
25427 if (N0.getOpcode() == ISD::AND &&
25429 N0.getOperand(0).hasOneUse()) {
25430 SDValue N00 = N0.getOperand(0);
25431 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
25432 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
25433 if (!C || C->getZExtValue() != 1)
25435 return DAG.getNode(ISD::AND, dl, VT,
25436 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
25437 N00.getOperand(0), N00.getOperand(1)),
25438 DAG.getConstant(1, VT));
25442 if (N0.getOpcode() == ISD::TRUNCATE &&
25444 N0.getOperand(0).hasOneUse()) {
25445 SDValue N00 = N0.getOperand(0);
25446 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
25447 return DAG.getNode(ISD::AND, dl, VT,
25448 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
25449 N00.getOperand(0), N00.getOperand(1)),
25450 DAG.getConstant(1, VT));
25453 if (VT.is256BitVector()) {
25454 SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
25459 // (i8,i32 zext (udivrem (i8 x, i8 y)) ->
25460 // (i8,i32 (udivrem_zext_hreg (i8 x, i8 y)
25461 // This exposes the zext to the udivrem lowering, so that it directly extends
25462 // from AH (which we otherwise need to do contortions to access).
25463 if (N0.getOpcode() == ISD::UDIVREM &&
25464 N0.getResNo() == 1 && N0.getValueType() == MVT::i8 &&
25465 (VT == MVT::i32 || VT == MVT::i64)) {
25466 SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
25467 SDValue R = DAG.getNode(X86ISD::UDIVREM8_ZEXT_HREG, dl, NodeTys,
25468 N0.getOperand(0), N0.getOperand(1));
25469 DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
25470 return R.getValue(1);
25476 // Optimize x == -y --> x+y == 0
25477 // x != -y --> x+y != 0
25478 static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
25479 const X86Subtarget* Subtarget) {
25480 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
25481 SDValue LHS = N->getOperand(0);
25482 SDValue RHS = N->getOperand(1);
25483 EVT VT = N->getValueType(0);
25486 if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
25487 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
25488 if (C->getAPIntValue() == 0 && LHS.hasOneUse()) {
25489 SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
25490 LHS.getValueType(), RHS, LHS.getOperand(1));
25491 return DAG.getSetCC(SDLoc(N), N->getValueType(0),
25492 addV, DAG.getConstant(0, addV.getValueType()), CC);
25494 if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
25495 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0)))
25496 if (C->getAPIntValue() == 0 && RHS.hasOneUse()) {
25497 SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
25498 RHS.getValueType(), LHS, RHS.getOperand(1));
25499 return DAG.getSetCC(SDLoc(N), N->getValueType(0),
25500 addV, DAG.getConstant(0, addV.getValueType()), CC);
25503 if (VT.getScalarType() == MVT::i1) {
25504 bool IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
25505 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
25506 bool IsVZero0 = ISD::isBuildVectorAllZeros(LHS.getNode());
25507 if (!IsSEXT0 && !IsVZero0)
25509 bool IsSEXT1 = (RHS.getOpcode() == ISD::SIGN_EXTEND) &&
25510 (RHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
25511 bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
25513 if (!IsSEXT1 && !IsVZero1)
25516 if (IsSEXT0 && IsVZero1) {
25517 assert(VT == LHS.getOperand(0).getValueType() && "Uexpected operand type");
25518 if (CC == ISD::SETEQ)
25519 return DAG.getNOT(DL, LHS.getOperand(0), VT);
25520 return LHS.getOperand(0);
25522 if (IsSEXT1 && IsVZero0) {
25523 assert(VT == RHS.getOperand(0).getValueType() && "Uexpected operand type");
25524 if (CC == ISD::SETEQ)
25525 return DAG.getNOT(DL, RHS.getOperand(0), VT);
25526 return RHS.getOperand(0);
25533 static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
25534 const X86Subtarget *Subtarget) {
25536 MVT VT = N->getOperand(1)->getSimpleValueType(0);
25537 assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
25538 "X86insertps is only defined for v4x32");
25540 SDValue Ld = N->getOperand(1);
25541 if (MayFoldLoad(Ld)) {
25542 // Extract the countS bits from the immediate so we can get the proper
25543 // address when narrowing the vector load to a specific element.
25544 // When the second source op is a memory address, interps doesn't use
25545 // countS and just gets an f32 from that address.
25546 unsigned DestIndex =
25547 cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
25548 Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
25552 // Create this as a scalar to vector to match the instruction pattern.
25553 SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
25554 // countS bits are ignored when loading from memory on insertps, which
25555 // means we don't need to explicitly set them to 0.
25556 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
25557 LoadScalarToVector, N->getOperand(2));
25560 // Helper function of PerformSETCCCombine. It is to materialize "setb reg"
25561 // as "sbb reg,reg", since it can be extended without zext and produces
25562 // an all-ones bit which is more useful than 0/1 in some cases.
25563 static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG,
25566 return DAG.getNode(ISD::AND, DL, VT,
25567 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
25568 DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS),
25569 DAG.getConstant(1, VT));
25570 assert (VT == MVT::i1 && "Unexpected type for SECCC node");
25571 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,
25572 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
25573 DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS));
25576 // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
25577 static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
25578 TargetLowering::DAGCombinerInfo &DCI,
25579 const X86Subtarget *Subtarget) {
25581 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
25582 SDValue EFLAGS = N->getOperand(1);
25584 if (CC == X86::COND_A) {
25585 // Try to convert COND_A into COND_B in an attempt to facilitate
25586 // materializing "setb reg".
25588 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
25589 // cannot take an immediate as its first operand.
25591 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
25592 EFLAGS.getValueType().isInteger() &&
25593 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
25594 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
25595 EFLAGS.getNode()->getVTList(),
25596 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
25597 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
25598 return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0));
25602 // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
25603 // a zext and produces an all-ones bit which is more useful than 0/1 in some
25605 if (CC == X86::COND_B)
25606 return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
25610 Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
25611 if (Flags.getNode()) {
25612 SDValue Cond = DAG.getConstant(CC, MVT::i8);
25613 return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
25619 // Optimize branch condition evaluation.
25621 static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
25622 TargetLowering::DAGCombinerInfo &DCI,
25623 const X86Subtarget *Subtarget) {
25625 SDValue Chain = N->getOperand(0);
25626 SDValue Dest = N->getOperand(1);
25627 SDValue EFLAGS = N->getOperand(3);
25628 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
25632 Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
25633 if (Flags.getNode()) {
25634 SDValue Cond = DAG.getConstant(CC, MVT::i8);
25635 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
25642 static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
25643 SelectionDAG &DAG) {
25644 // Take advantage of vector comparisons producing 0 or -1 in each lane to
25645 // optimize away operation when it's from a constant.
25647 // The general transformation is:
25648 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
25649 // AND(VECTOR_CMP(x,y), constant2)
25650 // constant2 = UNARYOP(constant)
25652 // Early exit if this isn't a vector operation, the operand of the
25653 // unary operation isn't a bitwise AND, or if the sizes of the operations
25654 // aren't the same.
25655 EVT VT = N->getValueType(0);
25656 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
25657 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
25658 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
25661 // Now check that the other operand of the AND is a constant. We could
25662 // make the transformation for non-constant splats as well, but it's unclear
25663 // that would be a benefit as it would not eliminate any operations, just
25664 // perform one more step in scalar code before moving to the vector unit.
25665 if (BuildVectorSDNode *BV =
25666 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
25667 // Bail out if the vector isn't a constant.
25668 if (!BV->isConstant())
25671 // Everything checks out. Build up the new and improved node.
25673 EVT IntVT = BV->getValueType(0);
25674 // Create a new constant of the appropriate type for the transformed
25676 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
25677 // The AND node needs bitcasts to/from an integer vector type around it.
25678 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
25679 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
25680 N->getOperand(0)->getOperand(0), MaskConst);
25681 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
25688 static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
25689 const X86TargetLowering *XTLI) {
25690 // First try to optimize away the conversion entirely when it's
25691 // conditionally from a constant. Vectors only.
25692 SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG);
25693 if (Res != SDValue())
25696 // Now move on to more general possibilities.
25697 SDValue Op0 = N->getOperand(0);
25698 EVT InVT = Op0->getValueType(0);
25700 // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32))
25701 if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
25703 MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
25704 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
25705 return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
25708 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
25709 // a 32-bit target where SSE doesn't support i64->FP operations.
25710 if (Op0.getOpcode() == ISD::LOAD) {
25711 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
25712 EVT VT = Ld->getValueType(0);
25713 if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
25714 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
25715 !XTLI->getSubtarget()->is64Bit() &&
25717 SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0),
25718 Ld->getChain(), Op0, DAG);
25719 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
25726 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
25727 static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
25728 X86TargetLowering::DAGCombinerInfo &DCI) {
25729 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
25730 // the result is either zero or one (depending on the input carry bit).
25731 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
25732 if (X86::isZeroNode(N->getOperand(0)) &&
25733 X86::isZeroNode(N->getOperand(1)) &&
25734 // We don't have a good way to replace an EFLAGS use, so only do this when
25736 SDValue(N, 1).use_empty()) {
25738 EVT VT = N->getValueType(0);
25739 SDValue CarryOut = DAG.getConstant(0, N->getValueType(1));
25740 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
25741 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
25742 DAG.getConstant(X86::COND_B,MVT::i8),
25744 DAG.getConstant(1, VT));
25745 return DCI.CombineTo(N, Res1, CarryOut);
25751 // fold (add Y, (sete X, 0)) -> adc 0, Y
25752 // (add Y, (setne X, 0)) -> sbb -1, Y
25753 // (sub (sete X, 0), Y) -> sbb 0, Y
25754 // (sub (setne X, 0), Y) -> adc -1, Y
25755 static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
25758 // Look through ZExts.
25759 SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
25760 if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
25763 SDValue SetCC = Ext.getOperand(0);
25764 if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
25767 X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
25768 if (CC != X86::COND_E && CC != X86::COND_NE)
25771 SDValue Cmp = SetCC.getOperand(1);
25772 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
25773 !X86::isZeroNode(Cmp.getOperand(1)) ||
25774 !Cmp.getOperand(0).getValueType().isInteger())
25777 SDValue CmpOp0 = Cmp.getOperand(0);
25778 SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
25779 DAG.getConstant(1, CmpOp0.getValueType()));
25781 SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
25782 if (CC == X86::COND_NE)
25783 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
25784 DL, OtherVal.getValueType(), OtherVal,
25785 DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp);
25786 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
25787 DL, OtherVal.getValueType(), OtherVal,
25788 DAG.getConstant(0, OtherVal.getValueType()), NewCmp);
25791 /// PerformADDCombine - Do target-specific dag combines on integer adds.
25792 static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
25793 const X86Subtarget *Subtarget) {
25794 EVT VT = N->getValueType(0);
25795 SDValue Op0 = N->getOperand(0);
25796 SDValue Op1 = N->getOperand(1);
25798 // Try to synthesize horizontal adds from adds of shuffles.
25799 if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
25800 (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
25801 isHorizontalBinOp(Op0, Op1, true))
25802 return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
25804 return OptimizeConditionalInDecrement(N, DAG);
25807 static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
25808 const X86Subtarget *Subtarget) {
25809 SDValue Op0 = N->getOperand(0);
25810 SDValue Op1 = N->getOperand(1);
25812 // X86 can't encode an immediate LHS of a sub. See if we can push the
25813 // negation into a preceding instruction.
25814 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
25815 // If the RHS of the sub is a XOR with one use and a constant, invert the
25816 // immediate. Then add one to the LHS of the sub so we can turn
25817 // X-Y -> X+~Y+1, saving one register.
25818 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
25819 isa<ConstantSDNode>(Op1.getOperand(1))) {
25820 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
25821 EVT VT = Op0.getValueType();
25822 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
25824 DAG.getConstant(~XorC, VT));
25825 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
25826 DAG.getConstant(C->getAPIntValue()+1, VT));
25830 // Try to synthesize horizontal adds from adds of shuffles.
25831 EVT VT = N->getValueType(0);
25832 if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
25833 (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
25834 isHorizontalBinOp(Op0, Op1, true))
25835 return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
25837 return OptimizeConditionalInDecrement(N, DAG);
25840 /// performVZEXTCombine - Performs build vector combines
25841 static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
25842 TargetLowering::DAGCombinerInfo &DCI,
25843 const X86Subtarget *Subtarget) {
25845 MVT VT = N->getSimpleValueType(0);
25846 SDValue Op = N->getOperand(0);
25847 MVT OpVT = Op.getSimpleValueType();
25848 MVT OpEltVT = OpVT.getVectorElementType();
25849 unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
25851 // (vzext (bitcast (vzext (x)) -> (vzext x)
25853 while (V.getOpcode() == ISD::BITCAST)
25854 V = V.getOperand(0);
25856 if (V != Op && V.getOpcode() == X86ISD::VZEXT) {
25857 MVT InnerVT = V.getSimpleValueType();
25858 MVT InnerEltVT = InnerVT.getVectorElementType();
25860 // If the element sizes match exactly, we can just do one larger vzext. This
25861 // is always an exact type match as vzext operates on integer types.
25862 if (OpEltVT == InnerEltVT) {
25863 assert(OpVT == InnerVT && "Types must match for vzext!");
25864 return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
25867 // The only other way we can combine them is if only a single element of the
25868 // inner vzext is used in the input to the outer vzext.
25869 if (InnerEltVT.getSizeInBits() < InputBits)
25872 // In this case, the inner vzext is completely dead because we're going to
25873 // only look at bits inside of the low element. Just do the outer vzext on
25874 // a bitcast of the input to the inner.
25875 return DAG.getNode(X86ISD::VZEXT, DL, VT,
25876 DAG.getNode(ISD::BITCAST, DL, OpVT, V));
25879 // Check if we can bypass extracting and re-inserting an element of an input
25880 // vector. Essentialy:
25881 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
25882 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
25883 V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
25884 V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
25885 SDValue ExtractedV = V.getOperand(0);
25886 SDValue OrigV = ExtractedV.getOperand(0);
25887 if (auto *ExtractIdx = dyn_cast<ConstantSDNode>(ExtractedV.getOperand(1)))
25888 if (ExtractIdx->getZExtValue() == 0) {
25889 MVT OrigVT = OrigV.getSimpleValueType();
25890 // Extract a subvector if necessary...
25891 if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
25892 int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
25893 OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
25894 OrigVT.getVectorNumElements() / Ratio);
25895 OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
25896 DAG.getIntPtrConstant(0));
25898 Op = DAG.getNode(ISD::BITCAST, DL, OpVT, OrigV);
25899 return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
25906 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
25907 DAGCombinerInfo &DCI) const {
25908 SelectionDAG &DAG = DCI.DAG;
25909 switch (N->getOpcode()) {
25911 case ISD::EXTRACT_VECTOR_ELT:
25912 return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
25915 case X86ISD::SHRUNKBLEND:
25916 return PerformSELECTCombine(N, DAG, DCI, Subtarget);
25917 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget);
25918 case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget);
25919 case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget);
25920 case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI);
25921 case ISD::MUL: return PerformMulCombine(N, DAG, DCI);
25924 case ISD::SRL: return PerformShiftCombine(N, DAG, DCI, Subtarget);
25925 case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget);
25926 case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget);
25927 case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget);
25928 case ISD::LOAD: return PerformLOADCombine(N, DAG, DCI, Subtarget);
25929 case ISD::MLOAD: return PerformMLOADCombine(N, DAG, DCI, Subtarget);
25930 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget);
25931 case ISD::MSTORE: return PerformMSTORECombine(N, DAG, Subtarget);
25932 case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this);
25933 case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget);
25934 case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget);
25936 case X86ISD::FOR: return PerformFORCombine(N, DAG);
25938 case X86ISD::FMAX: return PerformFMinFMaxCombine(N, DAG);
25939 case X86ISD::FAND: return PerformFANDCombine(N, DAG);
25940 case X86ISD::FANDN: return PerformFANDNCombine(N, DAG);
25941 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI);
25942 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG);
25943 case ISD::ANY_EXTEND:
25944 case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget);
25945 case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget);
25946 case ISD::SIGN_EXTEND_INREG:
25947 return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
25948 case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG,DCI,Subtarget);
25949 case ISD::SETCC: return PerformISDSETCCCombine(N, DAG, Subtarget);
25950 case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget);
25951 case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget);
25952 case X86ISD::VZEXT: return performVZEXTCombine(N, DAG, DCI, Subtarget);
25953 case X86ISD::SHUFP: // Handle all target specific shuffles
25954 case X86ISD::PALIGNR:
25955 case X86ISD::UNPCKH:
25956 case X86ISD::UNPCKL:
25957 case X86ISD::MOVHLPS:
25958 case X86ISD::MOVLHPS:
25959 case X86ISD::PSHUFB:
25960 case X86ISD::PSHUFD:
25961 case X86ISD::PSHUFHW:
25962 case X86ISD::PSHUFLW:
25963 case X86ISD::MOVSS:
25964 case X86ISD::MOVSD:
25965 case X86ISD::VPERMILPI:
25966 case X86ISD::VPERM2X128:
25967 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
25968 case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget);
25969 case ISD::INTRINSIC_WO_CHAIN:
25970 return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget);
25971 case X86ISD::INSERTPS: {
25972 if (getTargetMachine().getOptLevel() > CodeGenOpt::None)
25973 return PerformINSERTPSCombine(N, DAG, Subtarget);
25976 case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DAG, Subtarget);
25982 /// isTypeDesirableForOp - Return true if the target has native support for
25983 /// the specified value type and it is 'desirable' to use the type for the
25984 /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
25985 /// instruction encodings are longer and some i16 instructions are slow.
25986 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
25987 if (!isTypeLegal(VT))
25989 if (VT != MVT::i16)
25996 case ISD::SIGN_EXTEND:
25997 case ISD::ZERO_EXTEND:
25998 case ISD::ANY_EXTEND:
26011 /// IsDesirableToPromoteOp - This method query the target whether it is
26012 /// beneficial for dag combiner to promote the specified node. If true, it
26013 /// should return the desired promotion type by reference.
26014 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
26015 EVT VT = Op.getValueType();
26016 if (VT != MVT::i16)
26019 bool Promote = false;
26020 bool Commute = false;
26021 switch (Op.getOpcode()) {
26024 LoadSDNode *LD = cast<LoadSDNode>(Op);
26025 // If the non-extending load has a single use and it's not live out, then it
26026 // might be folded.
26027 if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&&
26028 Op.hasOneUse()*/) {
26029 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
26030 UE = Op.getNode()->use_end(); UI != UE; ++UI) {
26031 // The only case where we'd want to promote LOAD (rather then it being
26032 // promoted as an operand is when it's only use is liveout.
26033 if (UI->getOpcode() != ISD::CopyToReg)
26040 case ISD::SIGN_EXTEND:
26041 case ISD::ZERO_EXTEND:
26042 case ISD::ANY_EXTEND:
26047 SDValue N0 = Op.getOperand(0);
26048 // Look out for (store (shl (load), x)).
26049 if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
26062 SDValue N0 = Op.getOperand(0);
26063 SDValue N1 = Op.getOperand(1);
26064 if (!Commute && MayFoldLoad(N1))
26066 // Avoid disabling potential load folding opportunities.
26067 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
26069 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
26079 //===----------------------------------------------------------------------===//
26080 // X86 Inline Assembly Support
26081 //===----------------------------------------------------------------------===//
26084 // Helper to match a string separated by whitespace.
26085 bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) {
26086 s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace.
26088 for (unsigned i = 0, e = args.size(); i != e; ++i) {
26089 StringRef piece(*args[i]);
26090 if (!s.startswith(piece)) // Check if the piece matches.
26093 s = s.substr(piece.size());
26094 StringRef::size_type pos = s.find_first_not_of(" \t");
26095 if (pos == 0) // We matched a prefix.
26103 const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={};
26106 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
26108 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
26109 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
26110 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
26111 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
26113 if (AsmPieces.size() == 3)
26115 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
26122 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
26123 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
26125 std::string AsmStr = IA->getAsmString();
26127 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
26128 if (!Ty || Ty->getBitWidth() % 16 != 0)
26131 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
26132 SmallVector<StringRef, 4> AsmPieces;
26133 SplitString(AsmStr, AsmPieces, ";\n");
26135 switch (AsmPieces.size()) {
26136 default: return false;
26138 // FIXME: this should verify that we are targeting a 486 or better. If not,
26139 // we will turn this bswap into something that will be lowered to logical
26140 // ops instead of emitting the bswap asm. For now, we don't support 486 or
26141 // lower so don't worry about this.
26143 if (matchAsm(AsmPieces[0], "bswap", "$0") ||
26144 matchAsm(AsmPieces[0], "bswapl", "$0") ||
26145 matchAsm(AsmPieces[0], "bswapq", "$0") ||
26146 matchAsm(AsmPieces[0], "bswap", "${0:q}") ||
26147 matchAsm(AsmPieces[0], "bswapl", "${0:q}") ||
26148 matchAsm(AsmPieces[0], "bswapq", "${0:q}")) {
26149 // No need to check constraints, nothing other than the equivalent of
26150 // "=r,0" would be valid here.
26151 return IntrinsicLowering::LowerToByteSwap(CI);
26154 // rorw $$8, ${0:w} --> llvm.bswap.i16
26155 if (CI->getType()->isIntegerTy(16) &&
26156 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
26157 (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") ||
26158 matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) {
26160 const std::string &ConstraintsStr = IA->getConstraintString();
26161 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
26162 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
26163 if (clobbersFlagRegisters(AsmPieces))
26164 return IntrinsicLowering::LowerToByteSwap(CI);
26168 if (CI->getType()->isIntegerTy(32) &&
26169 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
26170 matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") &&
26171 matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") &&
26172 matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) {
26174 const std::string &ConstraintsStr = IA->getConstraintString();
26175 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
26176 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
26177 if (clobbersFlagRegisters(AsmPieces))
26178 return IntrinsicLowering::LowerToByteSwap(CI);
26181 if (CI->getType()->isIntegerTy(64)) {
26182 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
26183 if (Constraints.size() >= 2 &&
26184 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
26185 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
26186 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
26187 if (matchAsm(AsmPieces[0], "bswap", "%eax") &&
26188 matchAsm(AsmPieces[1], "bswap", "%edx") &&
26189 matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx"))
26190 return IntrinsicLowering::LowerToByteSwap(CI);
26198 /// getConstraintType - Given a constraint letter, return the type of
26199 /// constraint it is for this target.
26200 X86TargetLowering::ConstraintType
26201 X86TargetLowering::getConstraintType(const std::string &Constraint) const {
26202 if (Constraint.size() == 1) {
26203 switch (Constraint[0]) {
26214 return C_RegisterClass;
26238 return TargetLowering::getConstraintType(Constraint);
26241 /// Examine constraint type and operand type and determine a weight value.
26242 /// This object must already have been set up with the operand type
26243 /// and the current alternative constraint selected.
26244 TargetLowering::ConstraintWeight
26245 X86TargetLowering::getSingleConstraintMatchWeight(
26246 AsmOperandInfo &info, const char *constraint) const {
26247 ConstraintWeight weight = CW_Invalid;
26248 Value *CallOperandVal = info.CallOperandVal;
26249 // If we don't have a value, we can't do a match,
26250 // but allow it at the lowest weight.
26251 if (!CallOperandVal)
26253 Type *type = CallOperandVal->getType();
26254 // Look at the constraint type.
26255 switch (*constraint) {
26257 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
26268 if (CallOperandVal->getType()->isIntegerTy())
26269 weight = CW_SpecificReg;
26274 if (type->isFloatingPointTy())
26275 weight = CW_SpecificReg;
26278 if (type->isX86_MMXTy() && Subtarget->hasMMX())
26279 weight = CW_SpecificReg;
26283 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) ||
26284 ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256()))
26285 weight = CW_Register;
26288 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
26289 if (C->getZExtValue() <= 31)
26290 weight = CW_Constant;
26294 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26295 if (C->getZExtValue() <= 63)
26296 weight = CW_Constant;
26300 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26301 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
26302 weight = CW_Constant;
26306 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26307 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
26308 weight = CW_Constant;
26312 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26313 if (C->getZExtValue() <= 3)
26314 weight = CW_Constant;
26318 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26319 if (C->getZExtValue() <= 0xff)
26320 weight = CW_Constant;
26325 if (dyn_cast<ConstantFP>(CallOperandVal)) {
26326 weight = CW_Constant;
26330 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26331 if ((C->getSExtValue() >= -0x80000000LL) &&
26332 (C->getSExtValue() <= 0x7fffffffLL))
26333 weight = CW_Constant;
26337 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26338 if (C->getZExtValue() <= 0xffffffff)
26339 weight = CW_Constant;
26346 /// LowerXConstraint - try to replace an X constraint, which matches anything,
26347 /// with another that has more specific requirements based on the type of the
26348 /// corresponding operand.
26349 const char *X86TargetLowering::
26350 LowerXConstraint(EVT ConstraintVT) const {
26351 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
26352 // 'f' like normal targets.
26353 if (ConstraintVT.isFloatingPoint()) {
26354 if (Subtarget->hasSSE2())
26356 if (Subtarget->hasSSE1())
26360 return TargetLowering::LowerXConstraint(ConstraintVT);
26363 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
26364 /// vector. If it is invalid, don't add anything to Ops.
26365 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
26366 std::string &Constraint,
26367 std::vector<SDValue>&Ops,
26368 SelectionDAG &DAG) const {
26371 // Only support length 1 constraints for now.
26372 if (Constraint.length() > 1) return;
26374 char ConstraintLetter = Constraint[0];
26375 switch (ConstraintLetter) {
26378 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26379 if (C->getZExtValue() <= 31) {
26380 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26386 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26387 if (C->getZExtValue() <= 63) {
26388 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26394 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26395 if (isInt<8>(C->getSExtValue())) {
26396 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26402 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26403 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
26404 (Subtarget->is64Bit() && C->getZExtValue() == 0xffffffff)) {
26405 Result = DAG.getTargetConstant(C->getSExtValue(), Op.getValueType());
26411 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26412 if (C->getZExtValue() <= 3) {
26413 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26419 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26420 if (C->getZExtValue() <= 255) {
26421 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26427 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26428 if (C->getZExtValue() <= 127) {
26429 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26435 // 32-bit signed value
26436 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26437 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
26438 C->getSExtValue())) {
26439 // Widen to 64 bits here to get it sign extended.
26440 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
26443 // FIXME gcc accepts some relocatable values here too, but only in certain
26444 // memory models; it's complicated.
26449 // 32-bit unsigned value
26450 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26451 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
26452 C->getZExtValue())) {
26453 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26457 // FIXME gcc accepts some relocatable values here too, but only in certain
26458 // memory models; it's complicated.
26462 // Literal immediates are always ok.
26463 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
26464 // Widen to 64 bits here to get it sign extended.
26465 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64);
26469 // In any sort of PIC mode addresses need to be computed at runtime by
26470 // adding in a register or some sort of table lookup. These can't
26471 // be used as immediates.
26472 if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC())
26475 // If we are in non-pic codegen mode, we allow the address of a global (with
26476 // an optional displacement) to be used with 'i'.
26477 GlobalAddressSDNode *GA = nullptr;
26478 int64_t Offset = 0;
26480 // Match either (GA), (GA+C), (GA+C1+C2), etc.
26482 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
26483 Offset += GA->getOffset();
26485 } else if (Op.getOpcode() == ISD::ADD) {
26486 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
26487 Offset += C->getZExtValue();
26488 Op = Op.getOperand(0);
26491 } else if (Op.getOpcode() == ISD::SUB) {
26492 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
26493 Offset += -C->getZExtValue();
26494 Op = Op.getOperand(0);
26499 // Otherwise, this isn't something we can handle, reject it.
26503 const GlobalValue *GV = GA->getGlobal();
26504 // If we require an extra load to get this address, as in PIC mode, we
26505 // can't accept it.
26506 if (isGlobalStubReference(
26507 Subtarget->ClassifyGlobalReference(GV, DAG.getTarget())))
26510 Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
26511 GA->getValueType(0), Offset);
26516 if (Result.getNode()) {
26517 Ops.push_back(Result);
26520 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
26523 std::pair<unsigned, const TargetRegisterClass*>
26524 X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
26526 // First, see if this is a constraint that directly corresponds to an LLVM
26528 if (Constraint.size() == 1) {
26529 // GCC Constraint Letters
26530 switch (Constraint[0]) {
26532 // TODO: Slight differences here in allocation order and leaving
26533 // RIP in the class. Do they matter any more here than they do
26534 // in the normal allocation?
26535 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
26536 if (Subtarget->is64Bit()) {
26537 if (VT == MVT::i32 || VT == MVT::f32)
26538 return std::make_pair(0U, &X86::GR32RegClass);
26539 if (VT == MVT::i16)
26540 return std::make_pair(0U, &X86::GR16RegClass);
26541 if (VT == MVT::i8 || VT == MVT::i1)
26542 return std::make_pair(0U, &X86::GR8RegClass);
26543 if (VT == MVT::i64 || VT == MVT::f64)
26544 return std::make_pair(0U, &X86::GR64RegClass);
26547 // 32-bit fallthrough
26548 case 'Q': // Q_REGS
26549 if (VT == MVT::i32 || VT == MVT::f32)
26550 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
26551 if (VT == MVT::i16)
26552 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
26553 if (VT == MVT::i8 || VT == MVT::i1)
26554 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
26555 if (VT == MVT::i64)
26556 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
26558 case 'r': // GENERAL_REGS
26559 case 'l': // INDEX_REGS
26560 if (VT == MVT::i8 || VT == MVT::i1)
26561 return std::make_pair(0U, &X86::GR8RegClass);
26562 if (VT == MVT::i16)
26563 return std::make_pair(0U, &X86::GR16RegClass);
26564 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit())
26565 return std::make_pair(0U, &X86::GR32RegClass);
26566 return std::make_pair(0U, &X86::GR64RegClass);
26567 case 'R': // LEGACY_REGS
26568 if (VT == MVT::i8 || VT == MVT::i1)
26569 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
26570 if (VT == MVT::i16)
26571 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
26572 if (VT == MVT::i32 || !Subtarget->is64Bit())
26573 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
26574 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
26575 case 'f': // FP Stack registers.
26576 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
26577 // value to the correct fpstack register class.
26578 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
26579 return std::make_pair(0U, &X86::RFP32RegClass);
26580 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
26581 return std::make_pair(0U, &X86::RFP64RegClass);
26582 return std::make_pair(0U, &X86::RFP80RegClass);
26583 case 'y': // MMX_REGS if MMX allowed.
26584 if (!Subtarget->hasMMX()) break;
26585 return std::make_pair(0U, &X86::VR64RegClass);
26586 case 'Y': // SSE_REGS if SSE2 allowed
26587 if (!Subtarget->hasSSE2()) break;
26589 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
26590 if (!Subtarget->hasSSE1()) break;
26592 switch (VT.SimpleTy) {
26594 // Scalar SSE types.
26597 return std::make_pair(0U, &X86::FR32RegClass);
26600 return std::make_pair(0U, &X86::FR64RegClass);
26608 return std::make_pair(0U, &X86::VR128RegClass);
26616 return std::make_pair(0U, &X86::VR256RegClass);
26621 return std::make_pair(0U, &X86::VR512RegClass);
26627 // Use the default implementation in TargetLowering to convert the register
26628 // constraint into a member of a register class.
26629 std::pair<unsigned, const TargetRegisterClass*> Res;
26630 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
26632 // Not found as a standard register?
26634 // Map st(0) -> st(7) -> ST0
26635 if (Constraint.size() == 7 && Constraint[0] == '{' &&
26636 tolower(Constraint[1]) == 's' &&
26637 tolower(Constraint[2]) == 't' &&
26638 Constraint[3] == '(' &&
26639 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
26640 Constraint[5] == ')' &&
26641 Constraint[6] == '}') {
26643 Res.first = X86::FP0+Constraint[4]-'0';
26644 Res.second = &X86::RFP80RegClass;
26648 // GCC allows "st(0)" to be called just plain "st".
26649 if (StringRef("{st}").equals_lower(Constraint)) {
26650 Res.first = X86::FP0;
26651 Res.second = &X86::RFP80RegClass;
26656 if (StringRef("{flags}").equals_lower(Constraint)) {
26657 Res.first = X86::EFLAGS;
26658 Res.second = &X86::CCRRegClass;
26662 // 'A' means EAX + EDX.
26663 if (Constraint == "A") {
26664 Res.first = X86::EAX;
26665 Res.second = &X86::GR32_ADRegClass;
26671 // Otherwise, check to see if this is a register class of the wrong value
26672 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
26673 // turn into {ax},{dx}.
26674 if (Res.second->hasType(VT))
26675 return Res; // Correct type already, nothing to do.
26677 // All of the single-register GCC register classes map their values onto
26678 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we
26679 // really want an 8-bit or 32-bit register, map to the appropriate register
26680 // class and return the appropriate register.
26681 if (Res.second == &X86::GR16RegClass) {
26682 if (VT == MVT::i8 || VT == MVT::i1) {
26683 unsigned DestReg = 0;
26684 switch (Res.first) {
26686 case X86::AX: DestReg = X86::AL; break;
26687 case X86::DX: DestReg = X86::DL; break;
26688 case X86::CX: DestReg = X86::CL; break;
26689 case X86::BX: DestReg = X86::BL; break;
26692 Res.first = DestReg;
26693 Res.second = &X86::GR8RegClass;
26695 } else if (VT == MVT::i32 || VT == MVT::f32) {
26696 unsigned DestReg = 0;
26697 switch (Res.first) {
26699 case X86::AX: DestReg = X86::EAX; break;
26700 case X86::DX: DestReg = X86::EDX; break;
26701 case X86::CX: DestReg = X86::ECX; break;
26702 case X86::BX: DestReg = X86::EBX; break;
26703 case X86::SI: DestReg = X86::ESI; break;
26704 case X86::DI: DestReg = X86::EDI; break;
26705 case X86::BP: DestReg = X86::EBP; break;
26706 case X86::SP: DestReg = X86::ESP; break;
26709 Res.first = DestReg;
26710 Res.second = &X86::GR32RegClass;
26712 } else if (VT == MVT::i64 || VT == MVT::f64) {
26713 unsigned DestReg = 0;
26714 switch (Res.first) {
26716 case X86::AX: DestReg = X86::RAX; break;
26717 case X86::DX: DestReg = X86::RDX; break;
26718 case X86::CX: DestReg = X86::RCX; break;
26719 case X86::BX: DestReg = X86::RBX; break;
26720 case X86::SI: DestReg = X86::RSI; break;
26721 case X86::DI: DestReg = X86::RDI; break;
26722 case X86::BP: DestReg = X86::RBP; break;
26723 case X86::SP: DestReg = X86::RSP; break;
26726 Res.first = DestReg;
26727 Res.second = &X86::GR64RegClass;
26730 } else if (Res.second == &X86::FR32RegClass ||
26731 Res.second == &X86::FR64RegClass ||
26732 Res.second == &X86::VR128RegClass ||
26733 Res.second == &X86::VR256RegClass ||
26734 Res.second == &X86::FR32XRegClass ||
26735 Res.second == &X86::FR64XRegClass ||
26736 Res.second == &X86::VR128XRegClass ||
26737 Res.second == &X86::VR256XRegClass ||
26738 Res.second == &X86::VR512RegClass) {
26739 // Handle references to XMM physical registers that got mapped into the
26740 // wrong class. This can happen with constraints like {xmm0} where the
26741 // target independent register mapper will just pick the first match it can
26742 // find, ignoring the required type.
26744 if (VT == MVT::f32 || VT == MVT::i32)
26745 Res.second = &X86::FR32RegClass;
26746 else if (VT == MVT::f64 || VT == MVT::i64)
26747 Res.second = &X86::FR64RegClass;
26748 else if (X86::VR128RegClass.hasType(VT))
26749 Res.second = &X86::VR128RegClass;
26750 else if (X86::VR256RegClass.hasType(VT))
26751 Res.second = &X86::VR256RegClass;
26752 else if (X86::VR512RegClass.hasType(VT))
26753 Res.second = &X86::VR512RegClass;
26759 int X86TargetLowering::getScalingFactorCost(const AddrMode &AM,
26761 // Scaling factors are not free at all.
26762 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
26763 // will take 2 allocations in the out of order engine instead of 1
26764 // for plain addressing mode, i.e. inst (reg1).
26766 // vaddps (%rsi,%drx), %ymm0, %ymm1
26767 // Requires two allocations (one for the load, one for the computation)
26769 // vaddps (%rsi), %ymm0, %ymm1
26770 // Requires just 1 allocation, i.e., freeing allocations for other operations
26771 // and having less micro operations to execute.
26773 // For some X86 architectures, this is even worse because for instance for
26774 // stores, the complex addressing mode forces the instruction to use the
26775 // "load" ports instead of the dedicated "store" port.
26776 // E.g., on Haswell:
26777 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
26778 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
26779 if (isLegalAddressingMode(AM, Ty))
26780 // Scale represents reg2 * scale, thus account for 1
26781 // as soon as we use a second register.
26782 return AM.Scale != 0;
26786 bool X86TargetLowering::isTargetFTOL() const {
26787 return Subtarget->isTargetKnownWindowsMSVC() && !Subtarget->is64Bit();