1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file defines the interfaces that X86 uses to lower LLVM code into a
13 //===----------------------------------------------------------------------===//
15 #include "X86ISelLowering.h"
16 #include "Utils/X86ShuffleDecode.h"
17 #include "X86CallingConv.h"
18 #include "X86FrameLowering.h"
19 #include "X86InstrBuilder.h"
20 #include "X86MachineFunctionInfo.h"
21 #include "X86TargetMachine.h"
22 #include "X86TargetObjectFile.h"
23 #include "llvm/ADT/SmallBitVector.h"
24 #include "llvm/ADT/SmallSet.h"
25 #include "llvm/ADT/Statistic.h"
26 #include "llvm/ADT/StringExtras.h"
27 #include "llvm/ADT/StringSwitch.h"
28 #include "llvm/ADT/VariadicFunction.h"
29 #include "llvm/CodeGen/IntrinsicLowering.h"
30 #include "llvm/CodeGen/MachineFrameInfo.h"
31 #include "llvm/CodeGen/MachineFunction.h"
32 #include "llvm/CodeGen/MachineInstrBuilder.h"
33 #include "llvm/CodeGen/MachineJumpTableInfo.h"
34 #include "llvm/CodeGen/MachineModuleInfo.h"
35 #include "llvm/CodeGen/MachineRegisterInfo.h"
36 #include "llvm/IR/CallSite.h"
37 #include "llvm/IR/CallingConv.h"
38 #include "llvm/IR/Constants.h"
39 #include "llvm/IR/DerivedTypes.h"
40 #include "llvm/IR/Function.h"
41 #include "llvm/IR/GlobalAlias.h"
42 #include "llvm/IR/GlobalVariable.h"
43 #include "llvm/IR/Instructions.h"
44 #include "llvm/IR/Intrinsics.h"
45 #include "llvm/MC/MCAsmInfo.h"
46 #include "llvm/MC/MCContext.h"
47 #include "llvm/MC/MCExpr.h"
48 #include "llvm/MC/MCSymbol.h"
49 #include "llvm/Support/CommandLine.h"
50 #include "llvm/Support/Debug.h"
51 #include "llvm/Support/ErrorHandling.h"
52 #include "llvm/Support/MathExtras.h"
53 #include "llvm/Target/TargetOptions.h"
54 #include "X86IntrinsicsInfo.h"
60 #define DEBUG_TYPE "x86-isel"
62 STATISTIC(NumTailCalls, "Number of tail calls");
64 static cl::opt<bool> ExperimentalVectorWideningLegalization(
65 "x86-experimental-vector-widening-legalization", cl::init(false),
66 cl::desc("Enable an experimental vector type legalization through widening "
67 "rather than promotion."),
70 static cl::opt<bool> ExperimentalVectorShuffleLowering(
71 "x86-experimental-vector-shuffle-lowering", cl::init(true),
72 cl::desc("Enable an experimental vector shuffle lowering code path."),
75 static cl::opt<bool> ExperimentalVectorShuffleLegality(
76 "x86-experimental-vector-shuffle-legality", cl::init(false),
77 cl::desc("Enable experimental shuffle legality based on the experimental "
78 "shuffle lowering. Should only be used with the experimental "
82 static cl::opt<int> ReciprocalEstimateRefinementSteps(
83 "x86-recip-refinement-steps", cl::init(1),
84 cl::desc("Specify the number of Newton-Raphson iterations applied to the "
85 "result of the hardware reciprocal estimate instruction."),
88 // Forward declarations.
89 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
92 static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
93 SelectionDAG &DAG, SDLoc dl,
94 unsigned vectorWidth) {
95 assert((vectorWidth == 128 || vectorWidth == 256) &&
96 "Unsupported vector width");
97 EVT VT = Vec.getValueType();
98 EVT ElVT = VT.getVectorElementType();
99 unsigned Factor = VT.getSizeInBits()/vectorWidth;
100 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
101 VT.getVectorNumElements()/Factor);
103 // Extract from UNDEF is UNDEF.
104 if (Vec.getOpcode() == ISD::UNDEF)
105 return DAG.getUNDEF(ResultVT);
107 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
108 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
110 // This is the index of the first element of the vectorWidth-bit chunk
112 unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
115 // If the input is a buildvector just emit a smaller one.
116 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
117 return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
118 makeArrayRef(Vec->op_begin() + NormalizedIdxVal,
121 SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
122 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
125 /// Generate a DAG to grab 128-bits from a vector > 128 bits. This
126 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
127 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
128 /// instructions or a simple subregister reference. Idx is an index in the
129 /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
130 /// lowering EXTRACT_VECTOR_ELT operations easier.
131 static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
132 SelectionDAG &DAG, SDLoc dl) {
133 assert((Vec.getValueType().is256BitVector() ||
134 Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
135 return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
138 /// Generate a DAG to grab 256-bits from a 512-bit vector.
139 static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
140 SelectionDAG &DAG, SDLoc dl) {
141 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
142 return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
145 static SDValue InsertSubVector(SDValue Result, SDValue Vec,
146 unsigned IdxVal, SelectionDAG &DAG,
147 SDLoc dl, unsigned vectorWidth) {
148 assert((vectorWidth == 128 || vectorWidth == 256) &&
149 "Unsupported vector width");
150 // Inserting UNDEF is Result
151 if (Vec.getOpcode() == ISD::UNDEF)
153 EVT VT = Vec.getValueType();
154 EVT ElVT = VT.getVectorElementType();
155 EVT ResultVT = Result.getValueType();
157 // Insert the relevant vectorWidth bits.
158 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
160 // This is the index of the first element of the vectorWidth-bit chunk
162 unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
165 SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
166 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
169 /// Generate a DAG to put 128-bits into a vector > 128 bits. This
170 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
171 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
172 /// simple superregister reference. Idx is an index in the 128 bits
173 /// we want. It need not be aligned to a 128-bit boundary. That makes
174 /// lowering INSERT_VECTOR_ELT operations easier.
175 static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
176 SelectionDAG &DAG,SDLoc dl) {
177 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
178 return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
181 static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
182 SelectionDAG &DAG, SDLoc dl) {
183 assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
184 return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
187 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
188 /// instructions. This is used because creating CONCAT_VECTOR nodes of
189 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
190 /// large BUILD_VECTORS.
191 static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
192 unsigned NumElems, SelectionDAG &DAG,
194 SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
195 return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
198 static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
199 unsigned NumElems, SelectionDAG &DAG,
201 SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
202 return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
205 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
206 const X86Subtarget &STI)
207 : TargetLowering(TM), Subtarget(&STI) {
208 X86ScalarSSEf64 = Subtarget->hasSSE2();
209 X86ScalarSSEf32 = Subtarget->hasSSE1();
210 TD = getDataLayout();
212 // Set up the TargetLowering object.
213 static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
215 // X86 is weird. It always uses i8 for shift amounts and setcc results.
216 setBooleanContents(ZeroOrOneBooleanContent);
217 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
218 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
220 // For 64-bit, since we have so many registers, use the ILP scheduler.
221 // For 32-bit, use the register pressure specific scheduling.
222 // For Atom, always use ILP scheduling.
223 if (Subtarget->isAtom())
224 setSchedulingPreference(Sched::ILP);
225 else if (Subtarget->is64Bit())
226 setSchedulingPreference(Sched::ILP);
228 setSchedulingPreference(Sched::RegPressure);
229 const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
230 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
232 // Bypass expensive divides on Atom when compiling with O2.
233 if (TM.getOptLevel() >= CodeGenOpt::Default) {
234 if (Subtarget->hasSlowDivide32())
235 addBypassSlowDiv(32, 8);
236 if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit())
237 addBypassSlowDiv(64, 16);
240 if (Subtarget->isTargetKnownWindowsMSVC()) {
241 // Setup Windows compiler runtime calls.
242 setLibcallName(RTLIB::SDIV_I64, "_alldiv");
243 setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
244 setLibcallName(RTLIB::SREM_I64, "_allrem");
245 setLibcallName(RTLIB::UREM_I64, "_aullrem");
246 setLibcallName(RTLIB::MUL_I64, "_allmul");
247 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
248 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
249 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
250 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
251 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
253 // The _ftol2 runtime function has an unusual calling conv, which
254 // is modeled by a special pseudo-instruction.
255 setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr);
256 setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr);
257 setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr);
258 setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr);
261 if (Subtarget->isTargetDarwin()) {
262 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
263 setUseUnderscoreSetJmp(false);
264 setUseUnderscoreLongJmp(false);
265 } else if (Subtarget->isTargetWindowsGNU()) {
266 // MS runtime is weird: it exports _setjmp, but longjmp!
267 setUseUnderscoreSetJmp(true);
268 setUseUnderscoreLongJmp(false);
270 setUseUnderscoreSetJmp(true);
271 setUseUnderscoreLongJmp(true);
274 // Set up the register classes.
275 addRegisterClass(MVT::i8, &X86::GR8RegClass);
276 addRegisterClass(MVT::i16, &X86::GR16RegClass);
277 addRegisterClass(MVT::i32, &X86::GR32RegClass);
278 if (Subtarget->is64Bit())
279 addRegisterClass(MVT::i64, &X86::GR64RegClass);
281 for (MVT VT : MVT::integer_valuetypes())
282 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
284 // We don't accept any truncstore of integer registers.
285 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
286 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
287 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
288 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
289 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
290 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
292 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
294 // SETOEQ and SETUNE require checking two conditions.
295 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
296 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
297 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
298 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
299 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
300 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
302 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
304 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
305 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
306 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
308 if (Subtarget->is64Bit()) {
309 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
310 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
311 } else if (!TM.Options.UseSoftFloat) {
312 // We have an algorithm for SSE2->double, and we turn this into a
313 // 64-bit FILD followed by conditional FADD for other targets.
314 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
315 // We have an algorithm for SSE2, and we turn this into a 64-bit
316 // FILD for other targets.
317 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
320 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
322 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
323 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
325 if (!TM.Options.UseSoftFloat) {
326 // SSE has no i16 to fp conversion, only i32
327 if (X86ScalarSSEf32) {
328 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
329 // f32 and f64 cases are Legal, f80 case is not
330 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
332 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
333 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
336 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
337 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);
340 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
341 // are Legal, f80 is custom lowered.
342 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
343 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
345 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
347 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
348 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
350 if (X86ScalarSSEf32) {
351 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
352 // f32 and f64 cases are Legal, f80 case is not
353 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
355 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
356 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
359 // Handle FP_TO_UINT by promoting the destination to a larger signed
361 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
362 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
363 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
365 if (Subtarget->is64Bit()) {
366 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
367 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
368 } else if (!TM.Options.UseSoftFloat) {
369 // Since AVX is a superset of SSE3, only check for SSE here.
370 if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
371 // Expand FP_TO_UINT into a select.
372 // FIXME: We would like to use a Custom expander here eventually to do
373 // the optimal thing for SSE vs. the default expansion in the legalizer.
374 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
376 // With SSE3 we can use fisttpll to convert to a signed i64; without
377 // SSE, we're stuck with a fistpll.
378 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
381 if (isTargetFTOL()) {
382 // Use the _ftol2 runtime function, which has a pseudo-instruction
383 // to handle its weird calling convention.
384 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
387 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
388 if (!X86ScalarSSEf64) {
389 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
390 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
391 if (Subtarget->is64Bit()) {
392 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
393 // Without SSE, i64->f64 goes through memory.
394 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
398 // Scalar integer divide and remainder are lowered to use operations that
399 // produce two results, to match the available instructions. This exposes
400 // the two-result form to trivial CSE, which is able to combine x/y and x%y
401 // into a single instruction.
403 // Scalar integer multiply-high is also lowered to use two-result
404 // operations, to match the available instructions. However, plain multiply
405 // (low) operations are left as Legal, as there are single-result
406 // instructions for this in x86. Using the two-result multiply instructions
407 // when both high and low results are needed must be arranged by dagcombine.
408 for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
410 setOperationAction(ISD::MULHS, VT, Expand);
411 setOperationAction(ISD::MULHU, VT, Expand);
412 setOperationAction(ISD::SDIV, VT, Expand);
413 setOperationAction(ISD::UDIV, VT, Expand);
414 setOperationAction(ISD::SREM, VT, Expand);
415 setOperationAction(ISD::UREM, VT, Expand);
417 // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
418 setOperationAction(ISD::ADDC, VT, Custom);
419 setOperationAction(ISD::ADDE, VT, Custom);
420 setOperationAction(ISD::SUBC, VT, Custom);
421 setOperationAction(ISD::SUBE, VT, Custom);
424 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
425 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
426 setOperationAction(ISD::BR_CC , MVT::f32, Expand);
427 setOperationAction(ISD::BR_CC , MVT::f64, Expand);
428 setOperationAction(ISD::BR_CC , MVT::f80, Expand);
429 setOperationAction(ISD::BR_CC , MVT::i8, Expand);
430 setOperationAction(ISD::BR_CC , MVT::i16, Expand);
431 setOperationAction(ISD::BR_CC , MVT::i32, Expand);
432 setOperationAction(ISD::BR_CC , MVT::i64, Expand);
433 setOperationAction(ISD::SELECT_CC , MVT::f32, Expand);
434 setOperationAction(ISD::SELECT_CC , MVT::f64, Expand);
435 setOperationAction(ISD::SELECT_CC , MVT::f80, Expand);
436 setOperationAction(ISD::SELECT_CC , MVT::i8, Expand);
437 setOperationAction(ISD::SELECT_CC , MVT::i16, Expand);
438 setOperationAction(ISD::SELECT_CC , MVT::i32, Expand);
439 setOperationAction(ISD::SELECT_CC , MVT::i64, Expand);
440 if (Subtarget->is64Bit())
441 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
442 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
443 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
444 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
445 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
446 setOperationAction(ISD::FREM , MVT::f32 , Expand);
447 setOperationAction(ISD::FREM , MVT::f64 , Expand);
448 setOperationAction(ISD::FREM , MVT::f80 , Expand);
449 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
451 // Promote the i8 variants and force them on up to i32 which has a shorter
453 setOperationAction(ISD::CTTZ , MVT::i8 , Promote);
454 AddPromotedToType (ISD::CTTZ , MVT::i8 , MVT::i32);
455 setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i8 , Promote);
456 AddPromotedToType (ISD::CTTZ_ZERO_UNDEF , MVT::i8 , MVT::i32);
457 if (Subtarget->hasBMI()) {
458 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Expand);
459 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Expand);
460 if (Subtarget->is64Bit())
461 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
463 setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
464 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
465 if (Subtarget->is64Bit())
466 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
469 if (Subtarget->hasLZCNT()) {
470 // When promoting the i8 variants, force them to i32 for a shorter
472 setOperationAction(ISD::CTLZ , MVT::i8 , Promote);
473 AddPromotedToType (ISD::CTLZ , MVT::i8 , MVT::i32);
474 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Promote);
475 AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
476 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Expand);
477 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Expand);
478 if (Subtarget->is64Bit())
479 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
481 setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
482 setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
483 setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
484 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
485 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
486 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
487 if (Subtarget->is64Bit()) {
488 setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
489 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
493 // Special handling for half-precision floating point conversions.
494 // If we don't have F16C support, then lower half float conversions
495 // into library calls.
496 if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) {
497 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
498 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
501 // There's never any support for operations beyond MVT::f32.
502 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
503 setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
504 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
505 setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
507 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
508 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
509 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
510 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
511 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
512 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
514 if (Subtarget->hasPOPCNT()) {
515 setOperationAction(ISD::CTPOP , MVT::i8 , Promote);
517 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
518 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
519 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
520 if (Subtarget->is64Bit())
521 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
524 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
526 if (!Subtarget->hasMOVBE())
527 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
529 // These should be promoted to a larger select which is supported.
530 setOperationAction(ISD::SELECT , MVT::i1 , Promote);
531 // X86 wants to expand cmov itself.
532 setOperationAction(ISD::SELECT , MVT::i8 , Custom);
533 setOperationAction(ISD::SELECT , MVT::i16 , Custom);
534 setOperationAction(ISD::SELECT , MVT::i32 , Custom);
535 setOperationAction(ISD::SELECT , MVT::f32 , Custom);
536 setOperationAction(ISD::SELECT , MVT::f64 , Custom);
537 setOperationAction(ISD::SELECT , MVT::f80 , Custom);
538 setOperationAction(ISD::SETCC , MVT::i8 , Custom);
539 setOperationAction(ISD::SETCC , MVT::i16 , Custom);
540 setOperationAction(ISD::SETCC , MVT::i32 , Custom);
541 setOperationAction(ISD::SETCC , MVT::f32 , Custom);
542 setOperationAction(ISD::SETCC , MVT::f64 , Custom);
543 setOperationAction(ISD::SETCC , MVT::f80 , Custom);
544 if (Subtarget->is64Bit()) {
545 setOperationAction(ISD::SELECT , MVT::i64 , Custom);
546 setOperationAction(ISD::SETCC , MVT::i64 , Custom);
548 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
549 // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
550 // SjLj exception handling but a light-weight setjmp/longjmp replacement to
551 // support continuation, user-level threading, and etc.. As a result, no
552 // other SjLj exception interfaces are implemented and please don't build
553 // your own exception handling based on them.
554 // LLVM/Clang supports zero-cost DWARF exception handling.
555 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
556 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
559 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom);
560 setOperationAction(ISD::JumpTable , MVT::i32 , Custom);
561 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom);
562 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom);
563 if (Subtarget->is64Bit())
564 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
565 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom);
566 setOperationAction(ISD::BlockAddress , MVT::i32 , Custom);
567 if (Subtarget->is64Bit()) {
568 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom);
569 setOperationAction(ISD::JumpTable , MVT::i64 , Custom);
570 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom);
571 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom);
572 setOperationAction(ISD::BlockAddress , MVT::i64 , Custom);
574 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
575 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom);
576 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom);
577 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom);
578 if (Subtarget->is64Bit()) {
579 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom);
580 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom);
581 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom);
584 if (Subtarget->hasSSE1())
585 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
587 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
589 // Expand certain atomics
590 for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
592 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
593 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
594 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
597 if (Subtarget->hasCmpxchg16b()) {
598 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
601 // FIXME - use subtarget debug flags
602 if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() &&
603 !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) {
604 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
607 if (Subtarget->is64Bit()) {
608 setExceptionPointerRegister(X86::RAX);
609 setExceptionSelectorRegister(X86::RDX);
611 setExceptionPointerRegister(X86::EAX);
612 setExceptionSelectorRegister(X86::EDX);
614 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
615 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
617 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
618 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
620 setOperationAction(ISD::TRAP, MVT::Other, Legal);
621 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
623 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
624 setOperationAction(ISD::VASTART , MVT::Other, Custom);
625 setOperationAction(ISD::VAEND , MVT::Other, Expand);
626 if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {
627 // TargetInfo::X86_64ABIBuiltinVaList
628 setOperationAction(ISD::VAARG , MVT::Other, Custom);
629 setOperationAction(ISD::VACOPY , MVT::Other, Custom);
631 // TargetInfo::CharPtrBuiltinVaList
632 setOperationAction(ISD::VAARG , MVT::Other, Expand);
633 setOperationAction(ISD::VACOPY , MVT::Other, Expand);
636 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
637 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
639 setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(), Custom);
641 if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
642 // f32 and f64 use SSE.
643 // Set up the FP register classes.
644 addRegisterClass(MVT::f32, &X86::FR32RegClass);
645 addRegisterClass(MVT::f64, &X86::FR64RegClass);
647 // Use ANDPD to simulate FABS.
648 setOperationAction(ISD::FABS , MVT::f64, Custom);
649 setOperationAction(ISD::FABS , MVT::f32, Custom);
651 // Use XORP to simulate FNEG.
652 setOperationAction(ISD::FNEG , MVT::f64, Custom);
653 setOperationAction(ISD::FNEG , MVT::f32, Custom);
655 // Use ANDPD and ORPD to simulate FCOPYSIGN.
656 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
657 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
659 // Lower this to FGETSIGNx86 plus an AND.
660 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
661 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
663 // We don't support sin/cos/fmod
664 setOperationAction(ISD::FSIN , MVT::f64, Expand);
665 setOperationAction(ISD::FCOS , MVT::f64, Expand);
666 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
667 setOperationAction(ISD::FSIN , MVT::f32, Expand);
668 setOperationAction(ISD::FCOS , MVT::f32, Expand);
669 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
671 // Expand FP immediates into loads from the stack, except for the special
673 addLegalFPImmediate(APFloat(+0.0)); // xorpd
674 addLegalFPImmediate(APFloat(+0.0f)); // xorps
675 } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
676 // Use SSE for f32, x87 for f64.
677 // Set up the FP register classes.
678 addRegisterClass(MVT::f32, &X86::FR32RegClass);
679 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
681 // Use ANDPS to simulate FABS.
682 setOperationAction(ISD::FABS , MVT::f32, Custom);
684 // Use XORP to simulate FNEG.
685 setOperationAction(ISD::FNEG , MVT::f32, Custom);
687 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
689 // Use ANDPS and ORPS to simulate FCOPYSIGN.
690 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
691 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
693 // We don't support sin/cos/fmod
694 setOperationAction(ISD::FSIN , MVT::f32, Expand);
695 setOperationAction(ISD::FCOS , MVT::f32, Expand);
696 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
698 // Special cases we handle for FP constants.
699 addLegalFPImmediate(APFloat(+0.0f)); // xorps
700 addLegalFPImmediate(APFloat(+0.0)); // FLD0
701 addLegalFPImmediate(APFloat(+1.0)); // FLD1
702 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
703 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
705 if (!TM.Options.UnsafeFPMath) {
706 setOperationAction(ISD::FSIN , MVT::f64, Expand);
707 setOperationAction(ISD::FCOS , MVT::f64, Expand);
708 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
710 } else if (!TM.Options.UseSoftFloat) {
711 // f32 and f64 in x87.
712 // Set up the FP register classes.
713 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
714 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
716 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
717 setOperationAction(ISD::UNDEF, MVT::f32, Expand);
718 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
719 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
721 if (!TM.Options.UnsafeFPMath) {
722 setOperationAction(ISD::FSIN , MVT::f64, Expand);
723 setOperationAction(ISD::FSIN , MVT::f32, Expand);
724 setOperationAction(ISD::FCOS , MVT::f64, Expand);
725 setOperationAction(ISD::FCOS , MVT::f32, Expand);
726 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
727 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
729 addLegalFPImmediate(APFloat(+0.0)); // FLD0
730 addLegalFPImmediate(APFloat(+1.0)); // FLD1
731 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
732 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
733 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
734 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
735 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
736 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
739 // We don't support FMA.
740 setOperationAction(ISD::FMA, MVT::f64, Expand);
741 setOperationAction(ISD::FMA, MVT::f32, Expand);
743 // Long double always uses X87.
744 if (!TM.Options.UseSoftFloat) {
745 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
746 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
747 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
749 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
750 addLegalFPImmediate(TmpFlt); // FLD0
752 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
755 APFloat TmpFlt2(+1.0);
756 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
758 addLegalFPImmediate(TmpFlt2); // FLD1
759 TmpFlt2.changeSign();
760 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
763 if (!TM.Options.UnsafeFPMath) {
764 setOperationAction(ISD::FSIN , MVT::f80, Expand);
765 setOperationAction(ISD::FCOS , MVT::f80, Expand);
766 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
769 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
770 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
771 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
772 setOperationAction(ISD::FRINT, MVT::f80, Expand);
773 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
774 setOperationAction(ISD::FMA, MVT::f80, Expand);
777 // Always use a library call for pow.
778 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
779 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
780 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
782 setOperationAction(ISD::FLOG, MVT::f80, Expand);
783 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
784 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
785 setOperationAction(ISD::FEXP, MVT::f80, Expand);
786 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
787 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
788 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
790 // First set operation action for all vector types to either promote
791 // (for widening) or expand (for scalarization). Then we will selectively
792 // turn on ones that can be effectively codegen'd.
793 for (MVT VT : MVT::vector_valuetypes()) {
794 setOperationAction(ISD::ADD , VT, Expand);
795 setOperationAction(ISD::SUB , VT, Expand);
796 setOperationAction(ISD::FADD, VT, Expand);
797 setOperationAction(ISD::FNEG, VT, Expand);
798 setOperationAction(ISD::FSUB, VT, Expand);
799 setOperationAction(ISD::MUL , VT, Expand);
800 setOperationAction(ISD::FMUL, VT, Expand);
801 setOperationAction(ISD::SDIV, VT, Expand);
802 setOperationAction(ISD::UDIV, VT, Expand);
803 setOperationAction(ISD::FDIV, VT, Expand);
804 setOperationAction(ISD::SREM, VT, Expand);
805 setOperationAction(ISD::UREM, VT, Expand);
806 setOperationAction(ISD::LOAD, VT, Expand);
807 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
808 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
809 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
810 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
811 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
812 setOperationAction(ISD::FABS, VT, Expand);
813 setOperationAction(ISD::FSIN, VT, Expand);
814 setOperationAction(ISD::FSINCOS, VT, Expand);
815 setOperationAction(ISD::FCOS, VT, Expand);
816 setOperationAction(ISD::FSINCOS, VT, Expand);
817 setOperationAction(ISD::FREM, VT, Expand);
818 setOperationAction(ISD::FMA, VT, Expand);
819 setOperationAction(ISD::FPOWI, VT, Expand);
820 setOperationAction(ISD::FSQRT, VT, Expand);
821 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
822 setOperationAction(ISD::FFLOOR, VT, Expand);
823 setOperationAction(ISD::FCEIL, VT, Expand);
824 setOperationAction(ISD::FTRUNC, VT, Expand);
825 setOperationAction(ISD::FRINT, VT, Expand);
826 setOperationAction(ISD::FNEARBYINT, VT, Expand);
827 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
828 setOperationAction(ISD::MULHS, VT, Expand);
829 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
830 setOperationAction(ISD::MULHU, VT, Expand);
831 setOperationAction(ISD::SDIVREM, VT, Expand);
832 setOperationAction(ISD::UDIVREM, VT, Expand);
833 setOperationAction(ISD::FPOW, VT, Expand);
834 setOperationAction(ISD::CTPOP, VT, Expand);
835 setOperationAction(ISD::CTTZ, VT, Expand);
836 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
837 setOperationAction(ISD::CTLZ, VT, Expand);
838 setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
839 setOperationAction(ISD::SHL, VT, Expand);
840 setOperationAction(ISD::SRA, VT, Expand);
841 setOperationAction(ISD::SRL, VT, Expand);
842 setOperationAction(ISD::ROTL, VT, Expand);
843 setOperationAction(ISD::ROTR, VT, Expand);
844 setOperationAction(ISD::BSWAP, VT, Expand);
845 setOperationAction(ISD::SETCC, VT, Expand);
846 setOperationAction(ISD::FLOG, VT, Expand);
847 setOperationAction(ISD::FLOG2, VT, Expand);
848 setOperationAction(ISD::FLOG10, VT, Expand);
849 setOperationAction(ISD::FEXP, VT, Expand);
850 setOperationAction(ISD::FEXP2, VT, Expand);
851 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
852 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
853 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
854 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
855 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
856 setOperationAction(ISD::TRUNCATE, VT, Expand);
857 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
858 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
859 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
860 setOperationAction(ISD::VSELECT, VT, Expand);
861 setOperationAction(ISD::SELECT_CC, VT, Expand);
862 for (MVT InnerVT : MVT::vector_valuetypes()) {
863 setTruncStoreAction(InnerVT, VT, Expand);
865 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
866 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
868 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
869 // types, we have to deal with them whether we ask for Expansion or not.
870 // Setting Expand causes its own optimisation problems though, so leave
872 if (VT.getVectorElementType() == MVT::i1)
873 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
877 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
878 // with -msoft-float, disable use of MMX as well.
879 if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
880 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
881 // No operations on x86mmx supported, everything uses intrinsics.
884 // MMX-sized vectors (other than x86mmx) are expected to be expanded
885 // into smaller operations.
886 setOperationAction(ISD::MULHS, MVT::v8i8, Expand);
887 setOperationAction(ISD::MULHS, MVT::v4i16, Expand);
888 setOperationAction(ISD::MULHS, MVT::v2i32, Expand);
889 setOperationAction(ISD::MULHS, MVT::v1i64, Expand);
890 setOperationAction(ISD::AND, MVT::v8i8, Expand);
891 setOperationAction(ISD::AND, MVT::v4i16, Expand);
892 setOperationAction(ISD::AND, MVT::v2i32, Expand);
893 setOperationAction(ISD::AND, MVT::v1i64, Expand);
894 setOperationAction(ISD::OR, MVT::v8i8, Expand);
895 setOperationAction(ISD::OR, MVT::v4i16, Expand);
896 setOperationAction(ISD::OR, MVT::v2i32, Expand);
897 setOperationAction(ISD::OR, MVT::v1i64, Expand);
898 setOperationAction(ISD::XOR, MVT::v8i8, Expand);
899 setOperationAction(ISD::XOR, MVT::v4i16, Expand);
900 setOperationAction(ISD::XOR, MVT::v2i32, Expand);
901 setOperationAction(ISD::XOR, MVT::v1i64, Expand);
902 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Expand);
903 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Expand);
904 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i32, Expand);
905 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Expand);
906 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand);
907 setOperationAction(ISD::SELECT, MVT::v8i8, Expand);
908 setOperationAction(ISD::SELECT, MVT::v4i16, Expand);
909 setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
910 setOperationAction(ISD::SELECT, MVT::v1i64, Expand);
911 setOperationAction(ISD::BITCAST, MVT::v8i8, Expand);
912 setOperationAction(ISD::BITCAST, MVT::v4i16, Expand);
913 setOperationAction(ISD::BITCAST, MVT::v2i32, Expand);
914 setOperationAction(ISD::BITCAST, MVT::v1i64, Expand);
916 if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
917 addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
919 setOperationAction(ISD::FADD, MVT::v4f32, Legal);
920 setOperationAction(ISD::FSUB, MVT::v4f32, Legal);
921 setOperationAction(ISD::FMUL, MVT::v4f32, Legal);
922 setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
923 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
924 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
925 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
926 setOperationAction(ISD::LOAD, MVT::v4f32, Legal);
927 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
928 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
929 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
930 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
931 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
934 if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
935 addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
937 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
938 // registers cannot be used even for integer operations.
939 addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
940 addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
941 addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
942 addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
944 setOperationAction(ISD::ADD, MVT::v16i8, Legal);
945 setOperationAction(ISD::ADD, MVT::v8i16, Legal);
946 setOperationAction(ISD::ADD, MVT::v4i32, Legal);
947 setOperationAction(ISD::ADD, MVT::v2i64, Legal);
948 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
949 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
950 setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
951 setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
952 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
953 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
954 setOperationAction(ISD::SUB, MVT::v16i8, Legal);
955 setOperationAction(ISD::SUB, MVT::v8i16, Legal);
956 setOperationAction(ISD::SUB, MVT::v4i32, Legal);
957 setOperationAction(ISD::SUB, MVT::v2i64, Legal);
958 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
959 setOperationAction(ISD::FADD, MVT::v2f64, Legal);
960 setOperationAction(ISD::FSUB, MVT::v2f64, Legal);
961 setOperationAction(ISD::FMUL, MVT::v2f64, Legal);
962 setOperationAction(ISD::FDIV, MVT::v2f64, Legal);
963 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal);
964 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
965 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
967 setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
968 setOperationAction(ISD::SETCC, MVT::v16i8, Custom);
969 setOperationAction(ISD::SETCC, MVT::v8i16, Custom);
970 setOperationAction(ISD::SETCC, MVT::v4i32, Custom);
972 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom);
973 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom);
974 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
975 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
976 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
978 // Only provide customized ctpop vector bit twiddling for vector types we
979 // know to perform better than using the popcnt instructions on each vector
980 // element. If popcnt isn't supported, always provide the custom version.
981 if (!Subtarget->hasPOPCNT()) {
982 setOperationAction(ISD::CTPOP, MVT::v4i32, Custom);
983 setOperationAction(ISD::CTPOP, MVT::v2i64, Custom);
986 // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
987 for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
988 MVT VT = (MVT::SimpleValueType)i;
989 // Do not attempt to custom lower non-power-of-2 vectors
990 if (!isPowerOf2_32(VT.getVectorNumElements()))
992 // Do not attempt to custom lower non-128-bit vectors
993 if (!VT.is128BitVector())
995 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
996 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
997 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1000 // We support custom legalizing of sext and anyext loads for specific
1001 // memory vector types which we can load as a scalar (or sequence of
1002 // scalars) and extend in-register to a legal 128-bit vector type. For sext
1003 // loads these must work with a single scalar load.
1004 for (MVT VT : MVT::integer_vector_valuetypes()) {
1005 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
1006 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
1007 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
1008 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
1009 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
1010 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
1011 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
1012 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
1013 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
1016 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);
1017 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
1018 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom);
1019 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom);
1020 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom);
1021 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
1023 if (Subtarget->is64Bit()) {
1024 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom);
1025 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
1028 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
1029 for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
1030 MVT VT = (MVT::SimpleValueType)i;
1032 // Do not attempt to promote non-128-bit vectors
1033 if (!VT.is128BitVector())
1036 setOperationAction(ISD::AND, VT, Promote);
1037 AddPromotedToType (ISD::AND, VT, MVT::v2i64);
1038 setOperationAction(ISD::OR, VT, Promote);
1039 AddPromotedToType (ISD::OR, VT, MVT::v2i64);
1040 setOperationAction(ISD::XOR, VT, Promote);
1041 AddPromotedToType (ISD::XOR, VT, MVT::v2i64);
1042 setOperationAction(ISD::LOAD, VT, Promote);
1043 AddPromotedToType (ISD::LOAD, VT, MVT::v2i64);
1044 setOperationAction(ISD::SELECT, VT, Promote);
1045 AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
1048 // Custom lower v2i64 and v2f64 selects.
1049 setOperationAction(ISD::LOAD, MVT::v2f64, Legal);
1050 setOperationAction(ISD::LOAD, MVT::v2i64, Legal);
1051 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
1052 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
1054 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
1055 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
1057 setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom);
1058 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
1059 // As there is no 64-bit GPR available, we need build a special custom
1060 // sequence to convert from v2i32 to v2f32.
1061 if (!Subtarget->is64Bit())
1062 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
1064 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
1065 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
1067 for (MVT VT : MVT::fp_vector_valuetypes())
1068 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
1070 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
1071 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
1072 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
1075 if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
1076 setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
1077 setOperationAction(ISD::FCEIL, MVT::f32, Legal);
1078 setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
1079 setOperationAction(ISD::FRINT, MVT::f32, Legal);
1080 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
1081 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
1082 setOperationAction(ISD::FCEIL, MVT::f64, Legal);
1083 setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
1084 setOperationAction(ISD::FRINT, MVT::f64, Legal);
1085 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
1087 setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
1088 setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
1089 setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
1090 setOperationAction(ISD::FRINT, MVT::v4f32, Legal);
1091 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
1092 setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
1093 setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
1094 setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);
1095 setOperationAction(ISD::FRINT, MVT::v2f64, Legal);
1096 setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);
1098 // FIXME: Do we need to handle scalar-to-vector here?
1099 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1101 setOperationAction(ISD::VSELECT, MVT::v2f64, Custom);
1102 setOperationAction(ISD::VSELECT, MVT::v2i64, Custom);
1103 setOperationAction(ISD::VSELECT, MVT::v4i32, Custom);
1104 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
1105 setOperationAction(ISD::VSELECT, MVT::v8i16, Custom);
1106 // There is no BLENDI for byte vectors. We don't need to custom lower
1107 // some vselects for now.
1108 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
1110 // SSE41 brings specific instructions for doing vector sign extend even in
1111 // cases where we don't have SRA.
1112 for (MVT VT : MVT::integer_vector_valuetypes()) {
1113 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
1114 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
1115 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
1118 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1119 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8, Legal);
1120 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
1121 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
1122 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1123 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1124 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1126 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8, Legal);
1127 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
1128 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
1129 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1130 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1131 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1133 // i8 and i16 vectors are custom because the source register and source
1134 // source memory operand types are not the same width. f32 vectors are
1135 // custom since the immediate controlling the insert encodes additional
1137 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
1138 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
1139 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
1140 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
1142 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
1143 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
1144 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
1145 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
1147 // FIXME: these should be Legal, but that's only for the case where
1148 // the index is constant. For now custom expand to deal with that.
1149 if (Subtarget->is64Bit()) {
1150 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom);
1151 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
1155 if (Subtarget->hasSSE2()) {
1156 setOperationAction(ISD::SRL, MVT::v8i16, Custom);
1157 setOperationAction(ISD::SRL, MVT::v16i8, Custom);
1159 setOperationAction(ISD::SHL, MVT::v8i16, Custom);
1160 setOperationAction(ISD::SHL, MVT::v16i8, Custom);
1162 setOperationAction(ISD::SRA, MVT::v8i16, Custom);
1163 setOperationAction(ISD::SRA, MVT::v16i8, Custom);
1165 // In the customized shift lowering, the legal cases in AVX2 will be
1167 setOperationAction(ISD::SRL, MVT::v2i64, Custom);
1168 setOperationAction(ISD::SRL, MVT::v4i32, Custom);
1170 setOperationAction(ISD::SHL, MVT::v2i64, Custom);
1171 setOperationAction(ISD::SHL, MVT::v4i32, Custom);
1173 setOperationAction(ISD::SRA, MVT::v4i32, Custom);
1176 if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
1177 addRegisterClass(MVT::v32i8, &X86::VR256RegClass);
1178 addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
1179 addRegisterClass(MVT::v8i32, &X86::VR256RegClass);
1180 addRegisterClass(MVT::v8f32, &X86::VR256RegClass);
1181 addRegisterClass(MVT::v4i64, &X86::VR256RegClass);
1182 addRegisterClass(MVT::v4f64, &X86::VR256RegClass);
1184 setOperationAction(ISD::LOAD, MVT::v8f32, Legal);
1185 setOperationAction(ISD::LOAD, MVT::v4f64, Legal);
1186 setOperationAction(ISD::LOAD, MVT::v4i64, Legal);
1188 setOperationAction(ISD::FADD, MVT::v8f32, Legal);
1189 setOperationAction(ISD::FSUB, MVT::v8f32, Legal);
1190 setOperationAction(ISD::FMUL, MVT::v8f32, Legal);
1191 setOperationAction(ISD::FDIV, MVT::v8f32, Legal);
1192 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal);
1193 setOperationAction(ISD::FFLOOR, MVT::v8f32, Legal);
1194 setOperationAction(ISD::FCEIL, MVT::v8f32, Legal);
1195 setOperationAction(ISD::FTRUNC, MVT::v8f32, Legal);
1196 setOperationAction(ISD::FRINT, MVT::v8f32, Legal);
1197 setOperationAction(ISD::FNEARBYINT, MVT::v8f32, Legal);
1198 setOperationAction(ISD::FNEG, MVT::v8f32, Custom);
1199 setOperationAction(ISD::FABS, MVT::v8f32, Custom);
1201 setOperationAction(ISD::FADD, MVT::v4f64, Legal);
1202 setOperationAction(ISD::FSUB, MVT::v4f64, Legal);
1203 setOperationAction(ISD::FMUL, MVT::v4f64, Legal);
1204 setOperationAction(ISD::FDIV, MVT::v4f64, Legal);
1205 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal);
1206 setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal);
1207 setOperationAction(ISD::FCEIL, MVT::v4f64, Legal);
1208 setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal);
1209 setOperationAction(ISD::FRINT, MVT::v4f64, Legal);
1210 setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Legal);
1211 setOperationAction(ISD::FNEG, MVT::v4f64, Custom);
1212 setOperationAction(ISD::FABS, MVT::v4f64, Custom);
1214 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1215 // even though v8i16 is a legal type.
1216 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote);
1217 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote);
1218 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1220 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote);
1221 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1222 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
1224 setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);
1225 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
1227 for (MVT VT : MVT::fp_vector_valuetypes())
1228 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1230 setOperationAction(ISD::SRL, MVT::v16i16, Custom);
1231 setOperationAction(ISD::SRL, MVT::v32i8, Custom);
1233 setOperationAction(ISD::SHL, MVT::v16i16, Custom);
1234 setOperationAction(ISD::SHL, MVT::v32i8, Custom);
1236 setOperationAction(ISD::SRA, MVT::v16i16, Custom);
1237 setOperationAction(ISD::SRA, MVT::v32i8, Custom);
1239 setOperationAction(ISD::SETCC, MVT::v32i8, Custom);
1240 setOperationAction(ISD::SETCC, MVT::v16i16, Custom);
1241 setOperationAction(ISD::SETCC, MVT::v8i32, Custom);
1242 setOperationAction(ISD::SETCC, MVT::v4i64, Custom);
1244 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1245 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1246 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1248 setOperationAction(ISD::VSELECT, MVT::v4f64, Custom);
1249 setOperationAction(ISD::VSELECT, MVT::v4i64, Custom);
1250 setOperationAction(ISD::VSELECT, MVT::v8i32, Custom);
1251 setOperationAction(ISD::VSELECT, MVT::v8f32, Custom);
1253 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
1254 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom);
1255 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
1256 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom);
1257 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom);
1258 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom);
1259 setOperationAction(ISD::ANY_EXTEND, MVT::v4i64, Custom);
1260 setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Custom);
1261 setOperationAction(ISD::ANY_EXTEND, MVT::v16i16, Custom);
1262 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1263 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1264 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1266 if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
1267 setOperationAction(ISD::FMA, MVT::v8f32, Legal);
1268 setOperationAction(ISD::FMA, MVT::v4f64, Legal);
1269 setOperationAction(ISD::FMA, MVT::v4f32, Legal);
1270 setOperationAction(ISD::FMA, MVT::v2f64, Legal);
1271 setOperationAction(ISD::FMA, MVT::f32, Legal);
1272 setOperationAction(ISD::FMA, MVT::f64, Legal);
1275 if (Subtarget->hasInt256()) {
1276 setOperationAction(ISD::ADD, MVT::v4i64, Legal);
1277 setOperationAction(ISD::ADD, MVT::v8i32, Legal);
1278 setOperationAction(ISD::ADD, MVT::v16i16, Legal);
1279 setOperationAction(ISD::ADD, MVT::v32i8, Legal);
1281 setOperationAction(ISD::SUB, MVT::v4i64, Legal);
1282 setOperationAction(ISD::SUB, MVT::v8i32, Legal);
1283 setOperationAction(ISD::SUB, MVT::v16i16, Legal);
1284 setOperationAction(ISD::SUB, MVT::v32i8, Legal);
1286 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1287 setOperationAction(ISD::MUL, MVT::v8i32, Legal);
1288 setOperationAction(ISD::MUL, MVT::v16i16, Legal);
1289 // Don't lower v32i8 because there is no 128-bit byte mul
1291 setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
1292 setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
1293 setOperationAction(ISD::MULHU, MVT::v16i16, Legal);
1294 setOperationAction(ISD::MULHS, MVT::v16i16, Legal);
1296 setOperationAction(ISD::VSELECT, MVT::v16i16, Custom);
1297 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1299 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1300 // when we have a 256bit-wide blend with immediate.
1301 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1303 // Only provide customized ctpop vector bit twiddling for vector types we
1304 // know to perform better than using the popcnt instructions on each
1305 // vector element. If popcnt isn't supported, always provide the custom
1307 if (!Subtarget->hasPOPCNT())
1308 setOperationAction(ISD::CTPOP, MVT::v4i64, Custom);
1310 // Custom CTPOP always performs better on natively supported v8i32
1311 setOperationAction(ISD::CTPOP, MVT::v8i32, Custom);
1313 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1314 setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1315 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
1316 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
1317 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1318 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1319 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1321 setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1322 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
1323 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
1324 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1325 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1326 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1328 setOperationAction(ISD::ADD, MVT::v4i64, Custom);
1329 setOperationAction(ISD::ADD, MVT::v8i32, Custom);
1330 setOperationAction(ISD::ADD, MVT::v16i16, Custom);
1331 setOperationAction(ISD::ADD, MVT::v32i8, Custom);
1333 setOperationAction(ISD::SUB, MVT::v4i64, Custom);
1334 setOperationAction(ISD::SUB, MVT::v8i32, Custom);
1335 setOperationAction(ISD::SUB, MVT::v16i16, Custom);
1336 setOperationAction(ISD::SUB, MVT::v32i8, Custom);
1338 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1339 setOperationAction(ISD::MUL, MVT::v8i32, Custom);
1340 setOperationAction(ISD::MUL, MVT::v16i16, Custom);
1341 // Don't lower v32i8 because there is no 128-bit byte mul
1344 // In the customized shift lowering, the legal cases in AVX2 will be
1346 setOperationAction(ISD::SRL, MVT::v4i64, Custom);
1347 setOperationAction(ISD::SRL, MVT::v8i32, Custom);
1349 setOperationAction(ISD::SHL, MVT::v4i64, Custom);
1350 setOperationAction(ISD::SHL, MVT::v8i32, Custom);
1352 setOperationAction(ISD::SRA, MVT::v8i32, Custom);
1354 // Custom lower several nodes for 256-bit types.
1355 for (MVT VT : MVT::vector_valuetypes()) {
1356 if (VT.getScalarSizeInBits() >= 32) {
1357 setOperationAction(ISD::MLOAD, VT, Legal);
1358 setOperationAction(ISD::MSTORE, VT, Legal);
1360 // Extract subvector is special because the value type
1361 // (result) is 128-bit but the source is 256-bit wide.
1362 if (VT.is128BitVector()) {
1363 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1365 // Do not attempt to custom lower other non-256-bit vectors
1366 if (!VT.is256BitVector())
1369 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1370 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1371 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1372 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1373 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1374 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1375 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1378 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1379 for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
1380 MVT VT = (MVT::SimpleValueType)i;
1382 // Do not attempt to promote non-256-bit vectors
1383 if (!VT.is256BitVector())
1386 setOperationAction(ISD::AND, VT, Promote);
1387 AddPromotedToType (ISD::AND, VT, MVT::v4i64);
1388 setOperationAction(ISD::OR, VT, Promote);
1389 AddPromotedToType (ISD::OR, VT, MVT::v4i64);
1390 setOperationAction(ISD::XOR, VT, Promote);
1391 AddPromotedToType (ISD::XOR, VT, MVT::v4i64);
1392 setOperationAction(ISD::LOAD, VT, Promote);
1393 AddPromotedToType (ISD::LOAD, VT, MVT::v4i64);
1394 setOperationAction(ISD::SELECT, VT, Promote);
1395 AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
1399 if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) {
1400 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1401 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1402 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1403 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1405 addRegisterClass(MVT::i1, &X86::VK1RegClass);
1406 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1407 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1409 for (MVT VT : MVT::fp_vector_valuetypes())
1410 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1412 setOperationAction(ISD::BR_CC, MVT::i1, Expand);
1413 setOperationAction(ISD::SETCC, MVT::i1, Custom);
1414 setOperationAction(ISD::XOR, MVT::i1, Legal);
1415 setOperationAction(ISD::OR, MVT::i1, Legal);
1416 setOperationAction(ISD::AND, MVT::i1, Legal);
1417 setOperationAction(ISD::LOAD, MVT::v16f32, Legal);
1418 setOperationAction(ISD::LOAD, MVT::v8f64, Legal);
1419 setOperationAction(ISD::LOAD, MVT::v8i64, Legal);
1420 setOperationAction(ISD::LOAD, MVT::v16i32, Legal);
1421 setOperationAction(ISD::LOAD, MVT::v16i1, Legal);
1423 setOperationAction(ISD::FADD, MVT::v16f32, Legal);
1424 setOperationAction(ISD::FSUB, MVT::v16f32, Legal);
1425 setOperationAction(ISD::FMUL, MVT::v16f32, Legal);
1426 setOperationAction(ISD::FDIV, MVT::v16f32, Legal);
1427 setOperationAction(ISD::FSQRT, MVT::v16f32, Legal);
1428 setOperationAction(ISD::FNEG, MVT::v16f32, Custom);
1430 setOperationAction(ISD::FADD, MVT::v8f64, Legal);
1431 setOperationAction(ISD::FSUB, MVT::v8f64, Legal);
1432 setOperationAction(ISD::FMUL, MVT::v8f64, Legal);
1433 setOperationAction(ISD::FDIV, MVT::v8f64, Legal);
1434 setOperationAction(ISD::FSQRT, MVT::v8f64, Legal);
1435 setOperationAction(ISD::FNEG, MVT::v8f64, Custom);
1436 setOperationAction(ISD::FMA, MVT::v8f64, Legal);
1437 setOperationAction(ISD::FMA, MVT::v16f32, Legal);
1439 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal);
1440 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal);
1441 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal);
1442 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal);
1443 if (Subtarget->is64Bit()) {
1444 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Legal);
1445 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Legal);
1446 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Legal);
1447 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Legal);
1449 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1450 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1451 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1452 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1453 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1454 setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
1455 setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
1456 setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Promote);
1457 setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Promote);
1458 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1459 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1460 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
1461 setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal);
1462 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
1464 setOperationAction(ISD::TRUNCATE, MVT::i1, Custom);
1465 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1466 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
1467 setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom);
1468 setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom);
1469 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
1470 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1471 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1472 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1473 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1474 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
1475 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
1476 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
1478 setOperationAction(ISD::FFLOOR, MVT::v16f32, Legal);
1479 setOperationAction(ISD::FFLOOR, MVT::v8f64, Legal);
1480 setOperationAction(ISD::FCEIL, MVT::v16f32, Legal);
1481 setOperationAction(ISD::FCEIL, MVT::v8f64, Legal);
1482 setOperationAction(ISD::FTRUNC, MVT::v16f32, Legal);
1483 setOperationAction(ISD::FTRUNC, MVT::v8f64, Legal);
1484 setOperationAction(ISD::FRINT, MVT::v16f32, Legal);
1485 setOperationAction(ISD::FRINT, MVT::v8f64, Legal);
1486 setOperationAction(ISD::FNEARBYINT, MVT::v16f32, Legal);
1487 setOperationAction(ISD::FNEARBYINT, MVT::v8f64, Legal);
1489 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
1490 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
1491 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
1492 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
1493 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
1494 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Legal);
1496 setOperationAction(ISD::SETCC, MVT::v16i1, Custom);
1497 setOperationAction(ISD::SETCC, MVT::v8i1, Custom);
1499 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1501 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom);
1502 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
1503 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i1, Custom);
1504 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i1, Custom);
1505 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom);
1506 setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom);
1507 setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
1508 setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
1509 setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
1511 setOperationAction(ISD::ADD, MVT::v8i64, Legal);
1512 setOperationAction(ISD::ADD, MVT::v16i32, Legal);
1514 setOperationAction(ISD::SUB, MVT::v8i64, Legal);
1515 setOperationAction(ISD::SUB, MVT::v16i32, Legal);
1517 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1519 setOperationAction(ISD::SRL, MVT::v8i64, Custom);
1520 setOperationAction(ISD::SRL, MVT::v16i32, Custom);
1522 setOperationAction(ISD::SHL, MVT::v8i64, Custom);
1523 setOperationAction(ISD::SHL, MVT::v16i32, Custom);
1525 setOperationAction(ISD::SRA, MVT::v8i64, Custom);
1526 setOperationAction(ISD::SRA, MVT::v16i32, Custom);
1528 setOperationAction(ISD::AND, MVT::v8i64, Legal);
1529 setOperationAction(ISD::OR, MVT::v8i64, Legal);
1530 setOperationAction(ISD::XOR, MVT::v8i64, Legal);
1531 setOperationAction(ISD::AND, MVT::v16i32, Legal);
1532 setOperationAction(ISD::OR, MVT::v16i32, Legal);
1533 setOperationAction(ISD::XOR, MVT::v16i32, Legal);
1535 if (Subtarget->hasCDI()) {
1536 setOperationAction(ISD::CTLZ, MVT::v8i64, Legal);
1537 setOperationAction(ISD::CTLZ, MVT::v16i32, Legal);
1540 // Custom lower several nodes.
1541 for (MVT VT : MVT::vector_valuetypes()) {
1542 unsigned EltSize = VT.getVectorElementType().getSizeInBits();
1543 // Extract subvector is special because the value type
1544 // (result) is 256/128-bit but the source is 512-bit wide.
1545 if (VT.is128BitVector() || VT.is256BitVector()) {
1546 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1548 if (VT.getVectorElementType() == MVT::i1)
1549 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1551 // Do not attempt to custom lower other non-512-bit vectors
1552 if (!VT.is512BitVector())
1555 if ( EltSize >= 32) {
1556 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1557 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1558 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1559 setOperationAction(ISD::VSELECT, VT, Legal);
1560 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1561 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1562 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1563 setOperationAction(ISD::MLOAD, VT, Legal);
1564 setOperationAction(ISD::MSTORE, VT, Legal);
1567 for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
1568 MVT VT = (MVT::SimpleValueType)i;
1570 // Do not attempt to promote non-512-bit vectors.
1571 if (!VT.is512BitVector())
1574 setOperationAction(ISD::SELECT, VT, Promote);
1575 AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
1579 if (!TM.Options.UseSoftFloat && Subtarget->hasBWI()) {
1580 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1581 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1583 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1584 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1586 setOperationAction(ISD::LOAD, MVT::v32i16, Legal);
1587 setOperationAction(ISD::LOAD, MVT::v64i8, Legal);
1588 setOperationAction(ISD::SETCC, MVT::v32i1, Custom);
1589 setOperationAction(ISD::SETCC, MVT::v64i1, Custom);
1590 setOperationAction(ISD::ADD, MVT::v32i16, Legal);
1591 setOperationAction(ISD::ADD, MVT::v64i8, Legal);
1592 setOperationAction(ISD::SUB, MVT::v32i16, Legal);
1593 setOperationAction(ISD::SUB, MVT::v64i8, Legal);
1594 setOperationAction(ISD::MUL, MVT::v32i16, Legal);
1596 for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
1597 const MVT VT = (MVT::SimpleValueType)i;
1599 const unsigned EltSize = VT.getVectorElementType().getSizeInBits();
1601 // Do not attempt to promote non-512-bit vectors.
1602 if (!VT.is512BitVector())
1606 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1607 setOperationAction(ISD::VSELECT, VT, Legal);
1612 if (!TM.Options.UseSoftFloat && Subtarget->hasVLX()) {
1613 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1614 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1616 setOperationAction(ISD::SETCC, MVT::v4i1, Custom);
1617 setOperationAction(ISD::SETCC, MVT::v2i1, Custom);
1618 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Legal);
1620 setOperationAction(ISD::AND, MVT::v8i32, Legal);
1621 setOperationAction(ISD::OR, MVT::v8i32, Legal);
1622 setOperationAction(ISD::XOR, MVT::v8i32, Legal);
1623 setOperationAction(ISD::AND, MVT::v4i32, Legal);
1624 setOperationAction(ISD::OR, MVT::v4i32, Legal);
1625 setOperationAction(ISD::XOR, MVT::v4i32, Legal);
1628 // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
1629 // of this type with custom code.
1630 for (MVT VT : MVT::vector_valuetypes())
1631 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
1633 // We want to custom lower some of our intrinsics.
1634 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1635 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1636 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1637 if (!Subtarget->is64Bit())
1638 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1640 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1641 // handle type legalization for these operations here.
1643 // FIXME: We really should do custom legalization for addition and
1644 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1645 // than generic legalization for 64-bit multiplication-with-overflow, though.
1646 for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
1647 // Add/Sub/Mul with overflow operations are custom lowered.
1649 setOperationAction(ISD::SADDO, VT, Custom);
1650 setOperationAction(ISD::UADDO, VT, Custom);
1651 setOperationAction(ISD::SSUBO, VT, Custom);
1652 setOperationAction(ISD::USUBO, VT, Custom);
1653 setOperationAction(ISD::SMULO, VT, Custom);
1654 setOperationAction(ISD::UMULO, VT, Custom);
1658 if (!Subtarget->is64Bit()) {
1659 // These libcalls are not available in 32-bit.
1660 setLibcallName(RTLIB::SHL_I128, nullptr);
1661 setLibcallName(RTLIB::SRL_I128, nullptr);
1662 setLibcallName(RTLIB::SRA_I128, nullptr);
1665 // Combine sin / cos into one node or libcall if possible.
1666 if (Subtarget->hasSinCos()) {
1667 setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1668 setLibcallName(RTLIB::SINCOS_F64, "sincos");
1669 if (Subtarget->isTargetDarwin()) {
1670 // For MacOSX, we don't want the normal expansion of a libcall to sincos.
1671 // We want to issue a libcall to __sincos_stret to avoid memory traffic.
1672 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1673 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1677 if (Subtarget->isTargetWin64()) {
1678 setOperationAction(ISD::SDIV, MVT::i128, Custom);
1679 setOperationAction(ISD::UDIV, MVT::i128, Custom);
1680 setOperationAction(ISD::SREM, MVT::i128, Custom);
1681 setOperationAction(ISD::UREM, MVT::i128, Custom);
1682 setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1683 setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1686 // We have target-specific dag combine patterns for the following nodes:
1687 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1688 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1689 setTargetDAGCombine(ISD::BITCAST);
1690 setTargetDAGCombine(ISD::VSELECT);
1691 setTargetDAGCombine(ISD::SELECT);
1692 setTargetDAGCombine(ISD::SHL);
1693 setTargetDAGCombine(ISD::SRA);
1694 setTargetDAGCombine(ISD::SRL);
1695 setTargetDAGCombine(ISD::OR);
1696 setTargetDAGCombine(ISD::AND);
1697 setTargetDAGCombine(ISD::ADD);
1698 setTargetDAGCombine(ISD::FADD);
1699 setTargetDAGCombine(ISD::FSUB);
1700 setTargetDAGCombine(ISD::FMA);
1701 setTargetDAGCombine(ISD::SUB);
1702 setTargetDAGCombine(ISD::LOAD);
1703 setTargetDAGCombine(ISD::MLOAD);
1704 setTargetDAGCombine(ISD::STORE);
1705 setTargetDAGCombine(ISD::MSTORE);
1706 setTargetDAGCombine(ISD::ZERO_EXTEND);
1707 setTargetDAGCombine(ISD::ANY_EXTEND);
1708 setTargetDAGCombine(ISD::SIGN_EXTEND);
1709 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1710 setTargetDAGCombine(ISD::TRUNCATE);
1711 setTargetDAGCombine(ISD::SINT_TO_FP);
1712 setTargetDAGCombine(ISD::SETCC);
1713 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
1714 setTargetDAGCombine(ISD::BUILD_VECTOR);
1715 setTargetDAGCombine(ISD::MUL);
1716 setTargetDAGCombine(ISD::XOR);
1718 computeRegisterProperties();
1720 // On Darwin, -Os means optimize for size without hurting performance,
1721 // do not reduce the limit.
1722 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1723 MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
1724 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1725 MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1726 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1727 MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1728 setPrefLoopAlignment(4); // 2^4 bytes.
1730 // Predictable cmov don't hurt on atom because it's in-order.
1731 PredictableSelectIsExpensive = !Subtarget->isAtom();
1732 EnableExtLdPromotion = true;
1733 setPrefFunctionAlignment(4); // 2^4 bytes.
1735 verifyIntrinsicTables();
1738 // This has so far only been implemented for 64-bit MachO.
1739 bool X86TargetLowering::useLoadStackGuardNode() const {
1740 return Subtarget->isTargetMachO() && Subtarget->is64Bit();
1743 TargetLoweringBase::LegalizeTypeAction
1744 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1745 if (ExperimentalVectorWideningLegalization &&
1746 VT.getVectorNumElements() != 1 &&
1747 VT.getVectorElementType().getSimpleVT() != MVT::i1)
1748 return TypeWidenVector;
1750 return TargetLoweringBase::getPreferredVectorAction(VT);
1753 EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1755 return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
1757 const unsigned NumElts = VT.getVectorNumElements();
1758 const EVT EltVT = VT.getVectorElementType();
1759 if (VT.is512BitVector()) {
1760 if (Subtarget->hasAVX512())
1761 if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1762 EltVT == MVT::f32 || EltVT == MVT::f64)
1764 case 8: return MVT::v8i1;
1765 case 16: return MVT::v16i1;
1767 if (Subtarget->hasBWI())
1768 if (EltVT == MVT::i8 || EltVT == MVT::i16)
1770 case 32: return MVT::v32i1;
1771 case 64: return MVT::v64i1;
1775 if (VT.is256BitVector() || VT.is128BitVector()) {
1776 if (Subtarget->hasVLX())
1777 if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1778 EltVT == MVT::f32 || EltVT == MVT::f64)
1780 case 2: return MVT::v2i1;
1781 case 4: return MVT::v4i1;
1782 case 8: return MVT::v8i1;
1784 if (Subtarget->hasBWI() && Subtarget->hasVLX())
1785 if (EltVT == MVT::i8 || EltVT == MVT::i16)
1787 case 8: return MVT::v8i1;
1788 case 16: return MVT::v16i1;
1789 case 32: return MVT::v32i1;
1793 return VT.changeVectorElementTypeToInteger();
1796 /// Helper for getByValTypeAlignment to determine
1797 /// the desired ByVal argument alignment.
1798 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1801 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1802 if (VTy->getBitWidth() == 128)
1804 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1805 unsigned EltAlign = 0;
1806 getMaxByValAlign(ATy->getElementType(), EltAlign);
1807 if (EltAlign > MaxAlign)
1808 MaxAlign = EltAlign;
1809 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1810 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
1811 unsigned EltAlign = 0;
1812 getMaxByValAlign(STy->getElementType(i), EltAlign);
1813 if (EltAlign > MaxAlign)
1814 MaxAlign = EltAlign;
1821 /// Return the desired alignment for ByVal aggregate
1822 /// function arguments in the caller parameter area. For X86, aggregates
1823 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1824 /// are at 4-byte boundaries.
1825 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
1826 if (Subtarget->is64Bit()) {
1827 // Max of 8 and alignment of type.
1828 unsigned TyAlign = TD->getABITypeAlignment(Ty);
1835 if (Subtarget->hasSSE1())
1836 getMaxByValAlign(Ty, Align);
1840 /// Returns the target specific optimal type for load
1841 /// and store operations as a result of memset, memcpy, and memmove
1842 /// lowering. If DstAlign is zero that means it's safe to destination
1843 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1844 /// means there isn't a need to check it against alignment requirement,
1845 /// probably because the source does not need to be loaded. If 'IsMemset' is
1846 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1847 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1848 /// source is constant so it does not need to be loaded.
1849 /// It returns EVT::Other if the type should be determined using generic
1850 /// target-independent logic.
1852 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1853 unsigned DstAlign, unsigned SrcAlign,
1854 bool IsMemset, bool ZeroMemset,
1856 MachineFunction &MF) const {
1857 const Function *F = MF.getFunction();
1858 if ((!IsMemset || ZeroMemset) &&
1859 !F->hasFnAttribute(Attribute::NoImplicitFloat)) {
1861 (Subtarget->isUnalignedMemAccessFast() ||
1862 ((DstAlign == 0 || DstAlign >= 16) &&
1863 (SrcAlign == 0 || SrcAlign >= 16)))) {
1865 if (Subtarget->hasInt256())
1867 if (Subtarget->hasFp256())
1870 if (Subtarget->hasSSE2())
1872 if (Subtarget->hasSSE1())
1874 } else if (!MemcpyStrSrc && Size >= 8 &&
1875 !Subtarget->is64Bit() &&
1876 Subtarget->hasSSE2()) {
1877 // Do not use f64 to lower memcpy if source is string constant. It's
1878 // better to use i32 to avoid the loads.
1882 if (Subtarget->is64Bit() && Size >= 8)
1887 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1889 return X86ScalarSSEf32;
1890 else if (VT == MVT::f64)
1891 return X86ScalarSSEf64;
1896 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1901 *Fast = Subtarget->isUnalignedMemAccessFast();
1905 /// Return the entry encoding for a jump table in the
1906 /// current function. The returned value is a member of the
1907 /// MachineJumpTableInfo::JTEntryKind enum.
1908 unsigned X86TargetLowering::getJumpTableEncoding() const {
1909 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1911 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1912 Subtarget->isPICStyleGOT())
1913 return MachineJumpTableInfo::EK_Custom32;
1915 // Otherwise, use the normal jump table encoding heuristics.
1916 return TargetLowering::getJumpTableEncoding();
1920 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1921 const MachineBasicBlock *MBB,
1922 unsigned uid,MCContext &Ctx) const{
1923 assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ &&
1924 Subtarget->isPICStyleGOT());
1925 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1927 return MCSymbolRefExpr::Create(MBB->getSymbol(),
1928 MCSymbolRefExpr::VK_GOTOFF, Ctx);
1931 /// Returns relocation base for the given PIC jumptable.
1932 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1933 SelectionDAG &DAG) const {
1934 if (!Subtarget->is64Bit())
1935 // This doesn't have SDLoc associated with it, but is not really the
1936 // same as a Register.
1937 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
1941 /// This returns the relocation base for the given PIC jumptable,
1942 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
1943 const MCExpr *X86TargetLowering::
1944 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1945 MCContext &Ctx) const {
1946 // X86-64 uses RIP relative addressing based on the jump table label.
1947 if (Subtarget->isPICStyleRIPRel())
1948 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1950 // Otherwise, the reference is relative to the PIC base.
1951 return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
1954 // FIXME: Why this routine is here? Move to RegInfo!
1955 std::pair<const TargetRegisterClass*, uint8_t>
1956 X86TargetLowering::findRepresentativeClass(MVT VT) const{
1957 const TargetRegisterClass *RRC = nullptr;
1959 switch (VT.SimpleTy) {
1961 return TargetLowering::findRepresentativeClass(VT);
1962 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1963 RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
1966 RRC = &X86::VR64RegClass;
1968 case MVT::f32: case MVT::f64:
1969 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1970 case MVT::v4f32: case MVT::v2f64:
1971 case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
1973 RRC = &X86::VR128RegClass;
1976 return std::make_pair(RRC, Cost);
1979 bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
1980 unsigned &Offset) const {
1981 if (!Subtarget->isTargetLinux())
1984 if (Subtarget->is64Bit()) {
1985 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
1987 if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
1999 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
2000 unsigned DestAS) const {
2001 assert(SrcAS != DestAS && "Expected different address spaces!");
2003 return SrcAS < 256 && DestAS < 256;
2006 //===----------------------------------------------------------------------===//
2007 // Return Value Calling Convention Implementation
2008 //===----------------------------------------------------------------------===//
2010 #include "X86GenCallingConv.inc"
2013 X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
2014 MachineFunction &MF, bool isVarArg,
2015 const SmallVectorImpl<ISD::OutputArg> &Outs,
2016 LLVMContext &Context) const {
2017 SmallVector<CCValAssign, 16> RVLocs;
2018 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2019 return CCInfo.CheckReturn(Outs, RetCC_X86);
2022 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2023 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2028 X86TargetLowering::LowerReturn(SDValue Chain,
2029 CallingConv::ID CallConv, bool isVarArg,
2030 const SmallVectorImpl<ISD::OutputArg> &Outs,
2031 const SmallVectorImpl<SDValue> &OutVals,
2032 SDLoc dl, SelectionDAG &DAG) const {
2033 MachineFunction &MF = DAG.getMachineFunction();
2034 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2036 SmallVector<CCValAssign, 16> RVLocs;
2037 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2038 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2041 SmallVector<SDValue, 6> RetOps;
2042 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2043 // Operand #1 = Bytes To Pop
2044 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
2047 // Copy the result values into the output registers.
2048 for (unsigned i = 0; i != RVLocs.size(); ++i) {
2049 CCValAssign &VA = RVLocs[i];
2050 assert(VA.isRegLoc() && "Can only return in registers!");
2051 SDValue ValToCopy = OutVals[i];
2052 EVT ValVT = ValToCopy.getValueType();
2054 // Promote values to the appropriate types.
2055 if (VA.getLocInfo() == CCValAssign::SExt)
2056 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2057 else if (VA.getLocInfo() == CCValAssign::ZExt)
2058 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2059 else if (VA.getLocInfo() == CCValAssign::AExt)
2060 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2061 else if (VA.getLocInfo() == CCValAssign::BCvt)
2062 ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
2064 assert(VA.getLocInfo() != CCValAssign::FPExt &&
2065 "Unexpected FP-extend for return value.");
2067 // If this is x86-64, and we disabled SSE, we can't return FP values,
2068 // or SSE or MMX vectors.
2069 if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2070 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2071 (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
2072 report_fatal_error("SSE register return with SSE disabled");
2074 // Likewise we can't return F64 values with SSE1 only. gcc does so, but
2075 // llvm-gcc has never done it right and no one has noticed, so this
2076 // should be OK for now.
2077 if (ValVT == MVT::f64 &&
2078 (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
2079 report_fatal_error("SSE2 register return with SSE2 disabled");
2081 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2082 // the RET instruction and handled by the FP Stackifier.
2083 if (VA.getLocReg() == X86::FP0 ||
2084 VA.getLocReg() == X86::FP1) {
2085 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2086 // change the value to the FP stack register class.
2087 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2088 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2089 RetOps.push_back(ValToCopy);
2090 // Don't emit a copytoreg.
2094 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2095 // which is returned in RAX / RDX.
2096 if (Subtarget->is64Bit()) {
2097 if (ValVT == MVT::x86mmx) {
2098 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2099 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
2100 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2102 // If we don't have SSE2 available, convert to v4f32 so the generated
2103 // register is legal.
2104 if (!Subtarget->hasSSE2())
2105 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
2110 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
2111 Flag = Chain.getValue(1);
2112 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2115 // The x86-64 ABIs require that for returning structs by value we copy
2116 // the sret argument into %rax/%eax (depending on ABI) for the return.
2117 // Win32 requires us to put the sret argument to %eax as well.
2118 // We saved the argument into a virtual register in the entry block,
2119 // so now we copy the value out and into %rax/%eax.
2121 // Checking Function.hasStructRetAttr() here is insufficient because the IR
2122 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2123 // false, then an sret argument may be implicitly inserted in the SelDAG. In
2124 // either case FuncInfo->setSRetReturnReg() will have been called.
2125 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2126 assert((Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) &&
2127 "No need for an sret register");
2128 SDValue Val = DAG.getCopyFromReg(Chain, dl, SRetReg, getPointerTy());
2131 = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
2132 X86::RAX : X86::EAX;
2133 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2134 Flag = Chain.getValue(1);
2136 // RAX/EAX now acts like a return value.
2137 RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
2140 RetOps[0] = Chain; // Update chain.
2142 // Add the flag if we have it.
2144 RetOps.push_back(Flag);
2146 return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps);
2149 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2150 if (N->getNumValues() != 1)
2152 if (!N->hasNUsesOfValue(1, 0))
2155 SDValue TCChain = Chain;
2156 SDNode *Copy = *N->use_begin();
2157 if (Copy->getOpcode() == ISD::CopyToReg) {
2158 // If the copy has a glue operand, we conservatively assume it isn't safe to
2159 // perform a tail call.
2160 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2162 TCChain = Copy->getOperand(0);
2163 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2166 bool HasRet = false;
2167 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2169 if (UI->getOpcode() != X86ISD::RET_FLAG)
2171 // If we are returning more than one value, we can definitely
2172 // not make a tail call see PR19530
2173 if (UI->getNumOperands() > 4)
2175 if (UI->getNumOperands() == 4 &&
2176 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2189 X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
2190 ISD::NodeType ExtendKind) const {
2192 // TODO: Is this also valid on 32-bit?
2193 if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
2194 ReturnMVT = MVT::i8;
2196 ReturnMVT = MVT::i32;
2198 EVT MinVT = getRegisterType(Context, ReturnMVT);
2199 return VT.bitsLT(MinVT) ? MinVT : VT;
2202 /// Lower the result values of a call into the
2203 /// appropriate copies out of appropriate physical registers.
2206 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
2207 CallingConv::ID CallConv, bool isVarArg,
2208 const SmallVectorImpl<ISD::InputArg> &Ins,
2209 SDLoc dl, SelectionDAG &DAG,
2210 SmallVectorImpl<SDValue> &InVals) const {
2212 // Assign locations to each value returned by this call.
2213 SmallVector<CCValAssign, 16> RVLocs;
2214 bool Is64Bit = Subtarget->is64Bit();
2215 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2217 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2219 // Copy all of the result registers out of their specified physreg.
2220 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
2221 CCValAssign &VA = RVLocs[i];
2222 EVT CopyVT = VA.getValVT();
2224 // If this is x86-64, and we disabled SSE, we can't return FP values
2225 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
2226 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
2227 report_fatal_error("SSE register return with SSE disabled");
2230 // If we prefer to use the value in xmm registers, copy it out as f80 and
2231 // use a truncate to move it from fp stack reg to xmm reg.
2232 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2233 isScalarFPTypeInSSEReg(VA.getValVT()))
2236 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
2237 CopyVT, InFlag).getValue(1);
2238 SDValue Val = Chain.getValue(0);
2240 if (CopyVT != VA.getValVT())
2241 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2242 // This truncation won't change the value.
2243 DAG.getIntPtrConstant(1));
2245 InFlag = Chain.getValue(2);
2246 InVals.push_back(Val);
2252 //===----------------------------------------------------------------------===//
2253 // C & StdCall & Fast Calling Convention implementation
2254 //===----------------------------------------------------------------------===//
2255 // StdCall calling convention seems to be standard for many Windows' API
2256 // routines and around. It differs from C calling convention just a little:
2257 // callee should clean up the stack, not caller. Symbols should be also
2258 // decorated in some fancy way :) It doesn't support any vector arguments.
2259 // For info on fast calling convention see Fast Calling Convention (tail call)
2260 // implementation LowerX86_32FastCCCallTo.
2262 /// CallIsStructReturn - Determines whether a call uses struct return
2264 enum StructReturnType {
2269 static StructReturnType
2270 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
2272 return NotStructReturn;
2274 const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2275 if (!Flags.isSRet())
2276 return NotStructReturn;
2277 if (Flags.isInReg())
2278 return RegStructReturn;
2279 return StackStructReturn;
2282 /// Determines whether a function uses struct return semantics.
2283 static StructReturnType
2284 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
2286 return NotStructReturn;
2288 const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2289 if (!Flags.isSRet())
2290 return NotStructReturn;
2291 if (Flags.isInReg())
2292 return RegStructReturn;
2293 return StackStructReturn;
2296 /// Make a copy of an aggregate at address specified by "Src" to address
2297 /// "Dst" with size and alignment information specified by the specific
2298 /// parameter attribute. The copy will be passed as a byval function parameter.
2300 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
2301 ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
2303 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
2305 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2306 /*isVolatile*/false, /*AlwaysInline=*/true,
2307 MachinePointerInfo(), MachinePointerInfo());
2310 /// Return true if the calling convention is one that
2311 /// supports tail call optimization.
2312 static bool IsTailCallConvention(CallingConv::ID CC) {
2313 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2314 CC == CallingConv::HiPE);
2317 /// \brief Return true if the calling convention is a C calling convention.
2318 static bool IsCCallConvention(CallingConv::ID CC) {
2319 return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||
2320 CC == CallingConv::X86_64_SysV);
2323 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
2324 if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
2328 CallingConv::ID CalleeCC = CS.getCallingConv();
2329 if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
2335 /// Return true if the function is being made into
2336 /// a tailcall target by changing its ABI.
2337 static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
2338 bool GuaranteedTailCallOpt) {
2339 return GuaranteedTailCallOpt && IsTailCallConvention(CC);
2343 X86TargetLowering::LowerMemArgument(SDValue Chain,
2344 CallingConv::ID CallConv,
2345 const SmallVectorImpl<ISD::InputArg> &Ins,
2346 SDLoc dl, SelectionDAG &DAG,
2347 const CCValAssign &VA,
2348 MachineFrameInfo *MFI,
2350 // Create the nodes corresponding to a load from this parameter slot.
2351 ISD::ArgFlagsTy Flags = Ins[i].Flags;
2352 bool AlwaysUseMutable = FuncIsMadeTailCallSafe(
2353 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2354 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2357 // If value is passed by pointer we have address passed instead of the value
2359 if (VA.getLocInfo() == CCValAssign::Indirect)
2360 ValVT = VA.getLocVT();
2362 ValVT = VA.getValVT();
2364 // FIXME: For now, all byval parameter objects are marked mutable. This can be
2365 // changed with more analysis.
2366 // In case of tail call optimization mark all arguments mutable. Since they
2367 // could be overwritten by lowering of arguments in case of a tail call.
2368 if (Flags.isByVal()) {
2369 unsigned Bytes = Flags.getByValSize();
2370 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2371 int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2372 return DAG.getFrameIndex(FI, getPointerTy());
2374 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
2375 VA.getLocMemOffset(), isImmutable);
2376 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
2377 return DAG.getLoad(ValVT, dl, Chain, FIN,
2378 MachinePointerInfo::getFixedStack(FI),
2379 false, false, false, 0);
2383 // FIXME: Get this from tablegen.
2384 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2385 const X86Subtarget *Subtarget) {
2386 assert(Subtarget->is64Bit());
2388 if (Subtarget->isCallingConvWin64(CallConv)) {
2389 static const MCPhysReg GPR64ArgRegsWin64[] = {
2390 X86::RCX, X86::RDX, X86::R8, X86::R9
2392 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2395 static const MCPhysReg GPR64ArgRegs64Bit[] = {
2396 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2398 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2401 // FIXME: Get this from tablegen.
2402 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2403 CallingConv::ID CallConv,
2404 const X86Subtarget *Subtarget) {
2405 assert(Subtarget->is64Bit());
2406 if (Subtarget->isCallingConvWin64(CallConv)) {
2407 // The XMM registers which might contain var arg parameters are shadowed
2408 // in their paired GPR. So we only need to save the GPR to their home
2410 // TODO: __vectorcall will change this.
2414 const Function *Fn = MF.getFunction();
2415 bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
2416 assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) &&
2417 "SSE register cannot be used when SSE is disabled!");
2418 if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
2419 !Subtarget->hasSSE1())
2420 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2424 static const MCPhysReg XMMArgRegs64Bit[] = {
2425 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2426 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2428 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2432 X86TargetLowering::LowerFormalArguments(SDValue Chain,
2433 CallingConv::ID CallConv,
2435 const SmallVectorImpl<ISD::InputArg> &Ins,
2438 SmallVectorImpl<SDValue> &InVals)
2440 MachineFunction &MF = DAG.getMachineFunction();
2441 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2443 const Function* Fn = MF.getFunction();
2444 if (Fn->hasExternalLinkage() &&
2445 Subtarget->isTargetCygMing() &&
2446 Fn->getName() == "main")
2447 FuncInfo->setForceFramePointer(true);
2449 MachineFrameInfo *MFI = MF.getFrameInfo();
2450 bool Is64Bit = Subtarget->is64Bit();
2451 bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
2453 assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
2454 "Var args not supported with calling convention fastcc, ghc or hipe");
2456 // Assign locations to all of the incoming arguments.
2457 SmallVector<CCValAssign, 16> ArgLocs;
2458 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2460 // Allocate shadow area for Win64
2462 CCInfo.AllocateStack(32, 8);
2464 CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
2466 unsigned LastVal = ~0U;
2468 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2469 CCValAssign &VA = ArgLocs[i];
2470 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
2472 assert(VA.getValNo() != LastVal &&
2473 "Don't support value assigned to multiple locs yet");
2475 LastVal = VA.getValNo();
2477 if (VA.isRegLoc()) {
2478 EVT RegVT = VA.getLocVT();
2479 const TargetRegisterClass *RC;
2480 if (RegVT == MVT::i32)
2481 RC = &X86::GR32RegClass;
2482 else if (Is64Bit && RegVT == MVT::i64)
2483 RC = &X86::GR64RegClass;
2484 else if (RegVT == MVT::f32)
2485 RC = &X86::FR32RegClass;
2486 else if (RegVT == MVT::f64)
2487 RC = &X86::FR64RegClass;
2488 else if (RegVT.is512BitVector())
2489 RC = &X86::VR512RegClass;
2490 else if (RegVT.is256BitVector())
2491 RC = &X86::VR256RegClass;
2492 else if (RegVT.is128BitVector())
2493 RC = &X86::VR128RegClass;
2494 else if (RegVT == MVT::x86mmx)
2495 RC = &X86::VR64RegClass;
2496 else if (RegVT == MVT::i1)
2497 RC = &X86::VK1RegClass;
2498 else if (RegVT == MVT::v8i1)
2499 RC = &X86::VK8RegClass;
2500 else if (RegVT == MVT::v16i1)
2501 RC = &X86::VK16RegClass;
2502 else if (RegVT == MVT::v32i1)
2503 RC = &X86::VK32RegClass;
2504 else if (RegVT == MVT::v64i1)
2505 RC = &X86::VK64RegClass;
2507 llvm_unreachable("Unknown argument type!");
2509 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2510 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2512 // If this is an 8 or 16-bit value, it is really passed promoted to 32
2513 // bits. Insert an assert[sz]ext to capture this, then truncate to the
2515 if (VA.getLocInfo() == CCValAssign::SExt)
2516 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
2517 DAG.getValueType(VA.getValVT()));
2518 else if (VA.getLocInfo() == CCValAssign::ZExt)
2519 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
2520 DAG.getValueType(VA.getValVT()));
2521 else if (VA.getLocInfo() == CCValAssign::BCvt)
2522 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
2524 if (VA.isExtInLoc()) {
2525 // Handle MMX values passed in XMM regs.
2526 if (RegVT.isVector())
2527 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
2529 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
2532 assert(VA.isMemLoc());
2533 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
2536 // If value is passed via pointer - do a load.
2537 if (VA.getLocInfo() == CCValAssign::Indirect)
2538 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
2539 MachinePointerInfo(), false, false, false, 0);
2541 InVals.push_back(ArgValue);
2544 if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) {
2545 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2546 // The x86-64 ABIs require that for returning structs by value we copy
2547 // the sret argument into %rax/%eax (depending on ABI) for the return.
2548 // Win32 requires us to put the sret argument to %eax as well.
2549 // Save the argument into a virtual register so that we can access it
2550 // from the return points.
2551 if (Ins[i].Flags.isSRet()) {
2552 unsigned Reg = FuncInfo->getSRetReturnReg();
2554 MVT PtrTy = getPointerTy();
2555 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
2556 FuncInfo->setSRetReturnReg(Reg);
2558 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
2559 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
2565 unsigned StackSize = CCInfo.getNextStackOffset();
2566 // Align stack specially for tail calls.
2567 if (FuncIsMadeTailCallSafe(CallConv,
2568 MF.getTarget().Options.GuaranteedTailCallOpt))
2569 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
2571 // If the function takes variable number of arguments, make a frame index for
2572 // the start of the first vararg value... for expansion of llvm.va_start. We
2573 // can skip this if there are no va_start calls.
2574 if (MFI->hasVAStart() &&
2575 (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
2576 CallConv != CallingConv::X86_ThisCall))) {
2577 FuncInfo->setVarArgsFrameIndex(
2578 MFI->CreateFixedObject(1, StackSize, true));
2581 // Figure out if XMM registers are in use.
2582 assert(!(MF.getTarget().Options.UseSoftFloat &&
2583 Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
2584 "SSE register cannot be used when SSE is disabled!");
2586 // 64-bit calling conventions support varargs and register parameters, so we
2587 // have to do extra work to spill them in the prologue.
2588 if (Is64Bit && isVarArg && MFI->hasVAStart()) {
2589 // Find the first unallocated argument registers.
2590 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
2591 ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
2592 unsigned NumIntRegs =
2593 CCInfo.getFirstUnallocated(ArgGPRs.data(), ArgGPRs.size());
2594 unsigned NumXMMRegs =
2595 CCInfo.getFirstUnallocated(ArgXMMs.data(), ArgXMMs.size());
2596 assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
2597 "SSE register cannot be used when SSE is disabled!");
2599 // Gather all the live in physical registers.
2600 SmallVector<SDValue, 6> LiveGPRs;
2601 SmallVector<SDValue, 8> LiveXMMRegs;
2603 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
2604 unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
2606 DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
2608 if (!ArgXMMs.empty()) {
2609 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2610 ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
2611 for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
2612 unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
2613 LiveXMMRegs.push_back(
2614 DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
2619 const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
2620 // Get to the caller-allocated home save location. Add 8 to account
2621 // for the return address.
2622 int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
2623 FuncInfo->setRegSaveFrameIndex(
2624 MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
2625 // Fixup to set vararg frame on shadow area (4 x i64).
2627 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
2629 // For X86-64, if there are vararg parameters that are passed via
2630 // registers, then we must store them to their spots on the stack so
2631 // they may be loaded by deferencing the result of va_next.
2632 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
2633 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
2634 FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
2635 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
2638 // Store the integer parameter registers.
2639 SmallVector<SDValue, 8> MemOps;
2640 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
2642 unsigned Offset = FuncInfo->getVarArgsGPOffset();
2643 for (SDValue Val : LiveGPRs) {
2644 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
2645 DAG.getIntPtrConstant(Offset));
2647 DAG.getStore(Val.getValue(1), dl, Val, FIN,
2648 MachinePointerInfo::getFixedStack(
2649 FuncInfo->getRegSaveFrameIndex(), Offset),
2651 MemOps.push_back(Store);
2655 if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
2656 // Now store the XMM (fp + vector) parameter registers.
2657 SmallVector<SDValue, 12> SaveXMMOps;
2658 SaveXMMOps.push_back(Chain);
2659 SaveXMMOps.push_back(ALVal);
2660 SaveXMMOps.push_back(DAG.getIntPtrConstant(
2661 FuncInfo->getRegSaveFrameIndex()));
2662 SaveXMMOps.push_back(DAG.getIntPtrConstant(
2663 FuncInfo->getVarArgsFPOffset()));
2664 SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
2666 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
2667 MVT::Other, SaveXMMOps));
2670 if (!MemOps.empty())
2671 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
2674 if (isVarArg && MFI->hasMustTailInVarArgFunc()) {
2675 // Find the largest legal vector type.
2676 MVT VecVT = MVT::Other;
2677 // FIXME: Only some x86_32 calling conventions support AVX512.
2678 if (Subtarget->hasAVX512() &&
2679 (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
2680 CallConv == CallingConv::Intel_OCL_BI)))
2681 VecVT = MVT::v16f32;
2682 else if (Subtarget->hasAVX())
2684 else if (Subtarget->hasSSE2())
2687 // We forward some GPRs and some vector types.
2688 SmallVector<MVT, 2> RegParmTypes;
2689 MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
2690 RegParmTypes.push_back(IntVT);
2691 if (VecVT != MVT::Other)
2692 RegParmTypes.push_back(VecVT);
2694 // Compute the set of forwarded registers. The rest are scratch.
2695 SmallVectorImpl<ForwardedRegister> &Forwards =
2696 FuncInfo->getForwardedMustTailRegParms();
2697 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
2699 // Conservatively forward AL on x86_64, since it might be used for varargs.
2700 if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
2701 unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2702 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
2705 // Copy all forwards from physical to virtual registers.
2706 for (ForwardedRegister &F : Forwards) {
2707 // FIXME: Can we use a less constrained schedule?
2708 SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
2709 F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
2710 Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
2714 // Some CCs need callee pop.
2715 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
2716 MF.getTarget().Options.GuaranteedTailCallOpt)) {
2717 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
2719 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
2720 // If this is an sret function, the return should pop the hidden pointer.
2721 if (!Is64Bit && !IsTailCallConvention(CallConv) &&
2722 !Subtarget->getTargetTriple().isOSMSVCRT() &&
2723 argsAreStructReturn(Ins) == StackStructReturn)
2724 FuncInfo->setBytesToPopOnReturn(4);
2728 // RegSaveFrameIndex is X86-64 only.
2729 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
2730 if (CallConv == CallingConv::X86_FastCall ||
2731 CallConv == CallingConv::X86_ThisCall)
2732 // fastcc functions can't have varargs.
2733 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
2736 FuncInfo->setArgumentStackSize(StackSize);
2742 X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
2743 SDValue StackPtr, SDValue Arg,
2744 SDLoc dl, SelectionDAG &DAG,
2745 const CCValAssign &VA,
2746 ISD::ArgFlagsTy Flags) const {
2747 unsigned LocMemOffset = VA.getLocMemOffset();
2748 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
2749 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
2750 if (Flags.isByVal())
2751 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
2753 return DAG.getStore(Chain, dl, Arg, PtrOff,
2754 MachinePointerInfo::getStack(LocMemOffset),
2758 /// Emit a load of return address if tail call
2759 /// optimization is performed and it is required.
2761 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
2762 SDValue &OutRetAddr, SDValue Chain,
2763 bool IsTailCall, bool Is64Bit,
2764 int FPDiff, SDLoc dl) const {
2765 // Adjust the Return address stack slot.
2766 EVT VT = getPointerTy();
2767 OutRetAddr = getReturnAddressFrameIndex(DAG);
2769 // Load the "old" Return address.
2770 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
2771 false, false, false, 0);
2772 return SDValue(OutRetAddr.getNode(), 1);
2775 /// Emit a store of the return address if tail call
2776 /// optimization is performed and it is required (FPDiff!=0).
2777 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
2778 SDValue Chain, SDValue RetAddrFrIdx,
2779 EVT PtrVT, unsigned SlotSize,
2780 int FPDiff, SDLoc dl) {
2781 // Store the return address to the appropriate stack slot.
2782 if (!FPDiff) return Chain;
2783 // Calculate the new stack slot for the return address.
2784 int NewReturnAddrFI =
2785 MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
2787 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
2788 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
2789 MachinePointerInfo::getFixedStack(NewReturnAddrFI),
2795 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2796 SmallVectorImpl<SDValue> &InVals) const {
2797 SelectionDAG &DAG = CLI.DAG;
2799 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2800 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2801 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
2802 SDValue Chain = CLI.Chain;
2803 SDValue Callee = CLI.Callee;
2804 CallingConv::ID CallConv = CLI.CallConv;
2805 bool &isTailCall = CLI.IsTailCall;
2806 bool isVarArg = CLI.IsVarArg;
2808 MachineFunction &MF = DAG.getMachineFunction();
2809 bool Is64Bit = Subtarget->is64Bit();
2810 bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
2811 StructReturnType SR = callIsStructReturn(Outs);
2812 bool IsSibcall = false;
2813 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
2815 if (MF.getTarget().Options.DisableTailCalls)
2818 bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
2820 // Force this to be a tail call. The verifier rules are enough to ensure
2821 // that we can lower this successfully without moving the return address
2824 } else if (isTailCall) {
2825 // Check if it's really possible to do a tail call.
2826 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
2827 isVarArg, SR != NotStructReturn,
2828 MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
2829 Outs, OutVals, Ins, DAG);
2831 // Sibcalls are automatically detected tailcalls which do not require
2833 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
2840 assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
2841 "Var args not supported with calling convention fastcc, ghc or hipe");
2843 // Analyze operands of the call, assigning locations to each operand.
2844 SmallVector<CCValAssign, 16> ArgLocs;
2845 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2847 // Allocate shadow area for Win64
2849 CCInfo.AllocateStack(32, 8);
2851 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
2853 // Get a count of how many bytes are to be pushed on the stack.
2854 unsigned NumBytes = CCInfo.getNextStackOffset();
2856 // This is a sibcall. The memory operands are available in caller's
2857 // own caller's stack.
2859 else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
2860 IsTailCallConvention(CallConv))
2861 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
2864 if (isTailCall && !IsSibcall && !IsMustTail) {
2865 // Lower arguments at fp - stackoffset + fpdiff.
2866 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
2868 FPDiff = NumBytesCallerPushed - NumBytes;
2870 // Set the delta of movement of the returnaddr stackslot.
2871 // But only set if delta is greater than previous delta.
2872 if (FPDiff < X86Info->getTCReturnAddrDelta())
2873 X86Info->setTCReturnAddrDelta(FPDiff);
2876 unsigned NumBytesToPush = NumBytes;
2877 unsigned NumBytesToPop = NumBytes;
2879 // If we have an inalloca argument, all stack space has already been allocated
2880 // for us and be right at the top of the stack. We don't support multiple
2881 // arguments passed in memory when using inalloca.
2882 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
2884 if (!ArgLocs.back().isMemLoc())
2885 report_fatal_error("cannot use inalloca attribute on a register "
2887 if (ArgLocs.back().getLocMemOffset() != 0)
2888 report_fatal_error("any parameter with the inalloca attribute must be "
2889 "the only memory argument");
2893 Chain = DAG.getCALLSEQ_START(
2894 Chain, DAG.getIntPtrConstant(NumBytesToPush, true), dl);
2896 SDValue RetAddrFrIdx;
2897 // Load return address for tail calls.
2898 if (isTailCall && FPDiff)
2899 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
2900 Is64Bit, FPDiff, dl);
2902 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2903 SmallVector<SDValue, 8> MemOpChains;
2906 // Walk the register/memloc assignments, inserting copies/loads. In the case
2907 // of tail call optimization arguments are handle later.
2908 const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
2909 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2910 // Skip inalloca arguments, they have already been written.
2911 ISD::ArgFlagsTy Flags = Outs[i].Flags;
2912 if (Flags.isInAlloca())
2915 CCValAssign &VA = ArgLocs[i];
2916 EVT RegVT = VA.getLocVT();
2917 SDValue Arg = OutVals[i];
2918 bool isByVal = Flags.isByVal();
2920 // Promote the value if needed.
2921 switch (VA.getLocInfo()) {
2922 default: llvm_unreachable("Unknown loc info!");
2923 case CCValAssign::Full: break;
2924 case CCValAssign::SExt:
2925 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
2927 case CCValAssign::ZExt:
2928 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
2930 case CCValAssign::AExt:
2931 if (RegVT.is128BitVector()) {
2932 // Special case: passing MMX values in XMM registers.
2933 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
2934 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
2935 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
2937 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
2939 case CCValAssign::BCvt:
2940 Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
2942 case CCValAssign::Indirect: {
2943 // Store the argument.
2944 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
2945 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
2946 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
2947 MachinePointerInfo::getFixedStack(FI),
2954 if (VA.isRegLoc()) {
2955 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2956 if (isVarArg && IsWin64) {
2957 // Win64 ABI requires argument XMM reg to be copied to the corresponding
2958 // shadow reg if callee is a varargs function.
2959 unsigned ShadowReg = 0;
2960 switch (VA.getLocReg()) {
2961 case X86::XMM0: ShadowReg = X86::RCX; break;
2962 case X86::XMM1: ShadowReg = X86::RDX; break;
2963 case X86::XMM2: ShadowReg = X86::R8; break;
2964 case X86::XMM3: ShadowReg = X86::R9; break;
2967 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
2969 } else if (!IsSibcall && (!isTailCall || isByVal)) {
2970 assert(VA.isMemLoc());
2971 if (!StackPtr.getNode())
2972 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
2974 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
2975 dl, DAG, VA, Flags));
2979 if (!MemOpChains.empty())
2980 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2982 if (Subtarget->isPICStyleGOT()) {
2983 // ELF / PIC requires GOT in the EBX register before function calls via PLT
2986 RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
2987 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
2989 // If we are tail calling and generating PIC/GOT style code load the
2990 // address of the callee into ECX. The value in ecx is used as target of
2991 // the tail jump. This is done to circumvent the ebx/callee-saved problem
2992 // for tail calls on PIC/GOT architectures. Normally we would just put the
2993 // address of GOT into ebx and then call target@PLT. But for tail calls
2994 // ebx would be restored (since ebx is callee saved) before jumping to the
2997 // Note: The actual moving to ECX is done further down.
2998 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
2999 if (G && !G->getGlobal()->hasHiddenVisibility() &&
3000 !G->getGlobal()->hasProtectedVisibility())
3001 Callee = LowerGlobalAddress(Callee, DAG);
3002 else if (isa<ExternalSymbolSDNode>(Callee))
3003 Callee = LowerExternalSymbol(Callee, DAG);
3007 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3008 // From AMD64 ABI document:
3009 // For calls that may call functions that use varargs or stdargs
3010 // (prototype-less calls or calls to functions containing ellipsis (...) in
3011 // the declaration) %al is used as hidden argument to specify the number
3012 // of SSE registers used. The contents of %al do not need to match exactly
3013 // the number of registers, but must be an ubound on the number of SSE
3014 // registers used and is in the range 0 - 8 inclusive.
3016 // Count the number of XMM registers allocated.
3017 static const MCPhysReg XMMArgRegs[] = {
3018 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3019 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3021 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
3022 assert((Subtarget->hasSSE1() || !NumXMMRegs)
3023 && "SSE registers cannot be used when SSE is disabled");
3025 RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3026 DAG.getConstant(NumXMMRegs, MVT::i8)));
3029 if (isVarArg && IsMustTail) {
3030 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3031 for (const auto &F : Forwards) {
3032 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3033 RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3037 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
3038 // don't need this because the eligibility check rejects calls that require
3039 // shuffling arguments passed in memory.
3040 if (!IsSibcall && isTailCall) {
3041 // Force all the incoming stack arguments to be loaded from the stack
3042 // before any new outgoing arguments are stored to the stack, because the
3043 // outgoing stack slots may alias the incoming argument stack slots, and
3044 // the alias isn't otherwise explicit. This is slightly more conservative
3045 // than necessary, because it means that each store effectively depends
3046 // on every argument instead of just those arguments it would clobber.
3047 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3049 SmallVector<SDValue, 8> MemOpChains2;
3052 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3053 CCValAssign &VA = ArgLocs[i];
3056 assert(VA.isMemLoc());
3057 SDValue Arg = OutVals[i];
3058 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3059 // Skip inalloca arguments. They don't require any work.
3060 if (Flags.isInAlloca())
3062 // Create frame index.
3063 int32_t Offset = VA.getLocMemOffset()+FPDiff;
3064 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3065 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
3066 FIN = DAG.getFrameIndex(FI, getPointerTy());
3068 if (Flags.isByVal()) {
3069 // Copy relative to framepointer.
3070 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
3071 if (!StackPtr.getNode())
3072 StackPtr = DAG.getCopyFromReg(Chain, dl,
3073 RegInfo->getStackRegister(),
3075 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
3077 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3081 // Store relative to framepointer.
3082 MemOpChains2.push_back(
3083 DAG.getStore(ArgChain, dl, Arg, FIN,
3084 MachinePointerInfo::getFixedStack(FI),
3089 if (!MemOpChains2.empty())
3090 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3092 // Store the return address to the appropriate stack slot.
3093 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3094 getPointerTy(), RegInfo->getSlotSize(),
3098 // Build a sequence of copy-to-reg nodes chained together with token chain
3099 // and flag operands which copy the outgoing args into registers.
3101 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3102 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3103 RegsToPass[i].second, InFlag);
3104 InFlag = Chain.getValue(1);
3107 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3108 assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3109 // In the 64-bit large code model, we have to make all calls
3110 // through a register, since the call instruction's 32-bit
3111 // pc-relative offset may not be large enough to hold the whole
3113 } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3114 // If the callee is a GlobalAddress node (quite common, every direct call
3115 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3117 GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3119 // We should use extra load for direct calls to dllimported functions in
3121 const GlobalValue *GV = G->getGlobal();
3122 if (!GV->hasDLLImportStorageClass()) {
3123 unsigned char OpFlags = 0;
3124 bool ExtraLoad = false;
3125 unsigned WrapperKind = ISD::DELETED_NODE;
3127 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
3128 // external symbols most go through the PLT in PIC mode. If the symbol
3129 // has hidden or protected visibility, or if it is static or local, then
3130 // we don't need to use the PLT - we can directly call it.
3131 if (Subtarget->isTargetELF() &&
3132 DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
3133 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
3134 OpFlags = X86II::MO_PLT;
3135 } else if (Subtarget->isPICStyleStubAny() &&
3136 (GV->isDeclaration() || GV->isWeakForLinker()) &&
3137 (!Subtarget->getTargetTriple().isMacOSX() ||
3138 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
3139 // PC-relative references to external symbols should go through $stub,
3140 // unless we're building with the leopard linker or later, which
3141 // automatically synthesizes these stubs.
3142 OpFlags = X86II::MO_DARWIN_STUB;
3143 } else if (Subtarget->isPICStyleRIPRel() && isa<Function>(GV) &&
3144 cast<Function>(GV)->hasFnAttribute(Attribute::NonLazyBind)) {
3145 // If the function is marked as non-lazy, generate an indirect call
3146 // which loads from the GOT directly. This avoids runtime overhead
3147 // at the cost of eager binding (and one extra byte of encoding).
3148 OpFlags = X86II::MO_GOTPCREL;
3149 WrapperKind = X86ISD::WrapperRIP;
3153 Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
3154 G->getOffset(), OpFlags);
3156 // Add a wrapper if needed.
3157 if (WrapperKind != ISD::DELETED_NODE)
3158 Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
3159 // Add extra indirection if needed.
3161 Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
3162 MachinePointerInfo::getGOT(),
3163 false, false, false, 0);
3165 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3166 unsigned char OpFlags = 0;
3168 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
3169 // external symbols should go through the PLT.
3170 if (Subtarget->isTargetELF() &&
3171 DAG.getTarget().getRelocationModel() == Reloc::PIC_) {
3172 OpFlags = X86II::MO_PLT;
3173 } else if (Subtarget->isPICStyleStubAny() &&
3174 (!Subtarget->getTargetTriple().isMacOSX() ||
3175 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
3176 // PC-relative references to external symbols should go through $stub,
3177 // unless we're building with the leopard linker or later, which
3178 // automatically synthesizes these stubs.
3179 OpFlags = X86II::MO_DARWIN_STUB;
3182 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
3184 } else if (Subtarget->isTarget64BitILP32() &&
3185 Callee->getValueType(0) == MVT::i32) {
3186 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3187 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3190 // Returns a chain & a flag for retval copy to use.
3191 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3192 SmallVector<SDValue, 8> Ops;
3194 if (!IsSibcall && isTailCall) {
3195 Chain = DAG.getCALLSEQ_END(Chain,
3196 DAG.getIntPtrConstant(NumBytesToPop, true),
3197 DAG.getIntPtrConstant(0, true), InFlag, dl);
3198 InFlag = Chain.getValue(1);
3201 Ops.push_back(Chain);
3202 Ops.push_back(Callee);
3205 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
3207 // Add argument registers to the end of the list so that they are known live
3209 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3210 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3211 RegsToPass[i].second.getValueType()));
3213 // Add a register mask operand representing the call-preserved registers.
3214 const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
3215 const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
3216 assert(Mask && "Missing call preserved mask for calling convention");
3217 Ops.push_back(DAG.getRegisterMask(Mask));
3219 if (InFlag.getNode())
3220 Ops.push_back(InFlag);
3224 //// If this is the first return lowered for this function, add the regs
3225 //// to the liveout set for the function.
3226 // This isn't right, although it's probably harmless on x86; liveouts
3227 // should be computed from returns not tail calls. Consider a void
3228 // function making a tail call to a function returning int.
3229 return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3232 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3233 InFlag = Chain.getValue(1);
3235 // Create the CALLSEQ_END node.
3236 unsigned NumBytesForCalleeToPop;
3237 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3238 DAG.getTarget().Options.GuaranteedTailCallOpt))
3239 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
3240 else if (!Is64Bit && !IsTailCallConvention(CallConv) &&
3241 !Subtarget->getTargetTriple().isOSMSVCRT() &&
3242 SR == StackStructReturn)
3243 // If this is a call to a struct-return function, the callee
3244 // pops the hidden struct pointer, so we have to push it back.
3245 // This is common for Darwin/X86, Linux & Mingw32 targets.
3246 // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3247 NumBytesForCalleeToPop = 4;
3249 NumBytesForCalleeToPop = 0; // Callee pops nothing.
3251 // Returns a flag for retval copy to use.
3253 Chain = DAG.getCALLSEQ_END(Chain,
3254 DAG.getIntPtrConstant(NumBytesToPop, true),
3255 DAG.getIntPtrConstant(NumBytesForCalleeToPop,
3258 InFlag = Chain.getValue(1);
3261 // Handle result values, copying them out of physregs into vregs that we
3263 return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
3264 Ins, dl, DAG, InVals);
3267 //===----------------------------------------------------------------------===//
3268 // Fast Calling Convention (tail call) implementation
3269 //===----------------------------------------------------------------------===//
3271 // Like std call, callee cleans arguments, convention except that ECX is
3272 // reserved for storing the tail called function address. Only 2 registers are
3273 // free for argument passing (inreg). Tail call optimization is performed
3275 // * tailcallopt is enabled
3276 // * caller/callee are fastcc
3277 // On X86_64 architecture with GOT-style position independent code only local
3278 // (within module) calls are supported at the moment.
3279 // To keep the stack aligned according to platform abi the function
3280 // GetAlignedArgumentStackSize ensures that argument delta is always multiples
3281 // of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3282 // If a tail called function callee has more arguments than the caller the
3283 // caller needs to make sure that there is room to move the RETADDR to. This is
3284 // achieved by reserving an area the size of the argument delta right after the
3285 // original RETADDR, but before the saved framepointer or the spilled registers
3286 // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3298 /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
3299 /// for a 16 byte align requirement.
3301 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3302 SelectionDAG& DAG) const {
3303 const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
3304 const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
3305 unsigned StackAlignment = TFI.getStackAlignment();
3306 uint64_t AlignMask = StackAlignment - 1;
3307 int64_t Offset = StackSize;
3308 unsigned SlotSize = RegInfo->getSlotSize();
3309 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3310 // Number smaller than 12 so just add the difference.
3311 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3313 // Mask out lower bits, add stackalignment once plus the 12 bytes.
3314 Offset = ((~AlignMask) & Offset) + StackAlignment +
3315 (StackAlignment-SlotSize);
3320 /// MatchingStackOffset - Return true if the given stack call argument is
3321 /// already available in the same position (relatively) of the caller's
3322 /// incoming argument stack.
3324 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3325 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
3326 const X86InstrInfo *TII) {
3327 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
3329 if (Arg.getOpcode() == ISD::CopyFromReg) {
3330 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3331 if (!TargetRegisterInfo::isVirtualRegister(VR))
3333 MachineInstr *Def = MRI->getVRegDef(VR);
3336 if (!Flags.isByVal()) {
3337 if (!TII->isLoadFromStackSlot(Def, FI))
3340 unsigned Opcode = Def->getOpcode();
3341 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
3342 Opcode == X86::LEA64_32r) &&
3343 Def->getOperand(1).isFI()) {
3344 FI = Def->getOperand(1).getIndex();
3345 Bytes = Flags.getByValSize();
3349 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
3350 if (Flags.isByVal())
3351 // ByVal argument is passed in as a pointer but it's now being
3352 // dereferenced. e.g.
3353 // define @foo(%struct.X* %A) {
3354 // tail call @bar(%struct.X* byval %A)
3357 SDValue Ptr = Ld->getBasePtr();
3358 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
3361 FI = FINode->getIndex();
3362 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
3363 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
3364 FI = FINode->getIndex();
3365 Bytes = Flags.getByValSize();
3369 assert(FI != INT_MAX);
3370 if (!MFI->isFixedObjectIndex(FI))
3372 return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
3375 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
3376 /// for tail call optimization. Targets which want to do tail call
3377 /// optimization should implement this function.
3379 X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
3380 CallingConv::ID CalleeCC,
3382 bool isCalleeStructRet,
3383 bool isCallerStructRet,
3385 const SmallVectorImpl<ISD::OutputArg> &Outs,
3386 const SmallVectorImpl<SDValue> &OutVals,
3387 const SmallVectorImpl<ISD::InputArg> &Ins,
3388 SelectionDAG &DAG) const {
3389 if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
3392 // If -tailcallopt is specified, make fastcc functions tail-callable.
3393 const MachineFunction &MF = DAG.getMachineFunction();
3394 const Function *CallerF = MF.getFunction();
3396 // If the function return type is x86_fp80 and the callee return type is not,
3397 // then the FP_EXTEND of the call result is not a nop. It's not safe to
3398 // perform a tailcall optimization here.
3399 if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
3402 CallingConv::ID CallerCC = CallerF->getCallingConv();
3403 bool CCMatch = CallerCC == CalleeCC;
3404 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
3405 bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
3407 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
3408 if (IsTailCallConvention(CalleeCC) && CCMatch)
3413 // Look for obvious safe cases to perform tail call optimization that do not
3414 // require ABI changes. This is what gcc calls sibcall.
3416 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
3417 // emit a special epilogue.
3418 const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
3419 if (RegInfo->needsStackRealignment(MF))
3422 // Also avoid sibcall optimization if either caller or callee uses struct
3423 // return semantics.
3424 if (isCalleeStructRet || isCallerStructRet)
3427 // An stdcall/thiscall caller is expected to clean up its arguments; the
3428 // callee isn't going to do that.
3429 // FIXME: this is more restrictive than needed. We could produce a tailcall
3430 // when the stack adjustment matches. For example, with a thiscall that takes
3431 // only one argument.
3432 if (!CCMatch && (CallerCC == CallingConv::X86_StdCall ||
3433 CallerCC == CallingConv::X86_ThisCall))
3436 // Do not sibcall optimize vararg calls unless all arguments are passed via
3438 if (isVarArg && !Outs.empty()) {
3440 // Optimizing for varargs on Win64 is unlikely to be safe without
3441 // additional testing.
3442 if (IsCalleeWin64 || IsCallerWin64)
3445 SmallVector<CCValAssign, 16> ArgLocs;
3446 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
3449 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3450 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
3451 if (!ArgLocs[i].isRegLoc())
3455 // If the call result is in ST0 / ST1, it needs to be popped off the x87
3456 // stack. Therefore, if it's not used by the call it is not safe to optimize
3457 // this into a sibcall.
3458 bool Unused = false;
3459 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
3466 SmallVector<CCValAssign, 16> RVLocs;
3467 CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs,
3469 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3470 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
3471 CCValAssign &VA = RVLocs[i];
3472 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
3477 // If the calling conventions do not match, then we'd better make sure the
3478 // results are returned in the same way as what the caller expects.
3480 SmallVector<CCValAssign, 16> RVLocs1;
3481 CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
3483 CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
3485 SmallVector<CCValAssign, 16> RVLocs2;
3486 CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
3488 CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
3490 if (RVLocs1.size() != RVLocs2.size())
3492 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
3493 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
3495 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
3497 if (RVLocs1[i].isRegLoc()) {
3498 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
3501 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
3507 // If the callee takes no arguments then go on to check the results of the
3509 if (!Outs.empty()) {
3510 // Check if stack adjustment is needed. For now, do not do this if any
3511 // argument is passed on the stack.
3512 SmallVector<CCValAssign, 16> ArgLocs;
3513 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
3516 // Allocate shadow area for Win64
3518 CCInfo.AllocateStack(32, 8);
3520 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3521 if (CCInfo.getNextStackOffset()) {
3522 MachineFunction &MF = DAG.getMachineFunction();
3523 if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
3526 // Check if the arguments are already laid out in the right way as
3527 // the caller's fixed stack objects.
3528 MachineFrameInfo *MFI = MF.getFrameInfo();
3529 const MachineRegisterInfo *MRI = &MF.getRegInfo();
3530 const X86InstrInfo *TII = Subtarget->getInstrInfo();
3531 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3532 CCValAssign &VA = ArgLocs[i];
3533 SDValue Arg = OutVals[i];
3534 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3535 if (VA.getLocInfo() == CCValAssign::Indirect)
3537 if (!VA.isRegLoc()) {
3538 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
3545 // If the tailcall address may be in a register, then make sure it's
3546 // possible to register allocate for it. In 32-bit, the call address can
3547 // only target EAX, EDX, or ECX since the tail call must be scheduled after
3548 // callee-saved registers are restored. These happen to be the same
3549 // registers used to pass 'inreg' arguments so watch out for those.
3550 if (!Subtarget->is64Bit() &&
3551 ((!isa<GlobalAddressSDNode>(Callee) &&
3552 !isa<ExternalSymbolSDNode>(Callee)) ||
3553 DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
3554 unsigned NumInRegs = 0;
3555 // In PIC we need an extra register to formulate the address computation
3557 unsigned MaxInRegs =
3558 (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
3560 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3561 CCValAssign &VA = ArgLocs[i];
3564 unsigned Reg = VA.getLocReg();
3567 case X86::EAX: case X86::EDX: case X86::ECX:
3568 if (++NumInRegs == MaxInRegs)
3580 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
3581 const TargetLibraryInfo *libInfo) const {
3582 return X86::createFastISel(funcInfo, libInfo);
3585 //===----------------------------------------------------------------------===//
3586 // Other Lowering Hooks
3587 //===----------------------------------------------------------------------===//
3589 static bool MayFoldLoad(SDValue Op) {
3590 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
3593 static bool MayFoldIntoStore(SDValue Op) {
3594 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
3597 static bool isTargetShuffle(unsigned Opcode) {
3599 default: return false;
3600 case X86ISD::BLENDI:
3601 case X86ISD::PSHUFB:
3602 case X86ISD::PSHUFD:
3603 case X86ISD::PSHUFHW:
3604 case X86ISD::PSHUFLW:
3606 case X86ISD::PALIGNR:
3607 case X86ISD::MOVLHPS:
3608 case X86ISD::MOVLHPD:
3609 case X86ISD::MOVHLPS:
3610 case X86ISD::MOVLPS:
3611 case X86ISD::MOVLPD:
3612 case X86ISD::MOVSHDUP:
3613 case X86ISD::MOVSLDUP:
3614 case X86ISD::MOVDDUP:
3617 case X86ISD::UNPCKL:
3618 case X86ISD::UNPCKH:
3619 case X86ISD::VPERMILPI:
3620 case X86ISD::VPERM2X128:
3621 case X86ISD::VPERMI:
3626 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3627 SDValue V1, SelectionDAG &DAG) {
3629 default: llvm_unreachable("Unknown x86 shuffle node");
3630 case X86ISD::MOVSHDUP:
3631 case X86ISD::MOVSLDUP:
3632 case X86ISD::MOVDDUP:
3633 return DAG.getNode(Opc, dl, VT, V1);
3637 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3638 SDValue V1, unsigned TargetMask,
3639 SelectionDAG &DAG) {
3641 default: llvm_unreachable("Unknown x86 shuffle node");
3642 case X86ISD::PSHUFD:
3643 case X86ISD::PSHUFHW:
3644 case X86ISD::PSHUFLW:
3645 case X86ISD::VPERMILPI:
3646 case X86ISD::VPERMI:
3647 return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
3651 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3652 SDValue V1, SDValue V2, unsigned TargetMask,
3653 SelectionDAG &DAG) {
3655 default: llvm_unreachable("Unknown x86 shuffle node");
3656 case X86ISD::PALIGNR:
3657 case X86ISD::VALIGN:
3659 case X86ISD::VPERM2X128:
3660 return DAG.getNode(Opc, dl, VT, V1, V2,
3661 DAG.getConstant(TargetMask, MVT::i8));
3665 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3666 SDValue V1, SDValue V2, SelectionDAG &DAG) {
3668 default: llvm_unreachable("Unknown x86 shuffle node");
3669 case X86ISD::MOVLHPS:
3670 case X86ISD::MOVLHPD:
3671 case X86ISD::MOVHLPS:
3672 case X86ISD::MOVLPS:
3673 case X86ISD::MOVLPD:
3676 case X86ISD::UNPCKL:
3677 case X86ISD::UNPCKH:
3678 return DAG.getNode(Opc, dl, VT, V1, V2);
3682 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
3683 MachineFunction &MF = DAG.getMachineFunction();
3684 const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
3685 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3686 int ReturnAddrIndex = FuncInfo->getRAIndex();
3688 if (ReturnAddrIndex == 0) {
3689 // Set up a frame object for the return address.
3690 unsigned SlotSize = RegInfo->getSlotSize();
3691 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
3694 FuncInfo->setRAIndex(ReturnAddrIndex);
3697 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
3700 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
3701 bool hasSymbolicDisplacement) {
3702 // Offset should fit into 32 bit immediate field.
3703 if (!isInt<32>(Offset))
3706 // If we don't have a symbolic displacement - we don't have any extra
3708 if (!hasSymbolicDisplacement)
3711 // FIXME: Some tweaks might be needed for medium code model.
3712 if (M != CodeModel::Small && M != CodeModel::Kernel)
3715 // For small code model we assume that latest object is 16MB before end of 31
3716 // bits boundary. We may also accept pretty large negative constants knowing
3717 // that all objects are in the positive half of address space.
3718 if (M == CodeModel::Small && Offset < 16*1024*1024)
3721 // For kernel code model we know that all object resist in the negative half
3722 // of 32bits address space. We may not accept negative offsets, since they may
3723 // be just off and we may accept pretty large positive ones.
3724 if (M == CodeModel::Kernel && Offset >= 0)
3730 /// isCalleePop - Determines whether the callee is required to pop its
3731 /// own arguments. Callee pop is necessary to support tail calls.
3732 bool X86::isCalleePop(CallingConv::ID CallingConv,
3733 bool is64Bit, bool IsVarArg, bool TailCallOpt) {
3734 switch (CallingConv) {
3737 case CallingConv::X86_StdCall:
3738 case CallingConv::X86_FastCall:
3739 case CallingConv::X86_ThisCall:
3741 case CallingConv::Fast:
3742 case CallingConv::GHC:
3743 case CallingConv::HiPE:
3750 /// \brief Return true if the condition is an unsigned comparison operation.
3751 static bool isX86CCUnsigned(unsigned X86CC) {
3753 default: llvm_unreachable("Invalid integer condition!");
3754 case X86::COND_E: return true;
3755 case X86::COND_G: return false;
3756 case X86::COND_GE: return false;
3757 case X86::COND_L: return false;
3758 case X86::COND_LE: return false;
3759 case X86::COND_NE: return true;
3760 case X86::COND_B: return true;
3761 case X86::COND_A: return true;
3762 case X86::COND_BE: return true;
3763 case X86::COND_AE: return true;
3765 llvm_unreachable("covered switch fell through?!");
3768 /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
3769 /// specific condition code, returning the condition code and the LHS/RHS of the
3770 /// comparison to make.
3771 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
3772 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
3774 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
3775 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
3776 // X > -1 -> X == 0, jump !sign.
3777 RHS = DAG.getConstant(0, RHS.getValueType());
3778 return X86::COND_NS;
3780 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
3781 // X < 0 -> X == 0, jump on sign.
3784 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
3786 RHS = DAG.getConstant(0, RHS.getValueType());
3787 return X86::COND_LE;
3791 switch (SetCCOpcode) {
3792 default: llvm_unreachable("Invalid integer condition!");
3793 case ISD::SETEQ: return X86::COND_E;
3794 case ISD::SETGT: return X86::COND_G;
3795 case ISD::SETGE: return X86::COND_GE;
3796 case ISD::SETLT: return X86::COND_L;
3797 case ISD::SETLE: return X86::COND_LE;
3798 case ISD::SETNE: return X86::COND_NE;
3799 case ISD::SETULT: return X86::COND_B;
3800 case ISD::SETUGT: return X86::COND_A;
3801 case ISD::SETULE: return X86::COND_BE;
3802 case ISD::SETUGE: return X86::COND_AE;
3806 // First determine if it is required or is profitable to flip the operands.
3808 // If LHS is a foldable load, but RHS is not, flip the condition.
3809 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3810 !ISD::isNON_EXTLoad(RHS.getNode())) {
3811 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3812 std::swap(LHS, RHS);
3815 switch (SetCCOpcode) {
3821 std::swap(LHS, RHS);
3825 // On a floating point condition, the flags are set as follows:
3827 // 0 | 0 | 0 | X > Y
3828 // 0 | 0 | 1 | X < Y
3829 // 1 | 0 | 0 | X == Y
3830 // 1 | 1 | 1 | unordered
3831 switch (SetCCOpcode) {
3832 default: llvm_unreachable("Condcode should be pre-legalized away");
3834 case ISD::SETEQ: return X86::COND_E;
3835 case ISD::SETOLT: // flipped
3837 case ISD::SETGT: return X86::COND_A;
3838 case ISD::SETOLE: // flipped
3840 case ISD::SETGE: return X86::COND_AE;
3841 case ISD::SETUGT: // flipped
3843 case ISD::SETLT: return X86::COND_B;
3844 case ISD::SETUGE: // flipped
3846 case ISD::SETLE: return X86::COND_BE;
3848 case ISD::SETNE: return X86::COND_NE;
3849 case ISD::SETUO: return X86::COND_P;
3850 case ISD::SETO: return X86::COND_NP;
3852 case ISD::SETUNE: return X86::COND_INVALID;
3856 /// hasFPCMov - is there a floating point cmov for the specific X86 condition
3857 /// code. Current x86 isa includes the following FP cmov instructions:
3858 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3859 static bool hasFPCMov(unsigned X86CC) {
3875 /// isFPImmLegal - Returns true if the target can instruction select the
3876 /// specified FP immediate natively. If false, the legalizer will
3877 /// materialize the FP immediate as a load from a constant pool.
3878 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
3879 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
3880 if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
3886 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
3887 ISD::LoadExtType ExtTy,
3889 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3890 // relocation target a movq or addq instruction: don't let the load shrink.
3891 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3892 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3893 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3894 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3898 /// \brief Returns true if it is beneficial to convert a load of a constant
3899 /// to just the constant itself.
3900 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
3902 assert(Ty->isIntegerTy());
3904 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3905 if (BitSize == 0 || BitSize > 64)
3910 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
3911 unsigned Index) const {
3912 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
3915 return (Index == 0 || Index == ResVT.getVectorNumElements());
3918 bool X86TargetLowering::isCheapToSpeculateCttz() const {
3919 // Speculate cttz only if we can directly use TZCNT.
3920 return Subtarget->hasBMI();
3923 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
3924 // Speculate ctlz only if we can directly use LZCNT.
3925 return Subtarget->hasLZCNT();
3928 /// isUndefOrInRange - Return true if Val is undef or if its value falls within
3929 /// the specified range (L, H].
3930 static bool isUndefOrInRange(int Val, int Low, int Hi) {
3931 return (Val < 0) || (Val >= Low && Val < Hi);
3934 /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
3935 /// specified value.
3936 static bool isUndefOrEqual(int Val, int CmpVal) {
3937 return (Val < 0 || Val == CmpVal);
3940 /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
3941 /// from position Pos and ending in Pos+Size, falls within the specified
3942 /// sequential range (Low, Low+Size]. or is undef.
3943 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
3944 unsigned Pos, unsigned Size, int Low) {
3945 for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
3946 if (!isUndefOrEqual(Mask[i], Low))
3951 /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
3952 /// is suitable for input to PSHUFD. That is, it doesn't reference the other
3953 /// operand - by default will match for first operand.
3954 static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT,
3955 bool TestSecondOperand = false) {
3956 if (VT != MVT::v4f32 && VT != MVT::v4i32 &&
3957 VT != MVT::v2f64 && VT != MVT::v2i64)
3960 unsigned NumElems = VT.getVectorNumElements();
3961 unsigned Lo = TestSecondOperand ? NumElems : 0;
3962 unsigned Hi = Lo + NumElems;
3964 for (unsigned i = 0; i < NumElems; ++i)
3965 if (!isUndefOrInRange(Mask[i], (int)Lo, (int)Hi))
3971 /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
3972 /// is suitable for input to PSHUFHW.
3973 static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
3974 if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
3977 // Lower quadword copied in order or undef.
3978 if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
3981 // Upper quadword shuffled.
3982 for (unsigned i = 4; i != 8; ++i)
3983 if (!isUndefOrInRange(Mask[i], 4, 8))
3986 if (VT == MVT::v16i16) {
3987 // Lower quadword copied in order or undef.
3988 if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
3991 // Upper quadword shuffled.
3992 for (unsigned i = 12; i != 16; ++i)
3993 if (!isUndefOrInRange(Mask[i], 12, 16))
4000 /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
4001 /// is suitable for input to PSHUFLW.
4002 static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
4003 if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
4006 // Upper quadword copied in order.
4007 if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
4010 // Lower quadword shuffled.
4011 for (unsigned i = 0; i != 4; ++i)
4012 if (!isUndefOrInRange(Mask[i], 0, 4))
4015 if (VT == MVT::v16i16) {
4016 // Upper quadword copied in order.
4017 if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
4020 // Lower quadword shuffled.
4021 for (unsigned i = 8; i != 12; ++i)
4022 if (!isUndefOrInRange(Mask[i], 8, 12))
4029 /// \brief Return true if the mask specifies a shuffle of elements that is
4030 /// suitable for input to intralane (palignr) or interlane (valign) vector
4032 static bool isAlignrMask(ArrayRef<int> Mask, MVT VT, bool InterLane) {
4033 unsigned NumElts = VT.getVectorNumElements();
4034 unsigned NumLanes = InterLane ? 1: VT.getSizeInBits()/128;
4035 unsigned NumLaneElts = NumElts/NumLanes;
4037 // Do not handle 64-bit element shuffles with palignr.
4038 if (NumLaneElts == 2)
4041 for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
4043 for (i = 0; i != NumLaneElts; ++i) {
4048 // Lane is all undef, go to next lane
4049 if (i == NumLaneElts)
4052 int Start = Mask[i+l];
4054 // Make sure its in this lane in one of the sources
4055 if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
4056 !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
4059 // If not lane 0, then we must match lane 0
4060 if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
4063 // Correct second source to be contiguous with first source
4064 if (Start >= (int)NumElts)
4065 Start -= NumElts - NumLaneElts;
4067 // Make sure we're shifting in the right direction.
4068 if (Start <= (int)(i+l))
4073 // Check the rest of the elements to see if they are consecutive.
4074 for (++i; i != NumLaneElts; ++i) {
4075 int Idx = Mask[i+l];
4077 // Make sure its in this lane
4078 if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
4079 !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
4082 // If not lane 0, then we must match lane 0
4083 if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
4086 if (Idx >= (int)NumElts)
4087 Idx -= NumElts - NumLaneElts;
4089 if (!isUndefOrEqual(Idx, Start+i))
4098 /// \brief Return true if the node specifies a shuffle of elements that is
4099 /// suitable for input to PALIGNR.
4100 static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
4101 const X86Subtarget *Subtarget) {
4102 if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
4103 (VT.is256BitVector() && !Subtarget->hasInt256()) ||
4104 VT.is512BitVector())
4105 // FIXME: Add AVX512BW.
4108 return isAlignrMask(Mask, VT, false);
4111 /// \brief Return true if the node specifies a shuffle of elements that is
4112 /// suitable for input to VALIGN.
4113 static bool isVALIGNMask(ArrayRef<int> Mask, MVT VT,
4114 const X86Subtarget *Subtarget) {
4115 // FIXME: Add AVX512VL.
4116 if (!VT.is512BitVector() || !Subtarget->hasAVX512())
4118 return isAlignrMask(Mask, VT, true);
4121 /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
4122 /// the two vector operands have swapped position.
4123 static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
4124 unsigned NumElems) {
4125 for (unsigned i = 0; i != NumElems; ++i) {
4129 else if (idx < (int)NumElems)
4130 Mask[i] = idx + NumElems;
4132 Mask[i] = idx - NumElems;
4136 /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
4137 /// specifies a shuffle of elements that is suitable for input to 128/256-bit
4138 /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
4139 /// reverse of what x86 shuffles want.
4140 static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) {
4142 unsigned NumElems = VT.getVectorNumElements();
4143 unsigned NumLanes = VT.getSizeInBits()/128;
4144 unsigned NumLaneElems = NumElems/NumLanes;
4146 if (NumLaneElems != 2 && NumLaneElems != 4)
4149 unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4150 bool symmetricMaskRequired =
4151 (VT.getSizeInBits() >= 256) && (EltSize == 32);
4153 // VSHUFPSY divides the resulting vector into 4 chunks.
4154 // The sources are also splitted into 4 chunks, and each destination
4155 // chunk must come from a different source chunk.
4157 // SRC1 => X7 X6 X5 X4 X3 X2 X1 X0
4158 // SRC2 => Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y9
4160 // DST => Y7..Y4, Y7..Y4, X7..X4, X7..X4,
4161 // Y3..Y0, Y3..Y0, X3..X0, X3..X0
4163 // VSHUFPDY divides the resulting vector into 4 chunks.
4164 // The sources are also splitted into 4 chunks, and each destination
4165 // chunk must come from a different source chunk.
4167 // SRC1 => X3 X2 X1 X0
4168 // SRC2 => Y3 Y2 Y1 Y0
4170 // DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0
4172 SmallVector<int, 4> MaskVal(NumLaneElems, -1);
4173 unsigned HalfLaneElems = NumLaneElems/2;
4174 for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
4175 for (unsigned i = 0; i != NumLaneElems; ++i) {
4176 int Idx = Mask[i+l];
4177 unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
4178 if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
4180 // For VSHUFPSY, the mask of the second half must be the same as the
4181 // first but with the appropriate offsets. This works in the same way as
4182 // VPERMILPS works with masks.
4183 if (!symmetricMaskRequired || Idx < 0)
4185 if (MaskVal[i] < 0) {
4186 MaskVal[i] = Idx - l;
4189 if ((signed)(Idx - l) != MaskVal[i])
4197 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
4198 /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
4199 static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) {
4200 if (!VT.is128BitVector())
4203 unsigned NumElems = VT.getVectorNumElements();
4208 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
4209 return isUndefOrEqual(Mask[0], 6) &&
4210 isUndefOrEqual(Mask[1], 7) &&
4211 isUndefOrEqual(Mask[2], 2) &&
4212 isUndefOrEqual(Mask[3], 3);
4215 /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
4216 /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
4218 static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) {
4219 if (!VT.is128BitVector())
4222 unsigned NumElems = VT.getVectorNumElements();
4227 return isUndefOrEqual(Mask[0], 2) &&
4228 isUndefOrEqual(Mask[1], 3) &&
4229 isUndefOrEqual(Mask[2], 2) &&
4230 isUndefOrEqual(Mask[3], 3);
4233 /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
4234 /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
4235 static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) {
4236 if (!VT.is128BitVector())
4239 unsigned NumElems = VT.getVectorNumElements();
4241 if (NumElems != 2 && NumElems != 4)
4244 for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4245 if (!isUndefOrEqual(Mask[i], i + NumElems))
4248 for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
4249 if (!isUndefOrEqual(Mask[i], i))
4255 /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
4256 /// specifies a shuffle of elements that is suitable for input to MOVLHPS.
4257 static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
4258 if (!VT.is128BitVector())
4261 unsigned NumElems = VT.getVectorNumElements();
4263 if (NumElems != 2 && NumElems != 4)
4266 for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4267 if (!isUndefOrEqual(Mask[i], i))
4270 for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4271 if (!isUndefOrEqual(Mask[i + e], i + NumElems))
4277 /// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
4278 /// specifies a shuffle of elements that is suitable for input to INSERTPS.
4279 /// i. e: If all but one element come from the same vector.
4280 static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
4281 // TODO: Deal with AVX's VINSERTPS
4282 if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))
4285 unsigned CorrectPosV1 = 0;
4286 unsigned CorrectPosV2 = 0;
4287 for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) {
4288 if (Mask[i] == -1) {
4296 else if (Mask[i] == i + 4)
4300 if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
4301 // We have 3 elements (undefs count as elements from any vector) from one
4302 // vector, and one from another.
4309 // Some special combinations that can be optimized.
4312 SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
4313 SelectionDAG &DAG) {
4314 MVT VT = SVOp->getSimpleValueType(0);
4317 if (VT != MVT::v8i32 && VT != MVT::v8f32)
4320 ArrayRef<int> Mask = SVOp->getMask();
4322 // These are the special masks that may be optimized.
4323 static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
4324 static const int MaskToOptimizeOdd[] = {1, 9, 3, 11, 5, 13, 7, 15};
4325 bool MatchEvenMask = true;
4326 bool MatchOddMask = true;
4327 for (int i=0; i<8; ++i) {
4328 if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
4329 MatchEvenMask = false;
4330 if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
4331 MatchOddMask = false;
4334 if (!MatchEvenMask && !MatchOddMask)
4337 SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
4339 SDValue Op0 = SVOp->getOperand(0);
4340 SDValue Op1 = SVOp->getOperand(1);
4342 if (MatchEvenMask) {
4343 // Shift the second operand right to 32 bits.
4344 static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
4345 Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
4347 // Shift the first operand left to 32 bits.
4348 static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
4349 Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
4351 static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
4352 return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
4355 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
4356 /// specifies a shuffle of elements that is suitable for input to UNPCKL.
4357 static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT,
4358 bool HasInt256, bool V2IsSplat = false) {
4360 assert(VT.getSizeInBits() >= 128 &&
4361 "Unsupported vector type for unpckl");
4363 unsigned NumElts = VT.getVectorNumElements();
4364 if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
4365 (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4368 assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
4369 "Unsupported vector type for unpckh");
4371 // AVX defines UNPCK* to operate independently on 128-bit lanes.
4372 unsigned NumLanes = VT.getSizeInBits()/128;
4373 unsigned NumLaneElts = NumElts/NumLanes;
4375 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4376 for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
4377 int BitI = Mask[l+i];
4378 int BitI1 = Mask[l+i+1];
4379 if (!isUndefOrEqual(BitI, j))
4382 if (!isUndefOrEqual(BitI1, NumElts))
4385 if (!isUndefOrEqual(BitI1, j + NumElts))
4394 /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
4395 /// specifies a shuffle of elements that is suitable for input to UNPCKH.
4396 static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT,
4397 bool HasInt256, bool V2IsSplat = false) {
4398 assert(VT.getSizeInBits() >= 128 &&
4399 "Unsupported vector type for unpckh");
4401 unsigned NumElts = VT.getVectorNumElements();
4402 if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
4403 (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4406 assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
4407 "Unsupported vector type for unpckh");
4409 // AVX defines UNPCK* to operate independently on 128-bit lanes.
4410 unsigned NumLanes = VT.getSizeInBits()/128;
4411 unsigned NumLaneElts = NumElts/NumLanes;
4413 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4414 for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
4415 int BitI = Mask[l+i];
4416 int BitI1 = Mask[l+i+1];
4417 if (!isUndefOrEqual(BitI, j))
4420 if (isUndefOrEqual(BitI1, NumElts))
4423 if (!isUndefOrEqual(BitI1, j+NumElts))
4431 /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
4432 /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
4434 static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
4435 unsigned NumElts = VT.getVectorNumElements();
4436 bool Is256BitVec = VT.is256BitVector();
4438 if (VT.is512BitVector())
4440 assert((VT.is128BitVector() || VT.is256BitVector()) &&
4441 "Unsupported vector type for unpckh");
4443 if (Is256BitVec && NumElts != 4 && NumElts != 8 &&
4444 (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4447 // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
4448 // FIXME: Need a better way to get rid of this, there's no latency difference
4449 // between UNPCKLPD and MOVDDUP, the later should always be checked first and
4450 // the former later. We should also remove the "_undef" special mask.
4451 if (NumElts == 4 && Is256BitVec)
4454 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
4455 // independently on 128-bit lanes.
4456 unsigned NumLanes = VT.getSizeInBits()/128;
4457 unsigned NumLaneElts = NumElts/NumLanes;
4459 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4460 for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
4461 int BitI = Mask[l+i];
4462 int BitI1 = Mask[l+i+1];
4464 if (!isUndefOrEqual(BitI, j))
4466 if (!isUndefOrEqual(BitI1, j))
4474 /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
4475 /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
4477 static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
4478 unsigned NumElts = VT.getVectorNumElements();
4480 if (VT.is512BitVector())
4483 assert((VT.is128BitVector() || VT.is256BitVector()) &&
4484 "Unsupported vector type for unpckh");
4486 if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
4487 (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4490 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
4491 // independently on 128-bit lanes.
4492 unsigned NumLanes = VT.getSizeInBits()/128;
4493 unsigned NumLaneElts = NumElts/NumLanes;
4495 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4496 for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
4497 int BitI = Mask[l+i];
4498 int BitI1 = Mask[l+i+1];
4499 if (!isUndefOrEqual(BitI, j))
4501 if (!isUndefOrEqual(BitI1, j))
4508 // Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or
4509 // (src1[0], src0[1]), manipulation with 256-bit sub-vectors
4510 static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) {
4511 if (!VT.is512BitVector())
4514 unsigned NumElts = VT.getVectorNumElements();
4515 unsigned HalfSize = NumElts/2;
4516 if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) {
4517 if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) {
4522 if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) {
4523 if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) {
4531 /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
4532 /// specifies a shuffle of elements that is suitable for input to MOVSS,
4533 /// MOVSD, and MOVD, i.e. setting the lowest element.
4534 static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
4535 if (VT.getVectorElementType().getSizeInBits() < 32)
4537 if (!VT.is128BitVector())
4540 unsigned NumElts = VT.getVectorNumElements();
4542 if (!isUndefOrEqual(Mask[0], NumElts))
4545 for (unsigned i = 1; i != NumElts; ++i)
4546 if (!isUndefOrEqual(Mask[i], i))
4552 /// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
4553 /// as permutations between 128-bit chunks or halves. As an example: this
4555 /// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
4556 /// The first half comes from the second half of V1 and the second half from the
4557 /// the second half of V2.
4558 static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
4559 if (!HasFp256 || !VT.is256BitVector())
4562 // The shuffle result is divided into half A and half B. In total the two
4563 // sources have 4 halves, namely: C, D, E, F. The final values of A and
4564 // B must come from C, D, E or F.
4565 unsigned HalfSize = VT.getVectorNumElements()/2;
4566 bool MatchA = false, MatchB = false;
4568 // Check if A comes from one of C, D, E, F.
4569 for (unsigned Half = 0; Half != 4; ++Half) {
4570 if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
4576 // Check if B comes from one of C, D, E, F.
4577 for (unsigned Half = 0; Half != 4; ++Half) {
4578 if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
4584 return MatchA && MatchB;
4587 /// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
4588 /// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
4589 static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
4590 MVT VT = SVOp->getSimpleValueType(0);
4592 unsigned HalfSize = VT.getVectorNumElements()/2;
4594 unsigned FstHalf = 0, SndHalf = 0;
4595 for (unsigned i = 0; i < HalfSize; ++i) {
4596 if (SVOp->getMaskElt(i) > 0) {
4597 FstHalf = SVOp->getMaskElt(i)/HalfSize;
4601 for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
4602 if (SVOp->getMaskElt(i) > 0) {
4603 SndHalf = SVOp->getMaskElt(i)/HalfSize;
4608 return (FstHalf | (SndHalf << 4));
4611 // Symmetric in-lane mask. Each lane has 4 elements (for imm8)
4612 static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) {
4613 unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4617 unsigned NumElts = VT.getVectorNumElements();
4619 if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) {
4620 for (unsigned i = 0; i != NumElts; ++i) {
4623 Imm8 |= Mask[i] << (i*2);
4628 unsigned LaneSize = 4;
4629 SmallVector<int, 4> MaskVal(LaneSize, -1);
4631 for (unsigned l = 0; l != NumElts; l += LaneSize) {
4632 for (unsigned i = 0; i != LaneSize; ++i) {
4633 if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
4637 if (MaskVal[i] < 0) {
4638 MaskVal[i] = Mask[i+l] - l;
4639 Imm8 |= MaskVal[i] << (i*2);
4642 if (Mask[i+l] != (signed)(MaskVal[i]+l))
4649 /// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
4650 /// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
4651 /// Note that VPERMIL mask matching is different depending whether theunderlying
4652 /// type is 32 or 64. In the VPERMILPS the high half of the mask should point
4653 /// to the same elements of the low, but to the higher half of the source.
4654 /// In VPERMILPD the two lanes could be shuffled independently of each other
4655 /// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
4656 static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {
4657 unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4658 if (VT.getSizeInBits() < 256 || EltSize < 32)
4660 bool symmetricMaskRequired = (EltSize == 32);
4661 unsigned NumElts = VT.getVectorNumElements();
4663 unsigned NumLanes = VT.getSizeInBits()/128;
4664 unsigned LaneSize = NumElts/NumLanes;
4665 // 2 or 4 elements in one lane
4667 SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);
4668 for (unsigned l = 0; l != NumElts; l += LaneSize) {
4669 for (unsigned i = 0; i != LaneSize; ++i) {
4670 if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
4672 if (symmetricMaskRequired) {
4673 if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {
4674 ExpectedMaskVal[i] = Mask[i+l] - l;
4677 if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l))
4685 /// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
4686 /// of what x86 movss want. X86 movs requires the lowest element to be lowest
4687 /// element of vector 2 and the other elements to come from vector 1 in order.
4688 static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT,
4689 bool V2IsSplat = false, bool V2IsUndef = false) {
4690 if (!VT.is128BitVector())
4693 unsigned NumOps = VT.getVectorNumElements();
4694 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
4697 if (!isUndefOrEqual(Mask[0], 0))
4700 for (unsigned i = 1; i != NumOps; ++i)
4701 if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
4702 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
4703 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
4709 /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4710 /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
4711 /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
4712 static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT,
4713 const X86Subtarget *Subtarget) {
4714 if (!Subtarget->hasSSE3())
4717 unsigned NumElems = VT.getVectorNumElements();
4719 if ((VT.is128BitVector() && NumElems != 4) ||
4720 (VT.is256BitVector() && NumElems != 8) ||
4721 (VT.is512BitVector() && NumElems != 16))
4724 // "i+1" is the value the indexed mask element must have
4725 for (unsigned i = 0; i != NumElems; i += 2)
4726 if (!isUndefOrEqual(Mask[i], i+1) ||
4727 !isUndefOrEqual(Mask[i+1], i+1))
4733 /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4734 /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
4735 /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
4736 static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT,
4737 const X86Subtarget *Subtarget) {
4738 if (!Subtarget->hasSSE3())
4741 unsigned NumElems = VT.getVectorNumElements();
4743 if ((VT.is128BitVector() && NumElems != 4) ||
4744 (VT.is256BitVector() && NumElems != 8) ||
4745 (VT.is512BitVector() && NumElems != 16))
4748 // "i" is the value the indexed mask element must have
4749 for (unsigned i = 0; i != NumElems; i += 2)
4750 if (!isUndefOrEqual(Mask[i], i) ||
4751 !isUndefOrEqual(Mask[i+1], i))
4757 /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
4758 /// specifies a shuffle of elements that is suitable for input to 256-bit
4759 /// version of MOVDDUP.
4760 static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
4761 if (!HasFp256 || !VT.is256BitVector())
4764 unsigned NumElts = VT.getVectorNumElements();
4768 for (unsigned i = 0; i != NumElts/2; ++i)
4769 if (!isUndefOrEqual(Mask[i], 0))
4771 for (unsigned i = NumElts/2; i != NumElts; ++i)
4772 if (!isUndefOrEqual(Mask[i], NumElts/2))
4777 /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4778 /// specifies a shuffle of elements that is suitable for input to 128-bit
4779 /// version of MOVDDUP.
4780 static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) {
4781 if (!VT.is128BitVector())
4784 unsigned e = VT.getVectorNumElements() / 2;
4785 for (unsigned i = 0; i != e; ++i)
4786 if (!isUndefOrEqual(Mask[i], i))
4788 for (unsigned i = 0; i != e; ++i)
4789 if (!isUndefOrEqual(Mask[e+i], i))
4794 /// isVEXTRACTIndex - Return true if the specified
4795 /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
4796 /// suitable for instruction that extract 128 or 256 bit vectors
4797 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
4798 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4799 if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4802 // The index should be aligned on a vecWidth-bit boundary.
4804 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4806 MVT VT = N->getSimpleValueType(0);
4807 unsigned ElSize = VT.getVectorElementType().getSizeInBits();
4808 bool Result = (Index * ElSize) % vecWidth == 0;
4813 /// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR
4814 /// operand specifies a subvector insert that is suitable for input to
4815 /// insertion of 128 or 256-bit subvectors
4816 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
4817 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4818 if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4820 // The index should be aligned on a vecWidth-bit boundary.
4822 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4824 MVT VT = N->getSimpleValueType(0);
4825 unsigned ElSize = VT.getVectorElementType().getSizeInBits();
4826 bool Result = (Index * ElSize) % vecWidth == 0;
4831 bool X86::isVINSERT128Index(SDNode *N) {
4832 return isVINSERTIndex(N, 128);
4835 bool X86::isVINSERT256Index(SDNode *N) {
4836 return isVINSERTIndex(N, 256);
4839 bool X86::isVEXTRACT128Index(SDNode *N) {
4840 return isVEXTRACTIndex(N, 128);
4843 bool X86::isVEXTRACT256Index(SDNode *N) {
4844 return isVEXTRACTIndex(N, 256);
4847 /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
4848 /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
4849 /// Handles 128-bit and 256-bit.
4850 static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
4851 MVT VT = N->getSimpleValueType(0);
4853 assert((VT.getSizeInBits() >= 128) &&
4854 "Unsupported vector type for PSHUF/SHUFP");
4856 // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
4857 // independently on 128-bit lanes.
4858 unsigned NumElts = VT.getVectorNumElements();
4859 unsigned NumLanes = VT.getSizeInBits()/128;
4860 unsigned NumLaneElts = NumElts/NumLanes;
4862 assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) &&
4863 "Only supports 2, 4 or 8 elements per lane");
4865 unsigned Shift = (NumLaneElts >= 4) ? 1 : 0;
4867 for (unsigned i = 0; i != NumElts; ++i) {
4868 int Elt = N->getMaskElt(i);
4869 if (Elt < 0) continue;
4870 Elt &= NumLaneElts - 1;
4871 unsigned ShAmt = (i << Shift) % 8;
4872 Mask |= Elt << ShAmt;
4878 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
4879 /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
4880 static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
4881 MVT VT = N->getSimpleValueType(0);
4883 assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
4884 "Unsupported vector type for PSHUFHW");
4886 unsigned NumElts = VT.getVectorNumElements();
4889 for (unsigned l = 0; l != NumElts; l += 8) {
4890 // 8 nodes per lane, but we only care about the last 4.
4891 for (unsigned i = 0; i < 4; ++i) {
4892 int Elt = N->getMaskElt(l+i+4);
4893 if (Elt < 0) continue;
4894 Elt &= 0x3; // only 2-bits.
4895 Mask |= Elt << (i * 2);
4902 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
4903 /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
4904 static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
4905 MVT VT = N->getSimpleValueType(0);
4907 assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
4908 "Unsupported vector type for PSHUFHW");
4910 unsigned NumElts = VT.getVectorNumElements();
4913 for (unsigned l = 0; l != NumElts; l += 8) {
4914 // 8 nodes per lane, but we only care about the first 4.
4915 for (unsigned i = 0; i < 4; ++i) {
4916 int Elt = N->getMaskElt(l+i);
4917 if (Elt < 0) continue;
4918 Elt &= 0x3; // only 2-bits
4919 Mask |= Elt << (i * 2);
4926 /// \brief Return the appropriate immediate to shuffle the specified
4927 /// VECTOR_SHUFFLE mask with the PALIGNR (if InterLane is false) or with
4928 /// VALIGN (if Interlane is true) instructions.
4929 static unsigned getShuffleAlignrImmediate(ShuffleVectorSDNode *SVOp,
4931 MVT VT = SVOp->getSimpleValueType(0);
4932 unsigned EltSize = InterLane ? 1 :
4933 VT.getVectorElementType().getSizeInBits() >> 3;
4935 unsigned NumElts = VT.getVectorNumElements();
4936 unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128;
4937 unsigned NumLaneElts = NumElts/NumLanes;
4941 for (i = 0; i != NumElts; ++i) {
4942 Val = SVOp->getMaskElt(i);
4946 if (Val >= (int)NumElts)
4947 Val -= NumElts - NumLaneElts;
4949 assert(Val - i > 0 && "PALIGNR imm should be positive");
4950 return (Val - i) * EltSize;
4953 /// \brief Return the appropriate immediate to shuffle the specified
4954 /// VECTOR_SHUFFLE mask with the PALIGNR instruction.
4955 static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
4956 return getShuffleAlignrImmediate(SVOp, false);
4959 /// \brief Return the appropriate immediate to shuffle the specified
4960 /// VECTOR_SHUFFLE mask with the VALIGN instruction.
4961 static unsigned getShuffleVALIGNImmediate(ShuffleVectorSDNode *SVOp) {
4962 return getShuffleAlignrImmediate(SVOp, true);
4966 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
4967 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4968 if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4969 llvm_unreachable("Illegal extract subvector for VEXTRACT");
4972 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4974 MVT VecVT = N->getOperand(0).getSimpleValueType();
4975 MVT ElVT = VecVT.getVectorElementType();
4977 unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4978 return Index / NumElemsPerChunk;
4981 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
4982 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4983 if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4984 llvm_unreachable("Illegal insert subvector for VINSERT");
4987 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4989 MVT VecVT = N->getSimpleValueType(0);
4990 MVT ElVT = VecVT.getVectorElementType();
4992 unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4993 return Index / NumElemsPerChunk;
4996 /// getExtractVEXTRACT128Immediate - Return the appropriate immediate
4997 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
4998 /// and VINSERTI128 instructions.
4999 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
5000 return getExtractVEXTRACTImmediate(N, 128);
5003 /// getExtractVEXTRACT256Immediate - Return the appropriate immediate
5004 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4
5005 /// and VINSERTI64x4 instructions.
5006 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
5007 return getExtractVEXTRACTImmediate(N, 256);
5010 /// getInsertVINSERT128Immediate - Return the appropriate immediate
5011 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
5012 /// and VINSERTI128 instructions.
5013 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
5014 return getInsertVINSERTImmediate(N, 128);
5017 /// getInsertVINSERT256Immediate - Return the appropriate immediate
5018 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4
5019 /// and VINSERTI64x4 instructions.
5020 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
5021 return getInsertVINSERTImmediate(N, 256);
5024 /// isZero - Returns true if Elt is a constant integer zero
5025 static bool isZero(SDValue V) {
5026 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
5027 return C && C->isNullValue();
5030 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
5032 bool X86::isZeroNode(SDValue Elt) {
5035 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
5036 return CFP->getValueAPF().isPosZero();
5040 /// ShouldXformToMOVHLPS - Return true if the node should be transformed to
5041 /// match movhlps. The lower half elements should come from upper half of
5042 /// V1 (and in order), and the upper half elements should come from the upper
5043 /// half of V2 (and in order).
5044 static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) {
5045 if (!VT.is128BitVector())
5047 if (VT.getVectorNumElements() != 4)
5049 for (unsigned i = 0, e = 2; i != e; ++i)
5050 if (!isUndefOrEqual(Mask[i], i+2))
5052 for (unsigned i = 2; i != 4; ++i)
5053 if (!isUndefOrEqual(Mask[i], i+4))
5058 /// isScalarLoadToVector - Returns true if the node is a scalar load that
5059 /// is promoted to a vector. It also returns the LoadSDNode by reference if
5061 static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) {
5062 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
5064 N = N->getOperand(0).getNode();
5065 if (!ISD::isNON_EXTLoad(N))
5068 *LD = cast<LoadSDNode>(N);
5072 // Test whether the given value is a vector value which will be legalized
5074 static bool WillBeConstantPoolLoad(SDNode *N) {
5075 if (N->getOpcode() != ISD::BUILD_VECTOR)
5078 // Check for any non-constant elements.
5079 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
5080 switch (N->getOperand(i).getNode()->getOpcode()) {
5082 case ISD::ConstantFP:
5089 // Vectors of all-zeros and all-ones are materialized with special
5090 // instructions rather than being loaded.
5091 return !ISD::isBuildVectorAllZeros(N) &&
5092 !ISD::isBuildVectorAllOnes(N);
5095 /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
5096 /// match movlp{s|d}. The lower half elements should come from lower half of
5097 /// V1 (and in order), and the upper half elements should come from the upper
5098 /// half of V2 (and in order). And since V1 will become the source of the
5099 /// MOVLP, it must be either a vector load or a scalar load to vector.
5100 static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
5101 ArrayRef<int> Mask, MVT VT) {
5102 if (!VT.is128BitVector())
5105 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
5107 // Is V2 is a vector load, don't do this transformation. We will try to use
5108 // load folding shufps op.
5109 if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
5112 unsigned NumElems = VT.getVectorNumElements();
5114 if (NumElems != 2 && NumElems != 4)
5116 for (unsigned i = 0, e = NumElems/2; i != e; ++i)
5117 if (!isUndefOrEqual(Mask[i], i))
5119 for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
5120 if (!isUndefOrEqual(Mask[i], i+NumElems))
5125 /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
5126 /// to an zero vector.
5127 /// FIXME: move to dag combiner / method on ShuffleVectorSDNode
5128 static bool isZeroShuffle(ShuffleVectorSDNode *N) {
5129 SDValue V1 = N->getOperand(0);
5130 SDValue V2 = N->getOperand(1);
5131 unsigned NumElems = N->getValueType(0).getVectorNumElements();
5132 for (unsigned i = 0; i != NumElems; ++i) {
5133 int Idx = N->getMaskElt(i);
5134 if (Idx >= (int)NumElems) {
5135 unsigned Opc = V2.getOpcode();
5136 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
5138 if (Opc != ISD::BUILD_VECTOR ||
5139 !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
5141 } else if (Idx >= 0) {
5142 unsigned Opc = V1.getOpcode();
5143 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
5145 if (Opc != ISD::BUILD_VECTOR ||
5146 !X86::isZeroNode(V1.getOperand(Idx)))
5153 /// getZeroVector - Returns a vector of specified type with all zero elements.
5155 static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
5156 SelectionDAG &DAG, SDLoc dl) {
5157 assert(VT.isVector() && "Expected a vector type");
5159 // Always build SSE zero vectors as <4 x i32> bitcasted
5160 // to their dest type. This ensures they get CSE'd.
5162 if (VT.is128BitVector()) { // SSE
5163 if (Subtarget->hasSSE2()) { // SSE2
5164 SDValue Cst = DAG.getConstant(0, MVT::i32);
5165 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
5167 SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
5168 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
5170 } else if (VT.is256BitVector()) { // AVX
5171 if (Subtarget->hasInt256()) { // AVX2
5172 SDValue Cst = DAG.getConstant(0, MVT::i32);
5173 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5174 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
5176 // 256-bit logic and arithmetic instructions in AVX are all
5177 // floating-point, no support for integer ops. Emit fp zeroed vectors.
5178 SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
5179 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5180 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
5182 } else if (VT.is512BitVector()) { // AVX-512
5183 SDValue Cst = DAG.getConstant(0, MVT::i32);
5184 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
5185 Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5186 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
5187 } else if (VT.getScalarType() == MVT::i1) {
5188 assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type");
5189 SDValue Cst = DAG.getConstant(0, MVT::i1);
5190 SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
5191 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
5193 llvm_unreachable("Unexpected vector type");
5195 return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
5198 /// getOnesVector - Returns a vector of specified type with all bits set.
5199 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
5200 /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
5201 /// Then bitcast to their original type, ensuring they get CSE'd.
5202 static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
5204 assert(VT.isVector() && "Expected a vector type");
5206 SDValue Cst = DAG.getConstant(~0U, MVT::i32);
5208 if (VT.is256BitVector()) {
5209 if (HasInt256) { // AVX2
5210 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5211 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
5213 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
5214 Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
5216 } else if (VT.is128BitVector()) {
5217 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
5219 llvm_unreachable("Unexpected vector type");
5221 return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
5224 /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
5225 /// that point to V2 points to its first element.
5226 static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
5227 for (unsigned i = 0; i != NumElems; ++i) {
5228 if (Mask[i] > (int)NumElems) {
5234 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
5235 /// operation of specified width.
5236 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
5238 unsigned NumElems = VT.getVectorNumElements();
5239 SmallVector<int, 8> Mask;
5240 Mask.push_back(NumElems);
5241 for (unsigned i = 1; i != NumElems; ++i)
5243 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
5246 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
5247 static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
5249 unsigned NumElems = VT.getVectorNumElements();
5250 SmallVector<int, 8> Mask;
5251 for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
5253 Mask.push_back(i + NumElems);
5255 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
5258 /// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
5259 static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
5261 unsigned NumElems = VT.getVectorNumElements();
5262 SmallVector<int, 8> Mask;
5263 for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
5264 Mask.push_back(i + Half);
5265 Mask.push_back(i + NumElems + Half);
5267 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
5270 // PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
5271 // a generic shuffle instruction because the target has no such instructions.
5272 // Generate shuffles which repeat i16 and i8 several times until they can be
5273 // represented by v4f32 and then be manipulated by target suported shuffles.
5274 static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
5275 MVT VT = V.getSimpleValueType();
5276 int NumElems = VT.getVectorNumElements();
5279 while (NumElems > 4) {
5280 if (EltNo < NumElems/2) {
5281 V = getUnpackl(DAG, dl, VT, V, V);
5283 V = getUnpackh(DAG, dl, VT, V, V);
5284 EltNo -= NumElems/2;
5291 /// getLegalSplat - Generate a legal splat with supported x86 shuffles
5292 static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
5293 MVT VT = V.getSimpleValueType();
5296 if (VT.is128BitVector()) {
5297 V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
5298 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
5299 V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
5301 } else if (VT.is256BitVector()) {
5302 // To use VPERMILPS to splat scalars, the second half of indicies must
5303 // refer to the higher part, which is a duplication of the lower one,
5304 // because VPERMILPS can only handle in-lane permutations.
5305 int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
5306 EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
5308 V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
5309 V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
5312 llvm_unreachable("Vector size not supported");
5314 return DAG.getNode(ISD::BITCAST, dl, VT, V);
5317 /// PromoteSplat - Splat is promoted to target supported vector shuffles.
5318 static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
5319 MVT SrcVT = SV->getSimpleValueType(0);
5320 SDValue V1 = SV->getOperand(0);
5323 int EltNo = SV->getSplatIndex();
5324 int NumElems = SrcVT.getVectorNumElements();
5325 bool Is256BitVec = SrcVT.is256BitVector();
5327 assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) &&
5328 "Unknown how to promote splat for type");
5330 // Extract the 128-bit part containing the splat element and update
5331 // the splat element index when it refers to the higher register.
5333 V1 = Extract128BitVector(V1, EltNo, DAG, dl);
5334 if (EltNo >= NumElems/2)
5335 EltNo -= NumElems/2;
5338 // All i16 and i8 vector types can't be used directly by a generic shuffle
5339 // instruction because the target has no such instruction. Generate shuffles
5340 // which repeat i16 and i8 several times until they fit in i32, and then can
5341 // be manipulated by target suported shuffles.
5342 MVT EltVT = SrcVT.getVectorElementType();
5343 if (EltVT == MVT::i8 || EltVT == MVT::i16)
5344 V1 = PromoteSplati8i16(V1, DAG, EltNo);
5346 // Recreate the 256-bit vector and place the same 128-bit vector
5347 // into the low and high part. This is necessary because we want
5348 // to use VPERM* to shuffle the vectors
5350 V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
5353 return getLegalSplat(DAG, V1, EltNo);
5356 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
5357 /// vector of zero or undef vector. This produces a shuffle where the low
5358 /// element of V2 is swizzled into the zero/undef vector, landing at element
5359 /// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
5360 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
5362 const X86Subtarget *Subtarget,
5363 SelectionDAG &DAG) {
5364 MVT VT = V2.getSimpleValueType();
5366 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5367 unsigned NumElems = VT.getVectorNumElements();
5368 SmallVector<int, 16> MaskVec;
5369 for (unsigned i = 0; i != NumElems; ++i)
5370 // If this is the insertion idx, put the low elt of V2 here.
5371 MaskVec.push_back(i == Idx ? NumElems : i);
5372 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
5375 /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
5376 /// target specific opcode. Returns true if the Mask could be calculated. Sets
5377 /// IsUnary to true if only uses one source. Note that this will set IsUnary for
5378 /// shuffles which use a single input multiple times, and in those cases it will
5379 /// adjust the mask to only have indices within that single input.
5380 static bool getTargetShuffleMask(SDNode *N, MVT VT,
5381 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5382 unsigned NumElems = VT.getVectorNumElements();
5386 bool IsFakeUnary = false;
5387 switch(N->getOpcode()) {
5388 case X86ISD::BLENDI:
5389 ImmN = N->getOperand(N->getNumOperands()-1);
5390 DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5393 ImmN = N->getOperand(N->getNumOperands()-1);
5394 DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5395 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5397 case X86ISD::UNPCKH:
5398 DecodeUNPCKHMask(VT, Mask);
5399 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5401 case X86ISD::UNPCKL:
5402 DecodeUNPCKLMask(VT, Mask);
5403 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5405 case X86ISD::MOVHLPS:
5406 DecodeMOVHLPSMask(NumElems, Mask);
5407 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5409 case X86ISD::MOVLHPS:
5410 DecodeMOVLHPSMask(NumElems, Mask);
5411 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5413 case X86ISD::PALIGNR:
5414 ImmN = N->getOperand(N->getNumOperands()-1);
5415 DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5417 case X86ISD::PSHUFD:
5418 case X86ISD::VPERMILPI:
5419 ImmN = N->getOperand(N->getNumOperands()-1);
5420 DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5423 case X86ISD::PSHUFHW:
5424 ImmN = N->getOperand(N->getNumOperands()-1);
5425 DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5428 case X86ISD::PSHUFLW:
5429 ImmN = N->getOperand(N->getNumOperands()-1);
5430 DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5433 case X86ISD::PSHUFB: {
5435 SDValue MaskNode = N->getOperand(1);
5436 while (MaskNode->getOpcode() == ISD::BITCAST)
5437 MaskNode = MaskNode->getOperand(0);
5439 if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
5440 // If we have a build-vector, then things are easy.
5441 EVT VT = MaskNode.getValueType();
5442 assert(VT.isVector() &&
5443 "Can't produce a non-vector with a build_vector!");
5444 if (!VT.isInteger())
5447 int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8;
5449 SmallVector<uint64_t, 32> RawMask;
5450 for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) {
5451 SDValue Op = MaskNode->getOperand(i);
5452 if (Op->getOpcode() == ISD::UNDEF) {
5453 RawMask.push_back((uint64_t)SM_SentinelUndef);
5456 auto *CN = dyn_cast<ConstantSDNode>(Op.getNode());
5459 APInt MaskElement = CN->getAPIntValue();
5461 // We now have to decode the element which could be any integer size and
5462 // extract each byte of it.
5463 for (int j = 0; j < NumBytesPerElement; ++j) {
5464 // Note that this is x86 and so always little endian: the low byte is
5465 // the first byte of the mask.
5466 RawMask.push_back(MaskElement.getLoBits(8).getZExtValue());
5467 MaskElement = MaskElement.lshr(8);
5470 DecodePSHUFBMask(RawMask, Mask);
5474 auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
5478 SDValue Ptr = MaskLoad->getBasePtr();
5479 if (Ptr->getOpcode() == X86ISD::Wrapper)
5480 Ptr = Ptr->getOperand(0);
5482 auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
5483 if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
5486 if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
5487 DecodePSHUFBMask(C, Mask);
5495 case X86ISD::VPERMI:
5496 ImmN = N->getOperand(N->getNumOperands()-1);
5497 DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5502 DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
5504 case X86ISD::VPERM2X128:
5505 ImmN = N->getOperand(N->getNumOperands()-1);
5506 DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5507 if (Mask.empty()) return false;
5509 case X86ISD::MOVSLDUP:
5510 DecodeMOVSLDUPMask(VT, Mask);
5513 case X86ISD::MOVSHDUP:
5514 DecodeMOVSHDUPMask(VT, Mask);
5517 case X86ISD::MOVDDUP:
5518 DecodeMOVDDUPMask(VT, Mask);
5521 case X86ISD::MOVLHPD:
5522 case X86ISD::MOVLPD:
5523 case X86ISD::MOVLPS:
5524 // Not yet implemented
5526 default: llvm_unreachable("unknown target shuffle node");
5529 // If we have a fake unary shuffle, the shuffle mask is spread across two
5530 // inputs that are actually the same node. Re-map the mask to always point
5531 // into the first input.
5534 if (M >= (int)Mask.size())
5540 /// getShuffleScalarElt - Returns the scalar element that will make up the ith
5541 /// element of the result of the vector shuffle.
5542 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
5545 return SDValue(); // Limit search depth.
5547 SDValue V = SDValue(N, 0);
5548 EVT VT = V.getValueType();
5549 unsigned Opcode = V.getOpcode();
5551 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
5552 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
5553 int Elt = SV->getMaskElt(Index);
5556 return DAG.getUNDEF(VT.getVectorElementType());
5558 unsigned NumElems = VT.getVectorNumElements();
5559 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
5560 : SV->getOperand(1);
5561 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
5564 // Recurse into target specific vector shuffles to find scalars.
5565 if (isTargetShuffle(Opcode)) {
5566 MVT ShufVT = V.getSimpleValueType();
5567 unsigned NumElems = ShufVT.getVectorNumElements();
5568 SmallVector<int, 16> ShuffleMask;
5571 if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
5574 int Elt = ShuffleMask[Index];
5576 return DAG.getUNDEF(ShufVT.getVectorElementType());
5578 SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
5580 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
5584 // Actual nodes that may contain scalar elements
5585 if (Opcode == ISD::BITCAST) {
5586 V = V.getOperand(0);
5587 EVT SrcVT = V.getValueType();
5588 unsigned NumElems = VT.getVectorNumElements();
5590 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
5594 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
5595 return (Index == 0) ? V.getOperand(0)
5596 : DAG.getUNDEF(VT.getVectorElementType());
5598 if (V.getOpcode() == ISD::BUILD_VECTOR)
5599 return V.getOperand(Index);
5604 /// getNumOfConsecutiveZeros - Return the number of elements of a vector
5605 /// shuffle operation which come from a consecutively from a zero. The
5606 /// search can start in two different directions, from left or right.
5607 /// We count undefs as zeros until PreferredNum is reached.
5608 static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp,
5609 unsigned NumElems, bool ZerosFromLeft,
5611 unsigned PreferredNum = -1U) {
5612 unsigned NumZeros = 0;
5613 for (unsigned i = 0; i != NumElems; ++i) {
5614 unsigned Index = ZerosFromLeft ? i : NumElems - i - 1;
5615 SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
5619 if (X86::isZeroNode(Elt))
5621 else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum.
5622 NumZeros = std::min(NumZeros + 1, PreferredNum);
5630 /// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
5631 /// correspond consecutively to elements from one of the vector operands,
5632 /// starting from its index OpIdx. Also tell OpNum which source vector operand.
5634 bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
5635 unsigned MaskI, unsigned MaskE, unsigned OpIdx,
5636 unsigned NumElems, unsigned &OpNum) {
5637 bool SeenV1 = false;
5638 bool SeenV2 = false;
5640 for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
5641 int Idx = SVOp->getMaskElt(i);
5642 // Ignore undef indicies
5646 if (Idx < (int)NumElems)
5651 // Only accept consecutive elements from the same vector
5652 if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
5656 OpNum = SeenV1 ? 0 : 1;
5660 /// isVectorShiftRight - Returns true if the shuffle can be implemented as a
5661 /// logical left shift of a vector.
5662 static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
5663 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5665 SVOp->getSimpleValueType(0).getVectorNumElements();
5666 unsigned NumZeros = getNumOfConsecutiveZeros(
5667 SVOp, NumElems, false /* check zeros from right */, DAG,
5668 SVOp->getMaskElt(0));
5674 // Considering the elements in the mask that are not consecutive zeros,
5675 // check if they consecutively come from only one of the source vectors.
5677 // V1 = {X, A, B, C} 0
5679 // vector_shuffle V1, V2 <1, 2, 3, X>
5681 if (!isShuffleMaskConsecutive(SVOp,
5682 0, // Mask Start Index
5683 NumElems-NumZeros, // Mask End Index(exclusive)
5684 NumZeros, // Where to start looking in the src vector
5685 NumElems, // Number of elements in vector
5686 OpSrc)) // Which source operand ?
5691 ShVal = SVOp->getOperand(OpSrc);
5695 /// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
5696 /// logical left shift of a vector.
5697 static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
5698 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5700 SVOp->getSimpleValueType(0).getVectorNumElements();
5701 unsigned NumZeros = getNumOfConsecutiveZeros(
5702 SVOp, NumElems, true /* check zeros from left */, DAG,
5703 NumElems - SVOp->getMaskElt(NumElems - 1) - 1);
5709 // Considering the elements in the mask that are not consecutive zeros,
5710 // check if they consecutively come from only one of the source vectors.
5712 // 0 { A, B, X, X } = V2
5714 // vector_shuffle V1, V2 <X, X, 4, 5>
5716 if (!isShuffleMaskConsecutive(SVOp,
5717 NumZeros, // Mask Start Index
5718 NumElems, // Mask End Index(exclusive)
5719 0, // Where to start looking in the src vector
5720 NumElems, // Number of elements in vector
5721 OpSrc)) // Which source operand ?
5726 ShVal = SVOp->getOperand(OpSrc);
5730 /// isVectorShift - Returns true if the shuffle can be implemented as a
5731 /// logical left or right shift of a vector.
5732 static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
5733 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5734 // Although the logic below support any bitwidth size, there are no
5735 // shift instructions which handle more than 128-bit vectors.
5736 if (!SVOp->getSimpleValueType(0).is128BitVector())
5739 if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
5740 isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
5746 /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
5748 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
5749 unsigned NumNonZero, unsigned NumZero,
5751 const X86Subtarget* Subtarget,
5752 const TargetLowering &TLI) {
5759 for (unsigned i = 0; i < 16; ++i) {
5760 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
5761 if (ThisIsNonZero && First) {
5763 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5765 V = DAG.getUNDEF(MVT::v8i16);
5770 SDValue ThisElt, LastElt;
5771 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
5772 if (LastIsNonZero) {
5773 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
5774 MVT::i16, Op.getOperand(i-1));
5776 if (ThisIsNonZero) {
5777 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
5778 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
5779 ThisElt, DAG.getConstant(8, MVT::i8));
5781 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
5785 if (ThisElt.getNode())
5786 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
5787 DAG.getIntPtrConstant(i/2));
5791 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
5794 /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
5796 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
5797 unsigned NumNonZero, unsigned NumZero,
5799 const X86Subtarget* Subtarget,
5800 const TargetLowering &TLI) {
5807 for (unsigned i = 0; i < 8; ++i) {
5808 bool isNonZero = (NonZeros & (1 << i)) != 0;
5812 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5814 V = DAG.getUNDEF(MVT::v8i16);
5817 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
5818 MVT::v8i16, V, Op.getOperand(i),
5819 DAG.getIntPtrConstant(i));
5826 /// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
5827 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
5828 const X86Subtarget *Subtarget,
5829 const TargetLowering &TLI) {
5830 // Find all zeroable elements.
5832 for (int i=0; i < 4; ++i) {
5833 SDValue Elt = Op->getOperand(i);
5834 Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt));
5836 assert(std::count_if(&Zeroable[0], &Zeroable[4],
5837 [](bool M) { return !M; }) > 1 &&
5838 "We expect at least two non-zero elements!");
5840 // We only know how to deal with build_vector nodes where elements are either
5841 // zeroable or extract_vector_elt with constant index.
5842 SDValue FirstNonZero;
5843 unsigned FirstNonZeroIdx;
5844 for (unsigned i=0; i < 4; ++i) {
5847 SDValue Elt = Op->getOperand(i);
5848 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5849 !isa<ConstantSDNode>(Elt.getOperand(1)))
5851 // Make sure that this node is extracting from a 128-bit vector.
5852 MVT VT = Elt.getOperand(0).getSimpleValueType();
5853 if (!VT.is128BitVector())
5855 if (!FirstNonZero.getNode()) {
5857 FirstNonZeroIdx = i;
5861 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
5862 SDValue V1 = FirstNonZero.getOperand(0);
5863 MVT VT = V1.getSimpleValueType();
5865 // See if this build_vector can be lowered as a blend with zero.
5867 unsigned EltMaskIdx, EltIdx;
5869 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
5870 if (Zeroable[EltIdx]) {
5871 // The zero vector will be on the right hand side.
5872 Mask[EltIdx] = EltIdx+4;
5876 Elt = Op->getOperand(EltIdx);
5877 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
5878 EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
5879 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
5881 Mask[EltIdx] = EltIdx;
5885 // Let the shuffle legalizer deal with blend operations.
5886 SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
5887 if (V1.getSimpleValueType() != VT)
5888 V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1);
5889 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]);
5892 // See if we can lower this build_vector to a INSERTPS.
5893 if (!Subtarget->hasSSE41())
5896 SDValue V2 = Elt.getOperand(0);
5897 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
5900 bool CanFold = true;
5901 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
5905 SDValue Current = Op->getOperand(i);
5906 SDValue SrcVector = Current->getOperand(0);
5909 CanFold = SrcVector == V1 &&
5910 cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
5916 assert(V1.getNode() && "Expected at least two non-zero elements!");
5917 if (V1.getSimpleValueType() != MVT::v4f32)
5918 V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1);
5919 if (V2.getSimpleValueType() != MVT::v4f32)
5920 V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2);
5922 // Ok, we can emit an INSERTPS instruction.
5924 for (int i = 0; i < 4; ++i)
5928 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
5929 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
5930 SDValue Result = DAG.getNode(X86ISD::INSERTPS, SDLoc(Op), MVT::v4f32, V1, V2,
5931 DAG.getIntPtrConstant(InsertPSMask));
5932 return DAG.getNode(ISD::BITCAST, SDLoc(Op), VT, Result);
5935 /// Return a vector logical shift node.
5936 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
5937 unsigned NumBits, SelectionDAG &DAG,
5938 const TargetLowering &TLI, SDLoc dl) {
5939 assert(VT.is128BitVector() && "Unknown type for VShift");
5940 MVT ShVT = MVT::v2i64;
5941 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
5942 SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
5943 MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(SrcOp.getValueType());
5944 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
5945 SDValue ShiftVal = DAG.getConstant(NumBits/8, ScalarShiftTy);
5946 return DAG.getNode(ISD::BITCAST, dl, VT,
5947 DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
5951 LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
5953 // Check if the scalar load can be widened into a vector load. And if
5954 // the address is "base + cst" see if the cst can be "absorbed" into
5955 // the shuffle mask.
5956 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
5957 SDValue Ptr = LD->getBasePtr();
5958 if (!ISD::isNormalLoad(LD) || LD->isVolatile())
5960 EVT PVT = LD->getValueType(0);
5961 if (PVT != MVT::i32 && PVT != MVT::f32)
5966 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
5967 FI = FINode->getIndex();
5969 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
5970 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
5971 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
5972 Offset = Ptr.getConstantOperandVal(1);
5973 Ptr = Ptr.getOperand(0);
5978 // FIXME: 256-bit vector instructions don't require a strict alignment,
5979 // improve this code to support it better.
5980 unsigned RequiredAlign = VT.getSizeInBits()/8;
5981 SDValue Chain = LD->getChain();
5982 // Make sure the stack object alignment is at least 16 or 32.
5983 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
5984 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
5985 if (MFI->isFixedObjectIndex(FI)) {
5986 // Can't change the alignment. FIXME: It's possible to compute
5987 // the exact stack offset and reference FI + adjust offset instead.
5988 // If someone *really* cares about this. That's the way to implement it.
5991 MFI->setObjectAlignment(FI, RequiredAlign);
5995 // (Offset % 16 or 32) must be multiple of 4. Then address is then
5996 // Ptr + (Offset & ~15).
5999 if ((Offset % RequiredAlign) & 3)
6001 int64_t StartOffset = Offset & ~(RequiredAlign-1);
6003 Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(),
6004 Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
6006 int EltNo = (Offset - StartOffset) >> 2;
6007 unsigned NumElems = VT.getVectorNumElements();
6009 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6010 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6011 LD->getPointerInfo().getWithOffset(StartOffset),
6012 false, false, false, 0);
6014 SmallVector<int, 8> Mask(NumElems, EltNo);
6016 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
6022 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6023 /// elements can be replaced by a single large load which has the same value as
6024 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6026 /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
6028 /// FIXME: we'd also like to handle the case where the last elements are zero
6029 /// rather than undef via VZEXT_LOAD, but we do not detect that case today.
6030 /// There's even a handy isZeroNode for that purpose.
6031 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6032 SDLoc &DL, SelectionDAG &DAG,
6033 bool isAfterLegalize) {
6034 unsigned NumElems = Elts.size();
6036 LoadSDNode *LDBase = nullptr;
6037 unsigned LastLoadedElt = -1U;
6039 // For each element in the initializer, see if we've found a load or an undef.
6040 // If we don't find an initial load element, or later load elements are
6041 // non-consecutive, bail out.
6042 for (unsigned i = 0; i < NumElems; ++i) {
6043 SDValue Elt = Elts[i];
6044 // Look through a bitcast.
6045 if (Elt.getNode() && Elt.getOpcode() == ISD::BITCAST)
6046 Elt = Elt.getOperand(0);
6047 if (!Elt.getNode() ||
6048 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
6051 if (Elt.getNode()->getOpcode() == ISD::UNDEF)
6053 LDBase = cast<LoadSDNode>(Elt.getNode());
6057 if (Elt.getOpcode() == ISD::UNDEF)
6060 LoadSDNode *LD = cast<LoadSDNode>(Elt);
6061 EVT LdVT = Elt.getValueType();
6062 // Each loaded element must be the correct fractional portion of the
6063 // requested vector load.
6064 if (LdVT.getSizeInBits() != VT.getSizeInBits() / NumElems)
6066 if (!DAG.isConsecutiveLoad(LD, LDBase, LdVT.getSizeInBits() / 8, i))
6071 // If we have found an entire vector of loads and undefs, then return a large
6072 // load of the entire vector width starting at the base pointer. If we found
6073 // consecutive loads for the low half, generate a vzext_load node.
6074 if (LastLoadedElt == NumElems - 1) {
6075 assert(LDBase && "Did not find base load for merging consecutive loads");
6076 EVT EltVT = LDBase->getValueType(0);
6077 // Ensure that the input vector size for the merged loads matches the
6078 // cumulative size of the input elements.
6079 if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
6082 if (isAfterLegalize &&
6083 !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
6086 SDValue NewLd = SDValue();
6088 NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6089 LDBase->getPointerInfo(), LDBase->isVolatile(),
6090 LDBase->isNonTemporal(), LDBase->isInvariant(),
6091 LDBase->getAlignment());
6093 if (LDBase->hasAnyUseOfValue(1)) {
6094 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
6096 SDValue(NewLd.getNode(), 1));
6097 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6098 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6099 SDValue(NewLd.getNode(), 1));
6105 //TODO: The code below fires only for for loading the low v2i32 / v2f32
6106 //of a v4i32 / v4f32. It's probably worth generalizing.
6107 EVT EltVT = VT.getVectorElementType();
6108 if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) &&
6109 DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
6110 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
6111 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6113 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
6114 LDBase->getPointerInfo(),
6115 LDBase->getAlignment(),
6116 false/*isVolatile*/, true/*ReadMem*/,
6119 // Make sure the newly-created LOAD is in the same position as LDBase in
6120 // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
6121 // update uses of LDBase's output chain to use the TokenFactor.
6122 if (LDBase->hasAnyUseOfValue(1)) {
6123 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
6124 SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
6125 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6126 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6127 SDValue(ResNode.getNode(), 1));
6130 return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
6135 /// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
6136 /// to generate a splat value for the following cases:
6137 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
6138 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
6139 /// a scalar load, or a constant.
6140 /// The VBROADCAST node is returned when a pattern is found,
6141 /// or SDValue() otherwise.
6142 static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
6143 SelectionDAG &DAG) {
6144 // VBROADCAST requires AVX.
6145 // TODO: Splats could be generated for non-AVX CPUs using SSE
6146 // instructions, but there's less potential gain for only 128-bit vectors.
6147 if (!Subtarget->hasAVX())
6150 MVT VT = Op.getSimpleValueType();
6153 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
6154 "Unsupported vector type for broadcast.");
6159 switch (Op.getOpcode()) {
6161 // Unknown pattern found.
6164 case ISD::BUILD_VECTOR: {
6165 auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
6166 BitVector UndefElements;
6167 SDValue Splat = BVOp->getSplatValue(&UndefElements);
6169 // We need a splat of a single value to use broadcast, and it doesn't
6170 // make any sense if the value is only in one element of the vector.
6171 if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
6175 ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
6176 Ld.getOpcode() == ISD::ConstantFP);
6178 // Make sure that all of the users of a non-constant load are from the
6179 // BUILD_VECTOR node.
6180 if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
6185 case ISD::VECTOR_SHUFFLE: {
6186 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
6188 // Shuffles must have a splat mask where the first element is
6190 if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
6193 SDValue Sc = Op.getOperand(0);
6194 if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
6195 Sc.getOpcode() != ISD::BUILD_VECTOR) {
6197 if (!Subtarget->hasInt256())
6200 // Use the register form of the broadcast instruction available on AVX2.
6201 if (VT.getSizeInBits() >= 256)
6202 Sc = Extract128BitVector(Sc, 0, DAG, dl);
6203 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
6206 Ld = Sc.getOperand(0);
6207 ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
6208 Ld.getOpcode() == ISD::ConstantFP);
6210 // The scalar_to_vector node and the suspected
6211 // load node must have exactly one user.
6212 // Constants may have multiple users.
6214 // AVX-512 has register version of the broadcast
6215 bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
6216 Ld.getValueType().getSizeInBits() >= 32;
6217 if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
6224 unsigned ScalarSize = Ld.getValueType().getSizeInBits();
6225 bool IsGE256 = (VT.getSizeInBits() >= 256);
6227 // When optimizing for size, generate up to 5 extra bytes for a broadcast
6228 // instruction to save 8 or more bytes of constant pool data.
6229 // TODO: If multiple splats are generated to load the same constant,
6230 // it may be detrimental to overall size. There needs to be a way to detect
6231 // that condition to know if this is truly a size win.
6232 const Function *F = DAG.getMachineFunction().getFunction();
6233 bool OptForSize = F->hasFnAttribute(Attribute::OptimizeForSize);
6235 // Handle broadcasting a single constant scalar from the constant pool
6237 // On Sandybridge (no AVX2), it is still better to load a constant vector
6238 // from the constant pool and not to broadcast it from a scalar.
6239 // But override that restriction when optimizing for size.
6240 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
6241 if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) {
6242 EVT CVT = Ld.getValueType();
6243 assert(!CVT.isVector() && "Must not broadcast a vector type");
6245 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
6246 // For size optimization, also splat v2f64 and v2i64, and for size opt
6247 // with AVX2, also splat i8 and i16.
6248 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
6249 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6250 (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) {
6251 const Constant *C = nullptr;
6252 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
6253 C = CI->getConstantIntValue();
6254 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
6255 C = CF->getConstantFPValue();
6257 assert(C && "Invalid constant type");
6259 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6260 SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
6261 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6262 Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
6263 MachinePointerInfo::getConstantPool(),
6264 false, false, false, Alignment);
6266 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6270 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
6272 // Handle AVX2 in-register broadcasts.
6273 if (!IsLoad && Subtarget->hasInt256() &&
6274 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
6275 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6277 // The scalar source must be a normal load.
6281 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6282 (Subtarget->hasVLX() && ScalarSize == 64))
6283 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6285 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
6286 // double since there is no vbroadcastsd xmm
6287 if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
6288 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
6289 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6292 // Unsupported broadcast.
6296 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
6297 /// underlying vector and index.
6299 /// Modifies \p ExtractedFromVec to the real vector and returns the real
6301 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
6303 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
6304 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
6307 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
6309 // (extract_vector_elt (v8f32 %vreg1), Constant<6>)
6311 // (extract_vector_elt (vector_shuffle<2,u,u,u>
6312 // (extract_subvector (v8f32 %vreg0), Constant<4>),
6315 // In this case the vector is the extract_subvector expression and the index
6316 // is 2, as specified by the shuffle.
6317 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
6318 SDValue ShuffleVec = SVOp->getOperand(0);
6319 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
6320 assert(ShuffleVecVT.getVectorElementType() ==
6321 ExtractedFromVec.getSimpleValueType().getVectorElementType());
6323 int ShuffleIdx = SVOp->getMaskElt(Idx);
6324 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
6325 ExtractedFromVec = ShuffleVec;
6331 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
6332 MVT VT = Op.getSimpleValueType();
6334 // Skip if insert_vec_elt is not supported.
6335 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6336 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
6340 unsigned NumElems = Op.getNumOperands();
6344 SmallVector<unsigned, 4> InsertIndices;
6345 SmallVector<int, 8> Mask(NumElems, -1);
6347 for (unsigned i = 0; i != NumElems; ++i) {
6348 unsigned Opc = Op.getOperand(i).getOpcode();
6350 if (Opc == ISD::UNDEF)
6353 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
6354 // Quit if more than 1 elements need inserting.
6355 if (InsertIndices.size() > 1)
6358 InsertIndices.push_back(i);
6362 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
6363 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
6364 // Quit if non-constant index.
6365 if (!isa<ConstantSDNode>(ExtIdx))
6367 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
6369 // Quit if extracted from vector of different type.
6370 if (ExtractedFromVec.getValueType() != VT)
6373 if (!VecIn1.getNode())
6374 VecIn1 = ExtractedFromVec;
6375 else if (VecIn1 != ExtractedFromVec) {
6376 if (!VecIn2.getNode())
6377 VecIn2 = ExtractedFromVec;
6378 else if (VecIn2 != ExtractedFromVec)
6379 // Quit if more than 2 vectors to shuffle
6383 if (ExtractedFromVec == VecIn1)
6385 else if (ExtractedFromVec == VecIn2)
6386 Mask[i] = Idx + NumElems;
6389 if (!VecIn1.getNode())
6392 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
6393 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
6394 for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
6395 unsigned Idx = InsertIndices[i];
6396 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
6397 DAG.getIntPtrConstant(Idx));
6403 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
6405 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
6407 MVT VT = Op.getSimpleValueType();
6408 assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) &&
6409 "Unexpected type in LowerBUILD_VECTORvXi1!");
6412 if (ISD::isBuildVectorAllZeros(Op.getNode())) {
6413 SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
6414 SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
6415 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
6418 if (ISD::isBuildVectorAllOnes(Op.getNode())) {
6419 SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
6420 SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
6421 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
6424 bool AllContants = true;
6425 uint64_t Immediate = 0;
6426 int NonConstIdx = -1;
6427 bool IsSplat = true;
6428 unsigned NumNonConsts = 0;
6429 unsigned NumConsts = 0;
6430 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6431 SDValue In = Op.getOperand(idx);
6432 if (In.getOpcode() == ISD::UNDEF)
6434 if (!isa<ConstantSDNode>(In)) {
6435 AllContants = false;
6440 if (cast<ConstantSDNode>(In)->getZExtValue())
6441 Immediate |= (1ULL << idx);
6443 if (In != Op.getOperand(0))
6448 SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1,
6449 DAG.getConstant(Immediate, MVT::i16));
6450 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask,
6451 DAG.getIntPtrConstant(0));
6454 if (NumNonConsts == 1 && NonConstIdx != 0) {
6457 SDValue VecAsImm = DAG.getConstant(Immediate,
6458 MVT::getIntegerVT(VT.getSizeInBits()));
6459 DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
6462 DstVec = DAG.getUNDEF(VT);
6463 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
6464 Op.getOperand(NonConstIdx),
6465 DAG.getIntPtrConstant(NonConstIdx));
6467 if (!IsSplat && (NonConstIdx != 0))
6468 llvm_unreachable("Unsupported BUILD_VECTOR operation");
6469 MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8;
6472 Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
6473 DAG.getConstant(-1, SelectVT),
6474 DAG.getConstant(0, SelectVT));
6476 Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
6477 DAG.getConstant((Immediate | 1), SelectVT),
6478 DAG.getConstant(Immediate, SelectVT));
6479 return DAG.getNode(ISD::BITCAST, dl, VT, Select);
6482 /// \brief Return true if \p N implements a horizontal binop and return the
6483 /// operands for the horizontal binop into V0 and V1.
6485 /// This is a helper function of PerformBUILD_VECTORCombine.
6486 /// This function checks that the build_vector \p N in input implements a
6487 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
6488 /// operation to match.
6489 /// For example, if \p Opcode is equal to ISD::ADD, then this function
6490 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
6491 /// is equal to ISD::SUB, then this function checks if this is a horizontal
6494 /// This function only analyzes elements of \p N whose indices are
6495 /// in range [BaseIdx, LastIdx).
6496 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
6498 unsigned BaseIdx, unsigned LastIdx,
6499 SDValue &V0, SDValue &V1) {
6500 EVT VT = N->getValueType(0);
6502 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
6503 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
6504 "Invalid Vector in input!");
6506 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
6507 bool CanFold = true;
6508 unsigned ExpectedVExtractIdx = BaseIdx;
6509 unsigned NumElts = LastIdx - BaseIdx;
6510 V0 = DAG.getUNDEF(VT);
6511 V1 = DAG.getUNDEF(VT);
6513 // Check if N implements a horizontal binop.
6514 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
6515 SDValue Op = N->getOperand(i + BaseIdx);
6518 if (Op->getOpcode() == ISD::UNDEF) {
6519 // Update the expected vector extract index.
6520 if (i * 2 == NumElts)
6521 ExpectedVExtractIdx = BaseIdx;
6522 ExpectedVExtractIdx += 2;
6526 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
6531 SDValue Op0 = Op.getOperand(0);
6532 SDValue Op1 = Op.getOperand(1);
6534 // Try to match the following pattern:
6535 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
6536 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6537 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6538 Op0.getOperand(0) == Op1.getOperand(0) &&
6539 isa<ConstantSDNode>(Op0.getOperand(1)) &&
6540 isa<ConstantSDNode>(Op1.getOperand(1)));
6544 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
6545 unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
6547 if (i * 2 < NumElts) {
6548 if (V0.getOpcode() == ISD::UNDEF)
6549 V0 = Op0.getOperand(0);
6551 if (V1.getOpcode() == ISD::UNDEF)
6552 V1 = Op0.getOperand(0);
6553 if (i * 2 == NumElts)
6554 ExpectedVExtractIdx = BaseIdx;
6557 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
6558 if (I0 == ExpectedVExtractIdx)
6559 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
6560 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
6561 // Try to match the following dag sequence:
6562 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
6563 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
6567 ExpectedVExtractIdx += 2;
6573 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
6574 /// a concat_vector.
6576 /// This is a helper function of PerformBUILD_VECTORCombine.
6577 /// This function expects two 256-bit vectors called V0 and V1.
6578 /// At first, each vector is split into two separate 128-bit vectors.
6579 /// Then, the resulting 128-bit vectors are used to implement two
6580 /// horizontal binary operations.
6582 /// The kind of horizontal binary operation is defined by \p X86Opcode.
6584 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
6585 /// the two new horizontal binop.
6586 /// When Mode is set, the first horizontal binop dag node would take as input
6587 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
6588 /// horizontal binop dag node would take as input the lower 128-bit of V1
6589 /// and the upper 128-bit of V1.
6591 /// HADD V0_LO, V0_HI
6592 /// HADD V1_LO, V1_HI
6594 /// Otherwise, the first horizontal binop dag node takes as input the lower
6595 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
6596 /// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1.
6598 /// HADD V0_LO, V1_LO
6599 /// HADD V0_HI, V1_HI
6601 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
6602 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
6603 /// the upper 128-bits of the result.
6604 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
6605 SDLoc DL, SelectionDAG &DAG,
6606 unsigned X86Opcode, bool Mode,
6607 bool isUndefLO, bool isUndefHI) {
6608 EVT VT = V0.getValueType();
6609 assert(VT.is256BitVector() && VT == V1.getValueType() &&
6610 "Invalid nodes in input!");
6612 unsigned NumElts = VT.getVectorNumElements();
6613 SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL);
6614 SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL);
6615 SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL);
6616 SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL);
6617 EVT NewVT = V0_LO.getValueType();
6619 SDValue LO = DAG.getUNDEF(NewVT);
6620 SDValue HI = DAG.getUNDEF(NewVT);
6623 // Don't emit a horizontal binop if the result is expected to be UNDEF.
6624 if (!isUndefLO && V0->getOpcode() != ISD::UNDEF)
6625 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
6626 if (!isUndefHI && V1->getOpcode() != ISD::UNDEF)
6627 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
6629 // Don't emit a horizontal binop if the result is expected to be UNDEF.
6630 if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF ||
6631 V1_LO->getOpcode() != ISD::UNDEF))
6632 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
6634 if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF ||
6635 V1_HI->getOpcode() != ISD::UNDEF))
6636 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
6639 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
6642 /// \brief Try to fold a build_vector that performs an 'addsub' into the
6643 /// sequence of 'vadd + vsub + blendi'.
6644 static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG,
6645 const X86Subtarget *Subtarget) {
6647 EVT VT = BV->getValueType(0);
6648 unsigned NumElts = VT.getVectorNumElements();
6649 SDValue InVec0 = DAG.getUNDEF(VT);
6650 SDValue InVec1 = DAG.getUNDEF(VT);
6652 assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
6653 VT == MVT::v2f64) && "build_vector with an invalid type found!");
6655 // Odd-numbered elements in the input build vector are obtained from
6656 // adding two integer/float elements.
6657 // Even-numbered elements in the input build vector are obtained from
6658 // subtracting two integer/float elements.
6659 unsigned ExpectedOpcode = ISD::FSUB;
6660 unsigned NextExpectedOpcode = ISD::FADD;
6661 bool AddFound = false;
6662 bool SubFound = false;
6664 for (unsigned i = 0, e = NumElts; i != e; ++i) {
6665 SDValue Op = BV->getOperand(i);
6667 // Skip 'undef' values.
6668 unsigned Opcode = Op.getOpcode();
6669 if (Opcode == ISD::UNDEF) {
6670 std::swap(ExpectedOpcode, NextExpectedOpcode);
6674 // Early exit if we found an unexpected opcode.
6675 if (Opcode != ExpectedOpcode)
6678 SDValue Op0 = Op.getOperand(0);
6679 SDValue Op1 = Op.getOperand(1);
6681 // Try to match the following pattern:
6682 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
6683 // Early exit if we cannot match that sequence.
6684 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6685 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6686 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
6687 !isa<ConstantSDNode>(Op1.getOperand(1)) ||
6688 Op0.getOperand(1) != Op1.getOperand(1))
6691 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
6695 // We found a valid add/sub node. Update the information accordingly.
6701 // Update InVec0 and InVec1.
6702 if (InVec0.getOpcode() == ISD::UNDEF)
6703 InVec0 = Op0.getOperand(0);
6704 if (InVec1.getOpcode() == ISD::UNDEF)
6705 InVec1 = Op1.getOperand(0);
6707 // Make sure that operands in input to each add/sub node always
6708 // come from a same pair of vectors.
6709 if (InVec0 != Op0.getOperand(0)) {
6710 if (ExpectedOpcode == ISD::FSUB)
6713 // FADD is commutable. Try to commute the operands
6714 // and then test again.
6715 std::swap(Op0, Op1);
6716 if (InVec0 != Op0.getOperand(0))
6720 if (InVec1 != Op1.getOperand(0))
6723 // Update the pair of expected opcodes.
6724 std::swap(ExpectedOpcode, NextExpectedOpcode);
6727 // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
6728 if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF &&
6729 InVec1.getOpcode() != ISD::UNDEF)
6730 return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
6735 static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
6736 const X86Subtarget *Subtarget) {
6738 EVT VT = N->getValueType(0);
6739 unsigned NumElts = VT.getVectorNumElements();
6740 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N);
6741 SDValue InVec0, InVec1;
6743 // Try to match an ADDSUB.
6744 if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
6745 (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
6746 SDValue Value = matchAddSub(BV, DAG, Subtarget);
6747 if (Value.getNode())
6751 // Try to match horizontal ADD/SUB.
6752 unsigned NumUndefsLO = 0;
6753 unsigned NumUndefsHI = 0;
6754 unsigned Half = NumElts/2;
6756 // Count the number of UNDEF operands in the build_vector in input.
6757 for (unsigned i = 0, e = Half; i != e; ++i)
6758 if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
6761 for (unsigned i = Half, e = NumElts; i != e; ++i)
6762 if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
6765 // Early exit if this is either a build_vector of all UNDEFs or all the
6766 // operands but one are UNDEF.
6767 if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
6770 if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) {
6771 // Try to match an SSE3 float HADD/HSUB.
6772 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
6773 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
6775 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
6776 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
6777 } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
6778 // Try to match an SSSE3 integer HADD/HSUB.
6779 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
6780 return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
6782 if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
6783 return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
6786 if (!Subtarget->hasAVX())
6789 if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
6790 // Try to match an AVX horizontal add/sub of packed single/double
6791 // precision floating point values from 256-bit vectors.
6792 SDValue InVec2, InVec3;
6793 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
6794 isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
6795 ((InVec0.getOpcode() == ISD::UNDEF ||
6796 InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6797 ((InVec1.getOpcode() == ISD::UNDEF ||
6798 InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6799 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
6801 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
6802 isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
6803 ((InVec0.getOpcode() == ISD::UNDEF ||
6804 InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6805 ((InVec1.getOpcode() == ISD::UNDEF ||
6806 InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6807 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
6808 } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
6809 // Try to match an AVX2 horizontal add/sub of signed integers.
6810 SDValue InVec2, InVec3;
6812 bool CanFold = true;
6814 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
6815 isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
6816 ((InVec0.getOpcode() == ISD::UNDEF ||
6817 InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6818 ((InVec1.getOpcode() == ISD::UNDEF ||
6819 InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6820 X86Opcode = X86ISD::HADD;
6821 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
6822 isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
6823 ((InVec0.getOpcode() == ISD::UNDEF ||
6824 InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6825 ((InVec1.getOpcode() == ISD::UNDEF ||
6826 InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6827 X86Opcode = X86ISD::HSUB;
6832 // Fold this build_vector into a single horizontal add/sub.
6833 // Do this only if the target has AVX2.
6834 if (Subtarget->hasAVX2())
6835 return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
6837 // Do not try to expand this build_vector into a pair of horizontal
6838 // add/sub if we can emit a pair of scalar add/sub.
6839 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
6842 // Convert this build_vector into a pair of horizontal binop followed by
6844 bool isUndefLO = NumUndefsLO == Half;
6845 bool isUndefHI = NumUndefsHI == Half;
6846 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
6847 isUndefLO, isUndefHI);
6851 if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
6852 VT == MVT::v16i16) && Subtarget->hasAVX()) {
6854 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
6855 X86Opcode = X86ISD::HADD;
6856 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
6857 X86Opcode = X86ISD::HSUB;
6858 else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
6859 X86Opcode = X86ISD::FHADD;
6860 else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
6861 X86Opcode = X86ISD::FHSUB;
6865 // Don't try to expand this build_vector into a pair of horizontal add/sub
6866 // if we can simply emit a pair of scalar add/sub.
6867 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
6870 // Convert this build_vector into two horizontal add/sub followed by
6872 bool isUndefLO = NumUndefsLO == Half;
6873 bool isUndefHI = NumUndefsHI == Half;
6874 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
6875 isUndefLO, isUndefHI);
6882 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
6885 MVT VT = Op.getSimpleValueType();
6886 MVT ExtVT = VT.getVectorElementType();
6887 unsigned NumElems = Op.getNumOperands();
6889 // Generate vectors for predicate vectors.
6890 if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512())
6891 return LowerBUILD_VECTORvXi1(Op, DAG);
6893 // Vectors containing all zeros can be matched by pxor and xorps later
6894 if (ISD::isBuildVectorAllZeros(Op.getNode())) {
6895 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
6896 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
6897 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
6900 return getZeroVector(VT, Subtarget, DAG, dl);
6903 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
6904 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
6905 // vpcmpeqd on 256-bit vectors.
6906 if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
6907 if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
6910 if (!VT.is512BitVector())
6911 return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
6914 SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
6915 if (Broadcast.getNode())
6918 unsigned EVTBits = ExtVT.getSizeInBits();
6920 unsigned NumZero = 0;
6921 unsigned NumNonZero = 0;
6922 unsigned NonZeros = 0;
6923 bool IsAllConstants = true;
6924 SmallSet<SDValue, 8> Values;
6925 for (unsigned i = 0; i < NumElems; ++i) {
6926 SDValue Elt = Op.getOperand(i);
6927 if (Elt.getOpcode() == ISD::UNDEF)
6930 if (Elt.getOpcode() != ISD::Constant &&
6931 Elt.getOpcode() != ISD::ConstantFP)
6932 IsAllConstants = false;
6933 if (X86::isZeroNode(Elt))
6936 NonZeros |= (1 << i);
6941 // All undef vector. Return an UNDEF. All zero vectors were handled above.
6942 if (NumNonZero == 0)
6943 return DAG.getUNDEF(VT);
6945 // Special case for single non-zero, non-undef, element.
6946 if (NumNonZero == 1) {
6947 unsigned Idx = countTrailingZeros(NonZeros);
6948 SDValue Item = Op.getOperand(Idx);
6950 // If this is an insertion of an i64 value on x86-32, and if the top bits of
6951 // the value are obviously zero, truncate the value to i32 and do the
6952 // insertion that way. Only do this if the value is non-constant or if the
6953 // value is a constant being inserted into element 0. It is cheaper to do
6954 // a constant pool load than it is to do a movd + shuffle.
6955 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
6956 (!IsAllConstants || Idx == 0)) {
6957 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
6959 assert(VT == MVT::v2i64 && "Expected an SSE value type!");
6960 EVT VecVT = MVT::v4i32;
6961 unsigned VecElts = 4;
6963 // Truncate the value (which may itself be a constant) to i32, and
6964 // convert it to a vector with movd (S2V+shuffle to zero extend).
6965 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
6966 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
6968 // If using the new shuffle lowering, just directly insert this.
6969 if (ExperimentalVectorShuffleLowering)
6971 ISD::BITCAST, dl, VT,
6972 getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG));
6974 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
6976 // Now we have our 32-bit value zero extended in the low element of
6977 // a vector. If Idx != 0, swizzle it into place.
6979 SmallVector<int, 4> Mask;
6980 Mask.push_back(Idx);
6981 for (unsigned i = 1; i != VecElts; ++i)
6983 Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
6986 return DAG.getNode(ISD::BITCAST, dl, VT, Item);
6990 // If we have a constant or non-constant insertion into the low element of
6991 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
6992 // the rest of the elements. This will be matched as movd/movq/movss/movsd
6993 // depending on what the source datatype is.
6996 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
6998 if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
6999 (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
7000 if (VT.is256BitVector() || VT.is512BitVector()) {
7001 SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
7002 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
7003 Item, DAG.getIntPtrConstant(0));
7005 assert(VT.is128BitVector() && "Expected an SSE value type!");
7006 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7007 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
7008 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7011 if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
7012 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
7013 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7014 if (VT.is256BitVector()) {
7015 SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
7016 Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
7018 assert(VT.is128BitVector() && "Expected an SSE value type!");
7019 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7021 return DAG.getNode(ISD::BITCAST, dl, VT, Item);
7025 // Is it a vector logical left shift?
7026 if (NumElems == 2 && Idx == 1 &&
7027 X86::isZeroNode(Op.getOperand(0)) &&
7028 !X86::isZeroNode(Op.getOperand(1))) {
7029 unsigned NumBits = VT.getSizeInBits();
7030 return getVShift(true, VT,
7031 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
7032 VT, Op.getOperand(1)),
7033 NumBits/2, DAG, *this, dl);
7036 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
7039 // Otherwise, if this is a vector with i32 or f32 elements, and the element
7040 // is a non-constant being inserted into an element other than the low one,
7041 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
7042 // movd/movss) to move this into the low element, then shuffle it into
7044 if (EVTBits == 32) {
7045 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7047 // If using the new shuffle lowering, just directly insert this.
7048 if (ExperimentalVectorShuffleLowering)
7049 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
7051 // Turn it into a shuffle of zero and zero-extended scalar to vector.
7052 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
7053 SmallVector<int, 8> MaskVec;
7054 for (unsigned i = 0; i != NumElems; ++i)
7055 MaskVec.push_back(i == Idx ? 0 : 1);
7056 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
7060 // Splat is obviously ok. Let legalizer expand it to a shuffle.
7061 if (Values.size() == 1) {
7062 if (EVTBits == 32) {
7063 // Instead of a shuffle like this:
7064 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
7065 // Check if it's possible to issue this instead.
7066 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
7067 unsigned Idx = countTrailingZeros(NonZeros);
7068 SDValue Item = Op.getOperand(Idx);
7069 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
7070 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
7075 // A vector full of immediates; various special cases are already
7076 // handled, so this is best done with a single constant-pool load.
7080 // For AVX-length vectors, see if we can use a vector load to get all of the
7081 // elements, otherwise build the individual 128-bit pieces and use
7082 // shuffles to put them in place.
7083 if (VT.is256BitVector() || VT.is512BitVector()) {
7084 SmallVector<SDValue, 64> V(Op->op_begin(), Op->op_begin() + NumElems);
7086 // Check for a build vector of consecutive loads.
7087 if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
7090 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
7092 // Build both the lower and upper subvector.
7093 SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
7094 makeArrayRef(&V[0], NumElems/2));
7095 SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
7096 makeArrayRef(&V[NumElems / 2], NumElems/2));
7098 // Recreate the wider vector with the lower and upper part.
7099 if (VT.is256BitVector())
7100 return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7101 return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7104 // Let legalizer expand 2-wide build_vectors.
7105 if (EVTBits == 64) {
7106 if (NumNonZero == 1) {
7107 // One half is zero or undef.
7108 unsigned Idx = countTrailingZeros(NonZeros);
7109 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
7110 Op.getOperand(Idx));
7111 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
7116 // If element VT is < 32 bits, convert it to inserts into a zero vector.
7117 if (EVTBits == 8 && NumElems == 16) {
7118 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
7120 if (V.getNode()) return V;
7123 if (EVTBits == 16 && NumElems == 8) {
7124 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
7126 if (V.getNode()) return V;
7129 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
7130 if (EVTBits == 32 && NumElems == 4) {
7131 SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this);
7136 // If element VT is == 32 bits, turn it into a number of shuffles.
7137 SmallVector<SDValue, 8> V(NumElems);
7138 if (NumElems == 4 && NumZero > 0) {
7139 for (unsigned i = 0; i < 4; ++i) {
7140 bool isZero = !(NonZeros & (1 << i));
7142 V[i] = getZeroVector(VT, Subtarget, DAG, dl);
7144 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7147 for (unsigned i = 0; i < 2; ++i) {
7148 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
7151 V[i] = V[i*2]; // Must be a zero vector.
7154 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
7157 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
7160 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
7165 bool Reverse1 = (NonZeros & 0x3) == 2;
7166 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
7170 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
7171 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
7173 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
7176 if (Values.size() > 1 && VT.is128BitVector()) {
7177 // Check for a build vector of consecutive loads.
7178 for (unsigned i = 0; i < NumElems; ++i)
7179 V[i] = Op.getOperand(i);
7181 // Check for elements which are consecutive loads.
7182 SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false);
7186 // Check for a build vector from mostly shuffle plus few inserting.
7187 SDValue Sh = buildFromShuffleMostly(Op, DAG);
7191 // For SSE 4.1, use insertps to put the high elements into the low element.
7192 if (Subtarget->hasSSE41()) {
7194 if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
7195 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
7197 Result = DAG.getUNDEF(VT);
7199 for (unsigned i = 1; i < NumElems; ++i) {
7200 if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
7201 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
7202 Op.getOperand(i), DAG.getIntPtrConstant(i));
7207 // Otherwise, expand into a number of unpckl*, start by extending each of
7208 // our (non-undef) elements to the full vector width with the element in the
7209 // bottom slot of the vector (which generates no code for SSE).
7210 for (unsigned i = 0; i < NumElems; ++i) {
7211 if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
7212 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7214 V[i] = DAG.getUNDEF(VT);
7217 // Next, we iteratively mix elements, e.g. for v4f32:
7218 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
7219 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
7220 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
7221 unsigned EltStride = NumElems >> 1;
7222 while (EltStride != 0) {
7223 for (unsigned i = 0; i < EltStride; ++i) {
7224 // If V[i+EltStride] is undef and this is the first round of mixing,
7225 // then it is safe to just drop this shuffle: V[i] is already in the
7226 // right place, the one element (since it's the first round) being
7227 // inserted as undef can be dropped. This isn't safe for successive
7228 // rounds because they will permute elements within both vectors.
7229 if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
7230 EltStride == NumElems/2)
7233 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
7242 // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
7243 // to create 256-bit vectors from two other 128-bit ones.
7244 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7246 MVT ResVT = Op.getSimpleValueType();
7248 assert((ResVT.is256BitVector() ||
7249 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
7251 SDValue V1 = Op.getOperand(0);
7252 SDValue V2 = Op.getOperand(1);
7253 unsigned NumElems = ResVT.getVectorNumElements();
7254 if(ResVT.is256BitVector())
7255 return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7257 if (Op.getNumOperands() == 4) {
7258 MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),
7259 ResVT.getVectorNumElements()/2);
7260 SDValue V3 = Op.getOperand(2);
7261 SDValue V4 = Op.getOperand(3);
7262 return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl),
7263 Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl);
7265 return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7268 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7269 MVT LLVM_ATTRIBUTE_UNUSED VT = Op.getSimpleValueType();
7270 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
7271 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
7272 Op.getNumOperands() == 4)));
7274 // AVX can use the vinsertf128 instruction to create 256-bit vectors
7275 // from two other 128-bit ones.
7277 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
7278 return LowerAVXCONCAT_VECTORS(Op, DAG);
7282 //===----------------------------------------------------------------------===//
7283 // Vector shuffle lowering
7285 // This is an experimental code path for lowering vector shuffles on x86. It is
7286 // designed to handle arbitrary vector shuffles and blends, gracefully
7287 // degrading performance as necessary. It works hard to recognize idiomatic
7288 // shuffles and lower them to optimal instruction patterns without leaving
7289 // a framework that allows reasonably efficient handling of all vector shuffle
7291 //===----------------------------------------------------------------------===//
7293 /// \brief Tiny helper function to identify a no-op mask.
7295 /// This is a somewhat boring predicate function. It checks whether the mask
7296 /// array input, which is assumed to be a single-input shuffle mask of the kind
7297 /// used by the X86 shuffle instructions (not a fully general
7298 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
7299 /// in-place shuffle are 'no-op's.
7300 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
7301 for (int i = 0, Size = Mask.size(); i < Size; ++i)
7302 if (Mask[i] != -1 && Mask[i] != i)
7307 /// \brief Helper function to classify a mask as a single-input mask.
7309 /// This isn't a generic single-input test because in the vector shuffle
7310 /// lowering we canonicalize single inputs to be the first input operand. This
7311 /// means we can more quickly test for a single input by only checking whether
7312 /// an input from the second operand exists. We also assume that the size of
7313 /// mask corresponds to the size of the input vectors which isn't true in the
7314 /// fully general case.
7315 static bool isSingleInputShuffleMask(ArrayRef<int> Mask) {
7317 if (M >= (int)Mask.size())
7322 /// \brief Test whether there are elements crossing 128-bit lanes in this
7325 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
7326 /// and we routinely test for these.
7327 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
7328 int LaneSize = 128 / VT.getScalarSizeInBits();
7329 int Size = Mask.size();
7330 for (int i = 0; i < Size; ++i)
7331 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
7336 /// \brief Test whether a shuffle mask is equivalent within each 128-bit lane.
7338 /// This checks a shuffle mask to see if it is performing the same
7339 /// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies
7340 /// that it is also not lane-crossing. It may however involve a blend from the
7341 /// same lane of a second vector.
7343 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
7344 /// non-trivial to compute in the face of undef lanes. The representation is
7345 /// *not* suitable for use with existing 128-bit shuffles as it will contain
7346 /// entries from both V1 and V2 inputs to the wider mask.
7348 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
7349 SmallVectorImpl<int> &RepeatedMask) {
7350 int LaneSize = 128 / VT.getScalarSizeInBits();
7351 RepeatedMask.resize(LaneSize, -1);
7352 int Size = Mask.size();
7353 for (int i = 0; i < Size; ++i) {
7356 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
7357 // This entry crosses lanes, so there is no way to model this shuffle.
7360 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
7361 if (RepeatedMask[i % LaneSize] == -1)
7362 // This is the first non-undef entry in this slot of a 128-bit lane.
7363 RepeatedMask[i % LaneSize] =
7364 Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size;
7365 else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i])
7366 // Found a mismatch with the repeated mask.
7372 /// \brief Base case helper for testing a single mask element.
7373 static bool isShuffleEquivalentImpl(SDValue V1, SDValue V2,
7374 BuildVectorSDNode *BV1,
7375 BuildVectorSDNode *BV2, ArrayRef<int> Mask,
7377 int Size = Mask.size();
7378 if (Mask[i] != -1 && Mask[i] != Arg) {
7379 auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
7380 auto *ArgsBV = Arg < Size ? BV1 : BV2;
7381 if (!MaskBV || !ArgsBV ||
7382 MaskBV->getOperand(Mask[i] % Size) != ArgsBV->getOperand(Arg % Size))
7388 /// \brief Recursive helper to peel off and test each mask element.
7389 template <typename... Ts>
7390 static bool isShuffleEquivalentImpl(SDValue V1, SDValue V2,
7391 BuildVectorSDNode *BV1,
7392 BuildVectorSDNode *BV2, ArrayRef<int> Mask,
7393 int i, int Arg, Ts... Args) {
7394 if (!isShuffleEquivalentImpl(V1, V2, BV1, BV2, Mask, i, Arg))
7397 return isShuffleEquivalentImpl(V1, V2, BV1, BV2, Mask, i + 1, Args...);
7400 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
7403 /// This is a fast way to test a shuffle mask against a fixed pattern:
7405 /// if (isShuffleEquivalent(Mask, 3, 2, 1, 0)) { ... }
7407 /// It returns true if the mask is exactly as wide as the argument list, and
7408 /// each element of the mask is either -1 (signifying undef) or the value given
7409 /// in the argument.
7410 template <typename... Ts>
7411 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
7413 if (Mask.size() != sizeof...(Args))
7416 // If the values are build vectors, we can look through them to find
7417 // equivalent inputs that make the shuffles equivalent.
7418 auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
7419 auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
7421 // Recursively peel off arguments and test them against the mask.
7422 return isShuffleEquivalentImpl(V1, V2, BV1, BV2, Mask, 0, Args...);
7425 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
7427 /// This helper function produces an 8-bit shuffle immediate corresponding to
7428 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
7429 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
7432 /// NB: We rely heavily on "undef" masks preserving the input lane.
7433 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask,
7434 SelectionDAG &DAG) {
7435 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
7436 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
7437 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
7438 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
7439 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
7442 Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0;
7443 Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2;
7444 Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4;
7445 Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6;
7446 return DAG.getConstant(Imm, MVT::i8);
7449 /// \brief Try to emit a blend instruction for a shuffle.
7451 /// This doesn't do any checks for the availability of instructions for blending
7452 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
7453 /// be matched in the backend with the type given. What it does check for is
7454 /// that the shuffle mask is in fact a blend.
7455 static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
7456 SDValue V2, ArrayRef<int> Mask,
7457 const X86Subtarget *Subtarget,
7458 SelectionDAG &DAG) {
7460 unsigned BlendMask = 0;
7461 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7462 if (Mask[i] >= Size) {
7463 if (Mask[i] != i + Size)
7464 return SDValue(); // Shuffled V2 input!
7465 BlendMask |= 1u << i;
7468 if (Mask[i] >= 0 && Mask[i] != i)
7469 return SDValue(); // Shuffled V1 input!
7471 switch (VT.SimpleTy) {
7476 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
7477 DAG.getConstant(BlendMask, MVT::i8));
7481 assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
7485 // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
7486 // that instruction.
7487 if (Subtarget->hasAVX2()) {
7488 // Scale the blend by the number of 32-bit dwords per element.
7489 int Scale = VT.getScalarSizeInBits() / 32;
7491 for (int i = 0, Size = Mask.size(); i < Size; ++i)
7492 if (Mask[i] >= Size)
7493 for (int j = 0; j < Scale; ++j)
7494 BlendMask |= 1u << (i * Scale + j);
7496 MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
7497 V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
7498 V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
7499 return DAG.getNode(ISD::BITCAST, DL, VT,
7500 DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
7501 DAG.getConstant(BlendMask, MVT::i8)));
7505 // For integer shuffles we need to expand the mask and cast the inputs to
7506 // v8i16s prior to blending.
7507 int Scale = 8 / VT.getVectorNumElements();
7509 for (int i = 0, Size = Mask.size(); i < Size; ++i)
7510 if (Mask[i] >= Size)
7511 for (int j = 0; j < Scale; ++j)
7512 BlendMask |= 1u << (i * Scale + j);
7514 V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
7515 V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
7516 return DAG.getNode(ISD::BITCAST, DL, VT,
7517 DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
7518 DAG.getConstant(BlendMask, MVT::i8)));
7522 assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
7523 SmallVector<int, 8> RepeatedMask;
7524 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
7525 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
7526 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
7528 for (int i = 0; i < 8; ++i)
7529 if (RepeatedMask[i] >= 16)
7530 BlendMask |= 1u << i;
7531 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
7532 DAG.getConstant(BlendMask, MVT::i8));
7538 // Scale the blend by the number of bytes per element.
7539 int Scale = VT.getScalarSizeInBits() / 8;
7541 // This form of blend is always done on bytes. Compute the byte vector
7543 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
7545 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
7546 // mix of LLVM's code generator and the x86 backend. We tell the code
7547 // generator that boolean values in the elements of an x86 vector register
7548 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
7549 // mapping a select to operand #1, and 'false' mapping to operand #2. The
7550 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
7551 // of the element (the remaining are ignored) and 0 in that high bit would
7552 // mean operand #1 while 1 in the high bit would mean operand #2. So while
7553 // the LLVM model for boolean values in vector elements gets the relevant
7554 // bit set, it is set backwards and over constrained relative to x86's
7556 SmallVector<SDValue, 32> VSELECTMask;
7557 for (int i = 0, Size = Mask.size(); i < Size; ++i)
7558 for (int j = 0; j < Scale; ++j)
7559 VSELECTMask.push_back(
7560 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
7561 : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8));
7563 V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
7564 V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
7566 ISD::BITCAST, DL, VT,
7567 DAG.getNode(ISD::VSELECT, DL, BlendVT,
7568 DAG.getNode(ISD::BUILD_VECTOR, DL, BlendVT, VSELECTMask),
7573 llvm_unreachable("Not a supported integer vector type!");
7577 /// \brief Try to lower as a blend of elements from two inputs followed by
7578 /// a single-input permutation.
7580 /// This matches the pattern where we can blend elements from two inputs and
7581 /// then reduce the shuffle to a single-input permutation.
7582 static SDValue lowerVectorShuffleAsBlendAndPermute(SDLoc DL, MVT VT, SDValue V1,
7585 SelectionDAG &DAG) {
7586 // We build up the blend mask while checking whether a blend is a viable way
7587 // to reduce the shuffle.
7588 SmallVector<int, 32> BlendMask(Mask.size(), -1);
7589 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
7591 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7595 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
7597 if (BlendMask[Mask[i] % Size] == -1)
7598 BlendMask[Mask[i] % Size] = Mask[i];
7599 else if (BlendMask[Mask[i] % Size] != Mask[i])
7600 return SDValue(); // Can't blend in the needed input!
7602 PermuteMask[i] = Mask[i] % Size;
7605 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
7606 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
7609 /// \brief Generic routine to decompose a shuffle and blend into indepndent
7610 /// blends and permutes.
7612 /// This matches the extremely common pattern for handling combined
7613 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
7614 /// operations. It will try to pick the best arrangement of shuffles and
7616 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
7620 SelectionDAG &DAG) {
7621 // Shuffle the input elements into the desired positions in V1 and V2 and
7622 // blend them together.
7623 SmallVector<int, 32> V1Mask(Mask.size(), -1);
7624 SmallVector<int, 32> V2Mask(Mask.size(), -1);
7625 SmallVector<int, 32> BlendMask(Mask.size(), -1);
7626 for (int i = 0, Size = Mask.size(); i < Size; ++i)
7627 if (Mask[i] >= 0 && Mask[i] < Size) {
7628 V1Mask[i] = Mask[i];
7630 } else if (Mask[i] >= Size) {
7631 V2Mask[i] = Mask[i] - Size;
7632 BlendMask[i] = i + Size;
7635 // Try to lower with the simpler initial blend strategy unless one of the
7636 // input shuffles would be a no-op. We prefer to shuffle inputs as the
7637 // shuffle may be able to fold with a load or other benefit. However, when
7638 // we'll have to do 2x as many shuffles in order to achieve this, blending
7639 // first is a better strategy.
7640 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
7641 if (SDValue BlendPerm =
7642 lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
7645 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
7646 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
7647 return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
7650 /// \brief Try to lower a vector shuffle as a byte rotation.
7652 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
7653 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
7654 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
7655 /// try to generically lower a vector shuffle through such an pattern. It
7656 /// does not check for the profitability of lowering either as PALIGNR or
7657 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
7658 /// This matches shuffle vectors that look like:
7660 /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
7662 /// Essentially it concatenates V1 and V2, shifts right by some number of
7663 /// elements, and takes the low elements as the result. Note that while this is
7664 /// specified as a *right shift* because x86 is little-endian, it is a *left
7665 /// rotate* of the vector lanes.
7666 static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
7669 const X86Subtarget *Subtarget,
7670 SelectionDAG &DAG) {
7671 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
7673 int NumElts = Mask.size();
7674 int NumLanes = VT.getSizeInBits() / 128;
7675 int NumLaneElts = NumElts / NumLanes;
7677 // We need to detect various ways of spelling a rotation:
7678 // [11, 12, 13, 14, 15, 0, 1, 2]
7679 // [-1, 12, 13, 14, -1, -1, 1, -1]
7680 // [-1, -1, -1, -1, -1, -1, 1, 2]
7681 // [ 3, 4, 5, 6, 7, 8, 9, 10]
7682 // [-1, 4, 5, 6, -1, -1, 9, -1]
7683 // [-1, 4, 5, 6, -1, -1, -1, -1]
7686 for (int l = 0; l < NumElts; l += NumLaneElts) {
7687 for (int i = 0; i < NumLaneElts; ++i) {
7688 if (Mask[l + i] == -1)
7690 assert(Mask[l + i] >= 0 && "Only -1 is a valid negative mask element!");
7692 // Get the mod-Size index and lane correct it.
7693 int LaneIdx = (Mask[l + i] % NumElts) - l;
7694 // Make sure it was in this lane.
7695 if (LaneIdx < 0 || LaneIdx >= NumLaneElts)
7698 // Determine where a rotated vector would have started.
7699 int StartIdx = i - LaneIdx;
7701 // The identity rotation isn't interesting, stop.
7704 // If we found the tail of a vector the rotation must be the missing
7705 // front. If we found the head of a vector, it must be how much of the
7707 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumLaneElts - StartIdx;
7710 Rotation = CandidateRotation;
7711 else if (Rotation != CandidateRotation)
7712 // The rotations don't match, so we can't match this mask.
7715 // Compute which value this mask is pointing at.
7716 SDValue MaskV = Mask[l + i] < NumElts ? V1 : V2;
7718 // Compute which of the two target values this index should be assigned
7719 // to. This reflects whether the high elements are remaining or the low
7720 // elements are remaining.
7721 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
7723 // Either set up this value if we've not encountered it before, or check
7724 // that it remains consistent.
7727 else if (TargetV != MaskV)
7728 // This may be a rotation, but it pulls from the inputs in some
7729 // unsupported interleaving.
7734 // Check that we successfully analyzed the mask, and normalize the results.
7735 assert(Rotation != 0 && "Failed to locate a viable rotation!");
7736 assert((Lo || Hi) && "Failed to find a rotated input vector!");
7742 // The actual rotate instruction rotates bytes, so we need to scale the
7743 // rotation based on how many bytes are in the vector lane.
7744 int Scale = 16 / NumLaneElts;
7746 // SSSE3 targets can use the palignr instruction.
7747 if (Subtarget->hasSSSE3()) {
7748 // Cast the inputs to i8 vector of correct length to match PALIGNR.
7749 MVT AlignVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes);
7750 Lo = DAG.getNode(ISD::BITCAST, DL, AlignVT, Lo);
7751 Hi = DAG.getNode(ISD::BITCAST, DL, AlignVT, Hi);
7753 return DAG.getNode(ISD::BITCAST, DL, VT,
7754 DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Hi, Lo,
7755 DAG.getConstant(Rotation * Scale, MVT::i8)));
7758 assert(VT.getSizeInBits() == 128 &&
7759 "Rotate-based lowering only supports 128-bit lowering!");
7760 assert(Mask.size() <= 16 &&
7761 "Can shuffle at most 16 bytes in a 128-bit vector!");
7763 // Default SSE2 implementation
7764 int LoByteShift = 16 - Rotation * Scale;
7765 int HiByteShift = Rotation * Scale;
7767 // Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ.
7768 Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Lo);
7769 Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Hi);
7771 SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo,
7772 DAG.getConstant(LoByteShift, MVT::i8));
7773 SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi,
7774 DAG.getConstant(HiByteShift, MVT::i8));
7775 return DAG.getNode(ISD::BITCAST, DL, VT,
7776 DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift));
7779 /// \brief Compute whether each element of a shuffle is zeroable.
7781 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
7782 /// Either it is an undef element in the shuffle mask, the element of the input
7783 /// referenced is undef, or the element of the input referenced is known to be
7784 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
7785 /// as many lanes with this technique as possible to simplify the remaining
7787 static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
7788 SDValue V1, SDValue V2) {
7789 SmallBitVector Zeroable(Mask.size(), false);
7791 while (V1.getOpcode() == ISD::BITCAST)
7792 V1 = V1->getOperand(0);
7793 while (V2.getOpcode() == ISD::BITCAST)
7794 V2 = V2->getOperand(0);
7796 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
7797 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
7799 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7801 // Handle the easy cases.
7802 if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
7807 // If this is an index into a build_vector node (which has the same number
7808 // of elements), dig out the input value and use it.
7809 SDValue V = M < Size ? V1 : V2;
7810 if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands())
7813 SDValue Input = V.getOperand(M % Size);
7814 // The UNDEF opcode check really should be dead code here, but not quite
7815 // worth asserting on (it isn't invalid, just unexpected).
7816 if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input))
7823 /// \brief Try to emit a bitmask instruction for a shuffle.
7825 /// This handles cases where we can model a blend exactly as a bitmask due to
7826 /// one of the inputs being zeroable.
7827 static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1,
7828 SDValue V2, ArrayRef<int> Mask,
7829 SelectionDAG &DAG) {
7830 MVT EltVT = VT.getScalarType();
7831 int NumEltBits = EltVT.getSizeInBits();
7832 MVT IntEltVT = MVT::getIntegerVT(NumEltBits);
7833 SDValue Zero = DAG.getConstant(0, IntEltVT);
7834 SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), IntEltVT);
7835 if (EltVT.isFloatingPoint()) {
7836 Zero = DAG.getNode(ISD::BITCAST, DL, EltVT, Zero);
7837 AllOnes = DAG.getNode(ISD::BITCAST, DL, EltVT, AllOnes);
7839 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
7840 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7842 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7845 if (Mask[i] % Size != i)
7846 return SDValue(); // Not a blend.
7848 V = Mask[i] < Size ? V1 : V2;
7849 else if (V != (Mask[i] < Size ? V1 : V2))
7850 return SDValue(); // Can only let one input through the mask.
7852 VMaskOps[i] = AllOnes;
7855 return SDValue(); // No non-zeroable elements!
7857 SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps);
7858 V = DAG.getNode(VT.isFloatingPoint()
7859 ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND,
7864 /// \brief Try to lower a vector shuffle as a byte shift (shifts in zeros).
7866 /// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ
7867 /// byte-shift instructions. The mask must consist of a shifted sequential
7868 /// shuffle from one of the input vectors and zeroable elements for the
7869 /// remaining 'shifted in' elements.
7870 static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1,
7871 SDValue V2, ArrayRef<int> Mask,
7872 SelectionDAG &DAG) {
7873 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
7875 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7877 int NumElts = VT.getVectorNumElements();
7878 int NumLanes = VT.getSizeInBits() / 128;
7879 int NumLaneElts = NumElts / NumLanes;
7880 int Scale = 16 / NumLaneElts;
7881 MVT ShiftVT = MVT::getVectorVT(MVT::i64, 2 * NumLanes);
7883 // PSLLDQ : (little-endian) left byte shift
7884 // [ zz, 0, 1, 2, 3, 4, 5, 6]
7885 // [ zz, zz, -1, -1, 2, 3, 4, -1]
7886 // [ zz, zz, zz, zz, zz, zz, -1, 1]
7887 // PSRLDQ : (little-endian) right byte shift
7888 // [ 5, 6, 7, zz, zz, zz, zz, zz]
7889 // [ -1, 5, 6, 7, zz, zz, zz, zz]
7890 // [ 1, 2, -1, -1, -1, -1, zz, zz]
7892 auto CheckZeros = [&](int Shift, bool LeftShift) {
7893 for (int l = 0; l < NumElts; l += NumLaneElts)
7894 for (int i = 0; i < Shift; ++i)
7895 if (!Zeroable[l + i + (LeftShift ? 0 : (NumLaneElts - Shift))])
7901 auto MatchByteShift = [&](int Shift, bool LeftShift, SDValue V) {
7902 for (int l = 0; l < NumElts; l += NumLaneElts) {
7903 unsigned Pos = LeftShift ? Shift + l : l;
7904 unsigned Low = LeftShift ? l : Shift + l;
7905 unsigned Len = NumLaneElts - Shift;
7906 if (!isSequentialOrUndefInRange(Mask, Pos, Len,
7907 Low + (V == V1 ? 0 : NumElts)))
7911 int ByteShift = Shift * Scale;
7912 unsigned Op = LeftShift ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
7913 V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, V);
7914 V = DAG.getNode(Op, DL, ShiftVT, V, DAG.getConstant(ByteShift, MVT::i8));
7915 return DAG.getNode(ISD::BITCAST, DL, VT, V);
7918 for (int Shift = 1; Shift < NumLaneElts; ++Shift)
7919 for (bool LeftShift : {true, false})
7920 if (CheckZeros(Shift, LeftShift))
7921 for (SDValue V : {V1, V2})
7922 if (SDValue S = MatchByteShift(Shift, LeftShift, V))
7929 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
7931 /// Attempts to match a shuffle mask against the PSRL(W/D/Q) and PSLL(W/D/Q)
7932 /// SSE2 and AVX2 logical bit-shift instructions. The function matches
7933 /// elements from one of the input vectors shuffled to the left or right
7934 /// with zeroable elements 'shifted in'.
7935 static SDValue lowerVectorShuffleAsBitShift(SDLoc DL, MVT VT, SDValue V1,
7936 SDValue V2, ArrayRef<int> Mask,
7937 SelectionDAG &DAG) {
7938 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7940 int Size = Mask.size();
7941 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
7943 // PSRL : (little-endian) right bit shift.
7946 // PSHL : (little-endian) left bit shift.
7948 // [ -1, 4, zz, -1 ]
7949 auto MatchBitShift = [&](int Shift, int Scale) -> SDValue {
7950 MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale);
7951 MVT ShiftVT = MVT::getVectorVT(ShiftSVT, Size / Scale);
7952 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
7953 "Illegal integer vector type");
7955 bool MatchLeft = true, MatchRight = true;
7956 for (int i = 0; i != Size; i += Scale) {
7957 for (int j = 0; j != Shift; ++j) {
7958 MatchLeft &= Zeroable[i + j];
7960 for (int j = Scale - Shift; j != Scale; ++j) {
7961 MatchRight &= Zeroable[i + j];
7964 if (!(MatchLeft || MatchRight))
7967 bool MatchV1 = true, MatchV2 = true;
7968 for (int i = 0; i != Size; i += Scale) {
7969 unsigned Pos = MatchLeft ? i + Shift : i;
7970 unsigned Low = MatchLeft ? i : i + Shift;
7971 unsigned Len = Scale - Shift;
7972 MatchV1 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low);
7973 MatchV2 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low + Size);
7975 if (!(MatchV1 || MatchV2))
7978 // Cast the inputs to ShiftVT to match VSRLI/VSHLI and back again.
7979 unsigned OpCode = MatchLeft ? X86ISD::VSHLI : X86ISD::VSRLI;
7980 int ShiftAmt = Shift * VT.getScalarSizeInBits();
7981 SDValue V = MatchV1 ? V1 : V2;
7982 V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, V);
7983 V = DAG.getNode(OpCode, DL, ShiftVT, V, DAG.getConstant(ShiftAmt, MVT::i8));
7984 return DAG.getNode(ISD::BITCAST, DL, VT, V);
7987 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
7988 // keep doubling the size of the integer elements up to that. We can
7989 // then shift the elements of the integer vector by whole multiples of
7990 // their width within the elements of the larger integer vector. Test each
7991 // multiple to see if we can find a match with the moved element indices
7992 // and that the shifted in elements are all zeroable.
7993 for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= 64; Scale *= 2)
7994 for (int Shift = 1; Shift != Scale; ++Shift)
7995 if (SDValue BitShift = MatchBitShift(Shift, Scale))
8002 /// \brief Lower a vector shuffle as a zero or any extension.
8004 /// Given a specific number of elements, element bit width, and extension
8005 /// stride, produce either a zero or any extension based on the available
8006 /// features of the subtarget.
8007 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
8008 SDLoc DL, MVT VT, int Scale, bool AnyExt, SDValue InputV,
8009 const X86Subtarget *Subtarget, SelectionDAG &DAG) {
8010 assert(Scale > 1 && "Need a scale to extend.");
8011 int NumElements = VT.getVectorNumElements();
8012 int EltBits = VT.getScalarSizeInBits();
8013 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
8014 "Only 8, 16, and 32 bit elements can be extended.");
8015 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
8017 // Found a valid zext mask! Try various lowering strategies based on the
8018 // input type and available ISA extensions.
8019 if (Subtarget->hasSSE41()) {
8020 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
8021 NumElements / Scale);
8022 return DAG.getNode(ISD::BITCAST, DL, VT,
8023 DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV));
8026 // For any extends we can cheat for larger element sizes and use shuffle
8027 // instructions that can fold with a load and/or copy.
8028 if (AnyExt && EltBits == 32) {
8029 int PSHUFDMask[4] = {0, -1, 1, -1};
8031 ISD::BITCAST, DL, VT,
8032 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
8033 DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
8034 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
8036 if (AnyExt && EltBits == 16 && Scale > 2) {
8037 int PSHUFDMask[4] = {0, -1, 0, -1};
8038 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
8039 DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
8040 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG));
8041 int PSHUFHWMask[4] = {1, -1, -1, -1};
8043 ISD::BITCAST, DL, VT,
8044 DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16,
8045 DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, InputV),
8046 getV4X86ShuffleImm8ForMask(PSHUFHWMask, DAG)));
8049 // If this would require more than 2 unpack instructions to expand, use
8050 // pshufb when available. We can only use more than 2 unpack instructions
8051 // when zero extending i8 elements which also makes it easier to use pshufb.
8052 if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) {
8053 assert(NumElements == 16 && "Unexpected byte vector width!");
8054 SDValue PSHUFBMask[16];
8055 for (int i = 0; i < 16; ++i)
8057 DAG.getConstant((i % Scale == 0) ? i / Scale : 0x80, MVT::i8);
8058 InputV = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, InputV);
8059 return DAG.getNode(ISD::BITCAST, DL, VT,
8060 DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
8061 DAG.getNode(ISD::BUILD_VECTOR, DL,
8062 MVT::v16i8, PSHUFBMask)));
8065 // Otherwise emit a sequence of unpacks.
8067 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
8068 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
8069 : getZeroVector(InputVT, Subtarget, DAG, DL);
8070 InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
8071 InputV = DAG.getNode(X86ISD::UNPCKL, DL, InputVT, InputV, Ext);
8075 } while (Scale > 1);
8076 return DAG.getNode(ISD::BITCAST, DL, VT, InputV);
8079 /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
8081 /// This routine will try to do everything in its power to cleverly lower
8082 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
8083 /// check for the profitability of this lowering, it tries to aggressively
8084 /// match this pattern. It will use all of the micro-architectural details it
8085 /// can to emit an efficient lowering. It handles both blends with all-zero
8086 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
8087 /// masking out later).
8089 /// The reason we have dedicated lowering for zext-style shuffles is that they
8090 /// are both incredibly common and often quite performance sensitive.
8091 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
8092 SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
8093 const X86Subtarget *Subtarget, SelectionDAG &DAG) {
8094 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8096 int Bits = VT.getSizeInBits();
8097 int NumElements = VT.getVectorNumElements();
8098 assert(VT.getScalarSizeInBits() <= 32 &&
8099 "Exceeds 32-bit integer zero extension limit");
8100 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
8102 // Define a helper function to check a particular ext-scale and lower to it if
8104 auto Lower = [&](int Scale) -> SDValue {
8107 for (int i = 0; i < NumElements; ++i) {
8109 continue; // Valid anywhere but doesn't tell us anything.
8110 if (i % Scale != 0) {
8111 // Each of the extended elements need to be zeroable.
8115 // We no longer are in the anyext case.
8120 // Each of the base elements needs to be consecutive indices into the
8121 // same input vector.
8122 SDValue V = Mask[i] < NumElements ? V1 : V2;
8125 else if (InputV != V)
8126 return SDValue(); // Flip-flopping inputs.
8128 if (Mask[i] % NumElements != i / Scale)
8129 return SDValue(); // Non-consecutive strided elements.
8132 // If we fail to find an input, we have a zero-shuffle which should always
8133 // have already been handled.
8134 // FIXME: Maybe handle this here in case during blending we end up with one?
8138 return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
8139 DL, VT, Scale, AnyExt, InputV, Subtarget, DAG);
8142 // The widest scale possible for extending is to a 64-bit integer.
8143 assert(Bits % 64 == 0 &&
8144 "The number of bits in a vector must be divisible by 64 on x86!");
8145 int NumExtElements = Bits / 64;
8147 // Each iteration, try extending the elements half as much, but into twice as
8149 for (; NumExtElements < NumElements; NumExtElements *= 2) {
8150 assert(NumElements % NumExtElements == 0 &&
8151 "The input vector size must be divisible by the extended size.");
8152 if (SDValue V = Lower(NumElements / NumExtElements))
8156 // General extends failed, but 128-bit vectors may be able to use MOVQ.
8160 // Returns one of the source operands if the shuffle can be reduced to a
8161 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
8162 auto CanZExtLowHalf = [&]() {
8163 for (int i = NumElements / 2; i != NumElements; ++i)
8166 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
8168 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
8173 if (SDValue V = CanZExtLowHalf()) {
8174 V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V);
8175 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
8176 return DAG.getNode(ISD::BITCAST, DL, VT, V);
8179 // No viable ext lowering found.
8183 /// \brief Try to get a scalar value for a specific element of a vector.
8185 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
8186 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
8187 SelectionDAG &DAG) {
8188 MVT VT = V.getSimpleValueType();
8189 MVT EltVT = VT.getVectorElementType();
8190 while (V.getOpcode() == ISD::BITCAST)
8191 V = V.getOperand(0);
8192 // If the bitcasts shift the element size, we can't extract an equivalent
8194 MVT NewVT = V.getSimpleValueType();
8195 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
8198 if (V.getOpcode() == ISD::BUILD_VECTOR ||
8199 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR))
8200 return DAG.getNode(ISD::BITCAST, SDLoc(V), EltVT, V.getOperand(Idx));
8205 /// \brief Helper to test for a load that can be folded with x86 shuffles.
8207 /// This is particularly important because the set of instructions varies
8208 /// significantly based on whether the operand is a load or not.
8209 static bool isShuffleFoldableLoad(SDValue V) {
8210 while (V.getOpcode() == ISD::BITCAST)
8211 V = V.getOperand(0);
8213 return ISD::isNON_EXTLoad(V.getNode());
8216 /// \brief Try to lower insertion of a single element into a zero vector.
8218 /// This is a common pattern that we have especially efficient patterns to lower
8219 /// across all subtarget feature sets.
8220 static SDValue lowerVectorShuffleAsElementInsertion(
8221 MVT VT, SDLoc DL, SDValue V1, SDValue V2, ArrayRef<int> Mask,
8222 const X86Subtarget *Subtarget, SelectionDAG &DAG) {
8223 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8225 MVT EltVT = VT.getVectorElementType();
8227 int V2Index = std::find_if(Mask.begin(), Mask.end(),
8228 [&Mask](int M) { return M >= (int)Mask.size(); }) -
8230 bool IsV1Zeroable = true;
8231 for (int i = 0, Size = Mask.size(); i < Size; ++i)
8232 if (i != V2Index && !Zeroable[i]) {
8233 IsV1Zeroable = false;
8237 // Check for a single input from a SCALAR_TO_VECTOR node.
8238 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
8239 // all the smarts here sunk into that routine. However, the current
8240 // lowering of BUILD_VECTOR makes that nearly impossible until the old
8241 // vector shuffle lowering is dead.
8242 if (SDValue V2S = getScalarValueForVectorElement(
8243 V2, Mask[V2Index] - Mask.size(), DAG)) {
8244 // We need to zext the scalar if it is smaller than an i32.
8245 V2S = DAG.getNode(ISD::BITCAST, DL, EltVT, V2S);
8246 if (EltVT == MVT::i8 || EltVT == MVT::i16) {
8247 // Using zext to expand a narrow element won't work for non-zero
8252 // Zero-extend directly to i32.
8254 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
8256 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
8257 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
8258 EltVT == MVT::i16) {
8259 // Either not inserting from the low element of the input or the input
8260 // element size is too small to use VZEXT_MOVL to clear the high bits.
8264 if (!IsV1Zeroable) {
8265 // If V1 can't be treated as a zero vector we have fewer options to lower
8266 // this. We can't support integer vectors or non-zero targets cheaply, and
8267 // the V1 elements can't be permuted in any way.
8268 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
8269 if (!VT.isFloatingPoint() || V2Index != 0)
8271 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
8272 V1Mask[V2Index] = -1;
8273 if (!isNoopShuffleMask(V1Mask))
8275 // This is essentially a special case blend operation, but if we have
8276 // general purpose blend operations, they are always faster. Bail and let
8277 // the rest of the lowering handle these as blends.
8278 if (Subtarget->hasSSE41())
8281 // Otherwise, use MOVSD or MOVSS.
8282 assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
8283 "Only two types of floating point element types to handle!");
8284 return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
8288 // This lowering only works for the low element with floating point vectors.
8289 if (VT.isFloatingPoint() && V2Index != 0)
8292 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
8294 V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
8297 // If we have 4 or fewer lanes we can cheaply shuffle the element into
8298 // the desired position. Otherwise it is more efficient to do a vector
8299 // shift left. We know that we can do a vector shift left because all
8300 // the inputs are zero.
8301 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
8302 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
8303 V2Shuffle[V2Index] = 0;
8304 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
8306 V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V2);
8308 X86ISD::VSHLDQ, DL, MVT::v2i64, V2,
8310 V2Index * EltVT.getSizeInBits()/8,
8311 DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64)));
8312 V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
8318 /// \brief Try to lower broadcast of a single element.
8320 /// For convenience, this code also bundles all of the subtarget feature set
8321 /// filtering. While a little annoying to re-dispatch on type here, there isn't
8322 /// a convenient way to factor it out.
8323 static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V,
8325 const X86Subtarget *Subtarget,
8326 SelectionDAG &DAG) {
8327 if (!Subtarget->hasAVX())
8329 if (VT.isInteger() && !Subtarget->hasAVX2())
8332 // Check that the mask is a broadcast.
8333 int BroadcastIdx = -1;
8335 if (M >= 0 && BroadcastIdx == -1)
8337 else if (M >= 0 && M != BroadcastIdx)
8340 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
8341 "a sorted mask where the broadcast "
8344 // Go up the chain of (vector) values to try and find a scalar load that
8345 // we can combine with the broadcast.
8347 switch (V.getOpcode()) {
8348 case ISD::CONCAT_VECTORS: {
8349 int OperandSize = Mask.size() / V.getNumOperands();
8350 V = V.getOperand(BroadcastIdx / OperandSize);
8351 BroadcastIdx %= OperandSize;
8355 case ISD::INSERT_SUBVECTOR: {
8356 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
8357 auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
8361 int BeginIdx = (int)ConstantIdx->getZExtValue();
8363 BeginIdx + (int)VInner.getValueType().getVectorNumElements();
8364 if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
8365 BroadcastIdx -= BeginIdx;
8376 // Check if this is a broadcast of a scalar. We special case lowering
8377 // for scalars so that we can more effectively fold with loads.
8378 if (V.getOpcode() == ISD::BUILD_VECTOR ||
8379 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
8380 V = V.getOperand(BroadcastIdx);
8382 // If the scalar isn't a load we can't broadcast from it in AVX1, only with
8384 if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V))
8386 } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) {
8387 // We can't broadcast from a vector register w/o AVX2, and we can only
8388 // broadcast from the zero-element of a vector register.
8392 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V);
8395 // Check for whether we can use INSERTPS to perform the shuffle. We only use
8396 // INSERTPS when the V1 elements are already in the correct locations
8397 // because otherwise we can just always use two SHUFPS instructions which
8398 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
8399 // perform INSERTPS if a single V1 element is out of place and all V2
8400 // elements are zeroable.
8401 static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
8403 SelectionDAG &DAG) {
8404 assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
8405 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8406 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8407 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8409 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8412 int V1DstIndex = -1;
8413 int V2DstIndex = -1;
8414 bool V1UsedInPlace = false;
8416 for (int i = 0; i < 4; ++i) {
8417 // Synthesize a zero mask from the zeroable elements (includes undefs).
8423 // Flag if we use any V1 inputs in place.
8425 V1UsedInPlace = true;
8429 // We can only insert a single non-zeroable element.
8430 if (V1DstIndex != -1 || V2DstIndex != -1)
8434 // V1 input out of place for insertion.
8437 // V2 input for insertion.
8442 // Don't bother if we have no (non-zeroable) element for insertion.
8443 if (V1DstIndex == -1 && V2DstIndex == -1)
8446 // Determine element insertion src/dst indices. The src index is from the
8447 // start of the inserted vector, not the start of the concatenated vector.
8448 unsigned V2SrcIndex = 0;
8449 if (V1DstIndex != -1) {
8450 // If we have a V1 input out of place, we use V1 as the V2 element insertion
8451 // and don't use the original V2 at all.
8452 V2SrcIndex = Mask[V1DstIndex];
8453 V2DstIndex = V1DstIndex;
8456 V2SrcIndex = Mask[V2DstIndex] - 4;
8459 // If no V1 inputs are used in place, then the result is created only from
8460 // the zero mask and the V2 insertion - so remove V1 dependency.
8462 V1 = DAG.getUNDEF(MVT::v4f32);
8464 unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
8465 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
8467 // Insert the V2 element into the desired position.
8469 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
8470 DAG.getConstant(InsertPSMask, MVT::i8));
8473 /// \brief Try to lower a shuffle as a permute of the inputs followed by an
8474 /// UNPCK instruction.
8476 /// This specifically targets cases where we end up with alternating between
8477 /// the two inputs, and so can permute them into something that feeds a single
8478 /// UNPCK instruction. Note that this routine only targets integer vectors
8479 /// because for floating point vectors we have a generalized SHUFPS lowering
8480 /// strategy that handles everything that doesn't *exactly* match an unpack,
8481 /// making this clever lowering unnecessary.
8482 static SDValue lowerVectorShuffleAsUnpack(MVT VT, SDLoc DL, SDValue V1,
8483 SDValue V2, ArrayRef<int> Mask,
8484 SelectionDAG &DAG) {
8485 assert(!VT.isFloatingPoint() &&
8486 "This routine only supports integer vectors.");
8487 assert(!isSingleInputShuffleMask(Mask) &&
8488 "This routine should only be used when blending two inputs.");
8489 assert(Mask.size() >= 2 && "Single element masks are invalid.");
8491 int Size = Mask.size();
8493 int NumLoInputs = std::count_if(Mask.begin(), Mask.end(), [Size](int M) {
8494 return M >= 0 && M % Size < Size / 2;
8496 int NumHiInputs = std::count_if(
8497 Mask.begin(), Mask.end(), [Size](int M) { return M % Size > Size / 2; });
8499 bool UnpackLo = NumLoInputs >= NumHiInputs;
8501 auto TryUnpack = [&](MVT UnpackVT, int Scale) {
8502 SmallVector<int, 32> V1Mask(Mask.size(), -1);
8503 SmallVector<int, 32> V2Mask(Mask.size(), -1);
8505 for (int i = 0; i < Size; ++i) {
8509 // Each element of the unpack contains Scale elements from this mask.
8510 int UnpackIdx = i / Scale;
8512 // We only handle the case where V1 feeds the first slots of the unpack.
8513 // We rely on canonicalization to ensure this is the case.
8514 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
8517 // Setup the mask for this input. The indexing is tricky as we have to
8518 // handle the unpack stride.
8519 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
8520 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
8524 // Shuffle the inputs into place.
8525 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
8526 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
8528 // Cast the inputs to the type we will use to unpack them.
8529 V1 = DAG.getNode(ISD::BITCAST, DL, UnpackVT, V1);
8530 V2 = DAG.getNode(ISD::BITCAST, DL, UnpackVT, V2);
8532 // Unpack the inputs and cast the result back to the desired type.
8533 return DAG.getNode(ISD::BITCAST, DL, VT,
8534 DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH,
8535 DL, UnpackVT, V1, V2));
8538 // We try each unpack from the largest to the smallest to try and find one
8539 // that fits this mask.
8540 int OrigNumElements = VT.getVectorNumElements();
8541 int OrigScalarSize = VT.getScalarSizeInBits();
8542 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2) {
8543 int Scale = ScalarSize / OrigScalarSize;
8544 int NumElements = OrigNumElements / Scale;
8545 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), NumElements);
8546 if (SDValue Unpack = TryUnpack(UnpackVT, Scale))
8553 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
8555 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
8556 /// support for floating point shuffles but not integer shuffles. These
8557 /// instructions will incur a domain crossing penalty on some chips though so
8558 /// it is better to avoid lowering through this for integer vectors where
8560 static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8561 const X86Subtarget *Subtarget,
8562 SelectionDAG &DAG) {
8564 assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!");
8565 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
8566 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
8567 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8568 ArrayRef<int> Mask = SVOp->getMask();
8569 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
8571 if (isSingleInputShuffleMask(Mask)) {
8572 // Use low duplicate instructions for masks that match their pattern.
8573 if (Subtarget->hasSSE3())
8574 if (isShuffleEquivalent(V1, V2, Mask, 0, 0))
8575 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, V1);
8577 // Straight shuffle of a single input vector. Simulate this by using the
8578 // single input as both of the "inputs" to this instruction..
8579 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
8581 if (Subtarget->hasAVX()) {
8582 // If we have AVX, we can use VPERMILPS which will allow folding a load
8583 // into the shuffle.
8584 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
8585 DAG.getConstant(SHUFPDMask, MVT::i8));
8588 return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V1,
8589 DAG.getConstant(SHUFPDMask, MVT::i8));
8591 assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
8592 assert(Mask[1] >= 2 && "Non-canonicalized blend!");
8594 // If we have a single input, insert that into V1 if we can do so cheaply.
8595 if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
8596 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8597 MVT::v2f64, DL, V1, V2, Mask, Subtarget, DAG))
8599 // Try inverting the insertion since for v2 masks it is easy to do and we
8600 // can't reliably sort the mask one way or the other.
8601 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
8602 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
8603 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8604 MVT::v2f64, DL, V2, V1, InverseMask, Subtarget, DAG))
8608 // Try to use one of the special instruction patterns to handle two common
8609 // blend patterns if a zero-blend above didn't work.
8610 if (isShuffleEquivalent(V1, V2, Mask, 0, 3) || isShuffleEquivalent(V1, V2, Mask, 1, 3))
8611 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
8612 // We can either use a special instruction to load over the low double or
8613 // to move just the low double.
8615 isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
8617 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
8619 if (Subtarget->hasSSE41())
8620 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
8624 // Use dedicated unpack instructions for masks that match their pattern.
8625 if (isShuffleEquivalent(V1, V2, Mask, 0, 2))
8626 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2);
8627 if (isShuffleEquivalent(V1, V2, Mask, 1, 3))
8628 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2);
8630 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
8631 return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2,
8632 DAG.getConstant(SHUFPDMask, MVT::i8));
8635 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
8637 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
8638 /// the integer unit to minimize domain crossing penalties. However, for blends
8639 /// it falls back to the floating point shuffle operation with appropriate bit
8641 static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8642 const X86Subtarget *Subtarget,
8643 SelectionDAG &DAG) {
8645 assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!");
8646 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
8647 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
8648 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8649 ArrayRef<int> Mask = SVOp->getMask();
8650 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
8652 if (isSingleInputShuffleMask(Mask)) {
8653 // Check for being able to broadcast a single element.
8654 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v2i64, DL, V1,
8655 Mask, Subtarget, DAG))
8658 // Straight shuffle of a single input vector. For everything from SSE2
8659 // onward this has a single fast instruction with no scary immediates.
8660 // We have to map the mask as it is actually a v4i32 shuffle instruction.
8661 V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V1);
8662 int WidenedMask[4] = {
8663 std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
8664 std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
8666 ISD::BITCAST, DL, MVT::v2i64,
8667 DAG.getNode(X86ISD::PSHUFD, SDLoc(Op), MVT::v4i32, V1,
8668 getV4X86ShuffleImm8ForMask(WidenedMask, DAG)));
8671 // Try to use byte shift instructions.
8672 if (SDValue Shift = lowerVectorShuffleAsByteShift(
8673 DL, MVT::v2i64, V1, V2, Mask, DAG))
8676 // If we have a single input from V2 insert that into V1 if we can do so
8678 if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
8679 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8680 MVT::v2i64, DL, V1, V2, Mask, Subtarget, DAG))
8682 // Try inverting the insertion since for v2 masks it is easy to do and we
8683 // can't reliably sort the mask one way or the other.
8684 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
8685 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
8686 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8687 MVT::v2i64, DL, V2, V1, InverseMask, Subtarget, DAG))
8691 // We have different paths for blend lowering, but they all must use the
8692 // *exact* same predicate.
8693 bool IsBlendSupported = Subtarget->hasSSE41();
8694 if (IsBlendSupported)
8695 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
8699 // Use dedicated unpack instructions for masks that match their pattern.
8700 if (isShuffleEquivalent(V1, V2, Mask, 0, 2))
8701 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2);
8702 if (isShuffleEquivalent(V1, V2, Mask, 1, 3))
8703 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2);
8705 // Try to use byte rotation instructions.
8706 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
8707 if (Subtarget->hasSSSE3())
8708 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
8709 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
8712 // If we have direct support for blends, we should lower by decomposing into
8713 // a permute. That will be faster than the domain cross.
8714 if (IsBlendSupported)
8715 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
8718 // We implement this with SHUFPD which is pretty lame because it will likely
8719 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
8720 // However, all the alternatives are still more cycles and newer chips don't
8721 // have this problem. It would be really nice if x86 had better shuffles here.
8722 V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V1);
8723 V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V2);
8724 return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
8725 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
8728 /// \brief Test whether this can be lowered with a single SHUFPS instruction.
8730 /// This is used to disable more specialized lowerings when the shufps lowering
8731 /// will happen to be efficient.
8732 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
8733 // This routine only handles 128-bit shufps.
8734 assert(Mask.size() == 4 && "Unsupported mask size!");
8736 // To lower with a single SHUFPS we need to have the low half and high half
8737 // each requiring a single input.
8738 if (Mask[0] != -1 && Mask[1] != -1 && (Mask[0] < 4) != (Mask[1] < 4))
8740 if (Mask[2] != -1 && Mask[3] != -1 && (Mask[2] < 4) != (Mask[3] < 4))
8746 /// \brief Lower a vector shuffle using the SHUFPS instruction.
8748 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
8749 /// It makes no assumptions about whether this is the *best* lowering, it simply
8751 static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT,
8752 ArrayRef<int> Mask, SDValue V1,
8753 SDValue V2, SelectionDAG &DAG) {
8754 SDValue LowV = V1, HighV = V2;
8755 int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
8758 std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
8760 if (NumV2Elements == 1) {
8762 std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
8765 // Compute the index adjacent to V2Index and in the same half by toggling
8767 int V2AdjIndex = V2Index ^ 1;
8769 if (Mask[V2AdjIndex] == -1) {
8770 // Handles all the cases where we have a single V2 element and an undef.
8771 // This will only ever happen in the high lanes because we commute the
8772 // vector otherwise.
8774 std::swap(LowV, HighV);
8775 NewMask[V2Index] -= 4;
8777 // Handle the case where the V2 element ends up adjacent to a V1 element.
8778 // To make this work, blend them together as the first step.
8779 int V1Index = V2AdjIndex;
8780 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
8781 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
8782 getV4X86ShuffleImm8ForMask(BlendMask, DAG));
8784 // Now proceed to reconstruct the final blend as we have the necessary
8785 // high or low half formed.
8792 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
8793 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
8795 } else if (NumV2Elements == 2) {
8796 if (Mask[0] < 4 && Mask[1] < 4) {
8797 // Handle the easy case where we have V1 in the low lanes and V2 in the
8801 } else if (Mask[2] < 4 && Mask[3] < 4) {
8802 // We also handle the reversed case because this utility may get called
8803 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
8804 // arrange things in the right direction.
8810 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
8811 // trying to place elements directly, just blend them and set up the final
8812 // shuffle to place them.
8814 // The first two blend mask elements are for V1, the second two are for
8816 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
8817 Mask[2] < 4 ? Mask[2] : Mask[3],
8818 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
8819 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
8820 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
8821 getV4X86ShuffleImm8ForMask(BlendMask, DAG));
8823 // Now we do a normal shuffle of V1 by giving V1 as both operands to
8826 NewMask[0] = Mask[0] < 4 ? 0 : 2;
8827 NewMask[1] = Mask[0] < 4 ? 2 : 0;
8828 NewMask[2] = Mask[2] < 4 ? 1 : 3;
8829 NewMask[3] = Mask[2] < 4 ? 3 : 1;
8832 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
8833 getV4X86ShuffleImm8ForMask(NewMask, DAG));
8836 /// \brief Lower 4-lane 32-bit floating point shuffles.
8838 /// Uses instructions exclusively from the floating point unit to minimize
8839 /// domain crossing penalties, as these are sufficient to implement all v4f32
8841 static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8842 const X86Subtarget *Subtarget,
8843 SelectionDAG &DAG) {
8845 assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
8846 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8847 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8848 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8849 ArrayRef<int> Mask = SVOp->getMask();
8850 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8853 std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
8855 if (NumV2Elements == 0) {
8856 // Check for being able to broadcast a single element.
8857 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f32, DL, V1,
8858 Mask, Subtarget, DAG))
8861 // Use even/odd duplicate instructions for masks that match their pattern.
8862 if (Subtarget->hasSSE3()) {
8863 if (isShuffleEquivalent(V1, V2, Mask, 0, 0, 2, 2))
8864 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
8865 if (isShuffleEquivalent(V1, V2, Mask, 1, 1, 3, 3))
8866 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
8869 if (Subtarget->hasAVX()) {
8870 // If we have AVX, we can use VPERMILPS which will allow folding a load
8871 // into the shuffle.
8872 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
8873 getV4X86ShuffleImm8ForMask(Mask, DAG));
8876 // Otherwise, use a straight shuffle of a single input vector. We pass the
8877 // input vector to both operands to simulate this with a SHUFPS.
8878 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
8879 getV4X86ShuffleImm8ForMask(Mask, DAG));
8882 // There are special ways we can lower some single-element blends. However, we
8883 // have custom ways we can lower more complex single-element blends below that
8884 // we defer to if both this and BLENDPS fail to match, so restrict this to
8885 // when the V2 input is targeting element 0 of the mask -- that is the fast
8887 if (NumV2Elements == 1 && Mask[0] >= 4)
8888 if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4f32, DL, V1, V2,
8889 Mask, Subtarget, DAG))
8892 if (Subtarget->hasSSE41()) {
8893 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
8897 // Use INSERTPS if we can complete the shuffle efficiently.
8898 if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG))
8901 if (!isSingleSHUFPSMask(Mask))
8902 if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
8903 DL, MVT::v4f32, V1, V2, Mask, DAG))
8907 // Use dedicated unpack instructions for masks that match their pattern.
8908 if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 1, 5))
8909 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2);
8910 if (isShuffleEquivalent(V1, V2, Mask, 2, 6, 3, 7))
8911 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
8912 if (isShuffleEquivalent(V1, V2, Mask, 4, 0, 5, 1))
8913 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V2, V1);
8914 if (isShuffleEquivalent(V1, V2, Mask, 6, 2, 7, 3))
8915 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V2, V1);
8917 // Otherwise fall back to a SHUFPS lowering strategy.
8918 return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
8921 /// \brief Lower 4-lane i32 vector shuffles.
8923 /// We try to handle these with integer-domain shuffles where we can, but for
8924 /// blends we use the floating point domain blend instructions.
8925 static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8926 const X86Subtarget *Subtarget,
8927 SelectionDAG &DAG) {
8929 assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!");
8930 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
8931 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
8932 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8933 ArrayRef<int> Mask = SVOp->getMask();
8934 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8936 // Whenever we can lower this as a zext, that instruction is strictly faster
8937 // than any alternative. It also allows us to fold memory operands into the
8938 // shuffle in many cases.
8939 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2,
8940 Mask, Subtarget, DAG))
8944 std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
8946 if (NumV2Elements == 0) {
8947 // Check for being able to broadcast a single element.
8948 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i32, DL, V1,
8949 Mask, Subtarget, DAG))
8952 // Straight shuffle of a single input vector. For everything from SSE2
8953 // onward this has a single fast instruction with no scary immediates.
8954 // We coerce the shuffle pattern to be compatible with UNPCK instructions
8955 // but we aren't actually going to use the UNPCK instruction because doing
8956 // so prevents folding a load into this instruction or making a copy.
8957 const int UnpackLoMask[] = {0, 0, 1, 1};
8958 const int UnpackHiMask[] = {2, 2, 3, 3};
8959 if (isShuffleEquivalent(V1, V2, Mask, 0, 0, 1, 1))
8960 Mask = UnpackLoMask;
8961 else if (isShuffleEquivalent(V1, V2, Mask, 2, 2, 3, 3))
8962 Mask = UnpackHiMask;
8964 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
8965 getV4X86ShuffleImm8ForMask(Mask, DAG));
8968 // Try to use bit shift instructions.
8969 if (SDValue Shift = lowerVectorShuffleAsBitShift(
8970 DL, MVT::v4i32, V1, V2, Mask, DAG))
8973 // Try to use byte shift instructions.
8974 if (SDValue Shift = lowerVectorShuffleAsByteShift(
8975 DL, MVT::v4i32, V1, V2, Mask, DAG))
8978 // There are special ways we can lower some single-element blends.
8979 if (NumV2Elements == 1)
8980 if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2,
8981 Mask, Subtarget, DAG))
8984 // We have different paths for blend lowering, but they all must use the
8985 // *exact* same predicate.
8986 bool IsBlendSupported = Subtarget->hasSSE41();
8987 if (IsBlendSupported)
8988 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
8992 if (SDValue Masked =
8993 lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, DAG))
8996 // Use dedicated unpack instructions for masks that match their pattern.
8997 if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 1, 5))
8998 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2);
8999 if (isShuffleEquivalent(V1, V2, Mask, 2, 6, 3, 7))
9000 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
9001 if (isShuffleEquivalent(V1, V2, Mask, 4, 0, 5, 1))
9002 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V2, V1);
9003 if (isShuffleEquivalent(V1, V2, Mask, 6, 2, 7, 3))
9004 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V2, V1);
9006 // Try to use byte rotation instructions.
9007 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
9008 if (Subtarget->hasSSSE3())
9009 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9010 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
9013 // If we have direct support for blends, we should lower by decomposing into
9014 // a permute. That will be faster than the domain cross.
9015 if (IsBlendSupported)
9016 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
9019 // Try to lower by permuting the inputs into an unpack instruction.
9020 if (SDValue Unpack =
9021 lowerVectorShuffleAsUnpack(MVT::v4i32, DL, V1, V2, Mask, DAG))
9024 // We implement this with SHUFPS because it can blend from two vectors.
9025 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
9026 // up the inputs, bypassing domain shift penalties that we would encur if we
9027 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
9029 return DAG.getNode(ISD::BITCAST, DL, MVT::v4i32,
9030 DAG.getVectorShuffle(
9032 DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V1),
9033 DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V2), Mask));
9036 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
9037 /// shuffle lowering, and the most complex part.
9039 /// The lowering strategy is to try to form pairs of input lanes which are
9040 /// targeted at the same half of the final vector, and then use a dword shuffle
9041 /// to place them onto the right half, and finally unpack the paired lanes into
9042 /// their final position.
9044 /// The exact breakdown of how to form these dword pairs and align them on the
9045 /// correct sides is really tricky. See the comments within the function for
9046 /// more of the details.
9047 static SDValue lowerV8I16SingleInputVectorShuffle(
9048 SDLoc DL, SDValue V, MutableArrayRef<int> Mask,
9049 const X86Subtarget *Subtarget, SelectionDAG &DAG) {
9050 assert(V.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
9051 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
9052 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
9054 SmallVector<int, 4> LoInputs;
9055 std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
9056 [](int M) { return M >= 0; });
9057 std::sort(LoInputs.begin(), LoInputs.end());
9058 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
9059 SmallVector<int, 4> HiInputs;
9060 std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
9061 [](int M) { return M >= 0; });
9062 std::sort(HiInputs.begin(), HiInputs.end());
9063 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
9065 std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
9066 int NumHToL = LoInputs.size() - NumLToL;
9068 std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
9069 int NumHToH = HiInputs.size() - NumLToH;
9070 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
9071 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
9072 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
9073 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
9075 // Check for being able to broadcast a single element.
9076 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i16, DL, V,
9077 Mask, Subtarget, DAG))
9080 // Try to use bit shift instructions.
9081 if (SDValue Shift = lowerVectorShuffleAsBitShift(
9082 DL, MVT::v8i16, V, V, Mask, DAG))
9085 // Try to use byte shift instructions.
9086 if (SDValue Shift = lowerVectorShuffleAsByteShift(
9087 DL, MVT::v8i16, V, V, Mask, DAG))
9090 // Use dedicated unpack instructions for masks that match their pattern.
9091 if (isShuffleEquivalent(V, V, Mask, 0, 0, 1, 1, 2, 2, 3, 3))
9092 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V);
9093 if (isShuffleEquivalent(V, V, Mask, 4, 4, 5, 5, 6, 6, 7, 7))
9094 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V);
9096 // Try to use byte rotation instructions.
9097 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9098 DL, MVT::v8i16, V, V, Mask, Subtarget, DAG))
9101 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
9102 // such inputs we can swap two of the dwords across the half mark and end up
9103 // with <=2 inputs to each half in each half. Once there, we can fall through
9104 // to the generic code below. For example:
9106 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
9107 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
9109 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
9110 // and an existing 2-into-2 on the other half. In this case we may have to
9111 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
9112 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
9113 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
9114 // because any other situation (including a 3-into-1 or 1-into-3 in the other
9115 // half than the one we target for fixing) will be fixed when we re-enter this
9116 // path. We will also combine away any sequence of PSHUFD instructions that
9117 // result into a single instruction. Here is an example of the tricky case:
9119 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
9120 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
9122 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
9124 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
9125 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
9127 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
9128 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
9130 // The result is fine to be handled by the generic logic.
9131 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
9132 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
9133 int AOffset, int BOffset) {
9134 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
9135 "Must call this with A having 3 or 1 inputs from the A half.");
9136 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
9137 "Must call this with B having 1 or 3 inputs from the B half.");
9138 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
9139 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
9141 // Compute the index of dword with only one word among the three inputs in
9142 // a half by taking the sum of the half with three inputs and subtracting
9143 // the sum of the actual three inputs. The difference is the remaining
9146 int &TripleDWord = AToAInputs.size() == 3 ? ADWord : BDWord;
9147 int &OneInputDWord = AToAInputs.size() == 3 ? BDWord : ADWord;
9148 int TripleInputOffset = AToAInputs.size() == 3 ? AOffset : BOffset;
9149 ArrayRef<int> TripleInputs = AToAInputs.size() == 3 ? AToAInputs : BToAInputs;
9150 int OneInput = AToAInputs.size() == 3 ? BToAInputs[0] : AToAInputs[0];
9151 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
9152 int TripleNonInputIdx =
9153 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
9154 TripleDWord = TripleNonInputIdx / 2;
9156 // We use xor with one to compute the adjacent DWord to whichever one the
9158 OneInputDWord = (OneInput / 2) ^ 1;
9160 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
9161 // and BToA inputs. If there is also such a problem with the BToB and AToB
9162 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
9163 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
9164 // is essential that we don't *create* a 3<-1 as then we might oscillate.
9165 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
9166 // Compute how many inputs will be flipped by swapping these DWords. We
9168 // to balance this to ensure we don't form a 3-1 shuffle in the other
9170 int NumFlippedAToBInputs =
9171 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
9172 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
9173 int NumFlippedBToBInputs =
9174 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
9175 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
9176 if ((NumFlippedAToBInputs == 1 &&
9177 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
9178 (NumFlippedBToBInputs == 1 &&
9179 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
9180 // We choose whether to fix the A half or B half based on whether that
9181 // half has zero flipped inputs. At zero, we may not be able to fix it
9182 // with that half. We also bias towards fixing the B half because that
9183 // will more commonly be the high half, and we have to bias one way.
9184 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
9185 ArrayRef<int> Inputs) {
9186 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
9187 bool IsFixIdxInput = std::find(Inputs.begin(), Inputs.end(),
9188 PinnedIdx ^ 1) != Inputs.end();
9189 // Determine whether the free index is in the flipped dword or the
9190 // unflipped dword based on where the pinned index is. We use this bit
9191 // in an xor to conditionally select the adjacent dword.
9192 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
9193 bool IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
9194 FixFreeIdx) != Inputs.end();
9195 if (IsFixIdxInput == IsFixFreeIdxInput)
9197 IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
9198 FixFreeIdx) != Inputs.end();
9199 assert(IsFixIdxInput != IsFixFreeIdxInput &&
9200 "We need to be changing the number of flipped inputs!");
9201 int PSHUFHalfMask[] = {0, 1, 2, 3};
9202 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
9203 V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
9205 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DAG));
9208 if (M != -1 && M == FixIdx)
9210 else if (M != -1 && M == FixFreeIdx)
9213 if (NumFlippedBToBInputs != 0) {
9215 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
9216 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
9218 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
9220 AToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
9221 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
9226 int PSHUFDMask[] = {0, 1, 2, 3};
9227 PSHUFDMask[ADWord] = BDWord;
9228 PSHUFDMask[BDWord] = ADWord;
9229 V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9230 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9231 DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
9232 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
9234 // Adjust the mask to match the new locations of A and B.
9236 if (M != -1 && M/2 == ADWord)
9237 M = 2 * BDWord + M % 2;
9238 else if (M != -1 && M/2 == BDWord)
9239 M = 2 * ADWord + M % 2;
9241 // Recurse back into this routine to re-compute state now that this isn't
9242 // a 3 and 1 problem.
9243 return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
9246 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
9247 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
9248 else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
9249 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
9251 // At this point there are at most two inputs to the low and high halves from
9252 // each half. That means the inputs can always be grouped into dwords and
9253 // those dwords can then be moved to the correct half with a dword shuffle.
9254 // We use at most one low and one high word shuffle to collect these paired
9255 // inputs into dwords, and finally a dword shuffle to place them.
9256 int PSHUFLMask[4] = {-1, -1, -1, -1};
9257 int PSHUFHMask[4] = {-1, -1, -1, -1};
9258 int PSHUFDMask[4] = {-1, -1, -1, -1};
9260 // First fix the masks for all the inputs that are staying in their
9261 // original halves. This will then dictate the targets of the cross-half
9263 auto fixInPlaceInputs =
9264 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
9265 MutableArrayRef<int> SourceHalfMask,
9266 MutableArrayRef<int> HalfMask, int HalfOffset) {
9267 if (InPlaceInputs.empty())
9269 if (InPlaceInputs.size() == 1) {
9270 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
9271 InPlaceInputs[0] - HalfOffset;
9272 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
9275 if (IncomingInputs.empty()) {
9276 // Just fix all of the in place inputs.
9277 for (int Input : InPlaceInputs) {
9278 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
9279 PSHUFDMask[Input / 2] = Input / 2;
9284 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
9285 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
9286 InPlaceInputs[0] - HalfOffset;
9287 // Put the second input next to the first so that they are packed into
9288 // a dword. We find the adjacent index by toggling the low bit.
9289 int AdjIndex = InPlaceInputs[0] ^ 1;
9290 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
9291 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
9292 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
9294 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
9295 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
9297 // Now gather the cross-half inputs and place them into a free dword of
9298 // their target half.
9299 // FIXME: This operation could almost certainly be simplified dramatically to
9300 // look more like the 3-1 fixing operation.
9301 auto moveInputsToRightHalf = [&PSHUFDMask](
9302 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
9303 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
9304 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
9306 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
9307 return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word;
9309 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
9311 int LowWord = Word & ~1;
9312 int HighWord = Word | 1;
9313 return isWordClobbered(SourceHalfMask, LowWord) ||
9314 isWordClobbered(SourceHalfMask, HighWord);
9317 if (IncomingInputs.empty())
9320 if (ExistingInputs.empty()) {
9321 // Map any dwords with inputs from them into the right half.
9322 for (int Input : IncomingInputs) {
9323 // If the source half mask maps over the inputs, turn those into
9324 // swaps and use the swapped lane.
9325 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
9326 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == -1) {
9327 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
9328 Input - SourceOffset;
9329 // We have to swap the uses in our half mask in one sweep.
9330 for (int &M : HalfMask)
9331 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
9333 else if (M == Input)
9334 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
9336 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
9337 Input - SourceOffset &&
9338 "Previous placement doesn't match!");
9340 // Note that this correctly re-maps both when we do a swap and when
9341 // we observe the other side of the swap above. We rely on that to
9342 // avoid swapping the members of the input list directly.
9343 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
9346 // Map the input's dword into the correct half.
9347 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == -1)
9348 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
9350 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
9352 "Previous placement doesn't match!");
9355 // And just directly shift any other-half mask elements to be same-half
9356 // as we will have mirrored the dword containing the element into the
9357 // same position within that half.
9358 for (int &M : HalfMask)
9359 if (M >= SourceOffset && M < SourceOffset + 4) {
9360 M = M - SourceOffset + DestOffset;
9361 assert(M >= 0 && "This should never wrap below zero!");
9366 // Ensure we have the input in a viable dword of its current half. This
9367 // is particularly tricky because the original position may be clobbered
9368 // by inputs being moved and *staying* in that half.
9369 if (IncomingInputs.size() == 1) {
9370 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
9371 int InputFixed = std::find(std::begin(SourceHalfMask),
9372 std::end(SourceHalfMask), -1) -
9373 std::begin(SourceHalfMask) + SourceOffset;
9374 SourceHalfMask[InputFixed - SourceOffset] =
9375 IncomingInputs[0] - SourceOffset;
9376 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
9378 IncomingInputs[0] = InputFixed;
9380 } else if (IncomingInputs.size() == 2) {
9381 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
9382 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
9383 // We have two non-adjacent or clobbered inputs we need to extract from
9384 // the source half. To do this, we need to map them into some adjacent
9385 // dword slot in the source mask.
9386 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
9387 IncomingInputs[1] - SourceOffset};
9389 // If there is a free slot in the source half mask adjacent to one of
9390 // the inputs, place the other input in it. We use (Index XOR 1) to
9391 // compute an adjacent index.
9392 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
9393 SourceHalfMask[InputsFixed[0] ^ 1] == -1) {
9394 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
9395 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
9396 InputsFixed[1] = InputsFixed[0] ^ 1;
9397 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
9398 SourceHalfMask[InputsFixed[1] ^ 1] == -1) {
9399 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
9400 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
9401 InputsFixed[0] = InputsFixed[1] ^ 1;
9402 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] == -1 &&
9403 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] == -1) {
9404 // The two inputs are in the same DWord but it is clobbered and the
9405 // adjacent DWord isn't used at all. Move both inputs to the free
9407 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
9408 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
9409 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
9410 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
9412 // The only way we hit this point is if there is no clobbering
9413 // (because there are no off-half inputs to this half) and there is no
9414 // free slot adjacent to one of the inputs. In this case, we have to
9415 // swap an input with a non-input.
9416 for (int i = 0; i < 4; ++i)
9417 assert((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) &&
9418 "We can't handle any clobbers here!");
9419 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
9420 "Cannot have adjacent inputs here!");
9422 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
9423 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
9425 // We also have to update the final source mask in this case because
9426 // it may need to undo the above swap.
9427 for (int &M : FinalSourceHalfMask)
9428 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
9429 M = InputsFixed[1] + SourceOffset;
9430 else if (M == InputsFixed[1] + SourceOffset)
9431 M = (InputsFixed[0] ^ 1) + SourceOffset;
9433 InputsFixed[1] = InputsFixed[0] ^ 1;
9436 // Point everything at the fixed inputs.
9437 for (int &M : HalfMask)
9438 if (M == IncomingInputs[0])
9439 M = InputsFixed[0] + SourceOffset;
9440 else if (M == IncomingInputs[1])
9441 M = InputsFixed[1] + SourceOffset;
9443 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
9444 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
9447 llvm_unreachable("Unhandled input size!");
9450 // Now hoist the DWord down to the right half.
9451 int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2;
9452 assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free");
9453 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
9454 for (int &M : HalfMask)
9455 for (int Input : IncomingInputs)
9457 M = FreeDWord * 2 + Input % 2;
9459 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
9460 /*SourceOffset*/ 4, /*DestOffset*/ 0);
9461 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
9462 /*SourceOffset*/ 0, /*DestOffset*/ 4);
9464 // Now enact all the shuffles we've computed to move the inputs into their
9466 if (!isNoopShuffleMask(PSHUFLMask))
9467 V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
9468 getV4X86ShuffleImm8ForMask(PSHUFLMask, DAG));
9469 if (!isNoopShuffleMask(PSHUFHMask))
9470 V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
9471 getV4X86ShuffleImm8ForMask(PSHUFHMask, DAG));
9472 if (!isNoopShuffleMask(PSHUFDMask))
9473 V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9474 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9475 DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
9476 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
9478 // At this point, each half should contain all its inputs, and we can then
9479 // just shuffle them into their final position.
9480 assert(std::count_if(LoMask.begin(), LoMask.end(),
9481 [](int M) { return M >= 4; }) == 0 &&
9482 "Failed to lift all the high half inputs to the low mask!");
9483 assert(std::count_if(HiMask.begin(), HiMask.end(),
9484 [](int M) { return M >= 0 && M < 4; }) == 0 &&
9485 "Failed to lift all the low half inputs to the high mask!");
9487 // Do a half shuffle for the low mask.
9488 if (!isNoopShuffleMask(LoMask))
9489 V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
9490 getV4X86ShuffleImm8ForMask(LoMask, DAG));
9492 // Do a half shuffle with the high mask after shifting its values down.
9493 for (int &M : HiMask)
9496 if (!isNoopShuffleMask(HiMask))
9497 V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
9498 getV4X86ShuffleImm8ForMask(HiMask, DAG));
9503 /// \brief Detect whether the mask pattern should be lowered through
9506 /// This essentially tests whether viewing the mask as an interleaving of two
9507 /// sub-sequences reduces the cross-input traffic of a blend operation. If so,
9508 /// lowering it through interleaving is a significantly better strategy.
9509 static bool shouldLowerAsInterleaving(ArrayRef<int> Mask) {
9510 int NumEvenInputs[2] = {0, 0};
9511 int NumOddInputs[2] = {0, 0};
9512 int NumLoInputs[2] = {0, 0};
9513 int NumHiInputs[2] = {0, 0};
9514 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9518 int InputIdx = Mask[i] >= Size;
9521 ++NumLoInputs[InputIdx];
9523 ++NumHiInputs[InputIdx];
9526 ++NumEvenInputs[InputIdx];
9528 ++NumOddInputs[InputIdx];
9531 // The minimum number of cross-input results for both the interleaved and
9532 // split cases. If interleaving results in fewer cross-input results, return
9534 int InterleavedCrosses = std::min(NumEvenInputs[1] + NumOddInputs[0],
9535 NumEvenInputs[0] + NumOddInputs[1]);
9536 int SplitCrosses = std::min(NumLoInputs[1] + NumHiInputs[0],
9537 NumLoInputs[0] + NumHiInputs[1]);
9538 return InterleavedCrosses < SplitCrosses;
9541 /// \brief Blend two v8i16 vectors using a naive unpack strategy.
9543 /// This strategy only works when the inputs from each vector fit into a single
9544 /// half of that vector, and generally there are not so many inputs as to leave
9545 /// the in-place shuffles required highly constrained (and thus expensive). It
9546 /// shifts all the inputs into a single side of both input vectors and then
9547 /// uses an unpack to interleave these inputs in a single vector. At that
9548 /// point, we will fall back on the generic single input shuffle lowering.
9549 static SDValue lowerV8I16BasicBlendVectorShuffle(SDLoc DL, SDValue V1,
9551 MutableArrayRef<int> Mask,
9552 const X86Subtarget *Subtarget,
9553 SelectionDAG &DAG) {
9554 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
9555 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
9556 SmallVector<int, 3> LoV1Inputs, HiV1Inputs, LoV2Inputs, HiV2Inputs;
9557 for (int i = 0; i < 8; ++i)
9558 if (Mask[i] >= 0 && Mask[i] < 4)
9559 LoV1Inputs.push_back(i);
9560 else if (Mask[i] >= 4 && Mask[i] < 8)
9561 HiV1Inputs.push_back(i);
9562 else if (Mask[i] >= 8 && Mask[i] < 12)
9563 LoV2Inputs.push_back(i);
9564 else if (Mask[i] >= 12)
9565 HiV2Inputs.push_back(i);
9567 int NumV1Inputs = LoV1Inputs.size() + HiV1Inputs.size();
9568 int NumV2Inputs = LoV2Inputs.size() + HiV2Inputs.size();
9571 assert(NumV1Inputs > 0 && NumV1Inputs <= 3 && "At most 3 inputs supported");
9572 assert(NumV2Inputs > 0 && NumV2Inputs <= 3 && "At most 3 inputs supported");
9573 assert(NumV1Inputs + NumV2Inputs <= 4 && "At most 4 combined inputs");
9575 bool MergeFromLo = LoV1Inputs.size() + LoV2Inputs.size() >=
9576 HiV1Inputs.size() + HiV2Inputs.size();
9578 auto moveInputsToHalf = [&](SDValue V, ArrayRef<int> LoInputs,
9579 ArrayRef<int> HiInputs, bool MoveToLo,
9581 ArrayRef<int> GoodInputs = MoveToLo ? LoInputs : HiInputs;
9582 ArrayRef<int> BadInputs = MoveToLo ? HiInputs : LoInputs;
9583 if (BadInputs.empty())
9586 int MoveMask[] = {-1, -1, -1, -1, -1, -1, -1, -1};
9587 int MoveOffset = MoveToLo ? 0 : 4;
9589 if (GoodInputs.empty()) {
9590 for (int BadInput : BadInputs) {
9591 MoveMask[Mask[BadInput] % 4 + MoveOffset] = Mask[BadInput] - MaskOffset;
9592 Mask[BadInput] = Mask[BadInput] % 4 + MoveOffset + MaskOffset;
9595 if (GoodInputs.size() == 2) {
9596 // If the low inputs are spread across two dwords, pack them into
9598 MoveMask[MoveOffset] = Mask[GoodInputs[0]] - MaskOffset;
9599 MoveMask[MoveOffset + 1] = Mask[GoodInputs[1]] - MaskOffset;
9600 Mask[GoodInputs[0]] = MoveOffset + MaskOffset;
9601 Mask[GoodInputs[1]] = MoveOffset + 1 + MaskOffset;
9603 // Otherwise pin the good inputs.
9604 for (int GoodInput : GoodInputs)
9605 MoveMask[Mask[GoodInput] - MaskOffset] = Mask[GoodInput] - MaskOffset;
9608 if (BadInputs.size() == 2) {
9609 // If we have two bad inputs then there may be either one or two good
9610 // inputs fixed in place. Find a fixed input, and then find the *other*
9611 // two adjacent indices by using modular arithmetic.
9613 std::find_if(std::begin(MoveMask) + MoveOffset, std::end(MoveMask),
9614 [](int M) { return M >= 0; }) -
9615 std::begin(MoveMask);
9617 ((((GoodMaskIdx - MoveOffset) & ~1) + 2) % 4) + MoveOffset;
9618 assert(MoveMask[MoveMaskIdx] == -1 && "Expected empty slot");
9619 assert(MoveMask[MoveMaskIdx + 1] == -1 && "Expected empty slot");
9620 MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
9621 MoveMask[MoveMaskIdx + 1] = Mask[BadInputs[1]] - MaskOffset;
9622 Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
9623 Mask[BadInputs[1]] = MoveMaskIdx + 1 + MaskOffset;
9625 assert(BadInputs.size() == 1 && "All sizes handled");
9626 int MoveMaskIdx = std::find(std::begin(MoveMask) + MoveOffset,
9627 std::end(MoveMask), -1) -
9628 std::begin(MoveMask);
9629 MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
9630 Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
9634 return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
9637 V1 = moveInputsToHalf(V1, LoV1Inputs, HiV1Inputs, MergeFromLo,
9639 V2 = moveInputsToHalf(V2, LoV2Inputs, HiV2Inputs, MergeFromLo,
9642 // FIXME: Select an interleaving of the merge of V1 and V2 that minimizes
9643 // cross-half traffic in the final shuffle.
9645 // Munge the mask to be a single-input mask after the unpack merges the
9649 M = 2 * (M % 4) + (M / 8);
9651 return DAG.getVectorShuffle(
9652 MVT::v8i16, DL, DAG.getNode(MergeFromLo ? X86ISD::UNPCKL : X86ISD::UNPCKH,
9653 DL, MVT::v8i16, V1, V2),
9654 DAG.getUNDEF(MVT::v8i16), Mask);
9657 /// \brief Generic lowering of 8-lane i16 shuffles.
9659 /// This handles both single-input shuffles and combined shuffle/blends with
9660 /// two inputs. The single input shuffles are immediately delegated to
9661 /// a dedicated lowering routine.
9663 /// The blends are lowered in one of three fundamental ways. If there are few
9664 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
9665 /// of the input is significantly cheaper when lowered as an interleaving of
9666 /// the two inputs, try to interleave them. Otherwise, blend the low and high
9667 /// halves of the inputs separately (making them have relatively few inputs)
9668 /// and then concatenate them.
9669 static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9670 const X86Subtarget *Subtarget,
9671 SelectionDAG &DAG) {
9673 assert(Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!");
9674 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
9675 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
9676 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
9677 ArrayRef<int> OrigMask = SVOp->getMask();
9678 int MaskStorage[8] = {OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3],
9679 OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7]};
9680 MutableArrayRef<int> Mask(MaskStorage);
9682 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
9684 // Whenever we can lower this as a zext, that instruction is strictly faster
9685 // than any alternative.
9686 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
9687 DL, MVT::v8i16, V1, V2, OrigMask, Subtarget, DAG))
9690 auto isV1 = [](int M) { return M >= 0 && M < 8; };
9691 auto isV2 = [](int M) { return M >= 8; };
9693 int NumV1Inputs = std::count_if(Mask.begin(), Mask.end(), isV1);
9694 int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2);
9696 if (NumV2Inputs == 0)
9697 return lowerV8I16SingleInputVectorShuffle(DL, V1, Mask, Subtarget, DAG);
9699 assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized "
9700 "to be V1-input shuffles.");
9702 // Try to use bit shift instructions.
9703 if (SDValue Shift = lowerVectorShuffleAsBitShift(
9704 DL, MVT::v8i16, V1, V2, Mask, DAG))
9707 // Try to use byte shift instructions.
9708 if (SDValue Shift = lowerVectorShuffleAsByteShift(
9709 DL, MVT::v8i16, V1, V2, Mask, DAG))
9712 // There are special ways we can lower some single-element blends.
9713 if (NumV2Inputs == 1)
9714 if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v8i16, DL, V1, V2,
9715 Mask, Subtarget, DAG))
9718 // We have different paths for blend lowering, but they all must use the
9719 // *exact* same predicate.
9720 bool IsBlendSupported = Subtarget->hasSSE41();
9721 if (IsBlendSupported)
9722 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
9726 if (SDValue Masked =
9727 lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, DAG))
9730 // Use dedicated unpack instructions for masks that match their pattern.
9731 if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 1, 9, 2, 10, 3, 11))
9732 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2);
9733 if (isShuffleEquivalent(V1, V2, Mask, 4, 12, 5, 13, 6, 14, 7, 15))
9734 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2);
9736 // Try to use byte rotation instructions.
9737 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9738 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
9741 if (NumV1Inputs + NumV2Inputs <= 4)
9742 return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG);
9744 // Check whether an interleaving lowering is likely to be more efficient.
9745 // This isn't perfect but it is a strong heuristic that tends to work well on
9746 // the kinds of shuffles that show up in practice.
9748 // FIXME: Handle 1x, 2x, and 4x interleaving.
9749 if (shouldLowerAsInterleaving(Mask)) {
9750 // FIXME: Figure out whether we should pack these into the low or high
9753 int EMask[8], OMask[8];
9754 for (int i = 0; i < 4; ++i) {
9755 EMask[i] = Mask[2*i];
9756 OMask[i] = Mask[2*i + 1];
9761 SDValue Evens = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, EMask);
9762 SDValue Odds = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, OMask);
9764 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, Evens, Odds);
9767 // If we have direct support for blends, we should lower by decomposing into
9769 if (IsBlendSupported)
9770 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
9773 // Try to lower by permuting the inputs into an unpack instruction.
9774 if (SDValue Unpack =
9775 lowerVectorShuffleAsUnpack(MVT::v8i16, DL, V1, V2, Mask, DAG))
9778 int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9779 int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9781 for (int i = 0; i < 4; ++i) {
9782 LoBlendMask[i] = Mask[i];
9783 HiBlendMask[i] = Mask[i + 4];
9786 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
9787 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
9788 LoV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, LoV);
9789 HiV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, HiV);
9791 return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9792 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, LoV, HiV));
9795 /// \brief Check whether a compaction lowering can be done by dropping even
9796 /// elements and compute how many times even elements must be dropped.
9798 /// This handles shuffles which take every Nth element where N is a power of
9799 /// two. Example shuffle masks:
9801 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
9802 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
9803 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
9804 /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
9805 /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
9806 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
9808 /// Any of these lanes can of course be undef.
9810 /// This routine only supports N <= 3.
9811 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
9814 /// \returns N above, or the number of times even elements must be dropped if
9815 /// there is such a number. Otherwise returns zero.
9816 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask) {
9817 // Figure out whether we're looping over two inputs or just one.
9818 bool IsSingleInput = isSingleInputShuffleMask(Mask);
9820 // The modulus for the shuffle vector entries is based on whether this is
9821 // a single input or not.
9822 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
9823 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
9824 "We should only be called with masks with a power-of-2 size!");
9826 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
9828 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
9829 // and 2^3 simultaneously. This is because we may have ambiguity with
9830 // partially undef inputs.
9831 bool ViableForN[3] = {true, true, true};
9833 for (int i = 0, e = Mask.size(); i < e; ++i) {
9834 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
9839 bool IsAnyViable = false;
9840 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
9841 if (ViableForN[j]) {
9844 // The shuffle mask must be equal to (i * 2^N) % M.
9845 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
9848 ViableForN[j] = false;
9850 // Early exit if we exhaust the possible powers of two.
9855 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
9859 // Return 0 as there is no viable power of two.
9863 /// \brief Generic lowering of v16i8 shuffles.
9865 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
9866 /// detect any complexity reducing interleaving. If that doesn't help, it uses
9867 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
9868 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
9870 static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9871 const X86Subtarget *Subtarget,
9872 SelectionDAG &DAG) {
9874 assert(Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!");
9875 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
9876 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
9877 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
9878 ArrayRef<int> OrigMask = SVOp->getMask();
9879 assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!");
9881 // Try to use bit shift instructions.
9882 if (SDValue Shift = lowerVectorShuffleAsBitShift(
9883 DL, MVT::v16i8, V1, V2, OrigMask, DAG))
9886 // Try to use byte shift instructions.
9887 if (SDValue Shift = lowerVectorShuffleAsByteShift(
9888 DL, MVT::v16i8, V1, V2, OrigMask, DAG))
9891 // Try to use byte rotation instructions.
9892 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9893 DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
9896 // Try to use a zext lowering.
9897 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
9898 DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
9901 int MaskStorage[16] = {
9902 OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3],
9903 OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7],
9904 OrigMask[8], OrigMask[9], OrigMask[10], OrigMask[11],
9905 OrigMask[12], OrigMask[13], OrigMask[14], OrigMask[15]};
9906 MutableArrayRef<int> Mask(MaskStorage);
9907 MutableArrayRef<int> LoMask = Mask.slice(0, 8);
9908 MutableArrayRef<int> HiMask = Mask.slice(8, 8);
9911 std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; });
9913 // For single-input shuffles, there are some nicer lowering tricks we can use.
9914 if (NumV2Elements == 0) {
9915 // Check for being able to broadcast a single element.
9916 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i8, DL, V1,
9917 Mask, Subtarget, DAG))
9920 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
9921 // Notably, this handles splat and partial-splat shuffles more efficiently.
9922 // However, it only makes sense if the pre-duplication shuffle simplifies
9923 // things significantly. Currently, this means we need to be able to
9924 // express the pre-duplication shuffle as an i16 shuffle.
9926 // FIXME: We should check for other patterns which can be widened into an
9927 // i16 shuffle as well.
9928 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
9929 for (int i = 0; i < 16; i += 2)
9930 if (Mask[i] != -1 && Mask[i + 1] != -1 && Mask[i] != Mask[i + 1])
9935 auto tryToWidenViaDuplication = [&]() -> SDValue {
9936 if (!canWidenViaDuplication(Mask))
9938 SmallVector<int, 4> LoInputs;
9939 std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
9940 [](int M) { return M >= 0 && M < 8; });
9941 std::sort(LoInputs.begin(), LoInputs.end());
9942 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
9944 SmallVector<int, 4> HiInputs;
9945 std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
9946 [](int M) { return M >= 8; });
9947 std::sort(HiInputs.begin(), HiInputs.end());
9948 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
9951 bool TargetLo = LoInputs.size() >= HiInputs.size();
9952 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
9953 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
9955 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
9956 SmallDenseMap<int, int, 8> LaneMap;
9957 for (int I : InPlaceInputs) {
9958 PreDupI16Shuffle[I/2] = I/2;
9961 int j = TargetLo ? 0 : 4, je = j + 4;
9962 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
9963 // Check if j is already a shuffle of this input. This happens when
9964 // there are two adjacent bytes after we move the low one.
9965 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
9966 // If we haven't yet mapped the input, search for a slot into which
9968 while (j < je && PreDupI16Shuffle[j] != -1)
9972 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
9975 // Map this input with the i16 shuffle.
9976 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
9979 // Update the lane map based on the mapping we ended up with.
9980 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
9983 ISD::BITCAST, DL, MVT::v16i8,
9984 DAG.getVectorShuffle(MVT::v8i16, DL,
9985 DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
9986 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
9988 // Unpack the bytes to form the i16s that will be shuffled into place.
9989 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
9990 MVT::v16i8, V1, V1);
9992 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9993 for (int i = 0; i < 16; ++i)
9994 if (Mask[i] != -1) {
9995 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
9996 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
9997 if (PostDupI16Shuffle[i / 2] == -1)
9998 PostDupI16Shuffle[i / 2] = MappedMask;
10000 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
10001 "Conflicting entrties in the original shuffle!");
10003 return DAG.getNode(
10004 ISD::BITCAST, DL, MVT::v16i8,
10005 DAG.getVectorShuffle(MVT::v8i16, DL,
10006 DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
10007 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
10009 if (SDValue V = tryToWidenViaDuplication())
10013 // Check whether an interleaving lowering is likely to be more efficient.
10014 // This isn't perfect but it is a strong heuristic that tends to work well on
10015 // the kinds of shuffles that show up in practice.
10017 // FIXME: We need to handle other interleaving widths (i16, i32, ...).
10018 if (shouldLowerAsInterleaving(Mask)) {
10019 int NumLoHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {
10020 return (M >= 0 && M < 8) || (M >= 16 && M < 24);
10022 int NumHiHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {
10023 return (M >= 8 && M < 16) || M >= 24;
10025 int EMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,
10026 -1, -1, -1, -1, -1, -1, -1, -1};
10027 int OMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,
10028 -1, -1, -1, -1, -1, -1, -1, -1};
10029 bool UnpackLo = NumLoHalf >= NumHiHalf;
10030 MutableArrayRef<int> TargetEMask(UnpackLo ? EMask : EMask + 8, 8);
10031 MutableArrayRef<int> TargetOMask(UnpackLo ? OMask : OMask + 8, 8);
10032 for (int i = 0; i < 8; ++i) {
10033 TargetEMask[i] = Mask[2 * i];
10034 TargetOMask[i] = Mask[2 * i + 1];
10037 SDValue Evens = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, EMask);
10038 SDValue Odds = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, OMask);
10040 return DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
10041 MVT::v16i8, Evens, Odds);
10044 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
10045 // with PSHUFB. It is important to do this before we attempt to generate any
10046 // blends but after all of the single-input lowerings. If the single input
10047 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
10048 // want to preserve that and we can DAG combine any longer sequences into
10049 // a PSHUFB in the end. But once we start blending from multiple inputs,
10050 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
10051 // and there are *very* few patterns that would actually be faster than the
10052 // PSHUFB approach because of its ability to zero lanes.
10054 // FIXME: The only exceptions to the above are blends which are exact
10055 // interleavings with direct instructions supporting them. We currently don't
10056 // handle those well here.
10057 if (Subtarget->hasSSSE3()) {
10058 SDValue V1Mask[16];
10059 SDValue V2Mask[16];
10060 bool V1InUse = false;
10061 bool V2InUse = false;
10062 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
10064 for (int i = 0; i < 16; ++i) {
10065 if (Mask[i] == -1) {
10066 V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
10068 const int ZeroMask = 0x80;
10069 int V1Idx = (Mask[i] < 16 ? Mask[i] : ZeroMask);
10070 int V2Idx = (Mask[i] < 16 ? ZeroMask : Mask[i] - 16);
10072 V1Idx = V2Idx = ZeroMask;
10073 V1Mask[i] = DAG.getConstant(V1Idx, MVT::i8);
10074 V2Mask[i] = DAG.getConstant(V2Idx, MVT::i8);
10075 V1InUse |= (ZeroMask != V1Idx);
10076 V2InUse |= (ZeroMask != V2Idx);
10080 // If both V1 and V2 are in use and we can use a direct blend, do so. This
10081 // avoids using blends to handle blends-with-zero which is important as
10082 // a single pshufb is significantly faster for that.
10083 if (V1InUse && V2InUse && Subtarget->hasSSE41())
10084 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
10090 V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V1,
10091 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
10093 V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V2,
10094 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));
10096 // If we need shuffled inputs from both, blend the two.
10097 if (V1InUse && V2InUse)
10098 return DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
10100 return V1; // Single inputs are easy.
10102 return V2; // Single inputs are easy.
10103 // Shuffling to a zeroable vector.
10104 return getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
10107 // There are special ways we can lower some single-element blends.
10108 if (NumV2Elements == 1)
10109 if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v16i8, DL, V1, V2,
10110 Mask, Subtarget, DAG))
10113 // Check whether a compaction lowering can be done. This handles shuffles
10114 // which take every Nth element for some even N. See the helper function for
10117 // We special case these as they can be particularly efficiently handled with
10118 // the PACKUSB instruction on x86 and they show up in common patterns of
10119 // rearranging bytes to truncate wide elements.
10120 if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask)) {
10121 // NumEvenDrops is the power of two stride of the elements. Another way of
10122 // thinking about it is that we need to drop the even elements this many
10123 // times to get the original input.
10124 bool IsSingleInput = isSingleInputShuffleMask(Mask);
10126 // First we need to zero all the dropped bytes.
10127 assert(NumEvenDrops <= 3 &&
10128 "No support for dropping even elements more than 3 times.");
10129 // We use the mask type to pick which bytes are preserved based on how many
10130 // elements are dropped.
10131 MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
10132 SDValue ByteClearMask =
10133 DAG.getNode(ISD::BITCAST, DL, MVT::v16i8,
10134 DAG.getConstant(0xFF, MaskVTs[NumEvenDrops - 1]));
10135 V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
10136 if (!IsSingleInput)
10137 V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
10139 // Now pack things back together.
10140 V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
10141 V2 = IsSingleInput ? V1 : DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
10142 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
10143 for (int i = 1; i < NumEvenDrops; ++i) {
10144 Result = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Result);
10145 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
10151 int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
10152 int V1HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
10153 int V2LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
10154 int V2HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
10156 auto buildBlendMasks = [](MutableArrayRef<int> HalfMask,
10157 MutableArrayRef<int> V1HalfBlendMask,
10158 MutableArrayRef<int> V2HalfBlendMask) {
10159 for (int i = 0; i < 8; ++i)
10160 if (HalfMask[i] >= 0 && HalfMask[i] < 16) {
10161 V1HalfBlendMask[i] = HalfMask[i];
10163 } else if (HalfMask[i] >= 16) {
10164 V2HalfBlendMask[i] = HalfMask[i] - 16;
10165 HalfMask[i] = i + 8;
10168 buildBlendMasks(LoMask, V1LoBlendMask, V2LoBlendMask);
10169 buildBlendMasks(HiMask, V1HiBlendMask, V2HiBlendMask);
10171 SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
10173 auto buildLoAndHiV8s = [&](SDValue V, MutableArrayRef<int> LoBlendMask,
10174 MutableArrayRef<int> HiBlendMask) {
10176 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
10177 // them out and avoid using UNPCK{L,H} to extract the elements of V as
10179 if (std::none_of(LoBlendMask.begin(), LoBlendMask.end(),
10180 [](int M) { return M >= 0 && M % 2 == 1; }) &&
10181 std::none_of(HiBlendMask.begin(), HiBlendMask.end(),
10182 [](int M) { return M >= 0 && M % 2 == 1; })) {
10183 // Use a mask to drop the high bytes.
10184 V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
10185 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, V1,
10186 DAG.getConstant(0x00FF, MVT::v8i16));
10188 // This will be a single vector shuffle instead of a blend so nuke V2.
10189 V2 = DAG.getUNDEF(MVT::v8i16);
10191 // Squash the masks to point directly into V1.
10192 for (int &M : LoBlendMask)
10195 for (int &M : HiBlendMask)
10199 // Otherwise just unpack the low half of V into V1 and the high half into
10200 // V2 so that we can blend them as i16s.
10201 V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
10202 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
10203 V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
10204 DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
10207 SDValue BlendedLo = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
10208 SDValue BlendedHi = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
10209 return std::make_pair(BlendedLo, BlendedHi);
10211 SDValue V1Lo, V1Hi, V2Lo, V2Hi;
10212 std::tie(V1Lo, V1Hi) = buildLoAndHiV8s(V1, V1LoBlendMask, V1HiBlendMask);
10213 std::tie(V2Lo, V2Hi) = buildLoAndHiV8s(V2, V2LoBlendMask, V2HiBlendMask);
10215 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Lo, V2Lo, LoMask);
10216 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Hi, V2Hi, HiMask);
10218 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
10221 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
10223 /// This routine breaks down the specific type of 128-bit shuffle and
10224 /// dispatches to the lowering routines accordingly.
10225 static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10226 MVT VT, const X86Subtarget *Subtarget,
10227 SelectionDAG &DAG) {
10228 switch (VT.SimpleTy) {
10230 return lowerV2I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
10232 return lowerV2F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
10234 return lowerV4I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10236 return lowerV4F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10238 return lowerV8I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
10240 return lowerV16I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
10243 llvm_unreachable("Unimplemented!");
10247 /// \brief Helper function to test whether a shuffle mask could be
10248 /// simplified by widening the elements being shuffled.
10250 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
10251 /// leaves it in an unspecified state.
10253 /// NOTE: This must handle normal vector shuffle masks and *target* vector
10254 /// shuffle masks. The latter have the special property of a '-2' representing
10255 /// a zero-ed lane of a vector.
10256 static bool canWidenShuffleElements(ArrayRef<int> Mask,
10257 SmallVectorImpl<int> &WidenedMask) {
10258 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
10259 // If both elements are undef, its trivial.
10260 if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
10261 WidenedMask.push_back(SM_SentinelUndef);
10265 // Check for an undef mask and a mask value properly aligned to fit with
10266 // a pair of values. If we find such a case, use the non-undef mask's value.
10267 if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) {
10268 WidenedMask.push_back(Mask[i + 1] / 2);
10271 if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
10272 WidenedMask.push_back(Mask[i] / 2);
10276 // When zeroing, we need to spread the zeroing across both lanes to widen.
10277 if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
10278 if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
10279 (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
10280 WidenedMask.push_back(SM_SentinelZero);
10286 // Finally check if the two mask values are adjacent and aligned with
10288 if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) {
10289 WidenedMask.push_back(Mask[i] / 2);
10293 // Otherwise we can't safely widen the elements used in this shuffle.
10296 assert(WidenedMask.size() == Mask.size() / 2 &&
10297 "Incorrect size of mask after widening the elements!");
10302 /// \brief Generic routine to split vector shuffle into half-sized shuffles.
10304 /// This routine just extracts two subvectors, shuffles them independently, and
10305 /// then concatenates them back together. This should work effectively with all
10306 /// AVX vector shuffle types.
10307 static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
10308 SDValue V2, ArrayRef<int> Mask,
10309 SelectionDAG &DAG) {
10310 assert(VT.getSizeInBits() >= 256 &&
10311 "Only for 256-bit or wider vector shuffles!");
10312 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
10313 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
10315 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
10316 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
10318 int NumElements = VT.getVectorNumElements();
10319 int SplitNumElements = NumElements / 2;
10320 MVT ScalarVT = VT.getScalarType();
10321 MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
10323 // Rather than splitting build-vectors, just build two narrower build
10324 // vectors. This helps shuffling with splats and zeros.
10325 auto SplitVector = [&](SDValue V) {
10326 while (V.getOpcode() == ISD::BITCAST)
10327 V = V->getOperand(0);
10329 MVT OrigVT = V.getSimpleValueType();
10330 int OrigNumElements = OrigVT.getVectorNumElements();
10331 int OrigSplitNumElements = OrigNumElements / 2;
10332 MVT OrigScalarVT = OrigVT.getScalarType();
10333 MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
10337 auto *BV = dyn_cast<BuildVectorSDNode>(V);
10339 LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
10340 DAG.getIntPtrConstant(0));
10341 HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
10342 DAG.getIntPtrConstant(OrigSplitNumElements));
10345 SmallVector<SDValue, 16> LoOps, HiOps;
10346 for (int i = 0; i < OrigSplitNumElements; ++i) {
10347 LoOps.push_back(BV->getOperand(i));
10348 HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
10350 LoV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, LoOps);
10351 HiV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, HiOps);
10353 return std::make_pair(DAG.getNode(ISD::BITCAST, DL, SplitVT, LoV),
10354 DAG.getNode(ISD::BITCAST, DL, SplitVT, HiV));
10357 SDValue LoV1, HiV1, LoV2, HiV2;
10358 std::tie(LoV1, HiV1) = SplitVector(V1);
10359 std::tie(LoV2, HiV2) = SplitVector(V2);
10361 // Now create two 4-way blends of these half-width vectors.
10362 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
10363 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
10364 SmallVector<int, 32> V1BlendMask, V2BlendMask, BlendMask;
10365 for (int i = 0; i < SplitNumElements; ++i) {
10366 int M = HalfMask[i];
10367 if (M >= NumElements) {
10368 if (M >= NumElements + SplitNumElements)
10372 V2BlendMask.push_back(M - NumElements);
10373 V1BlendMask.push_back(-1);
10374 BlendMask.push_back(SplitNumElements + i);
10375 } else if (M >= 0) {
10376 if (M >= SplitNumElements)
10380 V2BlendMask.push_back(-1);
10381 V1BlendMask.push_back(M);
10382 BlendMask.push_back(i);
10384 V2BlendMask.push_back(-1);
10385 V1BlendMask.push_back(-1);
10386 BlendMask.push_back(-1);
10390 // Because the lowering happens after all combining takes place, we need to
10391 // manually combine these blend masks as much as possible so that we create
10392 // a minimal number of high-level vector shuffle nodes.
10394 // First try just blending the halves of V1 or V2.
10395 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
10396 return DAG.getUNDEF(SplitVT);
10397 if (!UseLoV2 && !UseHiV2)
10398 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
10399 if (!UseLoV1 && !UseHiV1)
10400 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
10402 SDValue V1Blend, V2Blend;
10403 if (UseLoV1 && UseHiV1) {
10405 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
10407 // We only use half of V1 so map the usage down into the final blend mask.
10408 V1Blend = UseLoV1 ? LoV1 : HiV1;
10409 for (int i = 0; i < SplitNumElements; ++i)
10410 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
10411 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
10413 if (UseLoV2 && UseHiV2) {
10415 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
10417 // We only use half of V2 so map the usage down into the final blend mask.
10418 V2Blend = UseLoV2 ? LoV2 : HiV2;
10419 for (int i = 0; i < SplitNumElements; ++i)
10420 if (BlendMask[i] >= SplitNumElements)
10421 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
10423 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
10425 SDValue Lo = HalfBlend(LoMask);
10426 SDValue Hi = HalfBlend(HiMask);
10427 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
10430 /// \brief Either split a vector in halves or decompose the shuffles and the
10433 /// This is provided as a good fallback for many lowerings of non-single-input
10434 /// shuffles with more than one 128-bit lane. In those cases, we want to select
10435 /// between splitting the shuffle into 128-bit components and stitching those
10436 /// back together vs. extracting the single-input shuffles and blending those
10438 static SDValue lowerVectorShuffleAsSplitOrBlend(SDLoc DL, MVT VT, SDValue V1,
10439 SDValue V2, ArrayRef<int> Mask,
10440 SelectionDAG &DAG) {
10441 assert(!isSingleInputShuffleMask(Mask) && "This routine must not be used to "
10442 "lower single-input shuffles as it "
10443 "could then recurse on itself.");
10444 int Size = Mask.size();
10446 // If this can be modeled as a broadcast of two elements followed by a blend,
10447 // prefer that lowering. This is especially important because broadcasts can
10448 // often fold with memory operands.
10449 auto DoBothBroadcast = [&] {
10450 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
10453 if (V2BroadcastIdx == -1)
10454 V2BroadcastIdx = M - Size;
10455 else if (M - Size != V2BroadcastIdx)
10457 } else if (M >= 0) {
10458 if (V1BroadcastIdx == -1)
10459 V1BroadcastIdx = M;
10460 else if (M != V1BroadcastIdx)
10465 if (DoBothBroadcast())
10466 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
10469 // If the inputs all stem from a single 128-bit lane of each input, then we
10470 // split them rather than blending because the split will decompose to
10471 // unusually few instructions.
10472 int LaneCount = VT.getSizeInBits() / 128;
10473 int LaneSize = Size / LaneCount;
10474 SmallBitVector LaneInputs[2];
10475 LaneInputs[0].resize(LaneCount, false);
10476 LaneInputs[1].resize(LaneCount, false);
10477 for (int i = 0; i < Size; ++i)
10479 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
10480 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
10481 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10483 // Otherwise, just fall back to decomposed shuffles and a blend. This requires
10484 // that the decomposed single-input shuffles don't end up here.
10485 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
10488 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
10489 /// a permutation and blend of those lanes.
10491 /// This essentially blends the out-of-lane inputs to each lane into the lane
10492 /// from a permuted copy of the vector. This lowering strategy results in four
10493 /// instructions in the worst case for a single-input cross lane shuffle which
10494 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
10495 /// of. Special cases for each particular shuffle pattern should be handled
10496 /// prior to trying this lowering.
10497 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT,
10498 SDValue V1, SDValue V2,
10499 ArrayRef<int> Mask,
10500 SelectionDAG &DAG) {
10501 // FIXME: This should probably be generalized for 512-bit vectors as well.
10502 assert(VT.getSizeInBits() == 256 && "Only for 256-bit vector shuffles!");
10503 int LaneSize = Mask.size() / 2;
10505 // If there are only inputs from one 128-bit lane, splitting will in fact be
10506 // less expensive. The flags track wether the given lane contains an element
10507 // that crosses to another lane.
10508 bool LaneCrossing[2] = {false, false};
10509 for (int i = 0, Size = Mask.size(); i < Size; ++i)
10510 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
10511 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
10512 if (!LaneCrossing[0] || !LaneCrossing[1])
10513 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10515 if (isSingleInputShuffleMask(Mask)) {
10516 SmallVector<int, 32> FlippedBlendMask;
10517 for (int i = 0, Size = Mask.size(); i < Size; ++i)
10518 FlippedBlendMask.push_back(
10519 Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
10521 : Mask[i] % LaneSize +
10522 (i / LaneSize) * LaneSize + Size));
10524 // Flip the vector, and blend the results which should now be in-lane. The
10525 // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
10526 // 5 for the high source. The value 3 selects the high half of source 2 and
10527 // the value 2 selects the low half of source 2. We only use source 2 to
10528 // allow folding it into a memory operand.
10529 unsigned PERMMask = 3 | 2 << 4;
10530 SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
10531 V1, DAG.getConstant(PERMMask, MVT::i8));
10532 return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
10535 // This now reduces to two single-input shuffles of V1 and V2 which at worst
10536 // will be handled by the above logic and a blend of the results, much like
10537 // other patterns in AVX.
10538 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
10541 /// \brief Handle lowering 2-lane 128-bit shuffles.
10542 static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
10543 SDValue V2, ArrayRef<int> Mask,
10544 const X86Subtarget *Subtarget,
10545 SelectionDAG &DAG) {
10546 // Blends are faster and handle all the non-lane-crossing cases.
10547 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
10551 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
10552 VT.getVectorNumElements() / 2);
10553 // Check for patterns which can be matched with a single insert of a 128-bit
10555 if (isShuffleEquivalent(V1, V2, Mask, 0, 1, 0, 1) ||
10556 isShuffleEquivalent(V1, V2, Mask, 0, 1, 4, 5)) {
10557 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
10558 DAG.getIntPtrConstant(0));
10559 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
10560 Mask[2] < 4 ? V1 : V2, DAG.getIntPtrConstant(0));
10561 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
10563 if (isShuffleEquivalent(V1, V2, Mask, 0, 1, 6, 7)) {
10564 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
10565 DAG.getIntPtrConstant(0));
10566 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
10567 DAG.getIntPtrConstant(2));
10568 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
10571 // Otherwise form a 128-bit permutation.
10572 // FIXME: Detect zero-vector inputs and use the VPERM2X128 to zero that half.
10573 unsigned PermMask = Mask[0] / 2 | (Mask[2] / 2) << 4;
10574 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
10575 DAG.getConstant(PermMask, MVT::i8));
10578 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
10579 /// shuffling each lane.
10581 /// This will only succeed when the result of fixing the 128-bit lanes results
10582 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
10583 /// each 128-bit lanes. This handles many cases where we can quickly blend away
10584 /// the lane crosses early and then use simpler shuffles within each lane.
10586 /// FIXME: It might be worthwhile at some point to support this without
10587 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
10588 /// in x86 only floating point has interesting non-repeating shuffles, and even
10589 /// those are still *marginally* more expensive.
10590 static SDValue lowerVectorShuffleByMerging128BitLanes(
10591 SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10592 const X86Subtarget *Subtarget, SelectionDAG &DAG) {
10593 assert(!isSingleInputShuffleMask(Mask) &&
10594 "This is only useful with multiple inputs.");
10596 int Size = Mask.size();
10597 int LaneSize = 128 / VT.getScalarSizeInBits();
10598 int NumLanes = Size / LaneSize;
10599 assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
10601 // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
10602 // check whether the in-128-bit lane shuffles share a repeating pattern.
10603 SmallVector<int, 4> Lanes;
10604 Lanes.resize(NumLanes, -1);
10605 SmallVector<int, 4> InLaneMask;
10606 InLaneMask.resize(LaneSize, -1);
10607 for (int i = 0; i < Size; ++i) {
10611 int j = i / LaneSize;
10613 if (Lanes[j] < 0) {
10614 // First entry we've seen for this lane.
10615 Lanes[j] = Mask[i] / LaneSize;
10616 } else if (Lanes[j] != Mask[i] / LaneSize) {
10617 // This doesn't match the lane selected previously!
10621 // Check that within each lane we have a consistent shuffle mask.
10622 int k = i % LaneSize;
10623 if (InLaneMask[k] < 0) {
10624 InLaneMask[k] = Mask[i] % LaneSize;
10625 } else if (InLaneMask[k] != Mask[i] % LaneSize) {
10626 // This doesn't fit a repeating in-lane mask.
10631 // First shuffle the lanes into place.
10632 MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
10633 VT.getSizeInBits() / 64);
10634 SmallVector<int, 8> LaneMask;
10635 LaneMask.resize(NumLanes * 2, -1);
10636 for (int i = 0; i < NumLanes; ++i)
10637 if (Lanes[i] >= 0) {
10638 LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
10639 LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
10642 V1 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V1);
10643 V2 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V2);
10644 SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
10646 // Cast it back to the type we actually want.
10647 LaneShuffle = DAG.getNode(ISD::BITCAST, DL, VT, LaneShuffle);
10649 // Now do a simple shuffle that isn't lane crossing.
10650 SmallVector<int, 8> NewMask;
10651 NewMask.resize(Size, -1);
10652 for (int i = 0; i < Size; ++i)
10654 NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
10655 assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
10656 "Must not introduce lane crosses at this point!");
10658 return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
10661 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
10664 /// This returns true if the elements from a particular input are already in the
10665 /// slot required by the given mask and require no permutation.
10666 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
10667 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
10668 int Size = Mask.size();
10669 for (int i = 0; i < Size; ++i)
10670 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
10676 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
10678 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
10679 /// isn't available.
10680 static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10681 const X86Subtarget *Subtarget,
10682 SelectionDAG &DAG) {
10684 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
10685 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
10686 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10687 ArrayRef<int> Mask = SVOp->getMask();
10688 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10690 SmallVector<int, 4> WidenedMask;
10691 if (canWidenShuffleElements(Mask, WidenedMask))
10692 return lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, Subtarget,
10695 if (isSingleInputShuffleMask(Mask)) {
10696 // Check for being able to broadcast a single element.
10697 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f64, DL, V1,
10698 Mask, Subtarget, DAG))
10701 // Use low duplicate instructions for masks that match their pattern.
10702 if (isShuffleEquivalent(V1, V2, Mask, 0, 0, 2, 2))
10703 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
10705 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
10706 // Non-half-crossing single input shuffles can be lowerid with an
10707 // interleaved permutation.
10708 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
10709 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
10710 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
10711 DAG.getConstant(VPERMILPMask, MVT::i8));
10714 // With AVX2 we have direct support for this permutation.
10715 if (Subtarget->hasAVX2())
10716 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
10717 getV4X86ShuffleImm8ForMask(Mask, DAG));
10719 // Otherwise, fall back.
10720 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
10724 // X86 has dedicated unpack instructions that can handle specific blend
10725 // operations: UNPCKH and UNPCKL.
10726 if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 2, 6))
10727 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2);
10728 if (isShuffleEquivalent(V1, V2, Mask, 1, 5, 3, 7))
10729 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2);
10730 if (isShuffleEquivalent(V1, V2, Mask, 4, 0, 6, 2))
10731 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V2, V1);
10732 if (isShuffleEquivalent(V1, V2, Mask, 5, 1, 7, 3))
10733 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V2, V1);
10735 // If we have a single input to the zero element, insert that into V1 if we
10736 // can do so cheaply.
10737 int NumV2Elements =
10738 std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
10739 if (NumV2Elements == 1 && Mask[0] >= 4)
10740 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10741 MVT::v4f64, DL, V1, V2, Mask, Subtarget, DAG))
10744 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
10748 // Check if the blend happens to exactly fit that of SHUFPD.
10749 if ((Mask[0] == -1 || Mask[0] < 2) &&
10750 (Mask[1] == -1 || (Mask[1] >= 4 && Mask[1] < 6)) &&
10751 (Mask[2] == -1 || (Mask[2] >= 2 && Mask[2] < 4)) &&
10752 (Mask[3] == -1 || Mask[3] >= 6)) {
10753 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 5) << 1) |
10754 ((Mask[2] == 3) << 2) | ((Mask[3] == 7) << 3);
10755 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V1, V2,
10756 DAG.getConstant(SHUFPDMask, MVT::i8));
10758 if ((Mask[0] == -1 || (Mask[0] >= 4 && Mask[0] < 6)) &&
10759 (Mask[1] == -1 || Mask[1] < 2) &&
10760 (Mask[2] == -1 || Mask[2] >= 6) &&
10761 (Mask[3] == -1 || (Mask[3] >= 2 && Mask[3] < 4))) {
10762 unsigned SHUFPDMask = (Mask[0] == 5) | ((Mask[1] == 1) << 1) |
10763 ((Mask[2] == 7) << 2) | ((Mask[3] == 3) << 3);
10764 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V2, V1,
10765 DAG.getConstant(SHUFPDMask, MVT::i8));
10768 // Try to simplify this by merging 128-bit lanes to enable a lane-based
10769 // shuffle. However, if we have AVX2 and either inputs are already in place,
10770 // we will be able to shuffle even across lanes the other input in a single
10771 // instruction so skip this pattern.
10772 if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
10773 isShuffleMaskInputInPlace(1, Mask))))
10774 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10775 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
10778 // If we have AVX2 then we always want to lower with a blend because an v4 we
10779 // can fully permute the elements.
10780 if (Subtarget->hasAVX2())
10781 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
10784 // Otherwise fall back on generic lowering.
10785 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
10788 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
10790 /// This routine is only called when we have AVX2 and thus a reasonable
10791 /// instruction set for v4i64 shuffling..
10792 static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10793 const X86Subtarget *Subtarget,
10794 SelectionDAG &DAG) {
10796 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
10797 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
10798 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10799 ArrayRef<int> Mask = SVOp->getMask();
10800 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10801 assert(Subtarget->hasAVX2() && "We can only lower v4i64 with AVX2!");
10803 SmallVector<int, 4> WidenedMask;
10804 if (canWidenShuffleElements(Mask, WidenedMask))
10805 return lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, Subtarget,
10808 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
10812 // Check for being able to broadcast a single element.
10813 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i64, DL, V1,
10814 Mask, Subtarget, DAG))
10817 // When the shuffle is mirrored between the 128-bit lanes of the unit, we can
10818 // use lower latency instructions that will operate on both 128-bit lanes.
10819 SmallVector<int, 2> RepeatedMask;
10820 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
10821 if (isSingleInputShuffleMask(Mask)) {
10822 int PSHUFDMask[] = {-1, -1, -1, -1};
10823 for (int i = 0; i < 2; ++i)
10824 if (RepeatedMask[i] >= 0) {
10825 PSHUFDMask[2 * i] = 2 * RepeatedMask[i];
10826 PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1;
10828 return DAG.getNode(
10829 ISD::BITCAST, DL, MVT::v4i64,
10830 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
10831 DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, V1),
10832 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
10836 // AVX2 provides a direct instruction for permuting a single input across
10838 if (isSingleInputShuffleMask(Mask))
10839 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
10840 getV4X86ShuffleImm8ForMask(Mask, DAG));
10842 // Try to use byte shift instructions.
10843 if (SDValue Shift = lowerVectorShuffleAsByteShift(
10844 DL, MVT::v4i64, V1, V2, Mask, DAG))
10847 // Use dedicated unpack instructions for masks that match their pattern.
10848 if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 2, 6))
10849 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
10850 if (isShuffleEquivalent(V1, V2, Mask, 1, 5, 3, 7))
10851 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);
10852 if (isShuffleEquivalent(V1, V2, Mask, 4, 0, 6, 2))
10853 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V2, V1);
10854 if (isShuffleEquivalent(V1, V2, Mask, 5, 1, 7, 3))
10855 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V2, V1);
10857 // Try to simplify this by merging 128-bit lanes to enable a lane-based
10858 // shuffle. However, if we have AVX2 and either inputs are already in place,
10859 // we will be able to shuffle even across lanes the other input in a single
10860 // instruction so skip this pattern.
10861 if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
10862 isShuffleMaskInputInPlace(1, Mask))))
10863 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10864 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
10867 // Otherwise fall back on generic blend lowering.
10868 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
10872 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
10874 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
10875 /// isn't available.
10876 static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10877 const X86Subtarget *Subtarget,
10878 SelectionDAG &DAG) {
10880 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
10881 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
10882 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10883 ArrayRef<int> Mask = SVOp->getMask();
10884 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10886 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
10890 // Check for being able to broadcast a single element.
10891 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8f32, DL, V1,
10892 Mask, Subtarget, DAG))
10895 // If the shuffle mask is repeated in each 128-bit lane, we have many more
10896 // options to efficiently lower the shuffle.
10897 SmallVector<int, 4> RepeatedMask;
10898 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
10899 assert(RepeatedMask.size() == 4 &&
10900 "Repeated masks must be half the mask width!");
10902 // Use even/odd duplicate instructions for masks that match their pattern.
10903 if (isShuffleEquivalent(V1, V2, Mask, 0, 0, 2, 2, 4, 4, 6, 6))
10904 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
10905 if (isShuffleEquivalent(V1, V2, Mask, 1, 1, 3, 3, 5, 5, 7, 7))
10906 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
10908 if (isSingleInputShuffleMask(Mask))
10909 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
10910 getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
10912 // Use dedicated unpack instructions for masks that match their pattern.
10913 if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 1, 9, 4, 12, 5, 13))
10914 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2);
10915 if (isShuffleEquivalent(V1, V2, Mask, 2, 10, 3, 11, 6, 14, 7, 15))
10916 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2);
10917 if (isShuffleEquivalent(V1, V2, Mask, 8, 0, 9, 1, 12, 4, 13, 5))
10918 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V2, V1);
10919 if (isShuffleEquivalent(V1, V2, Mask, 10, 2, 11, 3, 14, 6, 15, 7))
10920 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V2, V1);
10922 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
10923 // have already handled any direct blends. We also need to squash the
10924 // repeated mask into a simulated v4f32 mask.
10925 for (int i = 0; i < 4; ++i)
10926 if (RepeatedMask[i] >= 8)
10927 RepeatedMask[i] -= 4;
10928 return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
10931 // If we have a single input shuffle with different shuffle patterns in the
10932 // two 128-bit lanes use the variable mask to VPERMILPS.
10933 if (isSingleInputShuffleMask(Mask)) {
10934 SDValue VPermMask[8];
10935 for (int i = 0; i < 8; ++i)
10936 VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
10937 : DAG.getConstant(Mask[i], MVT::i32);
10938 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
10939 return DAG.getNode(
10940 X86ISD::VPERMILPV, DL, MVT::v8f32, V1,
10941 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask));
10943 if (Subtarget->hasAVX2())
10944 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32,
10945 DAG.getNode(ISD::BITCAST, DL, MVT::v8f32,
10946 DAG.getNode(ISD::BUILD_VECTOR, DL,
10947 MVT::v8i32, VPermMask)),
10950 // Otherwise, fall back.
10951 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
10955 // Try to simplify this by merging 128-bit lanes to enable a lane-based
10957 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10958 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
10961 // If we have AVX2 then we always want to lower with a blend because at v8 we
10962 // can fully permute the elements.
10963 if (Subtarget->hasAVX2())
10964 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
10967 // Otherwise fall back on generic lowering.
10968 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
10971 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
10973 /// This routine is only called when we have AVX2 and thus a reasonable
10974 /// instruction set for v8i32 shuffling..
10975 static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10976 const X86Subtarget *Subtarget,
10977 SelectionDAG &DAG) {
10979 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
10980 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
10981 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10982 ArrayRef<int> Mask = SVOp->getMask();
10983 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10984 assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!");
10986 // Whenever we can lower this as a zext, that instruction is strictly faster
10987 // than any alternative. It also allows us to fold memory operands into the
10988 // shuffle in many cases.
10989 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2,
10990 Mask, Subtarget, DAG))
10993 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
10997 // Check for being able to broadcast a single element.
10998 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i32, DL, V1,
10999 Mask, Subtarget, DAG))
11002 // If the shuffle mask is repeated in each 128-bit lane we can use more
11003 // efficient instructions that mirror the shuffles across the two 128-bit
11005 SmallVector<int, 4> RepeatedMask;
11006 if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) {
11007 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
11008 if (isSingleInputShuffleMask(Mask))
11009 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
11010 getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
11012 // Use dedicated unpack instructions for masks that match their pattern.
11013 if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 1, 9, 4, 12, 5, 13))
11014 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2);
11015 if (isShuffleEquivalent(V1, V2, Mask, 2, 10, 3, 11, 6, 14, 7, 15))
11016 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2);
11017 if (isShuffleEquivalent(V1, V2, Mask, 8, 0, 9, 1, 12, 4, 13, 5))
11018 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V2, V1);
11019 if (isShuffleEquivalent(V1, V2, Mask, 10, 2, 11, 3, 14, 6, 15, 7))
11020 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V2, V1);
11023 // Try to use bit shift instructions.
11024 if (SDValue Shift = lowerVectorShuffleAsBitShift(
11025 DL, MVT::v8i32, V1, V2, Mask, DAG))
11028 // Try to use byte shift instructions.
11029 if (SDValue Shift = lowerVectorShuffleAsByteShift(
11030 DL, MVT::v8i32, V1, V2, Mask, DAG))
11033 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11034 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
11037 // If the shuffle patterns aren't repeated but it is a single input, directly
11038 // generate a cross-lane VPERMD instruction.
11039 if (isSingleInputShuffleMask(Mask)) {
11040 SDValue VPermMask[8];
11041 for (int i = 0; i < 8; ++i)
11042 VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
11043 : DAG.getConstant(Mask[i], MVT::i32);
11044 return DAG.getNode(
11045 X86ISD::VPERMV, DL, MVT::v8i32,
11046 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
11049 // Try to simplify this by merging 128-bit lanes to enable a lane-based
11051 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
11052 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
11055 // Otherwise fall back on generic blend lowering.
11056 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
11060 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
11062 /// This routine is only called when we have AVX2 and thus a reasonable
11063 /// instruction set for v16i16 shuffling..
11064 static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11065 const X86Subtarget *Subtarget,
11066 SelectionDAG &DAG) {
11068 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
11069 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
11070 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11071 ArrayRef<int> Mask = SVOp->getMask();
11072 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11073 assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!");
11075 // Whenever we can lower this as a zext, that instruction is strictly faster
11076 // than any alternative. It also allows us to fold memory operands into the
11077 // shuffle in many cases.
11078 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v16i16, V1, V2,
11079 Mask, Subtarget, DAG))
11082 // Check for being able to broadcast a single element.
11083 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i16, DL, V1,
11084 Mask, Subtarget, DAG))
11087 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
11091 // Use dedicated unpack instructions for masks that match their pattern.
11092 if (isShuffleEquivalent(V1, V2, Mask,
11093 // First 128-bit lane:
11094 0, 16, 1, 17, 2, 18, 3, 19,
11095 // Second 128-bit lane:
11096 8, 24, 9, 25, 10, 26, 11, 27))
11097 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2);
11098 if (isShuffleEquivalent(V1, V2, Mask,
11099 // First 128-bit lane:
11100 4, 20, 5, 21, 6, 22, 7, 23,
11101 // Second 128-bit lane:
11102 12, 28, 13, 29, 14, 30, 15, 31))
11103 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2);
11105 // Try to use bit shift instructions.
11106 if (SDValue Shift = lowerVectorShuffleAsBitShift(
11107 DL, MVT::v16i16, V1, V2, Mask, DAG))
11110 // Try to use byte shift instructions.
11111 if (SDValue Shift = lowerVectorShuffleAsByteShift(
11112 DL, MVT::v16i16, V1, V2, Mask, DAG))
11115 // Try to use byte rotation instructions.
11116 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11117 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
11120 if (isSingleInputShuffleMask(Mask)) {
11121 // There are no generalized cross-lane shuffle operations available on i16
11123 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
11124 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
11127 SDValue PSHUFBMask[32];
11128 for (int i = 0; i < 16; ++i) {
11129 if (Mask[i] == -1) {
11130 PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8);
11134 int M = i < 8 ? Mask[i] : Mask[i] - 8;
11135 assert(M >= 0 && M < 8 && "Invalid single-input mask!");
11136 PSHUFBMask[2 * i] = DAG.getConstant(2 * M, MVT::i8);
11137 PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, MVT::i8);
11139 return DAG.getNode(
11140 ISD::BITCAST, DL, MVT::v16i16,
11142 X86ISD::PSHUFB, DL, MVT::v32i8,
11143 DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1),
11144 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)));
11147 // Try to simplify this by merging 128-bit lanes to enable a lane-based
11149 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
11150 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
11153 // Otherwise fall back on generic lowering.
11154 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
11157 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
11159 /// This routine is only called when we have AVX2 and thus a reasonable
11160 /// instruction set for v32i8 shuffling..
11161 static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11162 const X86Subtarget *Subtarget,
11163 SelectionDAG &DAG) {
11165 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
11166 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
11167 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11168 ArrayRef<int> Mask = SVOp->getMask();
11169 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
11170 assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!");
11172 // Whenever we can lower this as a zext, that instruction is strictly faster
11173 // than any alternative. It also allows us to fold memory operands into the
11174 // shuffle in many cases.
11175 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2,
11176 Mask, Subtarget, DAG))
11179 // Check for being able to broadcast a single element.
11180 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v32i8, DL, V1,
11181 Mask, Subtarget, DAG))
11184 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
11188 // Use dedicated unpack instructions for masks that match their pattern.
11189 // Note that these are repeated 128-bit lane unpacks, not unpacks across all
11191 if (isShuffleEquivalent(
11193 // First 128-bit lane:
11194 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
11195 // Second 128-bit lane:
11196 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55))
11197 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2);
11198 if (isShuffleEquivalent(
11200 // First 128-bit lane:
11201 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
11202 // Second 128-bit lane:
11203 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63))
11204 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2);
11206 // Try to use bit shift instructions.
11207 if (SDValue Shift = lowerVectorShuffleAsBitShift(
11208 DL, MVT::v32i8, V1, V2, Mask, DAG))
11211 // Try to use byte shift instructions.
11212 if (SDValue Shift = lowerVectorShuffleAsByteShift(
11213 DL, MVT::v32i8, V1, V2, Mask, DAG))
11216 // Try to use byte rotation instructions.
11217 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11218 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
11221 if (isSingleInputShuffleMask(Mask)) {
11222 // There are no generalized cross-lane shuffle operations available on i8
11224 if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
11225 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2,
11228 SDValue PSHUFBMask[32];
11229 for (int i = 0; i < 32; ++i)
11232 ? DAG.getUNDEF(MVT::i8)
11233 : DAG.getConstant(Mask[i] < 16 ? Mask[i] : Mask[i] - 16, MVT::i8);
11235 return DAG.getNode(
11236 X86ISD::PSHUFB, DL, MVT::v32i8, V1,
11237 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask));
11240 // Try to simplify this by merging 128-bit lanes to enable a lane-based
11242 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
11243 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
11246 // Otherwise fall back on generic lowering.
11247 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
11250 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
11252 /// This routine either breaks down the specific type of a 256-bit x86 vector
11253 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
11254 /// together based on the available instructions.
11255 static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11256 MVT VT, const X86Subtarget *Subtarget,
11257 SelectionDAG &DAG) {
11259 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11260 ArrayRef<int> Mask = SVOp->getMask();
11262 // There is a really nice hard cut-over between AVX1 and AVX2 that means we can
11263 // check for those subtargets here and avoid much of the subtarget querying in
11264 // the per-vector-type lowering routines. With AVX1 we have essentially *zero*
11265 // ability to manipulate a 256-bit vector with integer types. Since we'll use
11266 // floating point types there eventually, just immediately cast everything to
11267 // a float and operate entirely in that domain.
11268 if (VT.isInteger() && !Subtarget->hasAVX2()) {
11269 int ElementBits = VT.getScalarSizeInBits();
11270 if (ElementBits < 32)
11271 // No floating point type available, decompose into 128-bit vectors.
11272 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
11274 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
11275 VT.getVectorNumElements());
11276 V1 = DAG.getNode(ISD::BITCAST, DL, FpVT, V1);
11277 V2 = DAG.getNode(ISD::BITCAST, DL, FpVT, V2);
11278 return DAG.getNode(ISD::BITCAST, DL, VT,
11279 DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
11282 switch (VT.SimpleTy) {
11284 return lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
11286 return lowerV4I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
11288 return lowerV8F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
11290 return lowerV8I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
11292 return lowerV16I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
11294 return lowerV32I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
11297 llvm_unreachable("Not a valid 256-bit x86 vector type!");
11301 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
11302 static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11303 const X86Subtarget *Subtarget,
11304 SelectionDAG &DAG) {
11306 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
11307 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
11308 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11309 ArrayRef<int> Mask = SVOp->getMask();
11310 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11312 // X86 has dedicated unpack instructions that can handle specific blend
11313 // operations: UNPCKH and UNPCKL.
11314 if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 2, 10, 4, 12, 6, 14))
11315 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2);
11316 if (isShuffleEquivalent(V1, V2, Mask, 1, 9, 3, 11, 5, 13, 7, 15))
11317 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2);
11319 // FIXME: Implement direct support for this type!
11320 return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG);
11323 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
11324 static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11325 const X86Subtarget *Subtarget,
11326 SelectionDAG &DAG) {
11328 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
11329 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
11330 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11331 ArrayRef<int> Mask = SVOp->getMask();
11332 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11334 // Use dedicated unpack instructions for masks that match their pattern.
11335 if (isShuffleEquivalent(V1, V2, Mask,
11336 0, 16, 1, 17, 4, 20, 5, 21,
11337 8, 24, 9, 25, 12, 28, 13, 29))
11338 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2);
11339 if (isShuffleEquivalent(V1, V2, Mask,
11340 2, 18, 3, 19, 6, 22, 7, 23,
11341 10, 26, 11, 27, 14, 30, 15, 31))
11342 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2);
11344 // FIXME: Implement direct support for this type!
11345 return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG);
11348 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
11349 static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11350 const X86Subtarget *Subtarget,
11351 SelectionDAG &DAG) {
11353 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
11354 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
11355 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11356 ArrayRef<int> Mask = SVOp->getMask();
11357 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11359 // X86 has dedicated unpack instructions that can handle specific blend
11360 // operations: UNPCKH and UNPCKL.
11361 if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 2, 10, 4, 12, 6, 14))
11362 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2);
11363 if (isShuffleEquivalent(V1, V2, Mask, 1, 9, 3, 11, 5, 13, 7, 15))
11364 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2);
11366 // FIXME: Implement direct support for this type!
11367 return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG);
11370 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
11371 static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11372 const X86Subtarget *Subtarget,
11373 SelectionDAG &DAG) {
11375 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
11376 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
11377 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11378 ArrayRef<int> Mask = SVOp->getMask();
11379 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11381 // Use dedicated unpack instructions for masks that match their pattern.
11382 if (isShuffleEquivalent(V1, V2, Mask,
11383 0, 16, 1, 17, 4, 20, 5, 21,
11384 8, 24, 9, 25, 12, 28, 13, 29))
11385 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2);
11386 if (isShuffleEquivalent(V1, V2, Mask,
11387 2, 18, 3, 19, 6, 22, 7, 23,
11388 10, 26, 11, 27, 14, 30, 15, 31))
11389 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2);
11391 // FIXME: Implement direct support for this type!
11392 return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG);
11395 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
11396 static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11397 const X86Subtarget *Subtarget,
11398 SelectionDAG &DAG) {
11400 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
11401 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
11402 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11403 ArrayRef<int> Mask = SVOp->getMask();
11404 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
11405 assert(Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
11407 // FIXME: Implement direct support for this type!
11408 return splitAndLowerVectorShuffle(DL, MVT::v32i16, V1, V2, Mask, DAG);
11411 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
11412 static SDValue lowerV64I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11413 const X86Subtarget *Subtarget,
11414 SelectionDAG &DAG) {
11416 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
11417 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
11418 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11419 ArrayRef<int> Mask = SVOp->getMask();
11420 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
11421 assert(Subtarget->hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
11423 // FIXME: Implement direct support for this type!
11424 return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
11427 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
11429 /// This routine either breaks down the specific type of a 512-bit x86 vector
11430 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
11431 /// together based on the available instructions.
11432 static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11433 MVT VT, const X86Subtarget *Subtarget,
11434 SelectionDAG &DAG) {
11436 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11437 ArrayRef<int> Mask = SVOp->getMask();
11438 assert(Subtarget->hasAVX512() &&
11439 "Cannot lower 512-bit vectors w/ basic ISA!");
11441 // Check for being able to broadcast a single element.
11442 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(VT.SimpleTy, DL, V1,
11443 Mask, Subtarget, DAG))
11446 // Dispatch to each element type for lowering. If we don't have supprot for
11447 // specific element type shuffles at 512 bits, immediately split them and
11448 // lower them. Each lowering routine of a given type is allowed to assume that
11449 // the requisite ISA extensions for that element type are available.
11450 switch (VT.SimpleTy) {
11452 return lowerV8F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
11454 return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
11456 return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
11458 return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
11460 if (Subtarget->hasBWI())
11461 return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
11464 if (Subtarget->hasBWI())
11465 return lowerV64I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
11469 llvm_unreachable("Not a valid 512-bit x86 vector type!");
11472 // Otherwise fall back on splitting.
11473 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
11476 /// \brief Top-level lowering for x86 vector shuffles.
11478 /// This handles decomposition, canonicalization, and lowering of all x86
11479 /// vector shuffles. Most of the specific lowering strategies are encapsulated
11480 /// above in helper routines. The canonicalization attempts to widen shuffles
11481 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
11482 /// s.t. only one of the two inputs needs to be tested, etc.
11483 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
11484 SelectionDAG &DAG) {
11485 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11486 ArrayRef<int> Mask = SVOp->getMask();
11487 SDValue V1 = Op.getOperand(0);
11488 SDValue V2 = Op.getOperand(1);
11489 MVT VT = Op.getSimpleValueType();
11490 int NumElements = VT.getVectorNumElements();
11493 assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
11495 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
11496 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
11497 if (V1IsUndef && V2IsUndef)
11498 return DAG.getUNDEF(VT);
11500 // When we create a shuffle node we put the UNDEF node to second operand,
11501 // but in some cases the first operand may be transformed to UNDEF.
11502 // In this case we should just commute the node.
11504 return DAG.getCommutedVectorShuffle(*SVOp);
11506 // Check for non-undef masks pointing at an undef vector and make the masks
11507 // undef as well. This makes it easier to match the shuffle based solely on
11511 if (M >= NumElements) {
11512 SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
11513 for (int &M : NewMask)
11514 if (M >= NumElements)
11516 return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask);
11519 // We actually see shuffles that are entirely re-arrangements of a set of
11520 // zero inputs. This mostly happens while decomposing complex shuffles into
11521 // simple ones. Directly lower these as a buildvector of zeros.
11522 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
11523 if (Zeroable.all())
11524 return getZeroVector(VT, Subtarget, DAG, dl);
11526 // Try to collapse shuffles into using a vector type with fewer elements but
11527 // wider element types. We cap this to not form integers or floating point
11528 // elements wider than 64 bits, but it might be interesting to form i128
11529 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
11530 SmallVector<int, 16> WidenedMask;
11531 if (VT.getScalarSizeInBits() < 64 &&
11532 canWidenShuffleElements(Mask, WidenedMask)) {
11533 MVT NewEltVT = VT.isFloatingPoint()
11534 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
11535 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
11536 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
11537 // Make sure that the new vector type is legal. For example, v2f64 isn't
11539 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
11540 V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
11541 V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
11542 return DAG.getNode(ISD::BITCAST, dl, VT,
11543 DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask));
11547 int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0;
11548 for (int M : SVOp->getMask())
11550 ++NumUndefElements;
11551 else if (M < NumElements)
11556 // Commute the shuffle as needed such that more elements come from V1 than
11557 // V2. This allows us to match the shuffle pattern strictly on how many
11558 // elements come from V1 without handling the symmetric cases.
11559 if (NumV2Elements > NumV1Elements)
11560 return DAG.getCommutedVectorShuffle(*SVOp);
11562 // When the number of V1 and V2 elements are the same, try to minimize the
11563 // number of uses of V2 in the low half of the vector. When that is tied,
11564 // ensure that the sum of indices for V1 is equal to or lower than the sum
11565 // indices for V2. When those are equal, try to ensure that the number of odd
11566 // indices for V1 is lower than the number of odd indices for V2.
11567 if (NumV1Elements == NumV2Elements) {
11568 int LowV1Elements = 0, LowV2Elements = 0;
11569 for (int M : SVOp->getMask().slice(0, NumElements / 2))
11570 if (M >= NumElements)
11574 if (LowV2Elements > LowV1Elements) {
11575 return DAG.getCommutedVectorShuffle(*SVOp);
11576 } else if (LowV2Elements == LowV1Elements) {
11577 int SumV1Indices = 0, SumV2Indices = 0;
11578 for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
11579 if (SVOp->getMask()[i] >= NumElements)
11581 else if (SVOp->getMask()[i] >= 0)
11583 if (SumV2Indices < SumV1Indices) {
11584 return DAG.getCommutedVectorShuffle(*SVOp);
11585 } else if (SumV2Indices == SumV1Indices) {
11586 int NumV1OddIndices = 0, NumV2OddIndices = 0;
11587 for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
11588 if (SVOp->getMask()[i] >= NumElements)
11589 NumV2OddIndices += i % 2;
11590 else if (SVOp->getMask()[i] >= 0)
11591 NumV1OddIndices += i % 2;
11592 if (NumV2OddIndices < NumV1OddIndices)
11593 return DAG.getCommutedVectorShuffle(*SVOp);
11598 // For each vector width, delegate to a specialized lowering routine.
11599 if (VT.getSizeInBits() == 128)
11600 return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
11602 if (VT.getSizeInBits() == 256)
11603 return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
11605 // Force AVX-512 vectors to be scalarized for now.
11606 // FIXME: Implement AVX-512 support!
11607 if (VT.getSizeInBits() == 512)
11608 return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
11610 llvm_unreachable("Unimplemented!");
11614 //===----------------------------------------------------------------------===//
11615 // Legacy vector shuffle lowering
11617 // This code is the legacy code handling vector shuffles until the above
11618 // replaces its functionality and performance.
11619 //===----------------------------------------------------------------------===//
11621 static bool isBlendMask(ArrayRef<int> MaskVals, MVT VT, bool hasSSE41,
11622 bool hasInt256, unsigned *MaskOut = nullptr) {
11623 MVT EltVT = VT.getVectorElementType();
11625 // There is no blend with immediate in AVX-512.
11626 if (VT.is512BitVector())
11629 if (!hasSSE41 || EltVT == MVT::i8)
11631 if (!hasInt256 && VT == MVT::v16i16)
11634 unsigned MaskValue = 0;
11635 unsigned NumElems = VT.getVectorNumElements();
11636 // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
11637 unsigned NumLanes = (NumElems - 1) / 8 + 1;
11638 unsigned NumElemsInLane = NumElems / NumLanes;
11640 // Blend for v16i16 should be symmetric for both lanes.
11641 for (unsigned i = 0; i < NumElemsInLane; ++i) {
11643 int SndLaneEltIdx = (NumLanes == 2) ? MaskVals[i + NumElemsInLane] : -1;
11644 int EltIdx = MaskVals[i];
11646 if ((EltIdx < 0 || EltIdx == (int)i) &&
11647 (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane)))
11650 if (((unsigned)EltIdx == (i + NumElems)) &&
11651 (SndLaneEltIdx < 0 ||
11652 (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane))
11653 MaskValue |= (1 << i);
11659 *MaskOut = MaskValue;
11663 // Try to lower a shuffle node into a simple blend instruction.
11664 // This function assumes isBlendMask returns true for this
11665 // SuffleVectorSDNode
11666 static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
11667 unsigned MaskValue,
11668 const X86Subtarget *Subtarget,
11669 SelectionDAG &DAG) {
11670 MVT VT = SVOp->getSimpleValueType(0);
11671 MVT EltVT = VT.getVectorElementType();
11672 assert(isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(),
11673 Subtarget->hasInt256() && "Trying to lower a "
11674 "VECTOR_SHUFFLE to a Blend but "
11675 "with the wrong mask"));
11676 SDValue V1 = SVOp->getOperand(0);
11677 SDValue V2 = SVOp->getOperand(1);
11679 unsigned NumElems = VT.getVectorNumElements();
11681 // Convert i32 vectors to floating point if it is not AVX2.
11682 // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
11684 if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
11685 BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
11687 V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1);
11688 V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2);
11691 SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2,
11692 DAG.getConstant(MaskValue, MVT::i32));
11693 return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
11696 /// In vector type \p VT, return true if the element at index \p InputIdx
11697 /// falls on a different 128-bit lane than \p OutputIdx.
11698 static bool ShuffleCrosses128bitLane(MVT VT, unsigned InputIdx,
11699 unsigned OutputIdx) {
11700 unsigned EltSize = VT.getVectorElementType().getSizeInBits();
11701 return InputIdx * EltSize / 128 != OutputIdx * EltSize / 128;
11704 /// Generate a PSHUFB if possible. Selects elements from \p V1 according to
11705 /// \p MaskVals. MaskVals[OutputIdx] = InputIdx specifies that we want to
11706 /// shuffle the element at InputIdx in V1 to OutputIdx in the result. If \p
11707 /// MaskVals refers to elements outside of \p V1 or is undef (-1), insert a
11709 static SDValue getPSHUFB(ArrayRef<int> MaskVals, SDValue V1, SDLoc &dl,
11710 SelectionDAG &DAG) {
11711 MVT VT = V1.getSimpleValueType();
11712 assert(VT.is128BitVector() || VT.is256BitVector());
11714 MVT EltVT = VT.getVectorElementType();
11715 unsigned EltSizeInBytes = EltVT.getSizeInBits() / 8;
11716 unsigned NumElts = VT.getVectorNumElements();
11718 SmallVector<SDValue, 32> PshufbMask;
11719 for (unsigned OutputIdx = 0; OutputIdx < NumElts; ++OutputIdx) {
11720 int InputIdx = MaskVals[OutputIdx];
11721 unsigned InputByteIdx;
11723 if (InputIdx < 0 || NumElts <= (unsigned)InputIdx)
11724 InputByteIdx = 0x80;
11726 // Cross lane is not allowed.
11727 if (ShuffleCrosses128bitLane(VT, InputIdx, OutputIdx))
11729 InputByteIdx = InputIdx * EltSizeInBytes;
11730 // Index is an byte offset within the 128-bit lane.
11731 InputByteIdx &= 0xf;
11734 for (unsigned j = 0; j < EltSizeInBytes; ++j) {
11735 PshufbMask.push_back(DAG.getConstant(InputByteIdx, MVT::i8));
11736 if (InputByteIdx != 0x80)
11741 MVT ShufVT = MVT::getVectorVT(MVT::i8, PshufbMask.size());
11743 V1 = DAG.getNode(ISD::BITCAST, dl, ShufVT, V1);
11744 return DAG.getNode(X86ISD::PSHUFB, dl, ShufVT, V1,
11745 DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT, PshufbMask));
11748 // v8i16 shuffles - Prefer shuffles in the following order:
11749 // 1. [all] pshuflw, pshufhw, optional move
11750 // 2. [ssse3] 1 x pshufb
11751 // 3. [ssse3] 2 x pshufb + 1 x por
11752 // 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
11754 LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
11755 SelectionDAG &DAG) {
11756 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11757 SDValue V1 = SVOp->getOperand(0);
11758 SDValue V2 = SVOp->getOperand(1);
11760 SmallVector<int, 8> MaskVals;
11762 // Determine if more than 1 of the words in each of the low and high quadwords
11763 // of the result come from the same quadword of one of the two inputs. Undef
11764 // mask values count as coming from any quadword, for better codegen.
11766 // Lo/HiQuad[i] = j indicates how many words from the ith quad of the input
11767 // feeds this quad. For i, 0 and 1 refer to V1, 2 and 3 refer to V2.
11768 unsigned LoQuad[] = { 0, 0, 0, 0 };
11769 unsigned HiQuad[] = { 0, 0, 0, 0 };
11770 // Indices of quads used.
11771 std::bitset<4> InputQuads;
11772 for (unsigned i = 0; i < 8; ++i) {
11773 unsigned *Quad = i < 4 ? LoQuad : HiQuad;
11774 int EltIdx = SVOp->getMaskElt(i);
11775 MaskVals.push_back(EltIdx);
11783 ++Quad[EltIdx / 4];
11784 InputQuads.set(EltIdx / 4);
11787 int BestLoQuad = -1;
11788 unsigned MaxQuad = 1;
11789 for (unsigned i = 0; i < 4; ++i) {
11790 if (LoQuad[i] > MaxQuad) {
11792 MaxQuad = LoQuad[i];
11796 int BestHiQuad = -1;
11798 for (unsigned i = 0; i < 4; ++i) {
11799 if (HiQuad[i] > MaxQuad) {
11801 MaxQuad = HiQuad[i];
11805 // For SSSE3, If all 8 words of the result come from only 1 quadword of each
11806 // of the two input vectors, shuffle them into one input vector so only a
11807 // single pshufb instruction is necessary. If there are more than 2 input
11808 // quads, disable the next transformation since it does not help SSSE3.
11809 bool V1Used = InputQuads[0] || InputQuads[1];
11810 bool V2Used = InputQuads[2] || InputQuads[3];
11811 if (Subtarget->hasSSSE3()) {
11812 if (InputQuads.count() == 2 && V1Used && V2Used) {
11813 BestLoQuad = InputQuads[0] ? 0 : 1;
11814 BestHiQuad = InputQuads[2] ? 2 : 3;
11816 if (InputQuads.count() > 2) {
11822 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
11823 // the shuffle mask. If a quad is scored as -1, that means that it contains
11824 // words from all 4 input quadwords.
11826 if (BestLoQuad >= 0 || BestHiQuad >= 0) {
11828 BestLoQuad < 0 ? 0 : BestLoQuad,
11829 BestHiQuad < 0 ? 1 : BestHiQuad
11831 NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
11832 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
11833 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
11834 NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV);
11836 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
11837 // source words for the shuffle, to aid later transformations.
11838 bool AllWordsInNewV = true;
11839 bool InOrder[2] = { true, true };
11840 for (unsigned i = 0; i != 8; ++i) {
11841 int idx = MaskVals[i];
11843 InOrder[i/4] = false;
11844 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
11846 AllWordsInNewV = false;
11850 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
11851 if (AllWordsInNewV) {
11852 for (int i = 0; i != 8; ++i) {
11853 int idx = MaskVals[i];
11856 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
11857 if ((idx != i) && idx < 4)
11859 if ((idx != i) && idx > 3)
11868 // If we've eliminated the use of V2, and the new mask is a pshuflw or
11869 // pshufhw, that's as cheap as it gets. Return the new shuffle.
11870 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
11871 unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;
11872 unsigned TargetMask = 0;
11873 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
11874 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
11875 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
11876 TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp):
11877 getShufflePSHUFLWImmediate(SVOp);
11878 V1 = NewV.getOperand(0);
11879 return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
11883 // Promote splats to a larger type which usually leads to more efficient code.
11884 // FIXME: Is this true if pshufb is available?
11885 if (SVOp->isSplat())
11886 return PromoteSplat(SVOp, DAG);
11888 // If we have SSSE3, and all words of the result are from 1 input vector,
11889 // case 2 is generated, otherwise case 3 is generated. If no SSSE3
11890 // is present, fall back to case 4.
11891 if (Subtarget->hasSSSE3()) {
11892 SmallVector<SDValue,16> pshufbMask;
11894 // If we have elements from both input vectors, set the high bit of the
11895 // shuffle mask element to zero out elements that come from V2 in the V1
11896 // mask, and elements that come from V1 in the V2 mask, so that the two
11897 // results can be OR'd together.
11898 bool TwoInputs = V1Used && V2Used;
11899 V1 = getPSHUFB(MaskVals, V1, dl, DAG);
11901 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
11903 // Calculate the shuffle mask for the second input, shuffle it, and
11904 // OR it with the first shuffled input.
11905 CommuteVectorShuffleMask(MaskVals, 8);
11906 V2 = getPSHUFB(MaskVals, V2, dl, DAG);
11907 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
11908 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
11911 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
11912 // and update MaskVals with new element order.
11913 std::bitset<8> InOrder;
11914 if (BestLoQuad >= 0) {
11915 int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 };
11916 for (int i = 0; i != 4; ++i) {
11917 int idx = MaskVals[i];
11920 } else if ((idx / 4) == BestLoQuad) {
11921 MaskV[i] = idx & 3;
11925 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
11928 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
11929 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
11930 NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
11931 NewV.getOperand(0),
11932 getShufflePSHUFLWImmediate(SVOp), DAG);
11936 // If BestHi >= 0, generate a pshufhw to put the high elements in order,
11937 // and update MaskVals with the new element order.
11938 if (BestHiQuad >= 0) {
11939 int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 };
11940 for (unsigned i = 4; i != 8; ++i) {
11941 int idx = MaskVals[i];
11944 } else if ((idx / 4) == BestHiQuad) {
11945 MaskV[i] = (idx & 3) + 4;
11949 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
11952 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
11953 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
11954 NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
11955 NewV.getOperand(0),
11956 getShufflePSHUFHWImmediate(SVOp), DAG);
11960 // In case BestHi & BestLo were both -1, which means each quadword has a word
11961 // from each of the four input quadwords, calculate the InOrder bitvector now
11962 // before falling through to the insert/extract cleanup.
11963 if (BestLoQuad == -1 && BestHiQuad == -1) {
11965 for (int i = 0; i != 8; ++i)
11966 if (MaskVals[i] < 0 || MaskVals[i] == i)
11970 // The other elements are put in the right place using pextrw and pinsrw.
11971 for (unsigned i = 0; i != 8; ++i) {
11974 int EltIdx = MaskVals[i];
11977 SDValue ExtOp = (EltIdx < 8) ?
11978 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
11979 DAG.getIntPtrConstant(EltIdx)) :
11980 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
11981 DAG.getIntPtrConstant(EltIdx - 8));
11982 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
11983 DAG.getIntPtrConstant(i));
11988 /// \brief v16i16 shuffles
11990 /// FIXME: We only support generation of a single pshufb currently. We can
11991 /// generalize the other applicable cases from LowerVECTOR_SHUFFLEv8i16 as
11992 /// well (e.g 2 x pshufb + 1 x por).
11994 LowerVECTOR_SHUFFLEv16i16(SDValue Op, SelectionDAG &DAG) {
11995 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11996 SDValue V1 = SVOp->getOperand(0);
11997 SDValue V2 = SVOp->getOperand(1);
12000 if (V2.getOpcode() != ISD::UNDEF)
12003 SmallVector<int, 16> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
12004 return getPSHUFB(MaskVals, V1, dl, DAG);
12007 // v16i8 shuffles - Prefer shuffles in the following order:
12008 // 1. [ssse3] 1 x pshufb
12009 // 2. [ssse3] 2 x pshufb + 1 x por
12010 // 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw
12011 static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
12012 const X86Subtarget* Subtarget,
12013 SelectionDAG &DAG) {
12014 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12015 SDValue V1 = SVOp->getOperand(0);
12016 SDValue V2 = SVOp->getOperand(1);
12018 ArrayRef<int> MaskVals = SVOp->getMask();
12020 // Promote splats to a larger type which usually leads to more efficient code.
12021 // FIXME: Is this true if pshufb is available?
12022 if (SVOp->isSplat())
12023 return PromoteSplat(SVOp, DAG);
12025 // If we have SSSE3, case 1 is generated when all result bytes come from
12026 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is
12027 // present, fall back to case 3.
12029 // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
12030 if (Subtarget->hasSSSE3()) {
12031 SmallVector<SDValue,16> pshufbMask;
12033 // If all result elements are from one input vector, then only translate
12034 // undef mask values to 0x80 (zero out result) in the pshufb mask.
12036 // Otherwise, we have elements from both input vectors, and must zero out
12037 // elements that come from V2 in the first mask, and V1 in the second mask
12038 // so that we can OR them together.
12039 for (unsigned i = 0; i != 16; ++i) {
12040 int EltIdx = MaskVals[i];
12041 if (EltIdx < 0 || EltIdx >= 16)
12043 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
12045 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
12046 DAG.getNode(ISD::BUILD_VECTOR, dl,
12047 MVT::v16i8, pshufbMask));
12049 // As PSHUFB will zero elements with negative indices, it's safe to ignore
12050 // the 2nd operand if it's undefined or zero.
12051 if (V2.getOpcode() == ISD::UNDEF ||
12052 ISD::isBuildVectorAllZeros(V2.getNode()))
12055 // Calculate the shuffle mask for the second input, shuffle it, and
12056 // OR it with the first shuffled input.
12057 pshufbMask.clear();
12058 for (unsigned i = 0; i != 16; ++i) {
12059 int EltIdx = MaskVals[i];
12060 EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16;
12061 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
12063 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
12064 DAG.getNode(ISD::BUILD_VECTOR, dl,
12065 MVT::v16i8, pshufbMask));
12066 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
12069 // No SSSE3 - Calculate in place words and then fix all out of place words
12070 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from
12071 // the 16 different words that comprise the two doublequadword input vectors.
12072 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
12073 V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
12075 for (int i = 0; i != 8; ++i) {
12076 int Elt0 = MaskVals[i*2];
12077 int Elt1 = MaskVals[i*2+1];
12079 // This word of the result is all undef, skip it.
12080 if (Elt0 < 0 && Elt1 < 0)
12083 // This word of the result is already in the correct place, skip it.
12084 if ((Elt0 == i*2) && (Elt1 == i*2+1))
12087 SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
12088 SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
12091 // If Elt0 and Elt1 are defined, are consecutive, and can be load
12092 // using a single extract together, load it and store it.
12093 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
12094 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
12095 DAG.getIntPtrConstant(Elt1 / 2));
12096 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
12097 DAG.getIntPtrConstant(i));
12101 // If Elt1 is defined, extract it from the appropriate source. If the
12102 // source byte is not also odd, shift the extracted word left 8 bits
12103 // otherwise clear the bottom 8 bits if we need to do an or.
12105 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
12106 DAG.getIntPtrConstant(Elt1 / 2));
12107 if ((Elt1 & 1) == 0)
12108 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
12110 TLI.getShiftAmountTy(InsElt.getValueType())));
12111 else if (Elt0 >= 0)
12112 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
12113 DAG.getConstant(0xFF00, MVT::i16));
12115 // If Elt0 is defined, extract it from the appropriate source. If the
12116 // source byte is not also even, shift the extracted word right 8 bits. If
12117 // Elt1 was also defined, OR the extracted values together before
12118 // inserting them in the result.
12120 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
12121 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
12122 if ((Elt0 & 1) != 0)
12123 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
12125 TLI.getShiftAmountTy(InsElt0.getValueType())));
12126 else if (Elt1 >= 0)
12127 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
12128 DAG.getConstant(0x00FF, MVT::i16));
12129 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
12132 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
12133 DAG.getIntPtrConstant(i));
12135 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
12138 // v32i8 shuffles - Translate to VPSHUFB if possible.
12140 SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
12141 const X86Subtarget *Subtarget,
12142 SelectionDAG &DAG) {
12143 MVT VT = SVOp->getSimpleValueType(0);
12144 SDValue V1 = SVOp->getOperand(0);
12145 SDValue V2 = SVOp->getOperand(1);
12147 SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
12149 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
12150 bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode());
12151 bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode());
12153 // VPSHUFB may be generated if
12154 // (1) one of input vector is undefined or zeroinitializer.
12155 // The mask value 0x80 puts 0 in the corresponding slot of the vector.
12156 // And (2) the mask indexes don't cross the 128-bit lane.
12157 if (VT != MVT::v32i8 || !Subtarget->hasInt256() ||
12158 (!V2IsUndef && !V2IsAllZero && !V1IsAllZero))
12161 if (V1IsAllZero && !V2IsAllZero) {
12162 CommuteVectorShuffleMask(MaskVals, 32);
12165 return getPSHUFB(MaskVals, V1, dl, DAG);
12168 /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
12169 /// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
12170 /// done when every pair / quad of shuffle mask elements point to elements in
12171 /// the right sequence. e.g.
12172 /// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15>
12174 SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
12175 SelectionDAG &DAG) {
12176 MVT VT = SVOp->getSimpleValueType(0);
12178 unsigned NumElems = VT.getVectorNumElements();
12181 switch (VT.SimpleTy) {
12182 default: llvm_unreachable("Unexpected!");
12185 return SDValue(SVOp, 0);
12186 case MVT::v4f32: NewVT = MVT::v2f64; Scale = 2; break;
12187 case MVT::v4i32: NewVT = MVT::v2i64; Scale = 2; break;
12188 case MVT::v8i16: NewVT = MVT::v4i32; Scale = 2; break;
12189 case MVT::v16i8: NewVT = MVT::v4i32; Scale = 4; break;
12190 case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break;
12191 case MVT::v32i8: NewVT = MVT::v8i32; Scale = 4; break;
12194 SmallVector<int, 8> MaskVec;
12195 for (unsigned i = 0; i != NumElems; i += Scale) {
12197 for (unsigned j = 0; j != Scale; ++j) {
12198 int EltIdx = SVOp->getMaskElt(i+j);
12202 StartIdx = (EltIdx / Scale);
12203 if (EltIdx != (int)(StartIdx*Scale + j))
12206 MaskVec.push_back(StartIdx);
12209 SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0));
12210 SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1));
12211 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
12214 /// getVZextMovL - Return a zero-extending vector move low node.
12216 static SDValue getVZextMovL(MVT VT, MVT OpVT,
12217 SDValue SrcOp, SelectionDAG &DAG,
12218 const X86Subtarget *Subtarget, SDLoc dl) {
12219 if (VT == MVT::v2f64 || VT == MVT::v4f32) {
12220 LoadSDNode *LD = nullptr;
12221 if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
12222 LD = dyn_cast<LoadSDNode>(SrcOp);
12224 // movssrr and movsdrr do not clear top bits. Try to use movd, movq
12226 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
12227 if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) &&
12228 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
12229 SrcOp.getOperand(0).getOpcode() == ISD::BITCAST &&
12230 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
12232 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
12233 return DAG.getNode(ISD::BITCAST, dl, VT,
12234 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
12235 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
12237 SrcOp.getOperand(0)
12243 return DAG.getNode(ISD::BITCAST, dl, VT,
12244 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
12245 DAG.getNode(ISD::BITCAST, dl,
12249 /// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
12250 /// which could not be matched by any known target speficic shuffle
12252 LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
12254 SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG);
12255 if (NewOp.getNode())
12258 MVT VT = SVOp->getSimpleValueType(0);
12260 unsigned NumElems = VT.getVectorNumElements();
12261 unsigned NumLaneElems = NumElems / 2;
12264 MVT EltVT = VT.getVectorElementType();
12265 MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
12268 SmallVector<int, 16> Mask;
12269 for (unsigned l = 0; l < 2; ++l) {
12270 // Build a shuffle mask for the output, discovering on the fly which
12271 // input vectors to use as shuffle operands (recorded in InputUsed).
12272 // If building a suitable shuffle vector proves too hard, then bail
12273 // out with UseBuildVector set.
12274 bool UseBuildVector = false;
12275 int InputUsed[2] = { -1, -1 }; // Not yet discovered.
12276 unsigned LaneStart = l * NumLaneElems;
12277 for (unsigned i = 0; i != NumLaneElems; ++i) {
12278 // The mask element. This indexes into the input.
12279 int Idx = SVOp->getMaskElt(i+LaneStart);
12281 // the mask element does not index into any input vector.
12282 Mask.push_back(-1);
12286 // The input vector this mask element indexes into.
12287 int Input = Idx / NumLaneElems;
12289 // Turn the index into an offset from the start of the input vector.
12290 Idx -= Input * NumLaneElems;
12292 // Find or create a shuffle vector operand to hold this input.
12294 for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
12295 if (InputUsed[OpNo] == Input)
12296 // This input vector is already an operand.
12298 if (InputUsed[OpNo] < 0) {
12299 // Create a new operand for this input vector.
12300 InputUsed[OpNo] = Input;
12305 if (OpNo >= array_lengthof(InputUsed)) {
12306 // More than two input vectors used! Give up on trying to create a
12307 // shuffle vector. Insert all elements into a BUILD_VECTOR instead.
12308 UseBuildVector = true;
12312 // Add the mask index for the new shuffle vector.
12313 Mask.push_back(Idx + OpNo * NumLaneElems);
12316 if (UseBuildVector) {
12317 SmallVector<SDValue, 16> SVOps;
12318 for (unsigned i = 0; i != NumLaneElems; ++i) {
12319 // The mask element. This indexes into the input.
12320 int Idx = SVOp->getMaskElt(i+LaneStart);
12322 SVOps.push_back(DAG.getUNDEF(EltVT));
12326 // The input vector this mask element indexes into.
12327 int Input = Idx / NumElems;
12329 // Turn the index into an offset from the start of the input vector.
12330 Idx -= Input * NumElems;
12332 // Extract the vector element by hand.
12333 SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
12334 SVOp->getOperand(Input),
12335 DAG.getIntPtrConstant(Idx)));
12338 // Construct the output using a BUILD_VECTOR.
12339 Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, SVOps);
12340 } else if (InputUsed[0] < 0) {
12341 // No input vectors were used! The result is undefined.
12342 Output[l] = DAG.getUNDEF(NVT);
12344 SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),
12345 (InputUsed[0] % 2) * NumLaneElems,
12347 // If only one input was used, use an undefined vector for the other.
12348 SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :
12349 Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),
12350 (InputUsed[1] % 2) * NumLaneElems, DAG, dl);
12351 // At least one input vector was used. Create a new shuffle vector.
12352 Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
12358 // Concatenate the result back
12359 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]);
12362 /// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
12363 /// 4 elements, and match them with several different shuffle types.
12365 LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
12366 SDValue V1 = SVOp->getOperand(0);
12367 SDValue V2 = SVOp->getOperand(1);
12369 MVT VT = SVOp->getSimpleValueType(0);
12371 assert(VT.is128BitVector() && "Unsupported vector size");
12373 std::pair<int, int> Locs[4];
12374 int Mask1[] = { -1, -1, -1, -1 };
12375 SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end());
12377 unsigned NumHi = 0;
12378 unsigned NumLo = 0;
12379 for (unsigned i = 0; i != 4; ++i) {
12380 int Idx = PermMask[i];
12382 Locs[i] = std::make_pair(-1, -1);
12384 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
12386 Locs[i] = std::make_pair(0, NumLo);
12387 Mask1[NumLo] = Idx;
12390 Locs[i] = std::make_pair(1, NumHi);
12392 Mask1[2+NumHi] = Idx;
12398 if (NumLo <= 2 && NumHi <= 2) {
12399 // If no more than two elements come from either vector. This can be
12400 // implemented with two shuffles. First shuffle gather the elements.
12401 // The second shuffle, which takes the first shuffle as both of its
12402 // vector operands, put the elements into the right order.
12403 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
12405 int Mask2[] = { -1, -1, -1, -1 };
12407 for (unsigned i = 0; i != 4; ++i)
12408 if (Locs[i].first != -1) {
12409 unsigned Idx = (i < 2) ? 0 : 4;
12410 Idx += Locs[i].first * 2 + Locs[i].second;
12414 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
12417 if (NumLo == 3 || NumHi == 3) {
12418 // Otherwise, we must have three elements from one vector, call it X, and
12419 // one element from the other, call it Y. First, use a shufps to build an
12420 // intermediate vector with the one element from Y and the element from X
12421 // that will be in the same half in the final destination (the indexes don't
12422 // matter). Then, use a shufps to build the final vector, taking the half
12423 // containing the element from Y from the intermediate, and the other half
12426 // Normalize it so the 3 elements come from V1.
12427 CommuteVectorShuffleMask(PermMask, 4);
12431 // Find the element from V2.
12433 for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
12434 int Val = PermMask[HiIndex];
12441 Mask1[0] = PermMask[HiIndex];
12443 Mask1[2] = PermMask[HiIndex^1];
12445 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
12447 if (HiIndex >= 2) {
12448 Mask1[0] = PermMask[0];
12449 Mask1[1] = PermMask[1];
12450 Mask1[2] = HiIndex & 1 ? 6 : 4;
12451 Mask1[3] = HiIndex & 1 ? 4 : 6;
12452 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
12455 Mask1[0] = HiIndex & 1 ? 2 : 0;
12456 Mask1[1] = HiIndex & 1 ? 0 : 2;
12457 Mask1[2] = PermMask[2];
12458 Mask1[3] = PermMask[3];
12463 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
12466 // Break it into (shuffle shuffle_hi, shuffle_lo).
12467 int LoMask[] = { -1, -1, -1, -1 };
12468 int HiMask[] = { -1, -1, -1, -1 };
12470 int *MaskPtr = LoMask;
12471 unsigned MaskIdx = 0;
12472 unsigned LoIdx = 0;
12473 unsigned HiIdx = 2;
12474 for (unsigned i = 0; i != 4; ++i) {
12481 int Idx = PermMask[i];
12483 Locs[i] = std::make_pair(-1, -1);
12484 } else if (Idx < 4) {
12485 Locs[i] = std::make_pair(MaskIdx, LoIdx);
12486 MaskPtr[LoIdx] = Idx;
12489 Locs[i] = std::make_pair(MaskIdx, HiIdx);
12490 MaskPtr[HiIdx] = Idx;
12495 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
12496 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
12497 int MaskOps[] = { -1, -1, -1, -1 };
12498 for (unsigned i = 0; i != 4; ++i)
12499 if (Locs[i].first != -1)
12500 MaskOps[i] = Locs[i].first * 4 + Locs[i].second;
12501 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
12504 static bool MayFoldVectorLoad(SDValue V) {
12505 while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
12506 V = V.getOperand(0);
12508 if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
12509 V = V.getOperand(0);
12510 if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&
12511 V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)
12512 // BUILD_VECTOR (load), undef
12513 V = V.getOperand(0);
12515 return MayFoldLoad(V);
12519 SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) {
12520 MVT VT = Op.getSimpleValueType();
12522 // Canonicalize to v2f64.
12523 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
12524 return DAG.getNode(ISD::BITCAST, dl, VT,
12525 getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64,
12530 SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG,
12532 SDValue V1 = Op.getOperand(0);
12533 SDValue V2 = Op.getOperand(1);
12534 MVT VT = Op.getSimpleValueType();
12536 assert(VT != MVT::v2i64 && "unsupported shuffle type");
12538 if (HasSSE2 && VT == MVT::v2f64)
12539 return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
12541 // v4f32 or v4i32: canonicalize to v4f32 (which is legal for SSE1)
12542 return DAG.getNode(ISD::BITCAST, dl, VT,
12543 getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32,
12544 DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1),
12545 DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG));
12549 SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG) {
12550 SDValue V1 = Op.getOperand(0);
12551 SDValue V2 = Op.getOperand(1);
12552 MVT VT = Op.getSimpleValueType();
12554 assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
12555 "unsupported shuffle type");
12557 if (V2.getOpcode() == ISD::UNDEF)
12561 return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
12565 SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
12566 SDValue V1 = Op.getOperand(0);
12567 SDValue V2 = Op.getOperand(1);
12568 MVT VT = Op.getSimpleValueType();
12569 unsigned NumElems = VT.getVectorNumElements();
12571 // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
12572 // operand of these instructions is only memory, so check if there's a
12573 // potencial load folding here, otherwise use SHUFPS or MOVSD to match the
12575 bool CanFoldLoad = false;
12577 // Trivial case, when V2 comes from a load.
12578 if (MayFoldVectorLoad(V2))
12579 CanFoldLoad = true;
12581 // When V1 is a load, it can be folded later into a store in isel, example:
12582 // (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1)
12584 // (MOVLPSmr addr:$src1, VR128:$src2)
12585 // So, recognize this potential and also use MOVLPS or MOVLPD
12586 else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
12587 CanFoldLoad = true;
12589 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12591 if (HasSSE2 && NumElems == 2)
12592 return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
12595 // If we don't care about the second element, proceed to use movss.
12596 if (SVOp->getMaskElt(1) != -1)
12597 return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
12600 // movl and movlp will both match v2i64, but v2i64 is never matched by
12601 // movl earlier because we make it strict to avoid messing with the movlp load
12602 // folding logic (see the code above getMOVLP call). Match it here then,
12603 // this is horrible, but will stay like this until we move all shuffle
12604 // matching to x86 specific nodes. Note that for the 1st condition all
12605 // types are matched with movsd.
12607 // FIXME: isMOVLMask should be checked and matched before getMOVLP,
12608 // as to remove this logic from here, as much as possible
12609 if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT))
12610 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
12611 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
12614 assert(VT != MVT::v4i32 && "unsupported shuffle type");
12616 // Invert the operand order and use SHUFPS to match it.
12617 return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1,
12618 getShuffleSHUFImmediate(SVOp), DAG);
12621 static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
12622 SelectionDAG &DAG) {
12624 MVT VT = Load->getSimpleValueType(0);
12625 MVT EVT = VT.getVectorElementType();
12626 SDValue Addr = Load->getOperand(1);
12627 SDValue NewAddr = DAG.getNode(
12628 ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
12629 DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType()));
12632 DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
12633 DAG.getMachineFunction().getMachineMemOperand(
12634 Load->getMemOperand(), 0, EVT.getStoreSize()));
12638 // It is only safe to call this function if isINSERTPSMask is true for
12639 // this shufflevector mask.
12640 static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
12641 SelectionDAG &DAG) {
12642 // Generate an insertps instruction when inserting an f32 from memory onto a
12643 // v4f32 or when copying a member from one v4f32 to another.
12644 // We also use it for transferring i32 from one register to another,
12645 // since it simply copies the same bits.
12646 // If we're transferring an i32 from memory to a specific element in a
12647 // register, we output a generic DAG that will match the PINSRD
12649 MVT VT = SVOp->getSimpleValueType(0);
12650 MVT EVT = VT.getVectorElementType();
12651 SDValue V1 = SVOp->getOperand(0);
12652 SDValue V2 = SVOp->getOperand(1);
12653 auto Mask = SVOp->getMask();
12654 assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
12655 "unsupported vector type for insertps/pinsrd");
12657 auto FromV1Predicate = [](const int &i) { return i < 4 && i > -1; };
12658 auto FromV2Predicate = [](const int &i) { return i >= 4; };
12659 int FromV1 = std::count_if(Mask.begin(), Mask.end(), FromV1Predicate);
12663 unsigned DestIndex;
12667 DestIndex = std::find_if(Mask.begin(), Mask.end(), FromV1Predicate) -
12670 // If we have 1 element from each vector, we have to check if we're
12671 // changing V1's element's place. If so, we're done. Otherwise, we
12672 // should assume we're changing V2's element's place and behave
12674 int FromV2 = std::count_if(Mask.begin(), Mask.end(), FromV2Predicate);
12675 assert(DestIndex <= INT32_MAX && "truncated destination index");
12676 if (FromV1 == FromV2 &&
12677 static_cast<int>(DestIndex) == Mask[DestIndex] % 4) {
12681 std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
12684 assert(std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 &&
12685 "More than one element from V1 and from V2, or no elements from one "
12686 "of the vectors. This case should not have returned true from "
12691 std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
12694 // Get an index into the source vector in the range [0,4) (the mask is
12695 // in the range [0,8) because it can address V1 and V2)
12696 unsigned SrcIndex = Mask[DestIndex] % 4;
12697 if (MayFoldLoad(From)) {
12698 // Trivial case, when From comes from a load and is only used by the
12699 // shuffle. Make it use insertps from the vector that we need from that
12702 NarrowVectorLoadToElement(cast<LoadSDNode>(From), SrcIndex, DAG);
12703 if (!NewLoad.getNode())
12706 if (EVT == MVT::f32) {
12707 // Create this as a scalar to vector to match the instruction pattern.
12708 SDValue LoadScalarToVector =
12709 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad);
12710 SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4);
12711 return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector,
12713 } else { // EVT == MVT::i32
12714 // If we're getting an i32 from memory, use an INSERT_VECTOR_ELT
12715 // instruction, to match the PINSRD instruction, which loads an i32 to a
12716 // certain vector element.
12717 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, To, NewLoad,
12718 DAG.getConstant(DestIndex, MVT::i32));
12722 // Vector-element-to-vector
12723 SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6);
12724 return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask);
12727 // Reduce a vector shuffle to zext.
12728 static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
12729 SelectionDAG &DAG) {
12730 // PMOVZX is only available from SSE41.
12731 if (!Subtarget->hasSSE41())
12734 MVT VT = Op.getSimpleValueType();
12736 // Only AVX2 support 256-bit vector integer extending.
12737 if (!Subtarget->hasInt256() && VT.is256BitVector())
12740 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12742 SDValue V1 = Op.getOperand(0);
12743 SDValue V2 = Op.getOperand(1);
12744 unsigned NumElems = VT.getVectorNumElements();
12746 // Extending is an unary operation and the element type of the source vector
12747 // won't be equal to or larger than i64.
12748 if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() ||
12749 VT.getVectorElementType() == MVT::i64)
12752 // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4.
12753 unsigned Shift = 1; // Start from 2, i.e. 1 << 1.
12754 while ((1U << Shift) < NumElems) {
12755 if (SVOp->getMaskElt(1U << Shift) == 1)
12758 // The maximal ratio is 8, i.e. from i8 to i64.
12763 // Check the shuffle mask.
12764 unsigned Mask = (1U << Shift) - 1;
12765 for (unsigned i = 0; i != NumElems; ++i) {
12766 int EltIdx = SVOp->getMaskElt(i);
12767 if ((i & Mask) != 0 && EltIdx != -1)
12769 if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift))
12773 unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift;
12774 MVT NeVT = MVT::getIntegerVT(NBits);
12775 MVT NVT = MVT::getVectorVT(NeVT, NumElems >> Shift);
12777 if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT))
12780 return DAG.getNode(ISD::BITCAST, DL, VT,
12781 DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
12784 static SDValue NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
12785 SelectionDAG &DAG) {
12786 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12787 MVT VT = Op.getSimpleValueType();
12789 SDValue V1 = Op.getOperand(0);
12790 SDValue V2 = Op.getOperand(1);
12792 if (isZeroShuffle(SVOp))
12793 return getZeroVector(VT, Subtarget, DAG, dl);
12795 // Handle splat operations
12796 if (SVOp->isSplat()) {
12797 // Use vbroadcast whenever the splat comes from a foldable load
12798 SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
12799 if (Broadcast.getNode())
12803 // Check integer expanding shuffles.
12804 SDValue NewOp = LowerVectorIntExtend(Op, Subtarget, DAG);
12805 if (NewOp.getNode())
12808 // If the shuffle can be profitably rewritten as a narrower shuffle, then
12810 if (VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v16i16 ||
12811 VT == MVT::v32i8) {
12812 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
12813 if (NewOp.getNode())
12814 return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
12815 } else if (VT.is128BitVector() && Subtarget->hasSSE2()) {
12816 // FIXME: Figure out a cleaner way to do this.
12817 if (ISD::isBuildVectorAllZeros(V2.getNode())) {
12818 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
12819 if (NewOp.getNode()) {
12820 MVT NewVT = NewOp.getSimpleValueType();
12821 if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
12822 NewVT, true, false))
12823 return getVZextMovL(VT, NewVT, NewOp.getOperand(0), DAG, Subtarget,
12826 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
12827 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
12828 if (NewOp.getNode()) {
12829 MVT NewVT = NewOp.getSimpleValueType();
12830 if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
12831 return getVZextMovL(VT, NewVT, NewOp.getOperand(1), DAG, Subtarget,
12840 X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
12841 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12842 SDValue V1 = Op.getOperand(0);
12843 SDValue V2 = Op.getOperand(1);
12844 MVT VT = Op.getSimpleValueType();
12846 unsigned NumElems = VT.getVectorNumElements();
12847 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
12848 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
12849 bool V1IsSplat = false;
12850 bool V2IsSplat = false;
12851 bool HasSSE2 = Subtarget->hasSSE2();
12852 bool HasFp256 = Subtarget->hasFp256();
12853 bool HasInt256 = Subtarget->hasInt256();
12854 MachineFunction &MF = DAG.getMachineFunction();
12856 MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize);
12858 // Check if we should use the experimental vector shuffle lowering. If so,
12859 // delegate completely to that code path.
12860 if (ExperimentalVectorShuffleLowering)
12861 return lowerVectorShuffle(Op, Subtarget, DAG);
12863 assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
12865 if (V1IsUndef && V2IsUndef)
12866 return DAG.getUNDEF(VT);
12868 // When we create a shuffle node we put the UNDEF node to second operand,
12869 // but in some cases the first operand may be transformed to UNDEF.
12870 // In this case we should just commute the node.
12872 return DAG.getCommutedVectorShuffle(*SVOp);
12874 // Vector shuffle lowering takes 3 steps:
12876 // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable
12877 // narrowing and commutation of operands should be handled.
12878 // 2) Matching of shuffles with known shuffle masks to x86 target specific
12880 // 3) Rewriting of unmatched masks into new generic shuffle operations,
12881 // so the shuffle can be broken into other shuffles and the legalizer can
12882 // try the lowering again.
12884 // The general idea is that no vector_shuffle operation should be left to
12885 // be matched during isel, all of them must be converted to a target specific
12888 // Normalize the input vectors. Here splats, zeroed vectors, profitable
12889 // narrowing and commutation of operands should be handled. The actual code
12890 // doesn't include all of those, work in progress...
12891 SDValue NewOp = NormalizeVectorShuffle(Op, Subtarget, DAG);
12892 if (NewOp.getNode())
12895 SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end());
12897 // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
12898 // unpckh_undef). Only use pshufd if speed is more important than size.
12899 if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasInt256))
12900 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
12901 if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasInt256))
12902 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
12904 if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() &&
12905 V2IsUndef && MayFoldVectorLoad(V1))
12906 return getMOVDDup(Op, dl, V1, DAG);
12908 if (isMOVHLPS_v_undef_Mask(M, VT))
12909 return getMOVHighToLow(Op, dl, DAG);
12911 // Use to match splats
12912 if (HasSSE2 && isUNPCKHMask(M, VT, HasInt256) && V2IsUndef &&
12913 (VT == MVT::v2f64 || VT == MVT::v2i64))
12914 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
12916 if (isPSHUFDMask(M, VT)) {
12917 // The actual implementation will match the mask in the if above and then
12918 // during isel it can match several different instructions, not only pshufd
12919 // as its name says, sad but true, emulate the behavior for now...
12920 if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64)))
12921 return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG);
12923 unsigned TargetMask = getShuffleSHUFImmediate(SVOp);
12925 if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
12926 return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
12928 if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64))
12929 return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1, TargetMask,
12932 return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1,
12936 if (isPALIGNRMask(M, VT, Subtarget))
12937 return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2,
12938 getShufflePALIGNRImmediate(SVOp),
12941 if (isVALIGNMask(M, VT, Subtarget))
12942 return getTargetShuffleNode(X86ISD::VALIGN, dl, VT, V1, V2,
12943 getShuffleVALIGNImmediate(SVOp),
12946 // Check if this can be converted into a logical shift.
12947 bool isLeft = false;
12948 unsigned ShAmt = 0;
12950 bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
12951 if (isShift && ShVal.hasOneUse()) {
12952 // If the shifted value has multiple uses, it may be cheaper to use
12953 // v_set0 + movlhps or movhlps, etc.
12954 MVT EltVT = VT.getVectorElementType();
12955 ShAmt *= EltVT.getSizeInBits();
12956 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
12959 if (isMOVLMask(M, VT)) {
12960 if (ISD::isBuildVectorAllZeros(V1.getNode()))
12961 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
12962 if (!isMOVLPMask(M, VT)) {
12963 if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
12964 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
12966 if (VT == MVT::v4i32 || VT == MVT::v4f32)
12967 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
12971 // FIXME: fold these into legal mask.
12972 if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasInt256))
12973 return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
12975 if (isMOVHLPSMask(M, VT))
12976 return getMOVHighToLow(Op, dl, DAG);
12978 if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget))
12979 return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
12981 if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget))
12982 return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
12984 if (isMOVLPMask(M, VT))
12985 return getMOVLP(Op, dl, DAG, HasSSE2);
12987 if (ShouldXformToMOVHLPS(M, VT) ||
12988 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT))
12989 return DAG.getCommutedVectorShuffle(*SVOp);
12992 // No better options. Use a vshldq / vsrldq.
12993 MVT EltVT = VT.getVectorElementType();
12994 ShAmt *= EltVT.getSizeInBits();
12995 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
12998 bool Commuted = false;
12999 // FIXME: This should also accept a bitcast of a splat? Be careful, not
13000 // 1,1,1,1 -> v8i16 though.
13001 BitVector UndefElements;
13002 if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V1.getNode()))
13003 if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
13005 if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V2.getNode()))
13006 if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
13009 // Canonicalize the splat or undef, if present, to be on the RHS.
13010 if (!V2IsUndef && V1IsSplat && !V2IsSplat) {
13011 CommuteVectorShuffleMask(M, NumElems);
13013 std::swap(V1IsSplat, V2IsSplat);
13017 if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) {
13018 // Shuffling low element of v1 into undef, just return v1.
13021 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
13022 // the instruction selector will not match, so get a canonical MOVL with
13023 // swapped operands to undo the commute.
13024 return getMOVL(DAG, dl, VT, V2, V1);
13027 if (isUNPCKLMask(M, VT, HasInt256))
13028 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
13030 if (isUNPCKHMask(M, VT, HasInt256))
13031 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
13034 // Normalize mask so all entries that point to V2 points to its first
13035 // element then try to match unpck{h|l} again. If match, return a
13036 // new vector_shuffle with the corrected mask.p
13037 SmallVector<int, 8> NewMask(M.begin(), M.end());
13038 NormalizeMask(NewMask, NumElems);
13039 if (isUNPCKLMask(NewMask, VT, HasInt256, true))
13040 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
13041 if (isUNPCKHMask(NewMask, VT, HasInt256, true))
13042 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
13046 // Commute is back and try unpck* again.
13047 // FIXME: this seems wrong.
13048 CommuteVectorShuffleMask(M, NumElems);
13050 std::swap(V1IsSplat, V2IsSplat);
13052 if (isUNPCKLMask(M, VT, HasInt256))
13053 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
13055 if (isUNPCKHMask(M, VT, HasInt256))
13056 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
13059 // Normalize the node to match x86 shuffle ops if needed
13060 if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true)))
13061 return DAG.getCommutedVectorShuffle(*SVOp);
13063 // The checks below are all present in isShuffleMaskLegal, but they are
13064 // inlined here right now to enable us to directly emit target specific
13065 // nodes, and remove one by one until they don't return Op anymore.
13067 if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
13068 SVOp->getSplatIndex() == 0 && V2IsUndef) {
13069 if (VT == MVT::v2f64 || VT == MVT::v2i64)
13070 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
13073 if (isPSHUFHWMask(M, VT, HasInt256))
13074 return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
13075 getShufflePSHUFHWImmediate(SVOp),
13078 if (isPSHUFLWMask(M, VT, HasInt256))
13079 return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
13080 getShufflePSHUFLWImmediate(SVOp),
13083 unsigned MaskValue;
13084 if (isBlendMask(M, VT, Subtarget->hasSSE41(), HasInt256, &MaskValue))
13085 return LowerVECTOR_SHUFFLEtoBlend(SVOp, MaskValue, Subtarget, DAG);
13087 if (isSHUFPMask(M, VT))
13088 return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
13089 getShuffleSHUFImmediate(SVOp), DAG);
13091 if (isUNPCKL_v_undef_Mask(M, VT, HasInt256))
13092 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
13093 if (isUNPCKH_v_undef_Mask(M, VT, HasInt256))
13094 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
13096 //===--------------------------------------------------------------------===//
13097 // Generate target specific nodes for 128 or 256-bit shuffles only
13098 // supported in the AVX instruction set.
13101 // Handle VMOVDDUPY permutations
13102 if (V2IsUndef && isMOVDDUPYMask(M, VT, HasFp256))
13103 return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
13105 // Handle VPERMILPS/D* permutations
13106 if (isVPERMILPMask(M, VT)) {
13107 if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32)
13108 return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
13109 getShuffleSHUFImmediate(SVOp), DAG);
13110 return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1,
13111 getShuffleSHUFImmediate(SVOp), DAG);
13115 if (VT.is512BitVector() && isINSERT64x4Mask(M, VT, &Idx))
13116 return Insert256BitVector(V1, Extract256BitVector(V2, 0, DAG, dl),
13117 Idx*(NumElems/2), DAG, dl);
13119 // Handle VPERM2F128/VPERM2I128 permutations
13120 if (isVPERM2X128Mask(M, VT, HasFp256))
13121 return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
13122 V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
13124 if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT))
13125 return getINSERTPS(SVOp, dl, DAG);
13128 if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8))
13129 return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG);
13131 if ((V2IsUndef && HasInt256 && VT.is256BitVector() && NumElems == 8) ||
13132 VT.is512BitVector()) {
13133 MVT MaskEltVT = MVT::getIntegerVT(VT.getVectorElementType().getSizeInBits());
13134 MVT MaskVectorVT = MVT::getVectorVT(MaskEltVT, NumElems);
13135 SmallVector<SDValue, 16> permclMask;
13136 for (unsigned i = 0; i != NumElems; ++i) {
13137 permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT));
13140 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, permclMask);
13142 // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
13143 return DAG.getNode(X86ISD::VPERMV, dl, VT,
13144 DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
13145 return DAG.getNode(X86ISD::VPERMV3, dl, VT, V1,
13146 DAG.getNode(ISD::BITCAST, dl, VT, Mask), V2);
13149 //===--------------------------------------------------------------------===//
13150 // Since no target specific shuffle was selected for this generic one,
13151 // lower it into other known shuffles. FIXME: this isn't true yet, but
13152 // this is the plan.
13155 // Handle v8i16 specifically since SSE can do byte extraction and insertion.
13156 if (VT == MVT::v8i16) {
13157 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG);
13158 if (NewOp.getNode())
13162 if (VT == MVT::v16i16 && HasInt256) {
13163 SDValue NewOp = LowerVECTOR_SHUFFLEv16i16(Op, DAG);
13164 if (NewOp.getNode())
13168 if (VT == MVT::v16i8) {
13169 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, Subtarget, DAG);
13170 if (NewOp.getNode())
13174 if (VT == MVT::v32i8) {
13175 SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG);
13176 if (NewOp.getNode())
13180 // Handle all 128-bit wide vectors with 4 elements, and match them with
13181 // several different shuffle types.
13182 if (NumElems == 4 && VT.is128BitVector())
13183 return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
13185 // Handle general 256-bit shuffles
13186 if (VT.is256BitVector())
13187 return LowerVECTOR_SHUFFLE_256(SVOp, DAG);
13192 // This function assumes its argument is a BUILD_VECTOR of constants or
13193 // undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is
13195 static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
13196 unsigned &MaskValue) {
13198 unsigned NumElems = BuildVector->getNumOperands();
13199 // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
13200 unsigned NumLanes = (NumElems - 1) / 8 + 1;
13201 unsigned NumElemsInLane = NumElems / NumLanes;
13203 // Blend for v16i16 should be symetric for the both lanes.
13204 for (unsigned i = 0; i < NumElemsInLane; ++i) {
13205 SDValue EltCond = BuildVector->getOperand(i);
13206 SDValue SndLaneEltCond =
13207 (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond;
13209 int Lane1Cond = -1, Lane2Cond = -1;
13210 if (isa<ConstantSDNode>(EltCond))
13211 Lane1Cond = !isZero(EltCond);
13212 if (isa<ConstantSDNode>(SndLaneEltCond))
13213 Lane2Cond = !isZero(SndLaneEltCond);
13215 if (Lane1Cond == Lane2Cond || Lane2Cond < 0)
13216 // Lane1Cond != 0, means we want the first argument.
13217 // Lane1Cond == 0, means we want the second argument.
13218 // The encoding of this argument is 0 for the first argument, 1
13219 // for the second. Therefore, invert the condition.
13220 MaskValue |= !Lane1Cond << i;
13221 else if (Lane1Cond < 0)
13222 MaskValue |= !Lane2Cond << i;
13229 /// \brief Try to lower a VSELECT instruction to an immediate-controlled blend
13231 static SDValue lowerVSELECTtoBLENDI(SDValue Op, const X86Subtarget *Subtarget,
13232 SelectionDAG &DAG) {
13233 SDValue Cond = Op.getOperand(0);
13234 SDValue LHS = Op.getOperand(1);
13235 SDValue RHS = Op.getOperand(2);
13237 MVT VT = Op.getSimpleValueType();
13238 MVT EltVT = VT.getVectorElementType();
13239 unsigned NumElems = VT.getVectorNumElements();
13241 // There is no blend with immediate in AVX-512.
13242 if (VT.is512BitVector())
13245 if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
13247 if (!Subtarget->hasInt256() && VT == MVT::v16i16)
13250 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
13253 // Check the mask for BLEND and build the value.
13254 unsigned MaskValue = 0;
13255 if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
13258 // Convert i32 vectors to floating point if it is not AVX2.
13259 // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
13261 if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
13262 BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
13264 LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS);
13265 RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS);
13268 SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS,
13269 DAG.getConstant(MaskValue, MVT::i32));
13270 return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
13273 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
13274 // A vselect where all conditions and data are constants can be optimized into
13275 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
13276 if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
13277 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
13278 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
13281 SDValue BlendOp = lowerVSELECTtoBLENDI(Op, Subtarget, DAG);
13282 if (BlendOp.getNode())
13285 // Some types for vselect were previously set to Expand, not Legal or
13286 // Custom. Return an empty SDValue so we fall-through to Expand, after
13287 // the Custom lowering phase.
13288 MVT VT = Op.getSimpleValueType();
13289 switch (VT.SimpleTy) {
13294 if (Subtarget->hasBWI() && Subtarget->hasVLX())
13299 // We couldn't create a "Blend with immediate" node.
13300 // This node should still be legal, but we'll have to emit a blendv*
13305 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
13306 MVT VT = Op.getSimpleValueType();
13309 if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
13312 if (VT.getSizeInBits() == 8) {
13313 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
13314 Op.getOperand(0), Op.getOperand(1));
13315 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
13316 DAG.getValueType(VT));
13317 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
13320 if (VT.getSizeInBits() == 16) {
13321 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
13322 // If Idx is 0, it's cheaper to do a move instead of a pextrw.
13324 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
13325 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
13326 DAG.getNode(ISD::BITCAST, dl,
13329 Op.getOperand(1)));
13330 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
13331 Op.getOperand(0), Op.getOperand(1));
13332 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
13333 DAG.getValueType(VT));
13334 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
13337 if (VT == MVT::f32) {
13338 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
13339 // the result back to FR32 register. It's only worth matching if the
13340 // result has a single use which is a store or a bitcast to i32. And in
13341 // the case of a store, it's not worth it if the index is a constant 0,
13342 // because a MOVSSmr can be used instead, which is smaller and faster.
13343 if (!Op.hasOneUse())
13345 SDNode *User = *Op.getNode()->use_begin();
13346 if ((User->getOpcode() != ISD::STORE ||
13347 (isa<ConstantSDNode>(Op.getOperand(1)) &&
13348 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
13349 (User->getOpcode() != ISD::BITCAST ||
13350 User->getValueType(0) != MVT::i32))
13352 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
13353 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32,
13356 return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
13359 if (VT == MVT::i32 || VT == MVT::i64) {
13360 // ExtractPS/pextrq works with constant index.
13361 if (isa<ConstantSDNode>(Op.getOperand(1)))
13367 /// Extract one bit from mask vector, like v16i1 or v8i1.
13368 /// AVX-512 feature.
13370 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
13371 SDValue Vec = Op.getOperand(0);
13373 MVT VecVT = Vec.getSimpleValueType();
13374 SDValue Idx = Op.getOperand(1);
13375 MVT EltVT = Op.getSimpleValueType();
13377 assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
13378 assert((VecVT.getVectorNumElements() <= 16 || Subtarget->hasBWI()) &&
13379 "Unexpected vector type in ExtractBitFromMaskVector");
13381 // variable index can't be handled in mask registers,
13382 // extend vector to VR512
13383 if (!isa<ConstantSDNode>(Idx)) {
13384 MVT ExtVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
13385 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
13386 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
13387 ExtVT.getVectorElementType(), Ext, Idx);
13388 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
13391 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13392 const TargetRegisterClass* rc = getRegClassFor(VecVT);
13393 if (!Subtarget->hasDQI() && (VecVT.getVectorNumElements() <= 8))
13394 rc = getRegClassFor(MVT::v16i1);
13395 unsigned MaxSift = rc->getSize()*8 - 1;
13396 Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
13397 DAG.getConstant(MaxSift - IdxVal, MVT::i8));
13398 Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
13399 DAG.getConstant(MaxSift, MVT::i8));
13400 return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
13401 DAG.getIntPtrConstant(0));
13405 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
13406 SelectionDAG &DAG) const {
13408 SDValue Vec = Op.getOperand(0);
13409 MVT VecVT = Vec.getSimpleValueType();
13410 SDValue Idx = Op.getOperand(1);
13412 if (Op.getSimpleValueType() == MVT::i1)
13413 return ExtractBitFromMaskVector(Op, DAG);
13415 if (!isa<ConstantSDNode>(Idx)) {
13416 if (VecVT.is512BitVector() ||
13417 (VecVT.is256BitVector() && Subtarget->hasInt256() &&
13418 VecVT.getVectorElementType().getSizeInBits() == 32)) {
13421 MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits());
13422 MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
13423 MaskEltVT.getSizeInBits());
13425 Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
13426 SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
13427 getZeroVector(MaskVT, Subtarget, DAG, dl),
13428 Idx, DAG.getConstant(0, getPointerTy()));
13429 SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
13430 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(),
13431 Perm, DAG.getConstant(0, getPointerTy()));
13436 // If this is a 256-bit vector result, first extract the 128-bit vector and
13437 // then extract the element from the 128-bit vector.
13438 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
13440 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13441 // Get the 128-bit vector.
13442 Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
13443 MVT EltVT = VecVT.getVectorElementType();
13445 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
13447 //if (IdxVal >= NumElems/2)
13448 // IdxVal -= NumElems/2;
13449 IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk;
13450 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
13451 DAG.getConstant(IdxVal, MVT::i32));
13454 assert(VecVT.is128BitVector() && "Unexpected vector length");
13456 if (Subtarget->hasSSE41()) {
13457 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
13462 MVT VT = Op.getSimpleValueType();
13463 // TODO: handle v16i8.
13464 if (VT.getSizeInBits() == 16) {
13465 SDValue Vec = Op.getOperand(0);
13466 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
13468 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
13469 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
13470 DAG.getNode(ISD::BITCAST, dl,
13472 Op.getOperand(1)));
13473 // Transform it so it match pextrw which produces a 32-bit result.
13474 MVT EltVT = MVT::i32;
13475 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
13476 Op.getOperand(0), Op.getOperand(1));
13477 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
13478 DAG.getValueType(VT));
13479 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
13482 if (VT.getSizeInBits() == 32) {
13483 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
13487 // SHUFPS the element to the lowest double word, then movss.
13488 int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
13489 MVT VVT = Op.getOperand(0).getSimpleValueType();
13490 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
13491 DAG.getUNDEF(VVT), Mask);
13492 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
13493 DAG.getIntPtrConstant(0));
13496 if (VT.getSizeInBits() == 64) {
13497 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
13498 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
13499 // to match extract_elt for f64.
13500 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
13504 // UNPCKHPD the element to the lowest double word, then movsd.
13505 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
13506 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
13507 int Mask[2] = { 1, -1 };
13508 MVT VVT = Op.getOperand(0).getSimpleValueType();
13509 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
13510 DAG.getUNDEF(VVT), Mask);
13511 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
13512 DAG.getIntPtrConstant(0));
13518 /// Insert one bit to mask vector, like v16i1 or v8i1.
13519 /// AVX-512 feature.
13521 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
13523 SDValue Vec = Op.getOperand(0);
13524 SDValue Elt = Op.getOperand(1);
13525 SDValue Idx = Op.getOperand(2);
13526 MVT VecVT = Vec.getSimpleValueType();
13528 if (!isa<ConstantSDNode>(Idx)) {
13529 // Non constant index. Extend source and destination,
13530 // insert element and then truncate the result.
13531 MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
13532 MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32);
13533 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
13534 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
13535 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
13536 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
13539 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13540 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
13541 if (Vec.getOpcode() == ISD::UNDEF)
13542 return DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
13543 DAG.getConstant(IdxVal, MVT::i8));
13544 const TargetRegisterClass* rc = getRegClassFor(VecVT);
13545 unsigned MaxSift = rc->getSize()*8 - 1;
13546 EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
13547 DAG.getConstant(MaxSift, MVT::i8));
13548 EltInVec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, EltInVec,
13549 DAG.getConstant(MaxSift - IdxVal, MVT::i8));
13550 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
13553 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
13554 SelectionDAG &DAG) const {
13555 MVT VT = Op.getSimpleValueType();
13556 MVT EltVT = VT.getVectorElementType();
13558 if (EltVT == MVT::i1)
13559 return InsertBitToMaskVector(Op, DAG);
13562 SDValue N0 = Op.getOperand(0);
13563 SDValue N1 = Op.getOperand(1);
13564 SDValue N2 = Op.getOperand(2);
13565 if (!isa<ConstantSDNode>(N2))
13567 auto *N2C = cast<ConstantSDNode>(N2);
13568 unsigned IdxVal = N2C->getZExtValue();
13570 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
13571 // into that, and then insert the subvector back into the result.
13572 if (VT.is256BitVector() || VT.is512BitVector()) {
13573 // Get the desired 128-bit vector half.
13574 SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
13576 // Insert the element into the desired half.
13577 unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
13578 unsigned IdxIn128 = IdxVal - (IdxVal / NumEltsIn128) * NumEltsIn128;
13580 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
13581 DAG.getConstant(IdxIn128, MVT::i32));
13583 // Insert the changed part back to the 256-bit vector
13584 return Insert128BitVector(N0, V, IdxVal, DAG, dl);
13586 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
13588 if (Subtarget->hasSSE41()) {
13589 if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) {
13591 if (VT == MVT::v8i16) {
13592 Opc = X86ISD::PINSRW;
13594 assert(VT == MVT::v16i8);
13595 Opc = X86ISD::PINSRB;
13598 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
13600 if (N1.getValueType() != MVT::i32)
13601 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
13602 if (N2.getValueType() != MVT::i32)
13603 N2 = DAG.getIntPtrConstant(IdxVal);
13604 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
13607 if (EltVT == MVT::f32) {
13608 // Bits [7:6] of the constant are the source select. This will always be
13609 // zero here. The DAG Combiner may combine an extract_elt index into
13611 // bits. For example (insert (extract, 3), 2) could be matched by
13613 // the '3' into bits [7:6] of X86ISD::INSERTPS.
13614 // Bits [5:4] of the constant are the destination select. This is the
13615 // value of the incoming immediate.
13616 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
13617 // combine either bitwise AND or insert of float 0.0 to set these bits.
13618 N2 = DAG.getIntPtrConstant(IdxVal << 4);
13619 // Create this as a scalar to vector..
13620 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
13621 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
13624 if (EltVT == MVT::i32 || EltVT == MVT::i64) {
13625 // PINSR* works with constant index.
13630 if (EltVT == MVT::i8)
13633 if (EltVT.getSizeInBits() == 16) {
13634 // Transform it so it match pinsrw which expects a 16-bit value in a GR32
13635 // as its second argument.
13636 if (N1.getValueType() != MVT::i32)
13637 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
13638 if (N2.getValueType() != MVT::i32)
13639 N2 = DAG.getIntPtrConstant(IdxVal);
13640 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
13645 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
13647 MVT OpVT = Op.getSimpleValueType();
13649 // If this is a 256-bit vector result, first insert into a 128-bit
13650 // vector and then insert into the 256-bit vector.
13651 if (!OpVT.is128BitVector()) {
13652 // Insert into a 128-bit vector.
13653 unsigned SizeFactor = OpVT.getSizeInBits()/128;
13654 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
13655 OpVT.getVectorNumElements() / SizeFactor);
13657 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
13659 // Insert the 128-bit vector.
13660 return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
13663 if (OpVT == MVT::v1i64 &&
13664 Op.getOperand(0).getValueType() == MVT::i64)
13665 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
13667 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
13668 assert(OpVT.is128BitVector() && "Expected an SSE type!");
13669 return DAG.getNode(ISD::BITCAST, dl, OpVT,
13670 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
13673 // Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in
13674 // a simple subregister reference or explicit instructions to grab
13675 // upper bits of a vector.
13676 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
13677 SelectionDAG &DAG) {
13679 SDValue In = Op.getOperand(0);
13680 SDValue Idx = Op.getOperand(1);
13681 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13682 MVT ResVT = Op.getSimpleValueType();
13683 MVT InVT = In.getSimpleValueType();
13685 if (Subtarget->hasFp256()) {
13686 if (ResVT.is128BitVector() &&
13687 (InVT.is256BitVector() || InVT.is512BitVector()) &&
13688 isa<ConstantSDNode>(Idx)) {
13689 return Extract128BitVector(In, IdxVal, DAG, dl);
13691 if (ResVT.is256BitVector() && InVT.is512BitVector() &&
13692 isa<ConstantSDNode>(Idx)) {
13693 return Extract256BitVector(In, IdxVal, DAG, dl);
13699 // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
13700 // simple superregister reference or explicit instructions to insert
13701 // the upper bits of a vector.
13702 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
13703 SelectionDAG &DAG) {
13704 if (!Subtarget->hasAVX())
13708 SDValue Vec = Op.getOperand(0);
13709 SDValue SubVec = Op.getOperand(1);
13710 SDValue Idx = Op.getOperand(2);
13712 if (!isa<ConstantSDNode>(Idx))
13715 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13716 MVT OpVT = Op.getSimpleValueType();
13717 MVT SubVecVT = SubVec.getSimpleValueType();
13719 // Fold two 16-byte subvector loads into one 32-byte load:
13720 // (insert_subvector (insert_subvector undef, (load addr), 0),
13721 // (load addr + 16), Elts/2)
13723 if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
13724 Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
13725 OpVT.is256BitVector() && SubVecVT.is128BitVector() &&
13726 !Subtarget->isUnalignedMem32Slow()) {
13727 SDValue SubVec2 = Vec.getOperand(1);
13728 if (auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2))) {
13729 if (Idx2->getZExtValue() == 0) {
13730 SDValue Ops[] = { SubVec2, SubVec };
13731 SDValue LD = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false);
13738 if ((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
13739 SubVecVT.is128BitVector())
13740 return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
13742 if (OpVT.is512BitVector() && SubVecVT.is256BitVector())
13743 return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
13748 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
13749 // their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
13750 // one of the above mentioned nodes. It has to be wrapped because otherwise
13751 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
13752 // be used to form addressing mode. These wrapped nodes will be selected
13755 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
13756 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
13758 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13759 // global base reg.
13760 unsigned char OpFlag = 0;
13761 unsigned WrapperKind = X86ISD::Wrapper;
13762 CodeModel::Model M = DAG.getTarget().getCodeModel();
13764 if (Subtarget->isPICStyleRIPRel() &&
13765 (M == CodeModel::Small || M == CodeModel::Kernel))
13766 WrapperKind = X86ISD::WrapperRIP;
13767 else if (Subtarget->isPICStyleGOT())
13768 OpFlag = X86II::MO_GOTOFF;
13769 else if (Subtarget->isPICStyleStubPIC())
13770 OpFlag = X86II::MO_PIC_BASE_OFFSET;
13772 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
13773 CP->getAlignment(),
13774 CP->getOffset(), OpFlag);
13776 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13777 // With PIC, the address is actually $g + Offset.
13779 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13780 DAG.getNode(X86ISD::GlobalBaseReg,
13781 SDLoc(), getPointerTy()),
13788 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
13789 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
13791 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13792 // global base reg.
13793 unsigned char OpFlag = 0;
13794 unsigned WrapperKind = X86ISD::Wrapper;
13795 CodeModel::Model M = DAG.getTarget().getCodeModel();
13797 if (Subtarget->isPICStyleRIPRel() &&
13798 (M == CodeModel::Small || M == CodeModel::Kernel))
13799 WrapperKind = X86ISD::WrapperRIP;
13800 else if (Subtarget->isPICStyleGOT())
13801 OpFlag = X86II::MO_GOTOFF;
13802 else if (Subtarget->isPICStyleStubPIC())
13803 OpFlag = X86II::MO_PIC_BASE_OFFSET;
13805 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
13808 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13810 // With PIC, the address is actually $g + Offset.
13812 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13813 DAG.getNode(X86ISD::GlobalBaseReg,
13814 SDLoc(), getPointerTy()),
13821 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
13822 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
13824 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13825 // global base reg.
13826 unsigned char OpFlag = 0;
13827 unsigned WrapperKind = X86ISD::Wrapper;
13828 CodeModel::Model M = DAG.getTarget().getCodeModel();
13830 if (Subtarget->isPICStyleRIPRel() &&
13831 (M == CodeModel::Small || M == CodeModel::Kernel)) {
13832 if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())
13833 OpFlag = X86II::MO_GOTPCREL;
13834 WrapperKind = X86ISD::WrapperRIP;
13835 } else if (Subtarget->isPICStyleGOT()) {
13836 OpFlag = X86II::MO_GOT;
13837 } else if (Subtarget->isPICStyleStubPIC()) {
13838 OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE;
13839 } else if (Subtarget->isPICStyleStubNoDynamic()) {
13840 OpFlag = X86II::MO_DARWIN_NONLAZY;
13843 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
13846 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13848 // With PIC, the address is actually $g + Offset.
13849 if (DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
13850 !Subtarget->is64Bit()) {
13851 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13852 DAG.getNode(X86ISD::GlobalBaseReg,
13853 SDLoc(), getPointerTy()),
13857 // For symbols that require a load from a stub to get the address, emit the
13859 if (isGlobalStubReference(OpFlag))
13860 Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
13861 MachinePointerInfo::getGOT(), false, false, false, 0);
13867 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
13868 // Create the TargetBlockAddressAddress node.
13869 unsigned char OpFlags =
13870 Subtarget->ClassifyBlockAddressReference();
13871 CodeModel::Model M = DAG.getTarget().getCodeModel();
13872 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
13873 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
13875 SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset,
13878 if (Subtarget->isPICStyleRIPRel() &&
13879 (M == CodeModel::Small || M == CodeModel::Kernel))
13880 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
13882 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
13884 // With PIC, the address is actually $g + Offset.
13885 if (isGlobalRelativeToPICBase(OpFlags)) {
13886 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
13887 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
13895 X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
13896 int64_t Offset, SelectionDAG &DAG) const {
13897 // Create the TargetGlobalAddress node, folding in the constant
13898 // offset if it is legal.
13899 unsigned char OpFlags =
13900 Subtarget->ClassifyGlobalReference(GV, DAG.getTarget());
13901 CodeModel::Model M = DAG.getTarget().getCodeModel();
13903 if (OpFlags == X86II::MO_NO_FLAG &&
13904 X86::isOffsetSuitableForCodeModel(Offset, M)) {
13905 // A direct static reference to a global.
13906 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
13909 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
13912 if (Subtarget->isPICStyleRIPRel() &&
13913 (M == CodeModel::Small || M == CodeModel::Kernel))
13914 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
13916 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
13918 // With PIC, the address is actually $g + Offset.
13919 if (isGlobalRelativeToPICBase(OpFlags)) {
13920 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
13921 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
13925 // For globals that require a load from a stub to get the address, emit the
13927 if (isGlobalStubReference(OpFlags))
13928 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
13929 MachinePointerInfo::getGOT(), false, false, false, 0);
13931 // If there was a non-zero offset that we didn't fold, create an explicit
13932 // addition for it.
13934 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
13935 DAG.getConstant(Offset, getPointerTy()));
13941 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
13942 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
13943 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
13944 return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
13948 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
13949 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
13950 unsigned char OperandFlags, bool LocalDynamic = false) {
13951 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
13952 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
13954 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13955 GA->getValueType(0),
13959 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
13963 SDValue Ops[] = { Chain, TGA, *InFlag };
13964 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
13966 SDValue Ops[] = { Chain, TGA };
13967 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
13970 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
13971 MFI->setAdjustsStack(true);
13972 MFI->setHasCalls(true);
13974 SDValue Flag = Chain.getValue(1);
13975 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
13978 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
13980 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13983 SDLoc dl(GA); // ? function entry point might be better
13984 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
13985 DAG.getNode(X86ISD::GlobalBaseReg,
13986 SDLoc(), PtrVT), InFlag);
13987 InFlag = Chain.getValue(1);
13989 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
13992 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
13994 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13996 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
13997 X86::RAX, X86II::MO_TLSGD);
14000 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
14006 // Get the start address of the TLS block for this module.
14007 X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
14008 .getInfo<X86MachineFunctionInfo>();
14009 MFI->incNumLocalDynamicTLSAccesses();
14013 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
14014 X86II::MO_TLSLD, /*LocalDynamic=*/true);
14017 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
14018 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
14019 InFlag = Chain.getValue(1);
14020 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
14021 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
14024 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
14028 unsigned char OperandFlags = X86II::MO_DTPOFF;
14029 unsigned WrapperKind = X86ISD::Wrapper;
14030 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14031 GA->getValueType(0),
14032 GA->getOffset(), OperandFlags);
14033 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
14035 // Add x@dtpoff with the base.
14036 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
14039 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
14040 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14041 const EVT PtrVT, TLSModel::Model model,
14042 bool is64Bit, bool isPIC) {
14045 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
14046 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
14047 is64Bit ? 257 : 256));
14049 SDValue ThreadPointer =
14050 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0),
14051 MachinePointerInfo(Ptr), false, false, false, 0);
14053 unsigned char OperandFlags = 0;
14054 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
14056 unsigned WrapperKind = X86ISD::Wrapper;
14057 if (model == TLSModel::LocalExec) {
14058 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
14059 } else if (model == TLSModel::InitialExec) {
14061 OperandFlags = X86II::MO_GOTTPOFF;
14062 WrapperKind = X86ISD::WrapperRIP;
14064 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
14067 llvm_unreachable("Unexpected model");
14070 // emit "addl x@ntpoff,%eax" (local exec)
14071 // or "addl x@indntpoff,%eax" (initial exec)
14072 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
14074 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
14075 GA->getOffset(), OperandFlags);
14076 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
14078 if (model == TLSModel::InitialExec) {
14079 if (isPIC && !is64Bit) {
14080 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
14081 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
14085 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
14086 MachinePointerInfo::getGOT(), false, false, false, 0);
14089 // The address of the thread local variable is the add of the thread
14090 // pointer with the offset of the variable.
14091 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
14095 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
14097 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
14098 const GlobalValue *GV = GA->getGlobal();
14100 if (Subtarget->isTargetELF()) {
14101 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
14104 case TLSModel::GeneralDynamic:
14105 if (Subtarget->is64Bit())
14106 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
14107 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
14108 case TLSModel::LocalDynamic:
14109 return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(),
14110 Subtarget->is64Bit());
14111 case TLSModel::InitialExec:
14112 case TLSModel::LocalExec:
14113 return LowerToTLSExecModel(
14114 GA, DAG, getPointerTy(), model, Subtarget->is64Bit(),
14115 DAG.getTarget().getRelocationModel() == Reloc::PIC_);
14117 llvm_unreachable("Unknown TLS model.");
14120 if (Subtarget->isTargetDarwin()) {
14121 // Darwin only has one model of TLS. Lower to that.
14122 unsigned char OpFlag = 0;
14123 unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
14124 X86ISD::WrapperRIP : X86ISD::Wrapper;
14126 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14127 // global base reg.
14128 bool PIC32 = (DAG.getTarget().getRelocationModel() == Reloc::PIC_) &&
14129 !Subtarget->is64Bit();
14131 OpFlag = X86II::MO_TLVP_PIC_BASE;
14133 OpFlag = X86II::MO_TLVP;
14135 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
14136 GA->getValueType(0),
14137 GA->getOffset(), OpFlag);
14138 SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
14140 // With PIC32, the address is actually $g + Offset.
14142 Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),
14143 DAG.getNode(X86ISD::GlobalBaseReg,
14144 SDLoc(), getPointerTy()),
14147 // Lowering the machine isd will make sure everything is in the right
14149 SDValue Chain = DAG.getEntryNode();
14150 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
14151 SDValue Args[] = { Chain, Offset };
14152 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
14154 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
14155 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
14156 MFI->setAdjustsStack(true);
14158 // And our return value (tls address) is in the standard call return value
14160 unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
14161 return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
14162 Chain.getValue(1));
14165 if (Subtarget->isTargetKnownWindowsMSVC() ||
14166 Subtarget->isTargetWindowsGNU()) {
14167 // Just use the implicit TLS architecture
14168 // Need to generate someting similar to:
14169 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
14171 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
14172 // mov rcx, qword [rdx+rcx*8]
14173 // mov eax, .tls$:tlsvar
14174 // [rax+rcx] contains the address
14175 // Windows 64bit: gs:0x58
14176 // Windows 32bit: fs:__tls_array
14179 SDValue Chain = DAG.getEntryNode();
14181 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
14182 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
14183 // use its literal value of 0x2C.
14184 Value *Ptr = Constant::getNullValue(Subtarget->is64Bit()
14185 ? Type::getInt8PtrTy(*DAG.getContext(),
14187 : Type::getInt32PtrTy(*DAG.getContext(),
14191 Subtarget->is64Bit()
14192 ? DAG.getIntPtrConstant(0x58)
14193 : (Subtarget->isTargetWindowsGNU()
14194 ? DAG.getIntPtrConstant(0x2C)
14195 : DAG.getExternalSymbol("_tls_array", getPointerTy()));
14197 SDValue ThreadPointer =
14198 DAG.getLoad(getPointerTy(), dl, Chain, TlsArray,
14199 MachinePointerInfo(Ptr), false, false, false, 0);
14201 // Load the _tls_index variable
14202 SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy());
14203 if (Subtarget->is64Bit())
14204 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain,
14205 IDX, MachinePointerInfo(), MVT::i32,
14206 false, false, false, 0);
14208 IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(),
14209 false, false, false, 0);
14211 SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()),
14213 IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);
14215 SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);
14216 res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(),
14217 false, false, false, 0);
14219 // Get the offset of start of .tls section
14220 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14221 GA->getValueType(0),
14222 GA->getOffset(), X86II::MO_SECREL);
14223 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA);
14225 // The address of the thread local variable is the add of the thread
14226 // pointer with the offset of the variable.
14227 return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset);
14230 llvm_unreachable("TLS not implemented for this target.");
14233 /// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
14234 /// and take a 2 x i32 value to shift plus a shift amount.
14235 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
14236 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
14237 MVT VT = Op.getSimpleValueType();
14238 unsigned VTBits = VT.getSizeInBits();
14240 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
14241 SDValue ShOpLo = Op.getOperand(0);
14242 SDValue ShOpHi = Op.getOperand(1);
14243 SDValue ShAmt = Op.getOperand(2);
14244 // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
14245 // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
14247 SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
14248 DAG.getConstant(VTBits - 1, MVT::i8));
14249 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
14250 DAG.getConstant(VTBits - 1, MVT::i8))
14251 : DAG.getConstant(0, VT);
14253 SDValue Tmp2, Tmp3;
14254 if (Op.getOpcode() == ISD::SHL_PARTS) {
14255 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
14256 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
14258 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
14259 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
14262 // If the shift amount is larger or equal than the width of a part we can't
14263 // rely on the results of shld/shrd. Insert a test and select the appropriate
14264 // values for large shift amounts.
14265 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
14266 DAG.getConstant(VTBits, MVT::i8));
14267 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
14268 AndNode, DAG.getConstant(0, MVT::i8));
14271 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
14272 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
14273 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
14275 if (Op.getOpcode() == ISD::SHL_PARTS) {
14276 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
14277 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
14279 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
14280 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
14283 SDValue Ops[2] = { Lo, Hi };
14284 return DAG.getMergeValues(Ops, dl);
14287 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
14288 SelectionDAG &DAG) const {
14289 MVT SrcVT = Op.getOperand(0).getSimpleValueType();
14292 if (SrcVT.isVector()) {
14293 if (SrcVT.getVectorElementType() == MVT::i1) {
14294 MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
14295 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14296 DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT,
14297 Op.getOperand(0)));
14302 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
14303 "Unknown SINT_TO_FP to lower!");
14305 // These are really Legal; return the operand so the caller accepts it as
14307 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
14309 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
14310 Subtarget->is64Bit()) {
14314 unsigned Size = SrcVT.getSizeInBits()/8;
14315 MachineFunction &MF = DAG.getMachineFunction();
14316 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
14317 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
14318 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
14320 MachinePointerInfo::getFixedStack(SSFI),
14322 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
14325 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
14327 SelectionDAG &DAG) const {
14331 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
14333 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
14335 Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
14337 unsigned ByteSize = SrcVT.getSizeInBits()/8;
14339 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
14340 MachineMemOperand *MMO;
14342 int SSFI = FI->getIndex();
14344 DAG.getMachineFunction()
14345 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14346 MachineMemOperand::MOLoad, ByteSize, ByteSize);
14348 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
14349 StackSlot = StackSlot.getOperand(1);
14351 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
14352 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
14354 Tys, Ops, SrcVT, MMO);
14357 Chain = Result.getValue(1);
14358 SDValue InFlag = Result.getValue(2);
14360 // FIXME: Currently the FST is flagged to the FILD_FLAG. This
14361 // shouldn't be necessary except that RFP cannot be live across
14362 // multiple blocks. When stackifier is fixed, they can be uncoupled.
14363 MachineFunction &MF = DAG.getMachineFunction();
14364 unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
14365 int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
14366 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
14367 Tys = DAG.getVTList(MVT::Other);
14369 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
14371 MachineMemOperand *MMO =
14372 DAG.getMachineFunction()
14373 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14374 MachineMemOperand::MOStore, SSFISize, SSFISize);
14376 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
14377 Ops, Op.getValueType(), MMO);
14378 Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
14379 MachinePointerInfo::getFixedStack(SSFI),
14380 false, false, false, 0);
14386 // LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
14387 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
14388 SelectionDAG &DAG) const {
14389 // This algorithm is not obvious. Here it is what we're trying to output:
14392 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
14393 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
14395 haddpd %xmm0, %xmm0
14397 pshufd $0x4e, %xmm0, %xmm1
14403 LLVMContext *Context = DAG.getContext();
14405 // Build some magic constants.
14406 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
14407 Constant *C0 = ConstantDataVector::get(*Context, CV0);
14408 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
14410 SmallVector<Constant*,2> CV1;
14412 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
14413 APInt(64, 0x4330000000000000ULL))));
14415 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
14416 APInt(64, 0x4530000000000000ULL))));
14417 Constant *C1 = ConstantVector::get(CV1);
14418 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
14420 // Load the 64-bit value into an XMM register.
14421 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
14423 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
14424 MachinePointerInfo::getConstantPool(),
14425 false, false, false, 16);
14426 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32,
14427 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1),
14430 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
14431 MachinePointerInfo::getConstantPool(),
14432 false, false, false, 16);
14433 SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1);
14434 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
14437 if (Subtarget->hasSSE3()) {
14438 // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
14439 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
14441 SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub);
14442 SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
14444 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
14445 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle),
14449 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
14450 DAG.getIntPtrConstant(0));
14453 // LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
14454 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
14455 SelectionDAG &DAG) const {
14457 // FP constant to bias correct the final result.
14458 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
14461 // Load the 32-bit value into an XMM register.
14462 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
14465 // Zero out the upper parts of the register.
14466 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
14468 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
14469 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
14470 DAG.getIntPtrConstant(0));
14472 // Or the load with the bias.
14473 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
14474 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
14475 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
14476 MVT::v2f64, Load)),
14477 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
14478 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
14479 MVT::v2f64, Bias)));
14480 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
14481 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or),
14482 DAG.getIntPtrConstant(0));
14484 // Subtract the bias.
14485 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
14487 // Handle final rounding.
14488 EVT DestVT = Op.getValueType();
14490 if (DestVT.bitsLT(MVT::f64))
14491 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
14492 DAG.getIntPtrConstant(0));
14493 if (DestVT.bitsGT(MVT::f64))
14494 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
14496 // Handle final rounding.
14500 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
14501 const X86Subtarget &Subtarget) {
14502 // The algorithm is the following:
14503 // #ifdef __SSE4_1__
14504 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
14505 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
14506 // (uint4) 0x53000000, 0xaa);
14508 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
14509 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
14511 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
14512 // return (float4) lo + fhi;
14515 SDValue V = Op->getOperand(0);
14516 EVT VecIntVT = V.getValueType();
14517 bool Is128 = VecIntVT == MVT::v4i32;
14518 EVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
14519 // If we convert to something else than the supported type, e.g., to v4f64,
14521 if (VecFloatVT != Op->getValueType(0))
14524 unsigned NumElts = VecIntVT.getVectorNumElements();
14525 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
14526 "Unsupported custom type");
14527 assert(NumElts <= 8 && "The size of the constant array must be fixed");
14529 // In the #idef/#else code, we have in common:
14530 // - The vector of constants:
14536 // Create the splat vector for 0x4b000000.
14537 SDValue CstLow = DAG.getConstant(0x4b000000, MVT::i32);
14538 SDValue CstLowArray[] = {CstLow, CstLow, CstLow, CstLow,
14539 CstLow, CstLow, CstLow, CstLow};
14540 SDValue VecCstLow = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
14541 makeArrayRef(&CstLowArray[0], NumElts));
14542 // Create the splat vector for 0x53000000.
14543 SDValue CstHigh = DAG.getConstant(0x53000000, MVT::i32);
14544 SDValue CstHighArray[] = {CstHigh, CstHigh, CstHigh, CstHigh,
14545 CstHigh, CstHigh, CstHigh, CstHigh};
14546 SDValue VecCstHigh = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
14547 makeArrayRef(&CstHighArray[0], NumElts));
14549 // Create the right shift.
14550 SDValue CstShift = DAG.getConstant(16, MVT::i32);
14551 SDValue CstShiftArray[] = {CstShift, CstShift, CstShift, CstShift,
14552 CstShift, CstShift, CstShift, CstShift};
14553 SDValue VecCstShift = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
14554 makeArrayRef(&CstShiftArray[0], NumElts));
14555 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
14558 if (Subtarget.hasSSE41()) {
14559 EVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
14560 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
14561 SDValue VecCstLowBitcast =
14562 DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstLow);
14563 SDValue VecBitcast = DAG.getNode(ISD::BITCAST, DL, VecI16VT, V);
14564 // Low will be bitcasted right away, so do not bother bitcasting back to its
14566 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
14567 VecCstLowBitcast, DAG.getConstant(0xaa, MVT::i32));
14568 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
14569 // (uint4) 0x53000000, 0xaa);
14570 SDValue VecCstHighBitcast =
14571 DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstHigh);
14572 SDValue VecShiftBitcast =
14573 DAG.getNode(ISD::BITCAST, DL, VecI16VT, HighShift);
14574 // High will be bitcasted right away, so do not bother bitcasting back to
14575 // its original type.
14576 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
14577 VecCstHighBitcast, DAG.getConstant(0xaa, MVT::i32));
14579 SDValue CstMask = DAG.getConstant(0xffff, MVT::i32);
14580 SDValue VecCstMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, CstMask,
14581 CstMask, CstMask, CstMask);
14582 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
14583 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
14584 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
14586 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
14587 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
14590 // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
14591 SDValue CstFAdd = DAG.getConstantFP(
14592 APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), MVT::f32);
14593 SDValue CstFAddArray[] = {CstFAdd, CstFAdd, CstFAdd, CstFAdd,
14594 CstFAdd, CstFAdd, CstFAdd, CstFAdd};
14595 SDValue VecCstFAdd = DAG.getNode(ISD::BUILD_VECTOR, DL, VecFloatVT,
14596 makeArrayRef(&CstFAddArray[0], NumElts));
14598 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
14599 SDValue HighBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, High);
14601 DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
14602 // return (float4) lo + fhi;
14603 SDValue LowBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, Low);
14604 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
14607 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
14608 SelectionDAG &DAG) const {
14609 SDValue N0 = Op.getOperand(0);
14610 MVT SVT = N0.getSimpleValueType();
14613 switch (SVT.SimpleTy) {
14615 llvm_unreachable("Custom UINT_TO_FP is not supported!");
14620 MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements());
14621 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14622 DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
14626 return lowerUINT_TO_FP_vXi32(Op, DAG, *Subtarget);
14628 llvm_unreachable(nullptr);
14631 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
14632 SelectionDAG &DAG) const {
14633 SDValue N0 = Op.getOperand(0);
14636 if (Op.getValueType().isVector())
14637 return lowerUINT_TO_FP_vec(Op, DAG);
14639 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
14640 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
14641 // the optimization here.
14642 if (DAG.SignBitIsZero(N0))
14643 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
14645 MVT SrcVT = N0.getSimpleValueType();
14646 MVT DstVT = Op.getSimpleValueType();
14647 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
14648 return LowerUINT_TO_FP_i64(Op, DAG);
14649 if (SrcVT == MVT::i32 && X86ScalarSSEf64)
14650 return LowerUINT_TO_FP_i32(Op, DAG);
14651 if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
14654 // Make a 64-bit buffer, and use it to build an FILD.
14655 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
14656 if (SrcVT == MVT::i32) {
14657 SDValue WordOff = DAG.getConstant(4, getPointerTy());
14658 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
14659 getPointerTy(), StackSlot, WordOff);
14660 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
14661 StackSlot, MachinePointerInfo(),
14663 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
14664 OffsetSlot, MachinePointerInfo(),
14666 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
14670 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
14671 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
14672 StackSlot, MachinePointerInfo(),
14674 // For i64 source, we need to add the appropriate power of 2 if the input
14675 // was negative. This is the same as the optimization in
14676 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
14677 // we must be careful to do the computation in x87 extended precision, not
14678 // in SSE. (The generic code can't know it's OK to do this, or how to.)
14679 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
14680 MachineMemOperand *MMO =
14681 DAG.getMachineFunction()
14682 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14683 MachineMemOperand::MOLoad, 8, 8);
14685 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
14686 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
14687 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
14690 APInt FF(32, 0x5F800000ULL);
14692 // Check whether the sign bit is set.
14693 SDValue SignSet = DAG.getSetCC(dl,
14694 getSetCCResultType(*DAG.getContext(), MVT::i64),
14695 Op.getOperand(0), DAG.getConstant(0, MVT::i64),
14698 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
14699 SDValue FudgePtr = DAG.getConstantPool(
14700 ConstantInt::get(*DAG.getContext(), FF.zext(64)),
14703 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
14704 SDValue Zero = DAG.getIntPtrConstant(0);
14705 SDValue Four = DAG.getIntPtrConstant(4);
14706 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
14708 FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset);
14710 // Load the value out, extending it from f32 to f80.
14711 // FIXME: Avoid the extend by constructing the right constant pool?
14712 SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(),
14713 FudgePtr, MachinePointerInfo::getConstantPool(),
14714 MVT::f32, false, false, false, 4);
14715 // Extend everything to 80 bits to force it to be done on x87.
14716 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
14717 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0));
14720 std::pair<SDValue,SDValue>
14721 X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
14722 bool IsSigned, bool IsReplace) const {
14725 EVT DstTy = Op.getValueType();
14727 if (!IsSigned && !isIntegerTypeFTOL(DstTy)) {
14728 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
14732 assert(DstTy.getSimpleVT() <= MVT::i64 &&
14733 DstTy.getSimpleVT() >= MVT::i16 &&
14734 "Unknown FP_TO_INT to lower!");
14736 // These are really Legal.
14737 if (DstTy == MVT::i32 &&
14738 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
14739 return std::make_pair(SDValue(), SDValue());
14740 if (Subtarget->is64Bit() &&
14741 DstTy == MVT::i64 &&
14742 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
14743 return std::make_pair(SDValue(), SDValue());
14745 // We lower FP->int64 either into FISTP64 followed by a load from a temporary
14746 // stack slot, or into the FTOL runtime function.
14747 MachineFunction &MF = DAG.getMachineFunction();
14748 unsigned MemSize = DstTy.getSizeInBits()/8;
14749 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
14750 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
14753 if (!IsSigned && isIntegerTypeFTOL(DstTy))
14754 Opc = X86ISD::WIN_FTOL;
14756 switch (DstTy.getSimpleVT().SimpleTy) {
14757 default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
14758 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
14759 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
14760 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
14763 SDValue Chain = DAG.getEntryNode();
14764 SDValue Value = Op.getOperand(0);
14765 EVT TheVT = Op.getOperand(0).getValueType();
14766 // FIXME This causes a redundant load/store if the SSE-class value is already
14767 // in memory, such as if it is on the callstack.
14768 if (isScalarFPTypeInSSEReg(TheVT)) {
14769 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
14770 Chain = DAG.getStore(Chain, DL, Value, StackSlot,
14771 MachinePointerInfo::getFixedStack(SSFI),
14773 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
14775 Chain, StackSlot, DAG.getValueType(TheVT)
14778 MachineMemOperand *MMO =
14779 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14780 MachineMemOperand::MOLoad, MemSize, MemSize);
14781 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
14782 Chain = Value.getValue(1);
14783 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
14784 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
14787 MachineMemOperand *MMO =
14788 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14789 MachineMemOperand::MOStore, MemSize, MemSize);
14791 if (Opc != X86ISD::WIN_FTOL) {
14792 // Build the FP_TO_INT*_IN_MEM
14793 SDValue Ops[] = { Chain, Value, StackSlot };
14794 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
14796 return std::make_pair(FIST, StackSlot);
14798 SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
14799 DAG.getVTList(MVT::Other, MVT::Glue),
14801 SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX,
14802 MVT::i32, ftol.getValue(1));
14803 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX,
14804 MVT::i32, eax.getValue(2));
14805 SDValue Ops[] = { eax, edx };
14806 SDValue pair = IsReplace
14807 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops)
14808 : DAG.getMergeValues(Ops, DL);
14809 return std::make_pair(pair, SDValue());
14813 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
14814 const X86Subtarget *Subtarget) {
14815 MVT VT = Op->getSimpleValueType(0);
14816 SDValue In = Op->getOperand(0);
14817 MVT InVT = In.getSimpleValueType();
14820 // Optimize vectors in AVX mode:
14823 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
14824 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
14825 // Concat upper and lower parts.
14828 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
14829 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
14830 // Concat upper and lower parts.
14833 if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
14834 ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
14835 ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
14838 if (Subtarget->hasInt256())
14839 return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
14841 SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
14842 SDValue Undef = DAG.getUNDEF(InVT);
14843 bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
14844 SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
14845 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
14847 MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
14848 VT.getVectorNumElements()/2);
14850 OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
14851 OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
14853 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
14856 static SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
14857 SelectionDAG &DAG) {
14858 MVT VT = Op->getSimpleValueType(0);
14859 SDValue In = Op->getOperand(0);
14860 MVT InVT = In.getSimpleValueType();
14862 unsigned int NumElts = VT.getVectorNumElements();
14863 if (NumElts != 8 && NumElts != 16)
14866 if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
14867 return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
14869 EVT ExtVT = (NumElts == 8)? MVT::v8i64 : MVT::v16i32;
14870 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14871 // Now we have only mask extension
14872 assert(InVT.getVectorElementType() == MVT::i1);
14873 SDValue Cst = DAG.getTargetConstant(1, ExtVT.getScalarType());
14874 const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
14875 SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
14876 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
14877 SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
14878 MachinePointerInfo::getConstantPool(),
14879 false, false, false, Alignment);
14881 SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, DL, ExtVT, In, Ld);
14882 if (VT.is512BitVector())
14884 return DAG.getNode(X86ISD::VTRUNC, DL, VT, Brcst);
14887 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
14888 SelectionDAG &DAG) {
14889 if (Subtarget->hasFp256()) {
14890 SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
14898 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
14899 SelectionDAG &DAG) {
14901 MVT VT = Op.getSimpleValueType();
14902 SDValue In = Op.getOperand(0);
14903 MVT SVT = In.getSimpleValueType();
14905 if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
14906 return LowerZERO_EXTEND_AVX512(Op, DAG);
14908 if (Subtarget->hasFp256()) {
14909 SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
14914 assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
14915 VT.getVectorNumElements() != SVT.getVectorNumElements());
14919 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
14921 MVT VT = Op.getSimpleValueType();
14922 SDValue In = Op.getOperand(0);
14923 MVT InVT = In.getSimpleValueType();
14925 if (VT == MVT::i1) {
14926 assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
14927 "Invalid scalar TRUNCATE operation");
14928 if (InVT.getSizeInBits() >= 32)
14930 In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
14931 return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
14933 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
14934 "Invalid TRUNCATE operation");
14936 if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) {
14937 if (VT.getVectorElementType().getSizeInBits() >=8)
14938 return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
14940 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
14941 unsigned NumElts = InVT.getVectorNumElements();
14942 assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type");
14943 if (InVT.getSizeInBits() < 512) {
14944 MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64;
14945 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
14949 SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType());
14950 const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
14951 SDValue CP = DAG.getConstantPool(C, getPointerTy());
14952 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
14953 SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
14954 MachinePointerInfo::getConstantPool(),
14955 false, false, false, Alignment);
14956 SDValue OneV = DAG.getNode(X86ISD::VBROADCAST, DL, InVT, Ld);
14957 SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In);
14958 return DAG.getNode(X86ISD::TESTM, DL, VT, And, And);
14961 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
14962 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
14963 if (Subtarget->hasInt256()) {
14964 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
14965 In = DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, In);
14966 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
14968 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
14969 DAG.getIntPtrConstant(0));
14972 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14973 DAG.getIntPtrConstant(0));
14974 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14975 DAG.getIntPtrConstant(2));
14976 OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
14977 OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
14978 static const int ShufMask[] = {0, 2, 4, 6};
14979 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
14982 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
14983 // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
14984 if (Subtarget->hasInt256()) {
14985 In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In);
14987 SmallVector<SDValue,32> pshufbMask;
14988 for (unsigned i = 0; i < 2; ++i) {
14989 pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8));
14990 pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8));
14991 pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8));
14992 pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8));
14993 pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8));
14994 pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8));
14995 pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8));
14996 pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8));
14997 for (unsigned j = 0; j < 8; ++j)
14998 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
15000 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask);
15001 In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
15002 In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In);
15004 static const int ShufMask[] = {0, 2, -1, -1};
15005 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, DAG.getUNDEF(MVT::v4i64),
15007 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
15008 DAG.getIntPtrConstant(0));
15009 return DAG.getNode(ISD::BITCAST, DL, VT, In);
15012 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
15013 DAG.getIntPtrConstant(0));
15015 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
15016 DAG.getIntPtrConstant(4));
15018 OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpLo);
15019 OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpHi);
15021 // The PSHUFB mask:
15022 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
15023 -1, -1, -1, -1, -1, -1, -1, -1};
15025 SDValue Undef = DAG.getUNDEF(MVT::v16i8);
15026 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
15027 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
15029 OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
15030 OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
15032 // The MOVLHPS Mask:
15033 static const int ShufMask2[] = {0, 1, 4, 5};
15034 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
15035 return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, res);
15038 // Handle truncation of V256 to V128 using shuffles.
15039 if (!VT.is128BitVector() || !InVT.is256BitVector())
15042 assert(Subtarget->hasFp256() && "256-bit vector without AVX!");
15044 unsigned NumElems = VT.getVectorNumElements();
15045 MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
15047 SmallVector<int, 16> MaskVec(NumElems * 2, -1);
15048 // Prepare truncation shuffle mask
15049 for (unsigned i = 0; i != NumElems; ++i)
15050 MaskVec[i] = i * 2;
15051 SDValue V = DAG.getVectorShuffle(NVT, DL,
15052 DAG.getNode(ISD::BITCAST, DL, NVT, In),
15053 DAG.getUNDEF(NVT), &MaskVec[0]);
15054 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
15055 DAG.getIntPtrConstant(0));
15058 SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
15059 SelectionDAG &DAG) const {
15060 assert(!Op.getSimpleValueType().isVector());
15062 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
15063 /*IsSigned=*/ true, /*IsReplace=*/ false);
15064 SDValue FIST = Vals.first, StackSlot = Vals.second;
15065 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
15066 if (!FIST.getNode()) return Op;
15068 if (StackSlot.getNode())
15069 // Load the result.
15070 return DAG.getLoad(Op.getValueType(), SDLoc(Op),
15071 FIST, StackSlot, MachinePointerInfo(),
15072 false, false, false, 0);
15074 // The node is the result.
15078 SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
15079 SelectionDAG &DAG) const {
15080 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
15081 /*IsSigned=*/ false, /*IsReplace=*/ false);
15082 SDValue FIST = Vals.first, StackSlot = Vals.second;
15083 assert(FIST.getNode() && "Unexpected failure");
15085 if (StackSlot.getNode())
15086 // Load the result.
15087 return DAG.getLoad(Op.getValueType(), SDLoc(Op),
15088 FIST, StackSlot, MachinePointerInfo(),
15089 false, false, false, 0);
15091 // The node is the result.
15095 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
15097 MVT VT = Op.getSimpleValueType();
15098 SDValue In = Op.getOperand(0);
15099 MVT SVT = In.getSimpleValueType();
15101 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
15103 return DAG.getNode(X86ISD::VFPEXT, DL, VT,
15104 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
15105 In, DAG.getUNDEF(SVT)));
15108 /// The only differences between FABS and FNEG are the mask and the logic op.
15109 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
15110 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
15111 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
15112 "Wrong opcode for lowering FABS or FNEG.");
15114 bool IsFABS = (Op.getOpcode() == ISD::FABS);
15116 // If this is a FABS and it has an FNEG user, bail out to fold the combination
15117 // into an FNABS. We'll lower the FABS after that if it is still in use.
15119 for (SDNode *User : Op->uses())
15120 if (User->getOpcode() == ISD::FNEG)
15123 SDValue Op0 = Op.getOperand(0);
15124 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
15127 MVT VT = Op.getSimpleValueType();
15128 // Assume scalar op for initialization; update for vector if needed.
15129 // Note that there are no scalar bitwise logical SSE/AVX instructions, so we
15130 // generate a 16-byte vector constant and logic op even for the scalar case.
15131 // Using a 16-byte mask allows folding the load of the mask with
15132 // the logic op, so it can save (~4 bytes) on code size.
15134 unsigned NumElts = VT == MVT::f64 ? 2 : 4;
15135 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
15136 // decide if we should generate a 16-byte constant mask when we only need 4 or
15137 // 8 bytes for the scalar case.
15138 if (VT.isVector()) {
15139 EltVT = VT.getVectorElementType();
15140 NumElts = VT.getVectorNumElements();
15143 unsigned EltBits = EltVT.getSizeInBits();
15144 LLVMContext *Context = DAG.getContext();
15145 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
15147 IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits);
15148 Constant *C = ConstantInt::get(*Context, MaskElt);
15149 C = ConstantVector::getSplat(NumElts, C);
15150 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15151 SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy());
15152 unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
15153 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
15154 MachinePointerInfo::getConstantPool(),
15155 false, false, false, Alignment);
15157 if (VT.isVector()) {
15158 // For a vector, cast operands to a vector type, perform the logic op,
15159 // and cast the result back to the original value type.
15160 MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
15161 SDValue MaskCasted = DAG.getNode(ISD::BITCAST, dl, VecVT, Mask);
15162 SDValue Operand = IsFNABS ?
15163 DAG.getNode(ISD::BITCAST, dl, VecVT, Op0.getOperand(0)) :
15164 DAG.getNode(ISD::BITCAST, dl, VecVT, Op0);
15165 unsigned BitOp = IsFABS ? ISD::AND : IsFNABS ? ISD::OR : ISD::XOR;
15166 return DAG.getNode(ISD::BITCAST, dl, VT,
15167 DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted));
15170 // If not vector, then scalar.
15171 unsigned BitOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
15172 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
15173 return DAG.getNode(BitOp, dl, VT, Operand, Mask);
15176 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
15177 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15178 LLVMContext *Context = DAG.getContext();
15179 SDValue Op0 = Op.getOperand(0);
15180 SDValue Op1 = Op.getOperand(1);
15182 MVT VT = Op.getSimpleValueType();
15183 MVT SrcVT = Op1.getSimpleValueType();
15185 // If second operand is smaller, extend it first.
15186 if (SrcVT.bitsLT(VT)) {
15187 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
15190 // And if it is bigger, shrink it first.
15191 if (SrcVT.bitsGT(VT)) {
15192 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
15196 // At this point the operands and the result should have the same
15197 // type, and that won't be f80 since that is not custom lowered.
15199 const fltSemantics &Sem =
15200 VT == MVT::f64 ? APFloat::IEEEdouble : APFloat::IEEEsingle;
15201 const unsigned SizeInBits = VT.getSizeInBits();
15203 SmallVector<Constant *, 4> CV(
15204 VT == MVT::f64 ? 2 : 4,
15205 ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0))));
15207 // First, clear all bits but the sign bit from the second operand (sign).
15208 CV[0] = ConstantFP::get(*Context,
15209 APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1)));
15210 Constant *C = ConstantVector::get(CV);
15211 SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
15212 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
15213 MachinePointerInfo::getConstantPool(),
15214 false, false, false, 16);
15215 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
15217 // Next, clear the sign bit from the first operand (magnitude).
15218 // If it's a constant, we can clear it here.
15219 if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Op0)) {
15220 APFloat APF = Op0CN->getValueAPF();
15221 // If the magnitude is a positive zero, the sign bit alone is enough.
15222 if (APF.isPosZero())
15225 CV[0] = ConstantFP::get(*Context, APF);
15227 CV[0] = ConstantFP::get(
15229 APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1)));
15231 C = ConstantVector::get(CV);
15232 CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
15233 SDValue Val = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
15234 MachinePointerInfo::getConstantPool(),
15235 false, false, false, 16);
15236 // If the magnitude operand wasn't a constant, we need to AND out the sign.
15237 if (!isa<ConstantFPSDNode>(Op0))
15238 Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Val);
15240 // OR the magnitude value with the sign bit.
15241 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
15244 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
15245 SDValue N0 = Op.getOperand(0);
15247 MVT VT = Op.getSimpleValueType();
15249 // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
15250 SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
15251 DAG.getConstant(1, VT));
15252 return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
15255 // Check whether an OR'd tree is PTEST-able.
15256 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
15257 SelectionDAG &DAG) {
15258 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
15260 if (!Subtarget->hasSSE41())
15263 if (!Op->hasOneUse())
15266 SDNode *N = Op.getNode();
15269 SmallVector<SDValue, 8> Opnds;
15270 DenseMap<SDValue, unsigned> VecInMap;
15271 SmallVector<SDValue, 8> VecIns;
15272 EVT VT = MVT::Other;
15274 // Recognize a special case where a vector is casted into wide integer to
15276 Opnds.push_back(N->getOperand(0));
15277 Opnds.push_back(N->getOperand(1));
15279 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
15280 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
15281 // BFS traverse all OR'd operands.
15282 if (I->getOpcode() == ISD::OR) {
15283 Opnds.push_back(I->getOperand(0));
15284 Opnds.push_back(I->getOperand(1));
15285 // Re-evaluate the number of nodes to be traversed.
15286 e += 2; // 2 more nodes (LHS and RHS) are pushed.
15290 // Quit if a non-EXTRACT_VECTOR_ELT
15291 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
15294 // Quit if without a constant index.
15295 SDValue Idx = I->getOperand(1);
15296 if (!isa<ConstantSDNode>(Idx))
15299 SDValue ExtractedFromVec = I->getOperand(0);
15300 DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
15301 if (M == VecInMap.end()) {
15302 VT = ExtractedFromVec.getValueType();
15303 // Quit if not 128/256-bit vector.
15304 if (!VT.is128BitVector() && !VT.is256BitVector())
15306 // Quit if not the same type.
15307 if (VecInMap.begin() != VecInMap.end() &&
15308 VT != VecInMap.begin()->first.getValueType())
15310 M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
15311 VecIns.push_back(ExtractedFromVec);
15313 M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
15316 assert((VT.is128BitVector() || VT.is256BitVector()) &&
15317 "Not extracted from 128-/256-bit vector.");
15319 unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
15321 for (DenseMap<SDValue, unsigned>::const_iterator
15322 I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
15323 // Quit if not all elements are used.
15324 if (I->second != FullMask)
15328 EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
15330 // Cast all vectors into TestVT for PTEST.
15331 for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
15332 VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]);
15334 // If more than one full vectors are evaluated, OR them first before PTEST.
15335 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
15336 // Each iteration will OR 2 nodes and append the result until there is only
15337 // 1 node left, i.e. the final OR'd value of all vectors.
15338 SDValue LHS = VecIns[Slot];
15339 SDValue RHS = VecIns[Slot + 1];
15340 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
15343 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
15344 VecIns.back(), VecIns.back());
15347 /// \brief return true if \c Op has a use that doesn't just read flags.
15348 static bool hasNonFlagsUse(SDValue Op) {
15349 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
15351 SDNode *User = *UI;
15352 unsigned UOpNo = UI.getOperandNo();
15353 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
15354 // Look pass truncate.
15355 UOpNo = User->use_begin().getOperandNo();
15356 User = *User->use_begin();
15359 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
15360 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
15366 /// Emit nodes that will be selected as "test Op0,Op0", or something
15368 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
15369 SelectionDAG &DAG) const {
15370 if (Op.getValueType() == MVT::i1) {
15371 SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
15372 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
15373 DAG.getConstant(0, MVT::i8));
15375 // CF and OF aren't always set the way we want. Determine which
15376 // of these we need.
15377 bool NeedCF = false;
15378 bool NeedOF = false;
15381 case X86::COND_A: case X86::COND_AE:
15382 case X86::COND_B: case X86::COND_BE:
15385 case X86::COND_G: case X86::COND_GE:
15386 case X86::COND_L: case X86::COND_LE:
15387 case X86::COND_O: case X86::COND_NO: {
15388 // Check if we really need to set the
15389 // Overflow flag. If NoSignedWrap is present
15390 // that is not actually needed.
15391 switch (Op->getOpcode()) {
15396 const BinaryWithFlagsSDNode *BinNode =
15397 cast<BinaryWithFlagsSDNode>(Op.getNode());
15398 if (BinNode->hasNoSignedWrap())
15408 // See if we can use the EFLAGS value from the operand instead of
15409 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
15410 // we prove that the arithmetic won't overflow, we can't use OF or CF.
15411 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
15412 // Emit a CMP with 0, which is the TEST pattern.
15413 //if (Op.getValueType() == MVT::i1)
15414 // return DAG.getNode(X86ISD::CMP, dl, MVT::i1, Op,
15415 // DAG.getConstant(0, MVT::i1));
15416 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
15417 DAG.getConstant(0, Op.getValueType()));
15419 unsigned Opcode = 0;
15420 unsigned NumOperands = 0;
15422 // Truncate operations may prevent the merge of the SETCC instruction
15423 // and the arithmetic instruction before it. Attempt to truncate the operands
15424 // of the arithmetic instruction and use a reduced bit-width instruction.
15425 bool NeedTruncation = false;
15426 SDValue ArithOp = Op;
15427 if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
15428 SDValue Arith = Op->getOperand(0);
15429 // Both the trunc and the arithmetic op need to have one user each.
15430 if (Arith->hasOneUse())
15431 switch (Arith.getOpcode()) {
15438 NeedTruncation = true;
15444 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
15445 // which may be the result of a CAST. We use the variable 'Op', which is the
15446 // non-casted variable when we check for possible users.
15447 switch (ArithOp.getOpcode()) {
15449 // Due to an isel shortcoming, be conservative if this add is likely to be
15450 // selected as part of a load-modify-store instruction. When the root node
15451 // in a match is a store, isel doesn't know how to remap non-chain non-flag
15452 // uses of other nodes in the match, such as the ADD in this case. This
15453 // leads to the ADD being left around and reselected, with the result being
15454 // two adds in the output. Alas, even if none our users are stores, that
15455 // doesn't prove we're O.K. Ergo, if we have any parents that aren't
15456 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require
15457 // climbing the DAG back to the root, and it doesn't seem to be worth the
15459 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
15460 UE = Op.getNode()->use_end(); UI != UE; ++UI)
15461 if (UI->getOpcode() != ISD::CopyToReg &&
15462 UI->getOpcode() != ISD::SETCC &&
15463 UI->getOpcode() != ISD::STORE)
15466 if (ConstantSDNode *C =
15467 dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
15468 // An add of one will be selected as an INC.
15469 if (C->getAPIntValue() == 1 && !Subtarget->slowIncDec()) {
15470 Opcode = X86ISD::INC;
15475 // An add of negative one (subtract of one) will be selected as a DEC.
15476 if (C->getAPIntValue().isAllOnesValue() && !Subtarget->slowIncDec()) {
15477 Opcode = X86ISD::DEC;
15483 // Otherwise use a regular EFLAGS-setting add.
15484 Opcode = X86ISD::ADD;
15489 // If we have a constant logical shift that's only used in a comparison
15490 // against zero turn it into an equivalent AND. This allows turning it into
15491 // a TEST instruction later.
15492 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
15493 isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
15494 EVT VT = Op.getValueType();
15495 unsigned BitWidth = VT.getSizeInBits();
15496 unsigned ShAmt = Op->getConstantOperandVal(1);
15497 if (ShAmt >= BitWidth) // Avoid undefined shifts.
15499 APInt Mask = ArithOp.getOpcode() == ISD::SRL
15500 ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
15501 : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
15502 if (!Mask.isSignedIntN(32)) // Avoid large immediates.
15504 SDValue New = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
15505 DAG.getConstant(Mask, VT));
15506 DAG.ReplaceAllUsesWith(Op, New);
15512 // If the primary and result isn't used, don't bother using X86ISD::AND,
15513 // because a TEST instruction will be better.
15514 if (!hasNonFlagsUse(Op))
15520 // Due to the ISEL shortcoming noted above, be conservative if this op is
15521 // likely to be selected as part of a load-modify-store instruction.
15522 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
15523 UE = Op.getNode()->use_end(); UI != UE; ++UI)
15524 if (UI->getOpcode() == ISD::STORE)
15527 // Otherwise use a regular EFLAGS-setting instruction.
15528 switch (ArithOp.getOpcode()) {
15529 default: llvm_unreachable("unexpected operator!");
15530 case ISD::SUB: Opcode = X86ISD::SUB; break;
15531 case ISD::XOR: Opcode = X86ISD::XOR; break;
15532 case ISD::AND: Opcode = X86ISD::AND; break;
15534 if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
15535 SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG);
15536 if (EFLAGS.getNode())
15539 Opcode = X86ISD::OR;
15553 return SDValue(Op.getNode(), 1);
15559 // If we found that truncation is beneficial, perform the truncation and
15561 if (NeedTruncation) {
15562 EVT VT = Op.getValueType();
15563 SDValue WideVal = Op->getOperand(0);
15564 EVT WideVT = WideVal.getValueType();
15565 unsigned ConvertedOp = 0;
15566 // Use a target machine opcode to prevent further DAGCombine
15567 // optimizations that may separate the arithmetic operations
15568 // from the setcc node.
15569 switch (WideVal.getOpcode()) {
15571 case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
15572 case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
15573 case ISD::AND: ConvertedOp = X86ISD::AND; break;
15574 case ISD::OR: ConvertedOp = X86ISD::OR; break;
15575 case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
15579 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15580 if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
15581 SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
15582 SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
15583 Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
15589 // Emit a CMP with 0, which is the TEST pattern.
15590 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
15591 DAG.getConstant(0, Op.getValueType()));
15593 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
15594 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
15596 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
15597 DAG.ReplaceAllUsesWith(Op, New);
15598 return SDValue(New.getNode(), 1);
15601 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
15603 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
15604 SDLoc dl, SelectionDAG &DAG) const {
15605 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) {
15606 if (C->getAPIntValue() == 0)
15607 return EmitTest(Op0, X86CC, dl, DAG);
15609 if (Op0.getValueType() == MVT::i1)
15610 llvm_unreachable("Unexpected comparison operation for MVT::i1 operands");
15613 if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
15614 Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
15615 // Do the comparison at i32 if it's smaller, besides the Atom case.
15616 // This avoids subregister aliasing issues. Keep the smaller reference
15617 // if we're optimizing for size, however, as that'll allow better folding
15618 // of memory operations.
15619 if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 &&
15620 !DAG.getMachineFunction().getFunction()->hasFnAttribute(
15621 Attribute::MinSize) &&
15622 !Subtarget->isAtom()) {
15623 unsigned ExtendOp =
15624 isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
15625 Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
15626 Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
15628 // Use SUB instead of CMP to enable CSE between SUB and CMP.
15629 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
15630 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
15632 return SDValue(Sub.getNode(), 1);
15634 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
15637 /// Convert a comparison if required by the subtarget.
15638 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
15639 SelectionDAG &DAG) const {
15640 // If the subtarget does not support the FUCOMI instruction, floating-point
15641 // comparisons have to be converted.
15642 if (Subtarget->hasCMov() ||
15643 Cmp.getOpcode() != X86ISD::CMP ||
15644 !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
15645 !Cmp.getOperand(1).getValueType().isFloatingPoint())
15648 // The instruction selector will select an FUCOM instruction instead of
15649 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
15650 // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
15651 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
15653 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
15654 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
15655 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
15656 DAG.getConstant(8, MVT::i8));
15657 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
15658 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
15661 /// The minimum architected relative accuracy is 2^-12. We need one
15662 /// Newton-Raphson step to have a good float result (24 bits of precision).
15663 SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
15664 DAGCombinerInfo &DCI,
15665 unsigned &RefinementSteps,
15666 bool &UseOneConstNR) const {
15667 // FIXME: We should use instruction latency models to calculate the cost of
15668 // each potential sequence, but this is very hard to do reliably because
15669 // at least Intel's Core* chips have variable timing based on the number of
15670 // significant digits in the divisor and/or sqrt operand.
15671 if (!Subtarget->useSqrtEst())
15674 EVT VT = Op.getValueType();
15676 // SSE1 has rsqrtss and rsqrtps.
15677 // TODO: Add support for AVX512 (v16f32).
15678 // It is likely not profitable to do this for f64 because a double-precision
15679 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
15680 // instructions: convert to single, rsqrtss, convert back to double, refine
15681 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
15682 // along with FMA, this could be a throughput win.
15683 if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
15684 (Subtarget->hasAVX() && VT == MVT::v8f32)) {
15685 RefinementSteps = 1;
15686 UseOneConstNR = false;
15687 return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
15692 /// The minimum architected relative accuracy is 2^-12. We need one
15693 /// Newton-Raphson step to have a good float result (24 bits of precision).
15694 SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
15695 DAGCombinerInfo &DCI,
15696 unsigned &RefinementSteps) const {
15697 // FIXME: We should use instruction latency models to calculate the cost of
15698 // each potential sequence, but this is very hard to do reliably because
15699 // at least Intel's Core* chips have variable timing based on the number of
15700 // significant digits in the divisor.
15701 if (!Subtarget->useReciprocalEst())
15704 EVT VT = Op.getValueType();
15706 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
15707 // TODO: Add support for AVX512 (v16f32).
15708 // It is likely not profitable to do this for f64 because a double-precision
15709 // reciprocal estimate with refinement on x86 prior to FMA requires
15710 // 15 instructions: convert to single, rcpss, convert back to double, refine
15711 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
15712 // along with FMA, this could be a throughput win.
15713 if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
15714 (Subtarget->hasAVX() && VT == MVT::v8f32)) {
15715 RefinementSteps = ReciprocalEstimateRefinementSteps;
15716 return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
15721 static bool isAllOnes(SDValue V) {
15722 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
15723 return C && C->isAllOnesValue();
15726 /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
15727 /// if it's possible.
15728 SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
15729 SDLoc dl, SelectionDAG &DAG) const {
15730 SDValue Op0 = And.getOperand(0);
15731 SDValue Op1 = And.getOperand(1);
15732 if (Op0.getOpcode() == ISD::TRUNCATE)
15733 Op0 = Op0.getOperand(0);
15734 if (Op1.getOpcode() == ISD::TRUNCATE)
15735 Op1 = Op1.getOperand(0);
15738 if (Op1.getOpcode() == ISD::SHL)
15739 std::swap(Op0, Op1);
15740 if (Op0.getOpcode() == ISD::SHL) {
15741 if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0)))
15742 if (And00C->getZExtValue() == 1) {
15743 // If we looked past a truncate, check that it's only truncating away
15745 unsigned BitWidth = Op0.getValueSizeInBits();
15746 unsigned AndBitWidth = And.getValueSizeInBits();
15747 if (BitWidth > AndBitWidth) {
15749 DAG.computeKnownBits(Op0, Zeros, Ones);
15750 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
15754 RHS = Op0.getOperand(1);
15756 } else if (Op1.getOpcode() == ISD::Constant) {
15757 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
15758 uint64_t AndRHSVal = AndRHS->getZExtValue();
15759 SDValue AndLHS = Op0;
15761 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
15762 LHS = AndLHS.getOperand(0);
15763 RHS = AndLHS.getOperand(1);
15766 // Use BT if the immediate can't be encoded in a TEST instruction.
15767 if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
15769 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType());
15773 if (LHS.getNode()) {
15774 // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT
15775 // instruction. Since the shift amount is in-range-or-undefined, we know
15776 // that doing a bittest on the i32 value is ok. We extend to i32 because
15777 // the encoding for the i16 version is larger than the i32 version.
15778 // Also promote i16 to i32 for performance / code size reason.
15779 if (LHS.getValueType() == MVT::i8 ||
15780 LHS.getValueType() == MVT::i16)
15781 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
15783 // If the operand types disagree, extend the shift amount to match. Since
15784 // BT ignores high bits (like shifts) we can use anyextend.
15785 if (LHS.getValueType() != RHS.getValueType())
15786 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
15788 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
15789 X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
15790 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15791 DAG.getConstant(Cond, MVT::i8), BT);
15797 /// \brief - Turns an ISD::CondCode into a value suitable for SSE floating point
15799 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
15804 // SSE Condition code mapping:
15813 switch (SetCCOpcode) {
15814 default: llvm_unreachable("Unexpected SETCC condition");
15816 case ISD::SETEQ: SSECC = 0; break;
15818 case ISD::SETGT: Swap = true; // Fallthrough
15820 case ISD::SETOLT: SSECC = 1; break;
15822 case ISD::SETGE: Swap = true; // Fallthrough
15824 case ISD::SETOLE: SSECC = 2; break;
15825 case ISD::SETUO: SSECC = 3; break;
15827 case ISD::SETNE: SSECC = 4; break;
15828 case ISD::SETULE: Swap = true; // Fallthrough
15829 case ISD::SETUGE: SSECC = 5; break;
15830 case ISD::SETULT: Swap = true; // Fallthrough
15831 case ISD::SETUGT: SSECC = 6; break;
15832 case ISD::SETO: SSECC = 7; break;
15834 case ISD::SETONE: SSECC = 8; break;
15837 std::swap(Op0, Op1);
15842 // Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
15843 // ones, and then concatenate the result back.
15844 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
15845 MVT VT = Op.getSimpleValueType();
15847 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
15848 "Unsupported value type for operation");
15850 unsigned NumElems = VT.getVectorNumElements();
15852 SDValue CC = Op.getOperand(2);
15854 // Extract the LHS vectors
15855 SDValue LHS = Op.getOperand(0);
15856 SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
15857 SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
15859 // Extract the RHS vectors
15860 SDValue RHS = Op.getOperand(1);
15861 SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
15862 SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
15864 // Issue the operation on the smaller types and concatenate the result back
15865 MVT EltVT = VT.getVectorElementType();
15866 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
15867 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
15868 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
15869 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
15872 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG,
15873 const X86Subtarget *Subtarget) {
15874 SDValue Op0 = Op.getOperand(0);
15875 SDValue Op1 = Op.getOperand(1);
15876 SDValue CC = Op.getOperand(2);
15877 MVT VT = Op.getSimpleValueType();
15880 assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 8 &&
15881 Op.getValueType().getScalarType() == MVT::i1 &&
15882 "Cannot set masked compare for this operation");
15884 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
15886 bool Unsigned = false;
15889 switch (SetCCOpcode) {
15890 default: llvm_unreachable("Unexpected SETCC condition");
15891 case ISD::SETNE: SSECC = 4; break;
15892 case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break;
15893 case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
15894 case ISD::SETLT: Swap = true; //fall-through
15895 case ISD::SETGT: Opc = X86ISD::PCMPGTM; break;
15896 case ISD::SETULT: SSECC = 1; Unsigned = true; break;
15897 case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
15898 case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap
15899 case ISD::SETULE: Unsigned = true; //fall-through
15900 case ISD::SETLE: SSECC = 2; break;
15904 std::swap(Op0, Op1);
15906 return DAG.getNode(Opc, dl, VT, Op0, Op1);
15907 Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
15908 return DAG.getNode(Opc, dl, VT, Op0, Op1,
15909 DAG.getConstant(SSECC, MVT::i8));
15912 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
15913 /// operand \p Op1. If non-trivial (for example because it's not constant)
15914 /// return an empty value.
15915 static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG)
15917 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
15921 MVT VT = Op1.getSimpleValueType();
15922 MVT EVT = VT.getVectorElementType();
15923 unsigned n = VT.getVectorNumElements();
15924 SmallVector<SDValue, 8> ULTOp1;
15926 for (unsigned i = 0; i < n; ++i) {
15927 ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
15928 if (!Elt || Elt->isOpaque() || Elt->getValueType(0) != EVT)
15931 // Avoid underflow.
15932 APInt Val = Elt->getAPIntValue();
15936 ULTOp1.push_back(DAG.getConstant(Val - 1, EVT));
15939 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ULTOp1);
15942 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
15943 SelectionDAG &DAG) {
15944 SDValue Op0 = Op.getOperand(0);
15945 SDValue Op1 = Op.getOperand(1);
15946 SDValue CC = Op.getOperand(2);
15947 MVT VT = Op.getSimpleValueType();
15948 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
15949 bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
15954 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
15955 assert(EltVT == MVT::f32 || EltVT == MVT::f64);
15958 unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
15959 unsigned Opc = X86ISD::CMPP;
15960 if (Subtarget->hasAVX512() && VT.getVectorElementType() == MVT::i1) {
15961 assert(VT.getVectorNumElements() <= 16);
15962 Opc = X86ISD::CMPM;
15964 // In the two special cases we can't handle, emit two comparisons.
15967 unsigned CombineOpc;
15968 if (SetCCOpcode == ISD::SETUEQ) {
15969 CC0 = 3; CC1 = 0; CombineOpc = ISD::OR;
15971 assert(SetCCOpcode == ISD::SETONE);
15972 CC0 = 7; CC1 = 4; CombineOpc = ISD::AND;
15975 SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
15976 DAG.getConstant(CC0, MVT::i8));
15977 SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
15978 DAG.getConstant(CC1, MVT::i8));
15979 return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
15981 // Handle all other FP comparisons here.
15982 return DAG.getNode(Opc, dl, VT, Op0, Op1,
15983 DAG.getConstant(SSECC, MVT::i8));
15986 // Break 256-bit integer vector compare into smaller ones.
15987 if (VT.is256BitVector() && !Subtarget->hasInt256())
15988 return Lower256IntVSETCC(Op, DAG);
15990 bool MaskResult = (VT.getVectorElementType() == MVT::i1);
15991 EVT OpVT = Op1.getValueType();
15992 if (Subtarget->hasAVX512()) {
15993 if (Op1.getValueType().is512BitVector() ||
15994 (Subtarget->hasBWI() && Subtarget->hasVLX()) ||
15995 (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32))
15996 return LowerIntVSETCC_AVX512(Op, DAG, Subtarget);
15998 // In AVX-512 architecture setcc returns mask with i1 elements,
15999 // But there is no compare instruction for i8 and i16 elements in KNL.
16000 // We are not talking about 512-bit operands in this case, these
16001 // types are illegal.
16003 (OpVT.getVectorElementType().getSizeInBits() < 32 &&
16004 OpVT.getVectorElementType().getSizeInBits() >= 8))
16005 return DAG.getNode(ISD::TRUNCATE, dl, VT,
16006 DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
16009 // We are handling one of the integer comparisons here. Since SSE only has
16010 // GT and EQ comparisons for integer, swapping operands and multiple
16011 // operations may be required for some comparisons.
16013 bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
16014 bool Subus = false;
16016 switch (SetCCOpcode) {
16017 default: llvm_unreachable("Unexpected SETCC condition");
16018 case ISD::SETNE: Invert = true;
16019 case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break;
16020 case ISD::SETLT: Swap = true;
16021 case ISD::SETGT: Opc = X86ISD::PCMPGT; break;
16022 case ISD::SETGE: Swap = true;
16023 case ISD::SETLE: Opc = X86ISD::PCMPGT;
16024 Invert = true; break;
16025 case ISD::SETULT: Swap = true;
16026 case ISD::SETUGT: Opc = X86ISD::PCMPGT;
16027 FlipSigns = true; break;
16028 case ISD::SETUGE: Swap = true;
16029 case ISD::SETULE: Opc = X86ISD::PCMPGT;
16030 FlipSigns = true; Invert = true; break;
16033 // Special case: Use min/max operations for SETULE/SETUGE
16034 MVT VET = VT.getVectorElementType();
16036 (Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
16037 || (Subtarget->hasSSE2() && (VET == MVT::i8));
16040 switch (SetCCOpcode) {
16042 case ISD::SETULE: Opc = X86ISD::UMIN; MinMax = true; break;
16043 case ISD::SETUGE: Opc = X86ISD::UMAX; MinMax = true; break;
16046 if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
16049 bool hasSubus = Subtarget->hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
16050 if (!MinMax && hasSubus) {
16051 // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
16053 // t = psubus Op0, Op1
16054 // pcmpeq t, <0..0>
16055 switch (SetCCOpcode) {
16057 case ISD::SETULT: {
16058 // If the comparison is against a constant we can turn this into a
16059 // setule. With psubus, setule does not require a swap. This is
16060 // beneficial because the constant in the register is no longer
16061 // destructed as the destination so it can be hoisted out of a loop.
16062 // Only do this pre-AVX since vpcmp* is no longer destructive.
16063 if (Subtarget->hasAVX())
16065 SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG);
16066 if (ULEOp1.getNode()) {
16068 Subus = true; Invert = false; Swap = false;
16072 // Psubus is better than flip-sign because it requires no inversion.
16073 case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break;
16074 case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
16078 Opc = X86ISD::SUBUS;
16084 std::swap(Op0, Op1);
16086 // Check that the operation in question is available (most are plain SSE2,
16087 // but PCMPGTQ and PCMPEQQ have different requirements).
16088 if (VT == MVT::v2i64) {
16089 if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) {
16090 assert(Subtarget->hasSSE2() && "Don't know how to lower!");
16092 // First cast everything to the right type.
16093 Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
16094 Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
16096 // Since SSE has no unsigned integer comparisons, we need to flip the sign
16097 // bits of the inputs before performing those operations. The lower
16098 // compare is always unsigned.
16101 SB = DAG.getConstant(0x80000000U, MVT::v4i32);
16103 SDValue Sign = DAG.getConstant(0x80000000U, MVT::i32);
16104 SDValue Zero = DAG.getConstant(0x00000000U, MVT::i32);
16105 SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
16106 Sign, Zero, Sign, Zero);
16108 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
16109 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
16111 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
16112 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
16113 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
16115 // Create masks for only the low parts/high parts of the 64 bit integers.
16116 static const int MaskHi[] = { 1, 1, 3, 3 };
16117 static const int MaskLo[] = { 0, 0, 2, 2 };
16118 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
16119 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
16120 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
16122 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
16123 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
16126 Result = DAG.getNOT(dl, Result, MVT::v4i32);
16128 return DAG.getNode(ISD::BITCAST, dl, VT, Result);
16131 if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) {
16132 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
16133 // pcmpeqd + pshufd + pand.
16134 assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!");
16136 // First cast everything to the right type.
16137 Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
16138 Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
16141 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
16143 // Make sure the lower and upper halves are both all-ones.
16144 static const int Mask[] = { 1, 0, 3, 2 };
16145 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
16146 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
16149 Result = DAG.getNOT(dl, Result, MVT::v4i32);
16151 return DAG.getNode(ISD::BITCAST, dl, VT, Result);
16155 // Since SSE has no unsigned integer comparisons, we need to flip the sign
16156 // bits of the inputs before performing those operations.
16158 EVT EltVT = VT.getVectorElementType();
16159 SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), VT);
16160 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
16161 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
16164 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
16166 // If the logical-not of the result is required, perform that now.
16168 Result = DAG.getNOT(dl, Result, VT);
16171 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
16174 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
16175 getZeroVector(VT, Subtarget, DAG, dl));
16180 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
16182 MVT VT = Op.getSimpleValueType();
16184 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
16186 assert(((!Subtarget->hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
16187 && "SetCC type must be 8-bit or 1-bit integer");
16188 SDValue Op0 = Op.getOperand(0);
16189 SDValue Op1 = Op.getOperand(1);
16191 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
16193 // Optimize to BT if possible.
16194 // Lower (X & (1 << N)) == 0 to BT(X, N).
16195 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
16196 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
16197 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
16198 Op1.getOpcode() == ISD::Constant &&
16199 cast<ConstantSDNode>(Op1)->isNullValue() &&
16200 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
16201 SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
16202 if (NewSetCC.getNode()) {
16204 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
16209 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
16211 if (Op1.getOpcode() == ISD::Constant &&
16212 (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
16213 cast<ConstantSDNode>(Op1)->isNullValue()) &&
16214 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
16216 // If the input is a setcc, then reuse the input setcc or use a new one with
16217 // the inverted condition.
16218 if (Op0.getOpcode() == X86ISD::SETCC) {
16219 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
16220 bool Invert = (CC == ISD::SETNE) ^
16221 cast<ConstantSDNode>(Op1)->isNullValue();
16225 CCode = X86::GetOppositeBranchCondition(CCode);
16226 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
16227 DAG.getConstant(CCode, MVT::i8),
16228 Op0.getOperand(1));
16230 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
16234 if ((Op0.getValueType() == MVT::i1) && (Op1.getOpcode() == ISD::Constant) &&
16235 (cast<ConstantSDNode>(Op1)->getZExtValue() == 1) &&
16236 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
16238 ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
16239 return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, MVT::i1), NewCC);
16242 bool isFP = Op1.getSimpleValueType().isFloatingPoint();
16243 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
16244 if (X86CC == X86::COND_INVALID)
16247 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
16248 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
16249 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
16250 DAG.getConstant(X86CC, MVT::i8), EFLAGS);
16252 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
16256 // isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
16257 static bool isX86LogicalCmp(SDValue Op) {
16258 unsigned Opc = Op.getNode()->getOpcode();
16259 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
16260 Opc == X86ISD::SAHF)
16262 if (Op.getResNo() == 1 &&
16263 (Opc == X86ISD::ADD ||
16264 Opc == X86ISD::SUB ||
16265 Opc == X86ISD::ADC ||
16266 Opc == X86ISD::SBB ||
16267 Opc == X86ISD::SMUL ||
16268 Opc == X86ISD::UMUL ||
16269 Opc == X86ISD::INC ||
16270 Opc == X86ISD::DEC ||
16271 Opc == X86ISD::OR ||
16272 Opc == X86ISD::XOR ||
16273 Opc == X86ISD::AND))
16276 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
16282 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
16283 if (V.getOpcode() != ISD::TRUNCATE)
16286 SDValue VOp0 = V.getOperand(0);
16287 unsigned InBits = VOp0.getValueSizeInBits();
16288 unsigned Bits = V.getValueSizeInBits();
16289 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
16292 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
16293 bool addTest = true;
16294 SDValue Cond = Op.getOperand(0);
16295 SDValue Op1 = Op.getOperand(1);
16296 SDValue Op2 = Op.getOperand(2);
16298 EVT VT = Op1.getValueType();
16301 // Lower fp selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
16302 // are available. Otherwise fp cmovs get lowered into a less efficient branch
16303 // sequence later on.
16304 if (Cond.getOpcode() == ISD::SETCC &&
16305 ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
16306 (Subtarget->hasSSE1() && VT == MVT::f32)) &&
16307 VT == Cond.getOperand(0).getValueType() && Cond->hasOneUse()) {
16308 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
16309 int SSECC = translateX86FSETCC(
16310 cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
16313 if (Subtarget->hasAVX512()) {
16314 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1,
16315 DAG.getConstant(SSECC, MVT::i8));
16316 return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2);
16318 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
16319 DAG.getConstant(SSECC, MVT::i8));
16320 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
16321 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
16322 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
16326 if (Cond.getOpcode() == ISD::SETCC) {
16327 SDValue NewCond = LowerSETCC(Cond, DAG);
16328 if (NewCond.getNode())
16332 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
16333 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
16334 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
16335 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
16336 if (Cond.getOpcode() == X86ISD::SETCC &&
16337 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
16338 isZero(Cond.getOperand(1).getOperand(1))) {
16339 SDValue Cmp = Cond.getOperand(1);
16341 unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
16343 if ((isAllOnes(Op1) || isAllOnes(Op2)) &&
16344 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
16345 SDValue Y = isAllOnes(Op2) ? Op1 : Op2;
16347 SDValue CmpOp0 = Cmp.getOperand(0);
16348 // Apply further optimizations for special cases
16349 // (select (x != 0), -1, 0) -> neg & sbb
16350 // (select (x == 0), 0, -1) -> neg & sbb
16351 if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y))
16352 if (YC->isNullValue() &&
16353 (isAllOnes(Op1) == (CondCode == X86::COND_NE))) {
16354 SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
16355 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
16356 DAG.getConstant(0, CmpOp0.getValueType()),
16358 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
16359 DAG.getConstant(X86::COND_B, MVT::i8),
16360 SDValue(Neg.getNode(), 1));
16364 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
16365 CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));
16366 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16368 SDValue Res = // Res = 0 or -1.
16369 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
16370 DAG.getConstant(X86::COND_B, MVT::i8), Cmp);
16372 if (isAllOnes(Op1) != (CondCode == X86::COND_E))
16373 Res = DAG.getNOT(DL, Res, Res.getValueType());
16375 ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
16376 if (!N2C || !N2C->isNullValue())
16377 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
16382 // Look past (and (setcc_carry (cmp ...)), 1).
16383 if (Cond.getOpcode() == ISD::AND &&
16384 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
16385 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
16386 if (C && C->getAPIntValue() == 1)
16387 Cond = Cond.getOperand(0);
16390 // If condition flag is set by a X86ISD::CMP, then use it as the condition
16391 // setting operand in place of the X86ISD::SETCC.
16392 unsigned CondOpcode = Cond.getOpcode();
16393 if (CondOpcode == X86ISD::SETCC ||
16394 CondOpcode == X86ISD::SETCC_CARRY) {
16395 CC = Cond.getOperand(0);
16397 SDValue Cmp = Cond.getOperand(1);
16398 unsigned Opc = Cmp.getOpcode();
16399 MVT VT = Op.getSimpleValueType();
16401 bool IllegalFPCMov = false;
16402 if (VT.isFloatingPoint() && !VT.isVector() &&
16403 !isScalarFPTypeInSSEReg(VT)) // FPStack?
16404 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
16406 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
16407 Opc == X86ISD::BT) { // FIXME
16411 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
16412 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
16413 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
16414 Cond.getOperand(0).getValueType() != MVT::i8)) {
16415 SDValue LHS = Cond.getOperand(0);
16416 SDValue RHS = Cond.getOperand(1);
16417 unsigned X86Opcode;
16420 switch (CondOpcode) {
16421 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
16422 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
16423 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
16424 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
16425 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
16426 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
16427 default: llvm_unreachable("unexpected overflowing operator");
16429 if (CondOpcode == ISD::UMULO)
16430 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
16433 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
16435 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
16437 if (CondOpcode == ISD::UMULO)
16438 Cond = X86Op.getValue(2);
16440 Cond = X86Op.getValue(1);
16442 CC = DAG.getConstant(X86Cond, MVT::i8);
16447 // Look pass the truncate if the high bits are known zero.
16448 if (isTruncWithZeroHighBitsInput(Cond, DAG))
16449 Cond = Cond.getOperand(0);
16451 // We know the result of AND is compared against zero. Try to match
16453 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
16454 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG);
16455 if (NewSetCC.getNode()) {
16456 CC = NewSetCC.getOperand(0);
16457 Cond = NewSetCC.getOperand(1);
16464 CC = DAG.getConstant(X86::COND_NE, MVT::i8);
16465 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
16468 // a < b ? -1 : 0 -> RES = ~setcc_carry
16469 // a < b ? 0 : -1 -> RES = setcc_carry
16470 // a >= b ? -1 : 0 -> RES = setcc_carry
16471 // a >= b ? 0 : -1 -> RES = ~setcc_carry
16472 if (Cond.getOpcode() == X86ISD::SUB) {
16473 Cond = ConvertCmpIfNecessary(Cond, DAG);
16474 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
16476 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
16477 (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) {
16478 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
16479 DAG.getConstant(X86::COND_B, MVT::i8), Cond);
16480 if (isAllOnes(Op1) != (CondCode == X86::COND_B))
16481 return DAG.getNOT(DL, Res, Res.getValueType());
16486 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
16487 // widen the cmov and push the truncate through. This avoids introducing a new
16488 // branch during isel and doesn't add any extensions.
16489 if (Op.getValueType() == MVT::i8 &&
16490 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
16491 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
16492 if (T1.getValueType() == T2.getValueType() &&
16493 // Blacklist CopyFromReg to avoid partial register stalls.
16494 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
16495 SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
16496 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
16497 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
16501 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
16502 // condition is true.
16503 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
16504 SDValue Ops[] = { Op2, Op1, CC, Cond };
16505 return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
16508 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget,
16509 SelectionDAG &DAG) {
16510 MVT VT = Op->getSimpleValueType(0);
16511 SDValue In = Op->getOperand(0);
16512 MVT InVT = In.getSimpleValueType();
16513 MVT VTElt = VT.getVectorElementType();
16514 MVT InVTElt = InVT.getVectorElementType();
16518 if ((InVTElt == MVT::i1) &&
16519 (((Subtarget->hasBWI() && Subtarget->hasVLX() &&
16520 VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) ||
16522 ((Subtarget->hasBWI() && VT.is512BitVector() &&
16523 VTElt.getSizeInBits() <= 16)) ||
16525 ((Subtarget->hasDQI() && Subtarget->hasVLX() &&
16526 VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
16528 ((Subtarget->hasDQI() && VT.is512BitVector() &&
16529 VTElt.getSizeInBits() >= 32))))
16530 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16532 unsigned int NumElts = VT.getVectorNumElements();
16534 if (NumElts != 8 && NumElts != 16)
16537 if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) {
16538 if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
16539 return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
16540 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16543 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16544 assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
16546 MVT ExtVT = (NumElts == 8) ? MVT::v8i64 : MVT::v16i32;
16547 Constant *C = ConstantInt::get(*DAG.getContext(),
16548 APInt::getAllOnesValue(ExtVT.getScalarType().getSizeInBits()));
16550 SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
16551 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
16552 SDValue Ld = DAG.getLoad(ExtVT.getScalarType(), dl, DAG.getEntryNode(), CP,
16553 MachinePointerInfo::getConstantPool(),
16554 false, false, false, Alignment);
16555 SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, dl, ExtVT, In, Ld);
16556 if (VT.is512BitVector())
16558 return DAG.getNode(X86ISD::VTRUNC, dl, VT, Brcst);
16561 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
16562 SelectionDAG &DAG) {
16563 MVT VT = Op->getSimpleValueType(0);
16564 SDValue In = Op->getOperand(0);
16565 MVT InVT = In.getSimpleValueType();
16568 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
16569 return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
16571 if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
16572 (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
16573 (VT != MVT::v16i16 || InVT != MVT::v16i8))
16576 if (Subtarget->hasInt256())
16577 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16579 // Optimize vectors in AVX mode
16580 // Sign extend v8i16 to v8i32 and
16583 // Divide input vector into two parts
16584 // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
16585 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
16586 // concat the vectors to original VT
16588 unsigned NumElems = InVT.getVectorNumElements();
16589 SDValue Undef = DAG.getUNDEF(InVT);
16591 SmallVector<int,8> ShufMask1(NumElems, -1);
16592 for (unsigned i = 0; i != NumElems/2; ++i)
16595 SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask1[0]);
16597 SmallVector<int,8> ShufMask2(NumElems, -1);
16598 for (unsigned i = 0; i != NumElems/2; ++i)
16599 ShufMask2[i] = i + NumElems/2;
16601 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]);
16603 MVT HalfVT = MVT::getVectorVT(VT.getScalarType(),
16604 VT.getVectorNumElements()/2);
16606 OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
16607 OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi);
16609 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
16612 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
16613 // may emit an illegal shuffle but the expansion is still better than scalar
16614 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
16615 // we'll emit a shuffle and a arithmetic shift.
16616 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
16617 // TODO: It is possible to support ZExt by zeroing the undef values during
16618 // the shuffle phase or after the shuffle.
16619 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
16620 SelectionDAG &DAG) {
16621 MVT RegVT = Op.getSimpleValueType();
16622 assert(RegVT.isVector() && "We only custom lower vector sext loads.");
16623 assert(RegVT.isInteger() &&
16624 "We only custom lower integer vector sext loads.");
16626 // Nothing useful we can do without SSE2 shuffles.
16627 assert(Subtarget->hasSSE2() && "We only custom lower sext loads with SSE2.");
16629 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
16631 EVT MemVT = Ld->getMemoryVT();
16632 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16633 unsigned RegSz = RegVT.getSizeInBits();
16635 ISD::LoadExtType Ext = Ld->getExtensionType();
16637 assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
16638 && "Only anyext and sext are currently implemented.");
16639 assert(MemVT != RegVT && "Cannot extend to the same type");
16640 assert(MemVT.isVector() && "Must load a vector from memory");
16642 unsigned NumElems = RegVT.getVectorNumElements();
16643 unsigned MemSz = MemVT.getSizeInBits();
16644 assert(RegSz > MemSz && "Register size must be greater than the mem size");
16646 if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) {
16647 // The only way in which we have a legal 256-bit vector result but not the
16648 // integer 256-bit operations needed to directly lower a sextload is if we
16649 // have AVX1 but not AVX2. In that case, we can always emit a sextload to
16650 // a 128-bit vector and a normal sign_extend to 256-bits that should get
16651 // correctly legalized. We do this late to allow the canonical form of
16652 // sextload to persist throughout the rest of the DAG combiner -- it wants
16653 // to fold together any extensions it can, and so will fuse a sign_extend
16654 // of an sextload into a sextload targeting a wider value.
16656 if (MemSz == 128) {
16657 // Just switch this to a normal load.
16658 assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
16659 "it must be a legal 128-bit vector "
16661 Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
16662 Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(),
16663 Ld->isInvariant(), Ld->getAlignment());
16665 assert(MemSz < 128 &&
16666 "Can't extend a type wider than 128 bits to a 256 bit vector!");
16667 // Do an sext load to a 128-bit vector type. We want to use the same
16668 // number of elements, but elements half as wide. This will end up being
16669 // recursively lowered by this routine, but will succeed as we definitely
16670 // have all the necessary features if we're using AVX1.
16672 EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
16673 EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
16675 DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
16676 Ld->getPointerInfo(), MemVT, Ld->isVolatile(),
16677 Ld->isNonTemporal(), Ld->isInvariant(),
16678 Ld->getAlignment());
16681 // Replace chain users with the new chain.
16682 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
16683 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
16685 // Finally, do a normal sign-extend to the desired register.
16686 return DAG.getSExtOrTrunc(Load, dl, RegVT);
16689 // All sizes must be a power of two.
16690 assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
16691 "Non-power-of-two elements are not custom lowered!");
16693 // Attempt to load the original value using scalar loads.
16694 // Find the largest scalar type that divides the total loaded size.
16695 MVT SclrLoadTy = MVT::i8;
16696 for (MVT Tp : MVT::integer_valuetypes()) {
16697 if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
16702 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
16703 if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
16705 SclrLoadTy = MVT::f64;
16707 // Calculate the number of scalar loads that we need to perform
16708 // in order to load our vector from memory.
16709 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
16711 assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
16712 "Can only lower sext loads with a single scalar load!");
16714 unsigned loadRegZize = RegSz;
16715 if (Ext == ISD::SEXTLOAD && RegSz == 256)
16718 // Represent our vector as a sequence of elements which are the
16719 // largest scalar that we can load.
16720 EVT LoadUnitVecVT = EVT::getVectorVT(
16721 *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
16723 // Represent the data using the same element type that is stored in
16724 // memory. In practice, we ''widen'' MemVT.
16726 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
16727 loadRegZize / MemVT.getScalarType().getSizeInBits());
16729 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
16730 "Invalid vector type");
16732 // We can't shuffle using an illegal type.
16733 assert(TLI.isTypeLegal(WideVecVT) &&
16734 "We only lower types that form legal widened vector types");
16736 SmallVector<SDValue, 8> Chains;
16737 SDValue Ptr = Ld->getBasePtr();
16738 SDValue Increment =
16739 DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, TLI.getPointerTy());
16740 SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
16742 for (unsigned i = 0; i < NumLoads; ++i) {
16743 // Perform a single load.
16744 SDValue ScalarLoad =
16745 DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
16746 Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(),
16747 Ld->getAlignment());
16748 Chains.push_back(ScalarLoad.getValue(1));
16749 // Create the first element type using SCALAR_TO_VECTOR in order to avoid
16750 // another round of DAGCombining.
16752 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
16754 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
16755 ScalarLoad, DAG.getIntPtrConstant(i));
16757 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
16760 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
16762 // Bitcast the loaded value to a vector of the original element type, in
16763 // the size of the target vector type.
16764 SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
16765 unsigned SizeRatio = RegSz / MemSz;
16767 if (Ext == ISD::SEXTLOAD) {
16768 // If we have SSE4.1, we can directly emit a VSEXT node.
16769 if (Subtarget->hasSSE41()) {
16770 SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
16771 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16775 // Otherwise we'll shuffle the small elements in the high bits of the
16776 // larger type and perform an arithmetic shift. If the shift is not legal
16777 // it's better to scalarize.
16778 assert(TLI.isOperationLegalOrCustom(ISD::SRA, RegVT) &&
16779 "We can't implement a sext load without an arithmetic right shift!");
16781 // Redistribute the loaded elements into the different locations.
16782 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
16783 for (unsigned i = 0; i != NumElems; ++i)
16784 ShuffleVec[i * SizeRatio + SizeRatio - 1] = i;
16786 SDValue Shuff = DAG.getVectorShuffle(
16787 WideVecVT, dl, SlicedVec, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
16789 Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
16791 // Build the arithmetic shift.
16792 unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
16793 MemVT.getVectorElementType().getSizeInBits();
16795 DAG.getNode(ISD::SRA, dl, RegVT, Shuff, DAG.getConstant(Amt, RegVT));
16797 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16801 // Redistribute the loaded elements into the different locations.
16802 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
16803 for (unsigned i = 0; i != NumElems; ++i)
16804 ShuffleVec[i * SizeRatio] = i;
16806 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
16807 DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
16809 // Bitcast to the requested type.
16810 Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
16811 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16815 // isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
16816 // ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
16817 // from the AND / OR.
16818 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
16819 Opc = Op.getOpcode();
16820 if (Opc != ISD::OR && Opc != ISD::AND)
16822 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
16823 Op.getOperand(0).hasOneUse() &&
16824 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
16825 Op.getOperand(1).hasOneUse());
16828 // isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
16829 // 1 and that the SETCC node has a single use.
16830 static bool isXor1OfSetCC(SDValue Op) {
16831 if (Op.getOpcode() != ISD::XOR)
16833 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
16834 if (N1C && N1C->getAPIntValue() == 1) {
16835 return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
16836 Op.getOperand(0).hasOneUse();
16841 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
16842 bool addTest = true;
16843 SDValue Chain = Op.getOperand(0);
16844 SDValue Cond = Op.getOperand(1);
16845 SDValue Dest = Op.getOperand(2);
16848 bool Inverted = false;
16850 if (Cond.getOpcode() == ISD::SETCC) {
16851 // Check for setcc([su]{add,sub,mul}o == 0).
16852 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
16853 isa<ConstantSDNode>(Cond.getOperand(1)) &&
16854 cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() &&
16855 Cond.getOperand(0).getResNo() == 1 &&
16856 (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
16857 Cond.getOperand(0).getOpcode() == ISD::UADDO ||
16858 Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
16859 Cond.getOperand(0).getOpcode() == ISD::USUBO ||
16860 Cond.getOperand(0).getOpcode() == ISD::SMULO ||
16861 Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
16863 Cond = Cond.getOperand(0);
16865 SDValue NewCond = LowerSETCC(Cond, DAG);
16866 if (NewCond.getNode())
16871 // FIXME: LowerXALUO doesn't handle these!!
16872 else if (Cond.getOpcode() == X86ISD::ADD ||
16873 Cond.getOpcode() == X86ISD::SUB ||
16874 Cond.getOpcode() == X86ISD::SMUL ||
16875 Cond.getOpcode() == X86ISD::UMUL)
16876 Cond = LowerXALUO(Cond, DAG);
16879 // Look pass (and (setcc_carry (cmp ...)), 1).
16880 if (Cond.getOpcode() == ISD::AND &&
16881 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
16882 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
16883 if (C && C->getAPIntValue() == 1)
16884 Cond = Cond.getOperand(0);
16887 // If condition flag is set by a X86ISD::CMP, then use it as the condition
16888 // setting operand in place of the X86ISD::SETCC.
16889 unsigned CondOpcode = Cond.getOpcode();
16890 if (CondOpcode == X86ISD::SETCC ||
16891 CondOpcode == X86ISD::SETCC_CARRY) {
16892 CC = Cond.getOperand(0);
16894 SDValue Cmp = Cond.getOperand(1);
16895 unsigned Opc = Cmp.getOpcode();
16896 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
16897 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
16901 switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
16905 // These can only come from an arithmetic instruction with overflow,
16906 // e.g. SADDO, UADDO.
16907 Cond = Cond.getNode()->getOperand(1);
16913 CondOpcode = Cond.getOpcode();
16914 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
16915 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
16916 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
16917 Cond.getOperand(0).getValueType() != MVT::i8)) {
16918 SDValue LHS = Cond.getOperand(0);
16919 SDValue RHS = Cond.getOperand(1);
16920 unsigned X86Opcode;
16923 // Keep this in sync with LowerXALUO, otherwise we might create redundant
16924 // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
16926 switch (CondOpcode) {
16927 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
16929 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
16931 X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
16934 X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
16935 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
16937 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
16939 X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
16942 X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
16943 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
16944 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
16945 default: llvm_unreachable("unexpected overflowing operator");
16948 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
16949 if (CondOpcode == ISD::UMULO)
16950 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
16953 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
16955 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
16957 if (CondOpcode == ISD::UMULO)
16958 Cond = X86Op.getValue(2);
16960 Cond = X86Op.getValue(1);
16962 CC = DAG.getConstant(X86Cond, MVT::i8);
16966 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
16967 SDValue Cmp = Cond.getOperand(0).getOperand(1);
16968 if (CondOpc == ISD::OR) {
16969 // Also, recognize the pattern generated by an FCMP_UNE. We can emit
16970 // two branches instead of an explicit OR instruction with a
16972 if (Cmp == Cond.getOperand(1).getOperand(1) &&
16973 isX86LogicalCmp(Cmp)) {
16974 CC = Cond.getOperand(0).getOperand(0);
16975 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16976 Chain, Dest, CC, Cmp);
16977 CC = Cond.getOperand(1).getOperand(0);
16981 } else { // ISD::AND
16982 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
16983 // two branches instead of an explicit AND instruction with a
16984 // separate test. However, we only do this if this block doesn't
16985 // have a fall-through edge, because this requires an explicit
16986 // jmp when the condition is false.
16987 if (Cmp == Cond.getOperand(1).getOperand(1) &&
16988 isX86LogicalCmp(Cmp) &&
16989 Op.getNode()->hasOneUse()) {
16990 X86::CondCode CCode =
16991 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
16992 CCode = X86::GetOppositeBranchCondition(CCode);
16993 CC = DAG.getConstant(CCode, MVT::i8);
16994 SDNode *User = *Op.getNode()->use_begin();
16995 // Look for an unconditional branch following this conditional branch.
16996 // We need this because we need to reverse the successors in order
16997 // to implement FCMP_OEQ.
16998 if (User->getOpcode() == ISD::BR) {
16999 SDValue FalseBB = User->getOperand(1);
17001 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
17002 assert(NewBR == User);
17006 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
17007 Chain, Dest, CC, Cmp);
17008 X86::CondCode CCode =
17009 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
17010 CCode = X86::GetOppositeBranchCondition(CCode);
17011 CC = DAG.getConstant(CCode, MVT::i8);
17017 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
17018 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
17019 // It should be transformed during dag combiner except when the condition
17020 // is set by a arithmetics with overflow node.
17021 X86::CondCode CCode =
17022 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
17023 CCode = X86::GetOppositeBranchCondition(CCode);
17024 CC = DAG.getConstant(CCode, MVT::i8);
17025 Cond = Cond.getOperand(0).getOperand(1);
17027 } else if (Cond.getOpcode() == ISD::SETCC &&
17028 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
17029 // For FCMP_OEQ, we can emit
17030 // two branches instead of an explicit AND instruction with a
17031 // separate test. However, we only do this if this block doesn't
17032 // have a fall-through edge, because this requires an explicit
17033 // jmp when the condition is false.
17034 if (Op.getNode()->hasOneUse()) {
17035 SDNode *User = *Op.getNode()->use_begin();
17036 // Look for an unconditional branch following this conditional branch.
17037 // We need this because we need to reverse the successors in order
17038 // to implement FCMP_OEQ.
17039 if (User->getOpcode() == ISD::BR) {
17040 SDValue FalseBB = User->getOperand(1);
17042 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
17043 assert(NewBR == User);
17047 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
17048 Cond.getOperand(0), Cond.getOperand(1));
17049 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
17050 CC = DAG.getConstant(X86::COND_NE, MVT::i8);
17051 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
17052 Chain, Dest, CC, Cmp);
17053 CC = DAG.getConstant(X86::COND_P, MVT::i8);
17058 } else if (Cond.getOpcode() == ISD::SETCC &&
17059 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
17060 // For FCMP_UNE, we can emit
17061 // two branches instead of an explicit AND instruction with a
17062 // separate test. However, we only do this if this block doesn't
17063 // have a fall-through edge, because this requires an explicit
17064 // jmp when the condition is false.
17065 if (Op.getNode()->hasOneUse()) {
17066 SDNode *User = *Op.getNode()->use_begin();
17067 // Look for an unconditional branch following this conditional branch.
17068 // We need this because we need to reverse the successors in order
17069 // to implement FCMP_UNE.
17070 if (User->getOpcode() == ISD::BR) {
17071 SDValue FalseBB = User->getOperand(1);
17073 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
17074 assert(NewBR == User);
17077 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
17078 Cond.getOperand(0), Cond.getOperand(1));
17079 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
17080 CC = DAG.getConstant(X86::COND_NE, MVT::i8);
17081 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
17082 Chain, Dest, CC, Cmp);
17083 CC = DAG.getConstant(X86::COND_NP, MVT::i8);
17093 // Look pass the truncate if the high bits are known zero.
17094 if (isTruncWithZeroHighBitsInput(Cond, DAG))
17095 Cond = Cond.getOperand(0);
17097 // We know the result of AND is compared against zero. Try to match
17099 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
17100 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
17101 if (NewSetCC.getNode()) {
17102 CC = NewSetCC.getOperand(0);
17103 Cond = NewSetCC.getOperand(1);
17110 X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
17111 CC = DAG.getConstant(X86Cond, MVT::i8);
17112 Cond = EmitTest(Cond, X86Cond, dl, DAG);
17114 Cond = ConvertCmpIfNecessary(Cond, DAG);
17115 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
17116 Chain, Dest, CC, Cond);
17119 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
17120 // Calls to _alloca are needed to probe the stack when allocating more than 4k
17121 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
17122 // that the guard pages used by the OS virtual memory manager are allocated in
17123 // correct sequence.
17125 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
17126 SelectionDAG &DAG) const {
17127 MachineFunction &MF = DAG.getMachineFunction();
17128 bool SplitStack = MF.shouldSplitStack();
17129 bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMachO()) ||
17134 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17135 SDNode* Node = Op.getNode();
17137 unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
17138 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
17139 " not tell us which reg is the stack pointer!");
17140 EVT VT = Node->getValueType(0);
17141 SDValue Tmp1 = SDValue(Node, 0);
17142 SDValue Tmp2 = SDValue(Node, 1);
17143 SDValue Tmp3 = Node->getOperand(2);
17144 SDValue Chain = Tmp1.getOperand(0);
17146 // Chain the dynamic stack allocation so that it doesn't modify the stack
17147 // pointer when other instructions are using the stack.
17148 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true),
17151 SDValue Size = Tmp2.getOperand(1);
17152 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
17153 Chain = SP.getValue(1);
17154 unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
17155 const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
17156 unsigned StackAlign = TFI.getStackAlignment();
17157 Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
17158 if (Align > StackAlign)
17159 Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
17160 DAG.getConstant(-(uint64_t)Align, VT));
17161 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
17163 Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, true),
17164 DAG.getIntPtrConstant(0, true), SDValue(),
17167 SDValue Ops[2] = { Tmp1, Tmp2 };
17168 return DAG.getMergeValues(Ops, dl);
17172 SDValue Chain = Op.getOperand(0);
17173 SDValue Size = Op.getOperand(1);
17174 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
17175 EVT VT = Op.getNode()->getValueType(0);
17177 bool Is64Bit = Subtarget->is64Bit();
17178 EVT SPTy = getPointerTy();
17181 MachineRegisterInfo &MRI = MF.getRegInfo();
17184 // The 64 bit implementation of segmented stacks needs to clobber both r10
17185 // r11. This makes it impossible to use it along with nested parameters.
17186 const Function *F = MF.getFunction();
17188 for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
17190 if (I->hasNestAttr())
17191 report_fatal_error("Cannot use segmented stacks with functions that "
17192 "have nested arguments.");
17195 const TargetRegisterClass *AddrRegClass =
17196 getRegClassFor(getPointerTy());
17197 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
17198 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
17199 SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
17200 DAG.getRegister(Vreg, SPTy));
17201 SDValue Ops1[2] = { Value, Chain };
17202 return DAG.getMergeValues(Ops1, dl);
17205 const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX);
17207 Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
17208 Flag = Chain.getValue(1);
17209 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
17211 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
17213 const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
17214 unsigned SPReg = RegInfo->getStackRegister();
17215 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
17216 Chain = SP.getValue(1);
17219 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
17220 DAG.getConstant(-(uint64_t)Align, VT));
17221 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
17224 SDValue Ops1[2] = { SP, Chain };
17225 return DAG.getMergeValues(Ops1, dl);
17229 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
17230 MachineFunction &MF = DAG.getMachineFunction();
17231 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
17233 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
17236 if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) {
17237 // vastart just stores the address of the VarArgsFrameIndex slot into the
17238 // memory location argument.
17239 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
17241 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
17242 MachinePointerInfo(SV), false, false, 0);
17246 // gp_offset (0 - 6 * 8)
17247 // fp_offset (48 - 48 + 8 * 16)
17248 // overflow_arg_area (point to parameters coming in memory).
17250 SmallVector<SDValue, 8> MemOps;
17251 SDValue FIN = Op.getOperand(1);
17253 SDValue Store = DAG.getStore(Op.getOperand(0), DL,
17254 DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
17256 FIN, MachinePointerInfo(SV), false, false, 0);
17257 MemOps.push_back(Store);
17260 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
17261 FIN, DAG.getIntPtrConstant(4));
17262 Store = DAG.getStore(Op.getOperand(0), DL,
17263 DAG.getConstant(FuncInfo->getVarArgsFPOffset(),
17265 FIN, MachinePointerInfo(SV, 4), false, false, 0);
17266 MemOps.push_back(Store);
17268 // Store ptr to overflow_arg_area
17269 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
17270 FIN, DAG.getIntPtrConstant(4));
17271 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
17273 Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
17274 MachinePointerInfo(SV, 8),
17276 MemOps.push_back(Store);
17278 // Store ptr to reg_save_area.
17279 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
17280 FIN, DAG.getIntPtrConstant(8));
17281 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
17283 Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
17284 MachinePointerInfo(SV, 16), false, false, 0);
17285 MemOps.push_back(Store);
17286 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
17289 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
17290 assert(Subtarget->is64Bit() &&
17291 "LowerVAARG only handles 64-bit va_arg!");
17292 assert((Subtarget->isTargetLinux() ||
17293 Subtarget->isTargetDarwin()) &&
17294 "Unhandled target in LowerVAARG");
17295 assert(Op.getNode()->getNumOperands() == 4);
17296 SDValue Chain = Op.getOperand(0);
17297 SDValue SrcPtr = Op.getOperand(1);
17298 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
17299 unsigned Align = Op.getConstantOperandVal(3);
17302 EVT ArgVT = Op.getNode()->getValueType(0);
17303 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
17304 uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
17307 // Decide which area this value should be read from.
17308 // TODO: Implement the AMD64 ABI in its entirety. This simple
17309 // selection mechanism works only for the basic types.
17310 if (ArgVT == MVT::f80) {
17311 llvm_unreachable("va_arg for f80 not yet implemented");
17312 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
17313 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
17314 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
17315 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
17317 llvm_unreachable("Unhandled argument type in LowerVAARG");
17320 if (ArgMode == 2) {
17321 // Sanity Check: Make sure using fp_offset makes sense.
17322 assert(!DAG.getTarget().Options.UseSoftFloat &&
17323 !(DAG.getMachineFunction().getFunction()->hasFnAttribute(
17324 Attribute::NoImplicitFloat)) &&
17325 Subtarget->hasSSE1());
17328 // Insert VAARG_64 node into the DAG
17329 // VAARG_64 returns two values: Variable Argument Address, Chain
17330 SmallVector<SDValue, 11> InstOps;
17331 InstOps.push_back(Chain);
17332 InstOps.push_back(SrcPtr);
17333 InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32));
17334 InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8));
17335 InstOps.push_back(DAG.getConstant(Align, MVT::i32));
17336 SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other);
17337 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
17338 VTs, InstOps, MVT::i64,
17339 MachinePointerInfo(SV),
17341 /*Volatile=*/false,
17343 /*WriteMem=*/true);
17344 Chain = VAARG.getValue(1);
17346 // Load the next argument and return it
17347 return DAG.getLoad(ArgVT, dl,
17350 MachinePointerInfo(),
17351 false, false, false, 0);
17354 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
17355 SelectionDAG &DAG) {
17356 // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
17357 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
17358 SDValue Chain = Op.getOperand(0);
17359 SDValue DstPtr = Op.getOperand(1);
17360 SDValue SrcPtr = Op.getOperand(2);
17361 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
17362 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
17365 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
17366 DAG.getIntPtrConstant(24), 8, /*isVolatile*/false,
17368 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
17371 // getTargetVShiftByConstNode - Handle vector element shifts where the shift
17372 // amount is a constant. Takes immediate version of shift as input.
17373 static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
17374 SDValue SrcOp, uint64_t ShiftAmt,
17375 SelectionDAG &DAG) {
17376 MVT ElementType = VT.getVectorElementType();
17378 // Fold this packed shift into its first operand if ShiftAmt is 0.
17382 // Check for ShiftAmt >= element width
17383 if (ShiftAmt >= ElementType.getSizeInBits()) {
17384 if (Opc == X86ISD::VSRAI)
17385 ShiftAmt = ElementType.getSizeInBits() - 1;
17387 return DAG.getConstant(0, VT);
17390 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
17391 && "Unknown target vector shift-by-constant node");
17393 // Fold this packed vector shift into a build vector if SrcOp is a
17394 // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT.
17395 if (VT == SrcOp.getSimpleValueType() &&
17396 ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
17397 SmallVector<SDValue, 8> Elts;
17398 unsigned NumElts = SrcOp->getNumOperands();
17399 ConstantSDNode *ND;
17402 default: llvm_unreachable(nullptr);
17403 case X86ISD::VSHLI:
17404 for (unsigned i=0; i!=NumElts; ++i) {
17405 SDValue CurrentOp = SrcOp->getOperand(i);
17406 if (CurrentOp->getOpcode() == ISD::UNDEF) {
17407 Elts.push_back(CurrentOp);
17410 ND = cast<ConstantSDNode>(CurrentOp);
17411 const APInt &C = ND->getAPIntValue();
17412 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), ElementType));
17415 case X86ISD::VSRLI:
17416 for (unsigned i=0; i!=NumElts; ++i) {
17417 SDValue CurrentOp = SrcOp->getOperand(i);
17418 if (CurrentOp->getOpcode() == ISD::UNDEF) {
17419 Elts.push_back(CurrentOp);
17422 ND = cast<ConstantSDNode>(CurrentOp);
17423 const APInt &C = ND->getAPIntValue();
17424 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), ElementType));
17427 case X86ISD::VSRAI:
17428 for (unsigned i=0; i!=NumElts; ++i) {
17429 SDValue CurrentOp = SrcOp->getOperand(i);
17430 if (CurrentOp->getOpcode() == ISD::UNDEF) {
17431 Elts.push_back(CurrentOp);
17434 ND = cast<ConstantSDNode>(CurrentOp);
17435 const APInt &C = ND->getAPIntValue();
17436 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), ElementType));
17441 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
17444 return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8));
17447 // getTargetVShiftNode - Handle vector element shifts where the shift amount
17448 // may or may not be a constant. Takes immediate version of shift as input.
17449 static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
17450 SDValue SrcOp, SDValue ShAmt,
17451 SelectionDAG &DAG) {
17452 MVT SVT = ShAmt.getSimpleValueType();
17453 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
17455 // Catch shift-by-constant.
17456 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
17457 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
17458 CShAmt->getZExtValue(), DAG);
17460 // Change opcode to non-immediate version
17462 default: llvm_unreachable("Unknown target vector shift node");
17463 case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
17464 case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
17465 case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
17468 const X86Subtarget &Subtarget =
17469 static_cast<const X86Subtarget &>(DAG.getSubtarget());
17470 if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
17471 ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
17472 // Let the shuffle legalizer expand this shift amount node.
17473 SDValue Op0 = ShAmt.getOperand(0);
17474 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
17475 ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, &Subtarget, DAG);
17477 // Need to build a vector containing shift amount.
17478 // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
17479 SmallVector<SDValue, 4> ShOps;
17480 ShOps.push_back(ShAmt);
17481 if (SVT == MVT::i32) {
17482 ShOps.push_back(DAG.getConstant(0, SVT));
17483 ShOps.push_back(DAG.getUNDEF(SVT));
17485 ShOps.push_back(DAG.getUNDEF(SVT));
17487 MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
17488 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, BVT, ShOps);
17491 // The return type has to be a 128-bit type with the same element
17492 // type as the input type.
17493 MVT EltVT = VT.getVectorElementType();
17494 EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
17496 ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt);
17497 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
17500 /// \brief Return (and \p Op, \p Mask) for compare instructions or
17501 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
17502 /// necessary casting for \p Mask when lowering masking intrinsics.
17503 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
17504 SDValue PreservedSrc,
17505 const X86Subtarget *Subtarget,
17506 SelectionDAG &DAG) {
17507 EVT VT = Op.getValueType();
17508 EVT MaskVT = EVT::getVectorVT(*DAG.getContext(),
17509 MVT::i1, VT.getVectorNumElements());
17510 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17511 Mask.getValueType().getSizeInBits());
17514 assert(MaskVT.isSimple() && "invalid mask type");
17516 if (isAllOnes(Mask))
17519 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
17520 // are extracted by EXTRACT_SUBVECTOR.
17521 SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17522 DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17523 DAG.getIntPtrConstant(0));
17525 switch (Op.getOpcode()) {
17527 case X86ISD::PCMPEQM:
17528 case X86ISD::PCMPGTM:
17530 case X86ISD::CMPMU:
17531 return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
17533 if (PreservedSrc.getOpcode() == ISD::UNDEF)
17534 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
17535 return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc);
17538 /// \brief Creates an SDNode for a predicated scalar operation.
17539 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
17540 /// The mask is comming as MVT::i8 and it should be truncated
17541 /// to MVT::i1 while lowering masking intrinsics.
17542 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
17543 /// "X86select" instead of "vselect". We just can't create the "vselect" node for
17544 /// a scalar instruction.
17545 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
17546 SDValue PreservedSrc,
17547 const X86Subtarget *Subtarget,
17548 SelectionDAG &DAG) {
17549 if (isAllOnes(Mask))
17552 EVT VT = Op.getValueType();
17554 // The mask should be of type MVT::i1
17555 SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
17557 if (PreservedSrc.getOpcode() == ISD::UNDEF)
17558 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
17559 return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
17562 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
17563 SelectionDAG &DAG) {
17565 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
17566 EVT VT = Op.getValueType();
17567 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
17569 switch(IntrData->Type) {
17570 case INTR_TYPE_1OP:
17571 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
17572 case INTR_TYPE_2OP:
17573 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17575 case INTR_TYPE_3OP:
17576 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17577 Op.getOperand(2), Op.getOperand(3));
17578 case INTR_TYPE_1OP_MASK_RM: {
17579 SDValue Src = Op.getOperand(1);
17580 SDValue Src0 = Op.getOperand(2);
17581 SDValue Mask = Op.getOperand(3);
17582 SDValue RoundingMode = Op.getOperand(4);
17583 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
17585 Mask, Src0, Subtarget, DAG);
17587 case INTR_TYPE_SCALAR_MASK_RM: {
17588 SDValue Src1 = Op.getOperand(1);
17589 SDValue Src2 = Op.getOperand(2);
17590 SDValue Src0 = Op.getOperand(3);
17591 SDValue Mask = Op.getOperand(4);
17592 SDValue RoundingMode = Op.getOperand(5);
17593 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
17595 Mask, Src0, Subtarget, DAG);
17597 case INTR_TYPE_2OP_MASK: {
17598 SDValue Mask = Op.getOperand(4);
17599 SDValue PassThru = Op.getOperand(3);
17600 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17601 if (IntrWithRoundingModeOpcode != 0) {
17602 unsigned Round = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
17603 if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
17604 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17605 dl, Op.getValueType(),
17606 Op.getOperand(1), Op.getOperand(2),
17607 Op.getOperand(3), Op.getOperand(5)),
17608 Mask, PassThru, Subtarget, DAG);
17611 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17614 Mask, PassThru, Subtarget, DAG);
17616 case FMA_OP_MASK: {
17617 SDValue Src1 = Op.getOperand(1);
17618 SDValue Src2 = Op.getOperand(2);
17619 SDValue Src3 = Op.getOperand(3);
17620 SDValue Mask = Op.getOperand(4);
17621 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17622 if (IntrWithRoundingModeOpcode != 0) {
17623 SDValue Rnd = Op.getOperand(5);
17624 if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
17625 X86::STATIC_ROUNDING::CUR_DIRECTION)
17626 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17627 dl, Op.getValueType(),
17628 Src1, Src2, Src3, Rnd),
17629 Mask, Src1, Subtarget, DAG);
17631 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
17632 dl, Op.getValueType(),
17634 Mask, Src1, Subtarget, DAG);
17637 case CMP_MASK_CC: {
17638 // Comparison intrinsics with masks.
17639 // Example of transformation:
17640 // (i8 (int_x86_avx512_mask_pcmpeq_q_128
17641 // (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
17643 // (v8i1 (insert_subvector undef,
17644 // (v2i1 (and (PCMPEQM %a, %b),
17645 // (extract_subvector
17646 // (v8i1 (bitcast %mask)), 0))), 0))))
17647 EVT VT = Op.getOperand(1).getValueType();
17648 EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17649 VT.getVectorNumElements());
17650 SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
17651 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17652 Mask.getValueType().getSizeInBits());
17654 if (IntrData->Type == CMP_MASK_CC) {
17655 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
17656 Op.getOperand(2), Op.getOperand(3));
17658 assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
17659 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
17662 SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
17663 DAG.getTargetConstant(0, MaskVT),
17665 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
17666 DAG.getUNDEF(BitcastVT), CmpMask,
17667 DAG.getIntPtrConstant(0));
17668 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
17670 case COMI: { // Comparison intrinsics
17671 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
17672 SDValue LHS = Op.getOperand(1);
17673 SDValue RHS = Op.getOperand(2);
17674 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
17675 assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
17676 SDValue Cond = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
17677 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17678 DAG.getConstant(X86CC, MVT::i8), Cond);
17679 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17682 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
17683 Op.getOperand(1), Op.getOperand(2), DAG);
17685 return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl,
17686 Op.getSimpleValueType(),
17688 Op.getOperand(2), DAG),
17689 Op.getOperand(4), Op.getOperand(3), Subtarget,
17691 case COMPRESS_EXPAND_IN_REG: {
17692 SDValue Mask = Op.getOperand(3);
17693 SDValue DataToCompress = Op.getOperand(1);
17694 SDValue PassThru = Op.getOperand(2);
17695 if (isAllOnes(Mask)) // return data as is
17696 return Op.getOperand(1);
17697 EVT VT = Op.getValueType();
17698 EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17699 VT.getVectorNumElements());
17700 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17701 Mask.getValueType().getSizeInBits());
17703 SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17704 DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17705 DAG.getIntPtrConstant(0));
17707 return DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToCompress,
17711 SDValue Mask = Op.getOperand(3);
17712 EVT VT = Op.getValueType();
17713 EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17714 VT.getVectorNumElements());
17715 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17716 Mask.getValueType().getSizeInBits());
17718 SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17719 DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17720 DAG.getIntPtrConstant(0));
17721 return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1),
17730 default: return SDValue(); // Don't custom lower most intrinsics.
17732 case Intrinsic::x86_avx512_mask_valign_q_512:
17733 case Intrinsic::x86_avx512_mask_valign_d_512:
17734 // Vector source operands are swapped.
17735 return getVectorMaskingNode(DAG.getNode(X86ISD::VALIGN, dl,
17736 Op.getValueType(), Op.getOperand(2),
17739 Op.getOperand(5), Op.getOperand(4),
17742 // ptest and testp intrinsics. The intrinsic these come from are designed to
17743 // return an integer value, not just an instruction so lower it to the ptest
17744 // or testp pattern and a setcc for the result.
17745 case Intrinsic::x86_sse41_ptestz:
17746 case Intrinsic::x86_sse41_ptestc:
17747 case Intrinsic::x86_sse41_ptestnzc:
17748 case Intrinsic::x86_avx_ptestz_256:
17749 case Intrinsic::x86_avx_ptestc_256:
17750 case Intrinsic::x86_avx_ptestnzc_256:
17751 case Intrinsic::x86_avx_vtestz_ps:
17752 case Intrinsic::x86_avx_vtestc_ps:
17753 case Intrinsic::x86_avx_vtestnzc_ps:
17754 case Intrinsic::x86_avx_vtestz_pd:
17755 case Intrinsic::x86_avx_vtestc_pd:
17756 case Intrinsic::x86_avx_vtestnzc_pd:
17757 case Intrinsic::x86_avx_vtestz_ps_256:
17758 case Intrinsic::x86_avx_vtestc_ps_256:
17759 case Intrinsic::x86_avx_vtestnzc_ps_256:
17760 case Intrinsic::x86_avx_vtestz_pd_256:
17761 case Intrinsic::x86_avx_vtestc_pd_256:
17762 case Intrinsic::x86_avx_vtestnzc_pd_256: {
17763 bool IsTestPacked = false;
17766 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
17767 case Intrinsic::x86_avx_vtestz_ps:
17768 case Intrinsic::x86_avx_vtestz_pd:
17769 case Intrinsic::x86_avx_vtestz_ps_256:
17770 case Intrinsic::x86_avx_vtestz_pd_256:
17771 IsTestPacked = true; // Fallthrough
17772 case Intrinsic::x86_sse41_ptestz:
17773 case Intrinsic::x86_avx_ptestz_256:
17775 X86CC = X86::COND_E;
17777 case Intrinsic::x86_avx_vtestc_ps:
17778 case Intrinsic::x86_avx_vtestc_pd:
17779 case Intrinsic::x86_avx_vtestc_ps_256:
17780 case Intrinsic::x86_avx_vtestc_pd_256:
17781 IsTestPacked = true; // Fallthrough
17782 case Intrinsic::x86_sse41_ptestc:
17783 case Intrinsic::x86_avx_ptestc_256:
17785 X86CC = X86::COND_B;
17787 case Intrinsic::x86_avx_vtestnzc_ps:
17788 case Intrinsic::x86_avx_vtestnzc_pd:
17789 case Intrinsic::x86_avx_vtestnzc_ps_256:
17790 case Intrinsic::x86_avx_vtestnzc_pd_256:
17791 IsTestPacked = true; // Fallthrough
17792 case Intrinsic::x86_sse41_ptestnzc:
17793 case Intrinsic::x86_avx_ptestnzc_256:
17795 X86CC = X86::COND_A;
17799 SDValue LHS = Op.getOperand(1);
17800 SDValue RHS = Op.getOperand(2);
17801 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
17802 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
17803 SDValue CC = DAG.getConstant(X86CC, MVT::i8);
17804 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
17805 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17807 case Intrinsic::x86_avx512_kortestz_w:
17808 case Intrinsic::x86_avx512_kortestc_w: {
17809 unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B;
17810 SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1));
17811 SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2));
17812 SDValue CC = DAG.getConstant(X86CC, MVT::i8);
17813 SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
17814 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i1, CC, Test);
17815 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17818 case Intrinsic::x86_sse42_pcmpistria128:
17819 case Intrinsic::x86_sse42_pcmpestria128:
17820 case Intrinsic::x86_sse42_pcmpistric128:
17821 case Intrinsic::x86_sse42_pcmpestric128:
17822 case Intrinsic::x86_sse42_pcmpistrio128:
17823 case Intrinsic::x86_sse42_pcmpestrio128:
17824 case Intrinsic::x86_sse42_pcmpistris128:
17825 case Intrinsic::x86_sse42_pcmpestris128:
17826 case Intrinsic::x86_sse42_pcmpistriz128:
17827 case Intrinsic::x86_sse42_pcmpestriz128: {
17831 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
17832 case Intrinsic::x86_sse42_pcmpistria128:
17833 Opcode = X86ISD::PCMPISTRI;
17834 X86CC = X86::COND_A;
17836 case Intrinsic::x86_sse42_pcmpestria128:
17837 Opcode = X86ISD::PCMPESTRI;
17838 X86CC = X86::COND_A;
17840 case Intrinsic::x86_sse42_pcmpistric128:
17841 Opcode = X86ISD::PCMPISTRI;
17842 X86CC = X86::COND_B;
17844 case Intrinsic::x86_sse42_pcmpestric128:
17845 Opcode = X86ISD::PCMPESTRI;
17846 X86CC = X86::COND_B;
17848 case Intrinsic::x86_sse42_pcmpistrio128:
17849 Opcode = X86ISD::PCMPISTRI;
17850 X86CC = X86::COND_O;
17852 case Intrinsic::x86_sse42_pcmpestrio128:
17853 Opcode = X86ISD::PCMPESTRI;
17854 X86CC = X86::COND_O;
17856 case Intrinsic::x86_sse42_pcmpistris128:
17857 Opcode = X86ISD::PCMPISTRI;
17858 X86CC = X86::COND_S;
17860 case Intrinsic::x86_sse42_pcmpestris128:
17861 Opcode = X86ISD::PCMPESTRI;
17862 X86CC = X86::COND_S;
17864 case Intrinsic::x86_sse42_pcmpistriz128:
17865 Opcode = X86ISD::PCMPISTRI;
17866 X86CC = X86::COND_E;
17868 case Intrinsic::x86_sse42_pcmpestriz128:
17869 Opcode = X86ISD::PCMPESTRI;
17870 X86CC = X86::COND_E;
17873 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
17874 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17875 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
17876 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17877 DAG.getConstant(X86CC, MVT::i8),
17878 SDValue(PCMP.getNode(), 1));
17879 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17882 case Intrinsic::x86_sse42_pcmpistri128:
17883 case Intrinsic::x86_sse42_pcmpestri128: {
17885 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
17886 Opcode = X86ISD::PCMPISTRI;
17888 Opcode = X86ISD::PCMPESTRI;
17890 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
17891 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17892 return DAG.getNode(Opcode, dl, VTs, NewOps);
17897 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
17898 SDValue Src, SDValue Mask, SDValue Base,
17899 SDValue Index, SDValue ScaleOp, SDValue Chain,
17900 const X86Subtarget * Subtarget) {
17902 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
17903 assert(C && "Invalid scale type");
17904 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
17905 EVT MaskVT = MVT::getVectorVT(MVT::i1,
17906 Index.getSimpleValueType().getVectorNumElements());
17908 ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
17910 MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
17912 MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
17913 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
17914 SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
17915 SDValue Segment = DAG.getRegister(0, MVT::i32);
17916 if (Src.getOpcode() == ISD::UNDEF)
17917 Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl);
17918 SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
17919 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
17920 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
17921 return DAG.getMergeValues(RetOps, dl);
17924 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
17925 SDValue Src, SDValue Mask, SDValue Base,
17926 SDValue Index, SDValue ScaleOp, SDValue Chain) {
17928 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
17929 assert(C && "Invalid scale type");
17930 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
17931 SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
17932 SDValue Segment = DAG.getRegister(0, MVT::i32);
17933 EVT MaskVT = MVT::getVectorVT(MVT::i1,
17934 Index.getSimpleValueType().getVectorNumElements());
17936 ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
17938 MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
17940 MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
17941 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
17942 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
17943 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
17944 return SDValue(Res, 1);
17947 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
17948 SDValue Mask, SDValue Base, SDValue Index,
17949 SDValue ScaleOp, SDValue Chain) {
17951 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
17952 assert(C && "Invalid scale type");
17953 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
17954 SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
17955 SDValue Segment = DAG.getRegister(0, MVT::i32);
17957 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
17959 ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
17961 MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
17963 MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
17964 //SDVTList VTs = DAG.getVTList(MVT::Other);
17965 SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
17966 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
17967 return SDValue(Res, 0);
17970 // getReadPerformanceCounter - Handles the lowering of builtin intrinsics that
17971 // read performance monitor counters (x86_rdpmc).
17972 static void getReadPerformanceCounter(SDNode *N, SDLoc DL,
17973 SelectionDAG &DAG, const X86Subtarget *Subtarget,
17974 SmallVectorImpl<SDValue> &Results) {
17975 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
17976 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
17979 // The ECX register is used to select the index of the performance counter
17981 SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
17983 SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
17985 // Reads the content of a 64-bit performance counter and returns it in the
17986 // registers EDX:EAX.
17987 if (Subtarget->is64Bit()) {
17988 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
17989 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
17992 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
17993 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
17996 Chain = HI.getValue(1);
17998 if (Subtarget->is64Bit()) {
17999 // The EAX register is loaded with the low-order 32 bits. The EDX register
18000 // is loaded with the supported high-order bits of the counter.
18001 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
18002 DAG.getConstant(32, MVT::i8));
18003 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
18004 Results.push_back(Chain);
18008 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
18009 SDValue Ops[] = { LO, HI };
18010 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
18011 Results.push_back(Pair);
18012 Results.push_back(Chain);
18015 // getReadTimeStampCounter - Handles the lowering of builtin intrinsics that
18016 // read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is
18017 // also used to custom lower READCYCLECOUNTER nodes.
18018 static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode,
18019 SelectionDAG &DAG, const X86Subtarget *Subtarget,
18020 SmallVectorImpl<SDValue> &Results) {
18021 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
18022 SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
18025 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
18026 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
18027 // and the EAX register is loaded with the low-order 32 bits.
18028 if (Subtarget->is64Bit()) {
18029 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
18030 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
18033 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
18034 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
18037 SDValue Chain = HI.getValue(1);
18039 if (Opcode == X86ISD::RDTSCP_DAG) {
18040 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
18042 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
18043 // the ECX register. Add 'ecx' explicitly to the chain.
18044 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
18046 // Explicitly store the content of ECX at the location passed in input
18047 // to the 'rdtscp' intrinsic.
18048 Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
18049 MachinePointerInfo(), false, false, 0);
18052 if (Subtarget->is64Bit()) {
18053 // The EDX register is loaded with the high-order 32 bits of the MSR, and
18054 // the EAX register is loaded with the low-order 32 bits.
18055 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
18056 DAG.getConstant(32, MVT::i8));
18057 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
18058 Results.push_back(Chain);
18062 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
18063 SDValue Ops[] = { LO, HI };
18064 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
18065 Results.push_back(Pair);
18066 Results.push_back(Chain);
18069 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
18070 SelectionDAG &DAG) {
18071 SmallVector<SDValue, 2> Results;
18073 getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
18075 return DAG.getMergeValues(Results, DL);
18079 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
18080 SelectionDAG &DAG) {
18081 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
18083 const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo);
18088 switch(IntrData->Type) {
18090 llvm_unreachable("Unknown Intrinsic Type");
18094 // Emit the node with the right value type.
18095 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
18096 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
18098 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
18099 // Otherwise return the value from Rand, which is always 0, casted to i32.
18100 SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
18101 DAG.getConstant(1, Op->getValueType(1)),
18102 DAG.getConstant(X86::COND_B, MVT::i32),
18103 SDValue(Result.getNode(), 1) };
18104 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
18105 DAG.getVTList(Op->getValueType(1), MVT::Glue),
18108 // Return { result, isValid, chain }.
18109 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
18110 SDValue(Result.getNode(), 2));
18113 //gather(v1, mask, index, base, scale);
18114 SDValue Chain = Op.getOperand(0);
18115 SDValue Src = Op.getOperand(2);
18116 SDValue Base = Op.getOperand(3);
18117 SDValue Index = Op.getOperand(4);
18118 SDValue Mask = Op.getOperand(5);
18119 SDValue Scale = Op.getOperand(6);
18120 return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain,
18124 //scatter(base, mask, index, v1, scale);
18125 SDValue Chain = Op.getOperand(0);
18126 SDValue Base = Op.getOperand(2);
18127 SDValue Mask = Op.getOperand(3);
18128 SDValue Index = Op.getOperand(4);
18129 SDValue Src = Op.getOperand(5);
18130 SDValue Scale = Op.getOperand(6);
18131 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain);
18134 SDValue Hint = Op.getOperand(6);
18136 if (dyn_cast<ConstantSDNode> (Hint) == nullptr ||
18137 (HintVal = dyn_cast<ConstantSDNode> (Hint)->getZExtValue()) > 1)
18138 llvm_unreachable("Wrong prefetch hint in intrinsic: should be 0 or 1");
18139 unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0);
18140 SDValue Chain = Op.getOperand(0);
18141 SDValue Mask = Op.getOperand(2);
18142 SDValue Index = Op.getOperand(3);
18143 SDValue Base = Op.getOperand(4);
18144 SDValue Scale = Op.getOperand(5);
18145 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain);
18147 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
18149 SmallVector<SDValue, 2> Results;
18150 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget, Results);
18151 return DAG.getMergeValues(Results, dl);
18153 // Read Performance Monitoring Counters.
18155 SmallVector<SDValue, 2> Results;
18156 getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
18157 return DAG.getMergeValues(Results, dl);
18159 // XTEST intrinsics.
18161 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
18162 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
18163 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
18164 DAG.getConstant(X86::COND_NE, MVT::i8),
18166 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
18167 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
18168 Ret, SDValue(InTrans.getNode(), 1));
18172 SmallVector<SDValue, 2> Results;
18173 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
18174 SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
18175 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
18176 DAG.getConstant(-1, MVT::i8));
18177 SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
18178 Op.getOperand(4), GenCF.getValue(1));
18179 SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
18180 Op.getOperand(5), MachinePointerInfo(),
18182 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
18183 DAG.getConstant(X86::COND_B, MVT::i8),
18185 Results.push_back(SetCC);
18186 Results.push_back(Store);
18187 return DAG.getMergeValues(Results, dl);
18189 case COMPRESS_TO_MEM: {
18191 SDValue Mask = Op.getOperand(4);
18192 SDValue DataToCompress = Op.getOperand(3);
18193 SDValue Addr = Op.getOperand(2);
18194 SDValue Chain = Op.getOperand(0);
18196 if (isAllOnes(Mask)) // return just a store
18197 return DAG.getStore(Chain, dl, DataToCompress, Addr,
18198 MachinePointerInfo(), false, false, 0);
18200 EVT VT = DataToCompress.getValueType();
18201 EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
18202 VT.getVectorNumElements());
18203 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
18204 Mask.getValueType().getSizeInBits());
18205 SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
18206 DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
18207 DAG.getIntPtrConstant(0));
18209 SDValue Compressed = DAG.getNode(IntrData->Opc0, dl, VT, VMask,
18210 DataToCompress, DAG.getUNDEF(VT));
18211 return DAG.getStore(Chain, dl, Compressed, Addr,
18212 MachinePointerInfo(), false, false, 0);
18214 case EXPAND_FROM_MEM: {
18216 SDValue Mask = Op.getOperand(4);
18217 SDValue PathThru = Op.getOperand(3);
18218 SDValue Addr = Op.getOperand(2);
18219 SDValue Chain = Op.getOperand(0);
18220 EVT VT = Op.getValueType();
18222 if (isAllOnes(Mask)) // return just a load
18223 return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false,
18225 EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
18226 VT.getVectorNumElements());
18227 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
18228 Mask.getValueType().getSizeInBits());
18229 SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
18230 DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
18231 DAG.getIntPtrConstant(0));
18233 SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(),
18234 false, false, false, 0);
18236 SmallVector<SDValue, 2> Results;
18237 Results.push_back(DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToExpand,
18239 Results.push_back(Chain);
18240 return DAG.getMergeValues(Results, dl);
18245 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
18246 SelectionDAG &DAG) const {
18247 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
18248 MFI->setReturnAddressIsTaken(true);
18250 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
18253 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
18255 EVT PtrVT = getPointerTy();
18258 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
18259 const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
18260 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
18261 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
18262 DAG.getNode(ISD::ADD, dl, PtrVT,
18263 FrameAddr, Offset),
18264 MachinePointerInfo(), false, false, false, 0);
18267 // Just load the return address.
18268 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
18269 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
18270 RetAddrFI, MachinePointerInfo(), false, false, false, 0);
18273 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
18274 MachineFunction &MF = DAG.getMachineFunction();
18275 MachineFrameInfo *MFI = MF.getFrameInfo();
18276 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
18277 const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
18278 EVT VT = Op.getValueType();
18280 MFI->setFrameAddressIsTaken(true);
18282 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
18283 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
18284 // is not possible to crawl up the stack without looking at the unwind codes
18286 int FrameAddrIndex = FuncInfo->getFAIndex();
18287 if (!FrameAddrIndex) {
18288 // Set up a frame object for the return address.
18289 unsigned SlotSize = RegInfo->getSlotSize();
18290 FrameAddrIndex = MF.getFrameInfo()->CreateFixedObject(
18291 SlotSize, /*Offset=*/INT64_MIN, /*IsImmutable=*/false);
18292 FuncInfo->setFAIndex(FrameAddrIndex);
18294 return DAG.getFrameIndex(FrameAddrIndex, VT);
18297 unsigned FrameReg =
18298 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
18299 SDLoc dl(Op); // FIXME probably not meaningful
18300 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
18301 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
18302 (FrameReg == X86::EBP && VT == MVT::i32)) &&
18303 "Invalid Frame Register!");
18304 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
18306 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
18307 MachinePointerInfo(),
18308 false, false, false, 0);
18312 // FIXME? Maybe this could be a TableGen attribute on some registers and
18313 // this table could be generated automatically from RegInfo.
18314 unsigned X86TargetLowering::getRegisterByName(const char* RegName,
18316 unsigned Reg = StringSwitch<unsigned>(RegName)
18317 .Case("esp", X86::ESP)
18318 .Case("rsp", X86::RSP)
18322 report_fatal_error("Invalid register name global variable");
18325 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
18326 SelectionDAG &DAG) const {
18327 const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
18328 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());
18331 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
18332 SDValue Chain = Op.getOperand(0);
18333 SDValue Offset = Op.getOperand(1);
18334 SDValue Handler = Op.getOperand(2);
18337 EVT PtrVT = getPointerTy();
18338 const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
18339 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
18340 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
18341 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
18342 "Invalid Frame Register!");
18343 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
18344 unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
18346 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
18347 DAG.getIntPtrConstant(RegInfo->getSlotSize()));
18348 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
18349 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
18351 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
18353 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
18354 DAG.getRegister(StoreAddrReg, PtrVT));
18357 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
18358 SelectionDAG &DAG) const {
18360 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
18361 DAG.getVTList(MVT::i32, MVT::Other),
18362 Op.getOperand(0), Op.getOperand(1));
18365 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
18366 SelectionDAG &DAG) const {
18368 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
18369 Op.getOperand(0), Op.getOperand(1));
18372 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
18373 return Op.getOperand(0);
18376 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
18377 SelectionDAG &DAG) const {
18378 SDValue Root = Op.getOperand(0);
18379 SDValue Trmp = Op.getOperand(1); // trampoline
18380 SDValue FPtr = Op.getOperand(2); // nested function
18381 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
18384 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
18385 const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
18387 if (Subtarget->is64Bit()) {
18388 SDValue OutChains[6];
18390 // Large code-model.
18391 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
18392 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
18394 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
18395 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
18397 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
18399 // Load the pointer to the nested function into R11.
18400 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
18401 SDValue Addr = Trmp;
18402 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
18403 Addr, MachinePointerInfo(TrmpAddr),
18406 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18407 DAG.getConstant(2, MVT::i64));
18408 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
18409 MachinePointerInfo(TrmpAddr, 2),
18412 // Load the 'nest' parameter value into R10.
18413 // R10 is specified in X86CallingConv.td
18414 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
18415 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18416 DAG.getConstant(10, MVT::i64));
18417 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
18418 Addr, MachinePointerInfo(TrmpAddr, 10),
18421 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18422 DAG.getConstant(12, MVT::i64));
18423 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
18424 MachinePointerInfo(TrmpAddr, 12),
18427 // Jump to the nested function.
18428 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
18429 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18430 DAG.getConstant(20, MVT::i64));
18431 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
18432 Addr, MachinePointerInfo(TrmpAddr, 20),
18435 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
18436 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18437 DAG.getConstant(22, MVT::i64));
18438 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,
18439 MachinePointerInfo(TrmpAddr, 22),
18442 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
18444 const Function *Func =
18445 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
18446 CallingConv::ID CC = Func->getCallingConv();
18451 llvm_unreachable("Unsupported calling convention");
18452 case CallingConv::C:
18453 case CallingConv::X86_StdCall: {
18454 // Pass 'nest' parameter in ECX.
18455 // Must be kept in sync with X86CallingConv.td
18456 NestReg = X86::ECX;
18458 // Check that ECX wasn't needed by an 'inreg' parameter.
18459 FunctionType *FTy = Func->getFunctionType();
18460 const AttributeSet &Attrs = Func->getAttributes();
18462 if (!Attrs.isEmpty() && !Func->isVarArg()) {
18463 unsigned InRegCount = 0;
18466 for (FunctionType::param_iterator I = FTy->param_begin(),
18467 E = FTy->param_end(); I != E; ++I, ++Idx)
18468 if (Attrs.hasAttribute(Idx, Attribute::InReg))
18469 // FIXME: should only count parameters that are lowered to integers.
18470 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
18472 if (InRegCount > 2) {
18473 report_fatal_error("Nest register in use - reduce number of inreg"
18479 case CallingConv::X86_FastCall:
18480 case CallingConv::X86_ThisCall:
18481 case CallingConv::Fast:
18482 // Pass 'nest' parameter in EAX.
18483 // Must be kept in sync with X86CallingConv.td
18484 NestReg = X86::EAX;
18488 SDValue OutChains[4];
18489 SDValue Addr, Disp;
18491 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18492 DAG.getConstant(10, MVT::i32));
18493 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
18495 // This is storing the opcode for MOV32ri.
18496 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
18497 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
18498 OutChains[0] = DAG.getStore(Root, dl,
18499 DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
18500 Trmp, MachinePointerInfo(TrmpAddr),
18503 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18504 DAG.getConstant(1, MVT::i32));
18505 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
18506 MachinePointerInfo(TrmpAddr, 1),
18509 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
18510 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18511 DAG.getConstant(5, MVT::i32));
18512 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,
18513 MachinePointerInfo(TrmpAddr, 5),
18516 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18517 DAG.getConstant(6, MVT::i32));
18518 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
18519 MachinePointerInfo(TrmpAddr, 6),
18522 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
18526 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
18527 SelectionDAG &DAG) const {
18529 The rounding mode is in bits 11:10 of FPSR, and has the following
18531 00 Round to nearest
18536 FLT_ROUNDS, on the other hand, expects the following:
18543 To perform the conversion, we do:
18544 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
18547 MachineFunction &MF = DAG.getMachineFunction();
18548 const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
18549 unsigned StackAlignment = TFI.getStackAlignment();
18550 MVT VT = Op.getSimpleValueType();
18553 // Save FP Control Word to stack slot
18554 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
18555 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
18557 MachineMemOperand *MMO =
18558 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
18559 MachineMemOperand::MOStore, 2, 2);
18561 SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
18562 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
18563 DAG.getVTList(MVT::Other),
18564 Ops, MVT::i16, MMO);
18566 // Load FP Control Word from stack slot
18567 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
18568 MachinePointerInfo(), false, false, false, 0);
18570 // Transform as necessary
18572 DAG.getNode(ISD::SRL, DL, MVT::i16,
18573 DAG.getNode(ISD::AND, DL, MVT::i16,
18574 CWD, DAG.getConstant(0x800, MVT::i16)),
18575 DAG.getConstant(11, MVT::i8));
18577 DAG.getNode(ISD::SRL, DL, MVT::i16,
18578 DAG.getNode(ISD::AND, DL, MVT::i16,
18579 CWD, DAG.getConstant(0x400, MVT::i16)),
18580 DAG.getConstant(9, MVT::i8));
18583 DAG.getNode(ISD::AND, DL, MVT::i16,
18584 DAG.getNode(ISD::ADD, DL, MVT::i16,
18585 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
18586 DAG.getConstant(1, MVT::i16)),
18587 DAG.getConstant(3, MVT::i16));
18589 return DAG.getNode((VT.getSizeInBits() < 16 ?
18590 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
18593 static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
18594 MVT VT = Op.getSimpleValueType();
18596 unsigned NumBits = VT.getSizeInBits();
18599 Op = Op.getOperand(0);
18600 if (VT == MVT::i8) {
18601 // Zero extend to i32 since there is not an i8 bsr.
18603 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
18606 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
18607 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
18608 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
18610 // If src is zero (i.e. bsr sets ZF), returns NumBits.
18613 DAG.getConstant(NumBits+NumBits-1, OpVT),
18614 DAG.getConstant(X86::COND_E, MVT::i8),
18617 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
18619 // Finally xor with NumBits-1.
18620 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
18623 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
18627 static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
18628 MVT VT = Op.getSimpleValueType();
18630 unsigned NumBits = VT.getSizeInBits();
18633 Op = Op.getOperand(0);
18634 if (VT == MVT::i8) {
18635 // Zero extend to i32 since there is not an i8 bsr.
18637 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
18640 // Issue a bsr (scan bits in reverse).
18641 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
18642 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
18644 // And xor with NumBits-1.
18645 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
18648 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
18652 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
18653 MVT VT = Op.getSimpleValueType();
18654 unsigned NumBits = VT.getSizeInBits();
18656 Op = Op.getOperand(0);
18658 // Issue a bsf (scan bits forward) which also sets EFLAGS.
18659 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18660 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
18662 // If src is zero (i.e. bsf sets ZF), returns NumBits.
18665 DAG.getConstant(NumBits, VT),
18666 DAG.getConstant(X86::COND_E, MVT::i8),
18669 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
18672 // Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
18673 // ones, and then concatenate the result back.
18674 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
18675 MVT VT = Op.getSimpleValueType();
18677 assert(VT.is256BitVector() && VT.isInteger() &&
18678 "Unsupported value type for operation");
18680 unsigned NumElems = VT.getVectorNumElements();
18683 // Extract the LHS vectors
18684 SDValue LHS = Op.getOperand(0);
18685 SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
18686 SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
18688 // Extract the RHS vectors
18689 SDValue RHS = Op.getOperand(1);
18690 SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
18691 SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
18693 MVT EltVT = VT.getVectorElementType();
18694 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
18696 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
18697 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
18698 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
18701 static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
18702 assert(Op.getSimpleValueType().is256BitVector() &&
18703 Op.getSimpleValueType().isInteger() &&
18704 "Only handle AVX 256-bit vector integer operation");
18705 return Lower256IntArith(Op, DAG);
18708 static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
18709 assert(Op.getSimpleValueType().is256BitVector() &&
18710 Op.getSimpleValueType().isInteger() &&
18711 "Only handle AVX 256-bit vector integer operation");
18712 return Lower256IntArith(Op, DAG);
18715 static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
18716 SelectionDAG &DAG) {
18718 MVT VT = Op.getSimpleValueType();
18720 // Decompose 256-bit ops into smaller 128-bit ops.
18721 if (VT.is256BitVector() && !Subtarget->hasInt256())
18722 return Lower256IntArith(Op, DAG);
18724 SDValue A = Op.getOperand(0);
18725 SDValue B = Op.getOperand(1);
18727 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
18728 if (VT == MVT::v4i32) {
18729 assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() &&
18730 "Should not custom lower when pmuldq is available!");
18732 // Extract the odd parts.
18733 static const int UnpackMask[] = { 1, -1, 3, -1 };
18734 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
18735 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
18737 // Multiply the even parts.
18738 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
18739 // Now multiply odd parts.
18740 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
18742 Evens = DAG.getNode(ISD::BITCAST, dl, VT, Evens);
18743 Odds = DAG.getNode(ISD::BITCAST, dl, VT, Odds);
18745 // Merge the two vectors back together with a shuffle. This expands into 2
18747 static const int ShufMask[] = { 0, 4, 2, 6 };
18748 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
18751 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
18752 "Only know how to lower V2I64/V4I64/V8I64 multiply");
18754 // Ahi = psrlqi(a, 32);
18755 // Bhi = psrlqi(b, 32);
18757 // AloBlo = pmuludq(a, b);
18758 // AloBhi = pmuludq(a, Bhi);
18759 // AhiBlo = pmuludq(Ahi, b);
18761 // AloBhi = psllqi(AloBhi, 32);
18762 // AhiBlo = psllqi(AhiBlo, 32);
18763 // return AloBlo + AloBhi + AhiBlo;
18765 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
18766 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
18768 // Bit cast to 32-bit vectors for MULUDQ
18769 EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
18770 (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;
18771 A = DAG.getNode(ISD::BITCAST, dl, MulVT, A);
18772 B = DAG.getNode(ISD::BITCAST, dl, MulVT, B);
18773 Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi);
18774 Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi);
18776 SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
18777 SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
18778 SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
18780 AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG);
18781 AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG);
18783 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
18784 return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
18787 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
18788 assert(Subtarget->isTargetWin64() && "Unexpected target");
18789 EVT VT = Op.getValueType();
18790 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
18791 "Unexpected return type for lowering");
18795 switch (Op->getOpcode()) {
18796 default: llvm_unreachable("Unexpected request for libcall!");
18797 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
18798 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
18799 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
18800 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
18801 case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
18802 case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
18806 SDValue InChain = DAG.getEntryNode();
18808 TargetLowering::ArgListTy Args;
18809 TargetLowering::ArgListEntry Entry;
18810 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
18811 EVT ArgVT = Op->getOperand(i).getValueType();
18812 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
18813 "Unexpected argument type for lowering");
18814 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
18815 Entry.Node = StackPtr;
18816 InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(),
18818 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
18819 Entry.Ty = PointerType::get(ArgTy,0);
18820 Entry.isSExt = false;
18821 Entry.isZExt = false;
18822 Args.push_back(Entry);
18825 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
18828 TargetLowering::CallLoweringInfo CLI(DAG);
18829 CLI.setDebugLoc(dl).setChain(InChain)
18830 .setCallee(getLibcallCallingConv(LC),
18831 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
18832 Callee, std::move(Args), 0)
18833 .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
18835 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
18836 return DAG.getNode(ISD::BITCAST, dl, VT, CallInfo.first);
18839 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
18840 SelectionDAG &DAG) {
18841 SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
18842 EVT VT = Op0.getValueType();
18845 assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) ||
18846 (VT == MVT::v8i32 && Subtarget->hasInt256()));
18848 // PMULxD operations multiply each even value (starting at 0) of LHS with
18849 // the related value of RHS and produce a widen result.
18850 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
18851 // => <2 x i64> <ae|cg>
18853 // In other word, to have all the results, we need to perform two PMULxD:
18854 // 1. one with the even values.
18855 // 2. one with the odd values.
18856 // To achieve #2, with need to place the odd values at an even position.
18858 // Place the odd value at an even position (basically, shift all values 1
18859 // step to the left):
18860 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
18861 // <a|b|c|d> => <b|undef|d|undef>
18862 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask);
18863 // <e|f|g|h> => <f|undef|h|undef>
18864 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask);
18866 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
18868 MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
18869 bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
18871 (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
18872 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
18873 // => <2 x i64> <ae|cg>
18874 SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT,
18875 DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
18876 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
18877 // => <2 x i64> <bf|dh>
18878 SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT,
18879 DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
18881 // Shuffle it back into the right order.
18882 SDValue Highs, Lows;
18883 if (VT == MVT::v8i32) {
18884 const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
18885 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
18886 const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
18887 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
18889 const int HighMask[] = {1, 5, 3, 7};
18890 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
18891 const int LowMask[] = {0, 4, 2, 6};
18892 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
18895 // If we have a signed multiply but no PMULDQ fix up the high parts of a
18896 // unsigned multiply.
18897 if (IsSigned && !Subtarget->hasSSE41()) {
18899 DAG.getConstant(31, DAG.getTargetLoweringInfo().getShiftAmountTy(VT));
18900 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
18901 DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
18902 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
18903 DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
18905 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
18906 Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
18909 // The first result of MUL_LOHI is actually the low value, followed by the
18911 SDValue Ops[] = {Lows, Highs};
18912 return DAG.getMergeValues(Ops, dl);
18915 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
18916 const X86Subtarget *Subtarget) {
18917 MVT VT = Op.getSimpleValueType();
18919 SDValue R = Op.getOperand(0);
18920 SDValue Amt = Op.getOperand(1);
18922 // Optimize shl/srl/sra with constant shift amount.
18923 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
18924 if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
18925 uint64_t ShiftAmt = ShiftConst->getZExtValue();
18927 if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
18928 (Subtarget->hasInt256() &&
18929 (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16)) ||
18930 (Subtarget->hasAVX512() &&
18931 (VT == MVT::v8i64 || VT == MVT::v16i32))) {
18932 if (Op.getOpcode() == ISD::SHL)
18933 return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
18935 if (Op.getOpcode() == ISD::SRL)
18936 return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
18938 if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64)
18939 return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
18943 if (VT == MVT::v16i8) {
18944 if (Op.getOpcode() == ISD::SHL) {
18945 // Make a large shift.
18946 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
18947 MVT::v8i16, R, ShiftAmt,
18949 SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
18950 // Zero out the rightmost bits.
18951 SmallVector<SDValue, 16> V(16,
18952 DAG.getConstant(uint8_t(-1U << ShiftAmt),
18954 return DAG.getNode(ISD::AND, dl, VT, SHL,
18955 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18957 if (Op.getOpcode() == ISD::SRL) {
18958 // Make a large shift.
18959 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
18960 MVT::v8i16, R, ShiftAmt,
18962 SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
18963 // Zero out the leftmost bits.
18964 SmallVector<SDValue, 16> V(16,
18965 DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
18967 return DAG.getNode(ISD::AND, dl, VT, SRL,
18968 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18970 if (Op.getOpcode() == ISD::SRA) {
18971 if (ShiftAmt == 7) {
18972 // R s>> 7 === R s< 0
18973 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
18974 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
18977 // R s>> a === ((R u>> a) ^ m) - m
18978 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
18979 SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
18981 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
18982 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
18983 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
18986 llvm_unreachable("Unknown shift opcode.");
18989 if (Subtarget->hasInt256() && VT == MVT::v32i8) {
18990 if (Op.getOpcode() == ISD::SHL) {
18991 // Make a large shift.
18992 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
18993 MVT::v16i16, R, ShiftAmt,
18995 SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
18996 // Zero out the rightmost bits.
18997 SmallVector<SDValue, 32> V(32,
18998 DAG.getConstant(uint8_t(-1U << ShiftAmt),
19000 return DAG.getNode(ISD::AND, dl, VT, SHL,
19001 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
19003 if (Op.getOpcode() == ISD::SRL) {
19004 // Make a large shift.
19005 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
19006 MVT::v16i16, R, ShiftAmt,
19008 SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
19009 // Zero out the leftmost bits.
19010 SmallVector<SDValue, 32> V(32,
19011 DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
19013 return DAG.getNode(ISD::AND, dl, VT, SRL,
19014 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
19016 if (Op.getOpcode() == ISD::SRA) {
19017 if (ShiftAmt == 7) {
19018 // R s>> 7 === R s< 0
19019 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
19020 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
19023 // R s>> a === ((R u>> a) ^ m) - m
19024 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
19025 SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt,
19027 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
19028 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
19029 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
19032 llvm_unreachable("Unknown shift opcode.");
19037 // Special case in 32-bit mode, where i64 is expanded into high and low parts.
19038 if (!Subtarget->is64Bit() &&
19039 (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
19040 Amt.getOpcode() == ISD::BITCAST &&
19041 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
19042 Amt = Amt.getOperand(0);
19043 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
19044 VT.getVectorNumElements();
19045 unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
19046 uint64_t ShiftAmt = 0;
19047 for (unsigned i = 0; i != Ratio; ++i) {
19048 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i));
19052 ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
19054 // Check remaining shift amounts.
19055 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
19056 uint64_t ShAmt = 0;
19057 for (unsigned j = 0; j != Ratio; ++j) {
19058 ConstantSDNode *C =
19059 dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
19063 ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
19065 if (ShAmt != ShiftAmt)
19068 switch (Op.getOpcode()) {
19070 llvm_unreachable("Unknown shift opcode!");
19072 return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
19075 return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
19078 return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
19086 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
19087 const X86Subtarget* Subtarget) {
19088 MVT VT = Op.getSimpleValueType();
19090 SDValue R = Op.getOperand(0);
19091 SDValue Amt = Op.getOperand(1);
19093 if ((VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) ||
19094 VT == MVT::v4i32 || VT == MVT::v8i16 ||
19095 (Subtarget->hasInt256() &&
19096 ((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) ||
19097 VT == MVT::v8i32 || VT == MVT::v16i16)) ||
19098 (Subtarget->hasAVX512() && (VT == MVT::v8i64 || VT == MVT::v16i32))) {
19100 EVT EltVT = VT.getVectorElementType();
19102 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
19103 // Check if this build_vector node is doing a splat.
19104 // If so, then set BaseShAmt equal to the splat value.
19105 BaseShAmt = BV->getSplatValue();
19106 if (BaseShAmt && BaseShAmt.getOpcode() == ISD::UNDEF)
19107 BaseShAmt = SDValue();
19109 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
19110 Amt = Amt.getOperand(0);
19112 ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
19113 if (SVN && SVN->isSplat()) {
19114 unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
19115 SDValue InVec = Amt.getOperand(0);
19116 if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
19117 assert((SplatIdx < InVec.getValueType().getVectorNumElements()) &&
19118 "Unexpected shuffle index found!");
19119 BaseShAmt = InVec.getOperand(SplatIdx);
19120 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
19121 if (ConstantSDNode *C =
19122 dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
19123 if (C->getZExtValue() == SplatIdx)
19124 BaseShAmt = InVec.getOperand(1);
19129 // Avoid introducing an extract element from a shuffle.
19130 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
19131 DAG.getIntPtrConstant(SplatIdx));
19135 if (BaseShAmt.getNode()) {
19136 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
19137 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
19138 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
19139 else if (EltVT.bitsLT(MVT::i32))
19140 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
19142 switch (Op.getOpcode()) {
19144 llvm_unreachable("Unknown shift opcode!");
19146 switch (VT.SimpleTy) {
19147 default: return SDValue();
19156 return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG);
19159 switch (VT.SimpleTy) {
19160 default: return SDValue();
19167 return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG);
19170 switch (VT.SimpleTy) {
19171 default: return SDValue();
19180 return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG);
19186 // Special case in 32-bit mode, where i64 is expanded into high and low parts.
19187 if (!Subtarget->is64Bit() &&
19188 (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64) ||
19189 (Subtarget->hasAVX512() && VT == MVT::v8i64)) &&
19190 Amt.getOpcode() == ISD::BITCAST &&
19191 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
19192 Amt = Amt.getOperand(0);
19193 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
19194 VT.getVectorNumElements();
19195 std::vector<SDValue> Vals(Ratio);
19196 for (unsigned i = 0; i != Ratio; ++i)
19197 Vals[i] = Amt.getOperand(i);
19198 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
19199 for (unsigned j = 0; j != Ratio; ++j)
19200 if (Vals[j] != Amt.getOperand(i + j))
19203 switch (Op.getOpcode()) {
19205 llvm_unreachable("Unknown shift opcode!");
19207 return DAG.getNode(X86ISD::VSHL, dl, VT, R, Op.getOperand(1));
19209 return DAG.getNode(X86ISD::VSRL, dl, VT, R, Op.getOperand(1));
19211 return DAG.getNode(X86ISD::VSRA, dl, VT, R, Op.getOperand(1));
19218 static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
19219 SelectionDAG &DAG) {
19220 MVT VT = Op.getSimpleValueType();
19222 SDValue R = Op.getOperand(0);
19223 SDValue Amt = Op.getOperand(1);
19226 assert(VT.isVector() && "Custom lowering only for vector shifts!");
19227 assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!");
19229 V = LowerScalarImmediateShift(Op, DAG, Subtarget);
19233 V = LowerScalarVariableShift(Op, DAG, Subtarget);
19237 if (Subtarget->hasAVX512() && (VT == MVT::v16i32 || VT == MVT::v8i64))
19239 // AVX2 has VPSLLV/VPSRAV/VPSRLV.
19240 if (Subtarget->hasInt256()) {
19241 if (Op.getOpcode() == ISD::SRL &&
19242 (VT == MVT::v2i64 || VT == MVT::v4i32 ||
19243 VT == MVT::v4i64 || VT == MVT::v8i32))
19245 if (Op.getOpcode() == ISD::SHL &&
19246 (VT == MVT::v2i64 || VT == MVT::v4i32 ||
19247 VT == MVT::v4i64 || VT == MVT::v8i32))
19249 if (Op.getOpcode() == ISD::SRA && (VT == MVT::v4i32 || VT == MVT::v8i32))
19253 // If possible, lower this packed shift into a vector multiply instead of
19254 // expanding it into a sequence of scalar shifts.
19255 // Do this only if the vector shift count is a constant build_vector.
19256 if (Op.getOpcode() == ISD::SHL &&
19257 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
19258 (Subtarget->hasInt256() && VT == MVT::v16i16)) &&
19259 ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
19260 SmallVector<SDValue, 8> Elts;
19261 EVT SVT = VT.getScalarType();
19262 unsigned SVTBits = SVT.getSizeInBits();
19263 const APInt &One = APInt(SVTBits, 1);
19264 unsigned NumElems = VT.getVectorNumElements();
19266 for (unsigned i=0; i !=NumElems; ++i) {
19267 SDValue Op = Amt->getOperand(i);
19268 if (Op->getOpcode() == ISD::UNDEF) {
19269 Elts.push_back(Op);
19273 ConstantSDNode *ND = cast<ConstantSDNode>(Op);
19274 const APInt &C = APInt(SVTBits, ND->getAPIntValue().getZExtValue());
19275 uint64_t ShAmt = C.getZExtValue();
19276 if (ShAmt >= SVTBits) {
19277 Elts.push_back(DAG.getUNDEF(SVT));
19280 Elts.push_back(DAG.getConstant(One.shl(ShAmt), SVT));
19282 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
19283 return DAG.getNode(ISD::MUL, dl, VT, R, BV);
19286 // Lower SHL with variable shift amount.
19287 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
19288 Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT));
19290 Op = DAG.getNode(ISD::ADD, dl, VT, Op, DAG.getConstant(0x3f800000U, VT));
19291 Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op);
19292 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
19293 return DAG.getNode(ISD::MUL, dl, VT, Op, R);
19296 // If possible, lower this shift as a sequence of two shifts by
19297 // constant plus a MOVSS/MOVSD instead of scalarizing it.
19299 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
19301 // Could be rewritten as:
19302 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
19304 // The advantage is that the two shifts from the example would be
19305 // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
19306 // the vector shift into four scalar shifts plus four pairs of vector
19308 if ((VT == MVT::v8i16 || VT == MVT::v4i32) &&
19309 ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
19310 unsigned TargetOpcode = X86ISD::MOVSS;
19311 bool CanBeSimplified;
19312 // The splat value for the first packed shift (the 'X' from the example).
19313 SDValue Amt1 = Amt->getOperand(0);
19314 // The splat value for the second packed shift (the 'Y' from the example).
19315 SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) :
19316 Amt->getOperand(2);
19318 // See if it is possible to replace this node with a sequence of
19319 // two shifts followed by a MOVSS/MOVSD
19320 if (VT == MVT::v4i32) {
19321 // Check if it is legal to use a MOVSS.
19322 CanBeSimplified = Amt2 == Amt->getOperand(2) &&
19323 Amt2 == Amt->getOperand(3);
19324 if (!CanBeSimplified) {
19325 // Otherwise, check if we can still simplify this node using a MOVSD.
19326 CanBeSimplified = Amt1 == Amt->getOperand(1) &&
19327 Amt->getOperand(2) == Amt->getOperand(3);
19328 TargetOpcode = X86ISD::MOVSD;
19329 Amt2 = Amt->getOperand(2);
19332 // Do similar checks for the case where the machine value type
19334 CanBeSimplified = Amt1 == Amt->getOperand(1);
19335 for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
19336 CanBeSimplified = Amt2 == Amt->getOperand(i);
19338 if (!CanBeSimplified) {
19339 TargetOpcode = X86ISD::MOVSD;
19340 CanBeSimplified = true;
19341 Amt2 = Amt->getOperand(4);
19342 for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
19343 CanBeSimplified = Amt1 == Amt->getOperand(i);
19344 for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
19345 CanBeSimplified = Amt2 == Amt->getOperand(j);
19349 if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
19350 isa<ConstantSDNode>(Amt2)) {
19351 // Replace this node with two shifts followed by a MOVSS/MOVSD.
19352 EVT CastVT = MVT::v4i32;
19354 DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), VT);
19355 SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
19357 DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), VT);
19358 SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
19359 if (TargetOpcode == X86ISD::MOVSD)
19360 CastVT = MVT::v2i64;
19361 SDValue BitCast1 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift1);
19362 SDValue BitCast2 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift2);
19363 SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2,
19365 return DAG.getNode(ISD::BITCAST, dl, VT, Result);
19369 if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
19370 assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq.");
19373 Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(5, VT));
19374 Op = DAG.getNode(ISD::BITCAST, dl, VT, Op);
19376 // Turn 'a' into a mask suitable for VSELECT
19377 SDValue VSelM = DAG.getConstant(0x80, VT);
19378 SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
19379 OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
19381 SDValue CM1 = DAG.getConstant(0x0f, VT);
19382 SDValue CM2 = DAG.getConstant(0x3f, VT);
19384 // r = VSELECT(r, psllw(r & (char16)15, 4), a);
19385 SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1);
19386 M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 4, DAG);
19387 M = DAG.getNode(ISD::BITCAST, dl, VT, M);
19388 R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
19391 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
19392 OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
19393 OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
19395 // r = VSELECT(r, psllw(r & (char16)63, 2), a);
19396 M = DAG.getNode(ISD::AND, dl, VT, R, CM2);
19397 M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 2, DAG);
19398 M = DAG.getNode(ISD::BITCAST, dl, VT, M);
19399 R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
19402 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
19403 OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
19404 OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
19406 // return VSELECT(r, r+r, a);
19407 R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel,
19408 DAG.getNode(ISD::ADD, dl, VT, R, R), R);
19412 // It's worth extending once and using the v8i32 shifts for 16-bit types, but
19413 // the extra overheads to get from v16i8 to v8i32 make the existing SSE
19414 // solution better.
19415 if (Subtarget->hasInt256() && VT == MVT::v8i16) {
19416 MVT NewVT = VT == MVT::v8i16 ? MVT::v8i32 : MVT::v16i16;
19418 Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
19419 R = DAG.getNode(ExtOpc, dl, NewVT, R);
19420 Amt = DAG.getNode(ISD::ANY_EXTEND, dl, NewVT, Amt);
19421 return DAG.getNode(ISD::TRUNCATE, dl, VT,
19422 DAG.getNode(Op.getOpcode(), dl, NewVT, R, Amt));
19425 // Decompose 256-bit shifts into smaller 128-bit shifts.
19426 if (VT.is256BitVector()) {
19427 unsigned NumElems = VT.getVectorNumElements();
19428 MVT EltVT = VT.getVectorElementType();
19429 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
19431 // Extract the two vectors
19432 SDValue V1 = Extract128BitVector(R, 0, DAG, dl);
19433 SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl);
19435 // Recreate the shift amount vectors
19436 SDValue Amt1, Amt2;
19437 if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
19438 // Constant shift amount
19439 SmallVector<SDValue, 4> Amt1Csts;
19440 SmallVector<SDValue, 4> Amt2Csts;
19441 for (unsigned i = 0; i != NumElems/2; ++i)
19442 Amt1Csts.push_back(Amt->getOperand(i));
19443 for (unsigned i = NumElems/2; i != NumElems; ++i)
19444 Amt2Csts.push_back(Amt->getOperand(i));
19446 Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts);
19447 Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts);
19449 // Variable shift amount
19450 Amt1 = Extract128BitVector(Amt, 0, DAG, dl);
19451 Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl);
19454 // Issue new vector shifts for the smaller types
19455 V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
19456 V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
19458 // Concatenate the result back
19459 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
19465 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
19466 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
19467 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
19468 // looks for this combo and may remove the "setcc" instruction if the "setcc"
19469 // has only one use.
19470 SDNode *N = Op.getNode();
19471 SDValue LHS = N->getOperand(0);
19472 SDValue RHS = N->getOperand(1);
19473 unsigned BaseOp = 0;
19476 switch (Op.getOpcode()) {
19477 default: llvm_unreachable("Unknown ovf instruction!");
19479 // A subtract of one will be selected as a INC. Note that INC doesn't
19480 // set CF, so we can't do this for UADDO.
19481 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
19483 BaseOp = X86ISD::INC;
19484 Cond = X86::COND_O;
19487 BaseOp = X86ISD::ADD;
19488 Cond = X86::COND_O;
19491 BaseOp = X86ISD::ADD;
19492 Cond = X86::COND_B;
19495 // A subtract of one will be selected as a DEC. Note that DEC doesn't
19496 // set CF, so we can't do this for USUBO.
19497 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
19499 BaseOp = X86ISD::DEC;
19500 Cond = X86::COND_O;
19503 BaseOp = X86ISD::SUB;
19504 Cond = X86::COND_O;
19507 BaseOp = X86ISD::SUB;
19508 Cond = X86::COND_B;
19511 BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
19512 Cond = X86::COND_O;
19514 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
19515 if (N->getValueType(0) == MVT::i8) {
19516 BaseOp = X86ISD::UMUL8;
19517 Cond = X86::COND_O;
19520 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
19522 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
19525 DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
19526 DAG.getConstant(X86::COND_O, MVT::i32),
19527 SDValue(Sum.getNode(), 2));
19529 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
19533 // Also sets EFLAGS.
19534 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
19535 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
19538 DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1),
19539 DAG.getConstant(Cond, MVT::i32),
19540 SDValue(Sum.getNode(), 1));
19542 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
19545 // Sign extension of the low part of vector elements. This may be used either
19546 // when sign extend instructions are not available or if the vector element
19547 // sizes already match the sign-extended size. If the vector elements are in
19548 // their pre-extended size and sign extend instructions are available, that will
19549 // be handled by LowerSIGN_EXTEND.
19550 SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
19551 SelectionDAG &DAG) const {
19553 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
19554 MVT VT = Op.getSimpleValueType();
19556 if (!Subtarget->hasSSE2() || !VT.isVector())
19559 unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
19560 ExtraVT.getScalarType().getSizeInBits();
19562 switch (VT.SimpleTy) {
19563 default: return SDValue();
19566 if (!Subtarget->hasFp256())
19568 if (!Subtarget->hasInt256()) {
19569 // needs to be split
19570 unsigned NumElems = VT.getVectorNumElements();
19572 // Extract the LHS vectors
19573 SDValue LHS = Op.getOperand(0);
19574 SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
19575 SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
19577 MVT EltVT = VT.getVectorElementType();
19578 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
19580 EVT ExtraEltVT = ExtraVT.getVectorElementType();
19581 unsigned ExtraNumElems = ExtraVT.getVectorNumElements();
19582 ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
19584 SDValue Extra = DAG.getValueType(ExtraVT);
19586 LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
19587 LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
19589 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);
19594 SDValue Op0 = Op.getOperand(0);
19596 // This is a sign extension of some low part of vector elements without
19597 // changing the size of the vector elements themselves:
19598 // Shift-Left + Shift-Right-Algebraic.
19599 SDValue Shl = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0,
19601 return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Shl, BitsDiff,
19607 /// Returns true if the operand type is exactly twice the native width, and
19608 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
19609 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
19610 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
19611 bool X86TargetLowering::needsCmpXchgNb(const Type *MemType) const {
19612 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
19615 return !Subtarget->is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
19616 else if (OpWidth == 128)
19617 return Subtarget->hasCmpxchg16b();
19622 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
19623 return needsCmpXchgNb(SI->getValueOperand()->getType());
19626 // Note: this turns large loads into lock cmpxchg8b/16b.
19627 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
19628 bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
19629 auto PTy = cast<PointerType>(LI->getPointerOperand()->getType());
19630 return needsCmpXchgNb(PTy->getElementType());
19633 bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
19634 unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
19635 const Type *MemType = AI->getType();
19637 // If the operand is too big, we must see if cmpxchg8/16b is available
19638 // and default to library calls otherwise.
19639 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
19640 return needsCmpXchgNb(MemType);
19642 AtomicRMWInst::BinOp Op = AI->getOperation();
19645 llvm_unreachable("Unknown atomic operation");
19646 case AtomicRMWInst::Xchg:
19647 case AtomicRMWInst::Add:
19648 case AtomicRMWInst::Sub:
19649 // It's better to use xadd, xsub or xchg for these in all cases.
19651 case AtomicRMWInst::Or:
19652 case AtomicRMWInst::And:
19653 case AtomicRMWInst::Xor:
19654 // If the atomicrmw's result isn't actually used, we can just add a "lock"
19655 // prefix to a normal instruction for these operations.
19656 return !AI->use_empty();
19657 case AtomicRMWInst::Nand:
19658 case AtomicRMWInst::Max:
19659 case AtomicRMWInst::Min:
19660 case AtomicRMWInst::UMax:
19661 case AtomicRMWInst::UMin:
19662 // These always require a non-trivial set of data operations on x86. We must
19663 // use a cmpxchg loop.
19668 static bool hasMFENCE(const X86Subtarget& Subtarget) {
19669 // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
19670 // no-sse2). There isn't any reason to disable it if the target processor
19672 return Subtarget.hasSSE2() || Subtarget.is64Bit();
19676 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
19677 unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
19678 const Type *MemType = AI->getType();
19679 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
19680 // there is no benefit in turning such RMWs into loads, and it is actually
19681 // harmful as it introduces a mfence.
19682 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
19685 auto Builder = IRBuilder<>(AI);
19686 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
19687 auto SynchScope = AI->getSynchScope();
19688 // We must restrict the ordering to avoid generating loads with Release or
19689 // ReleaseAcquire orderings.
19690 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
19691 auto Ptr = AI->getPointerOperand();
19693 // Before the load we need a fence. Here is an example lifted from
19694 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
19697 // x.store(1, relaxed);
19698 // r1 = y.fetch_add(0, release);
19700 // y.fetch_add(42, acquire);
19701 // r2 = x.load(relaxed);
19702 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
19703 // lowered to just a load without a fence. A mfence flushes the store buffer,
19704 // making the optimization clearly correct.
19705 // FIXME: it is required if isAtLeastRelease(Order) but it is not clear
19706 // otherwise, we might be able to be more agressive on relaxed idempotent
19707 // rmw. In practice, they do not look useful, so we don't try to be
19708 // especially clever.
19709 if (SynchScope == SingleThread) {
19710 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
19711 // the IR level, so we must wrap it in an intrinsic.
19713 } else if (hasMFENCE(*Subtarget)) {
19714 Function *MFence = llvm::Intrinsic::getDeclaration(M,
19715 Intrinsic::x86_sse2_mfence);
19716 Builder.CreateCall(MFence);
19718 // FIXME: it might make sense to use a locked operation here but on a
19719 // different cache-line to prevent cache-line bouncing. In practice it
19720 // is probably a small win, and x86 processors without mfence are rare
19721 // enough that we do not bother.
19725 // Finally we can emit the atomic load.
19726 LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
19727 AI->getType()->getPrimitiveSizeInBits());
19728 Loaded->setAtomic(Order, SynchScope);
19729 AI->replaceAllUsesWith(Loaded);
19730 AI->eraseFromParent();
19734 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
19735 SelectionDAG &DAG) {
19737 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
19738 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
19739 SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
19740 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
19742 // The only fence that needs an instruction is a sequentially-consistent
19743 // cross-thread fence.
19744 if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) {
19745 if (hasMFENCE(*Subtarget))
19746 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
19748 SDValue Chain = Op.getOperand(0);
19749 SDValue Zero = DAG.getConstant(0, MVT::i32);
19751 DAG.getRegister(X86::ESP, MVT::i32), // Base
19752 DAG.getTargetConstant(1, MVT::i8), // Scale
19753 DAG.getRegister(0, MVT::i32), // Index
19754 DAG.getTargetConstant(0, MVT::i32), // Disp
19755 DAG.getRegister(0, MVT::i32), // Segment.
19759 SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
19760 return SDValue(Res, 0);
19763 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
19764 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
19767 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
19768 SelectionDAG &DAG) {
19769 MVT T = Op.getSimpleValueType();
19773 switch(T.SimpleTy) {
19774 default: llvm_unreachable("Invalid value type!");
19775 case MVT::i8: Reg = X86::AL; size = 1; break;
19776 case MVT::i16: Reg = X86::AX; size = 2; break;
19777 case MVT::i32: Reg = X86::EAX; size = 4; break;
19779 assert(Subtarget->is64Bit() && "Node not type legal!");
19780 Reg = X86::RAX; size = 8;
19783 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
19784 Op.getOperand(2), SDValue());
19785 SDValue Ops[] = { cpIn.getValue(0),
19788 DAG.getTargetConstant(size, MVT::i8),
19789 cpIn.getValue(1) };
19790 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
19791 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
19792 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
19796 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
19797 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
19798 MVT::i32, cpOut.getValue(2));
19799 SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1),
19800 DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);
19802 DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
19803 DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
19804 DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
19808 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
19809 SelectionDAG &DAG) {
19810 MVT SrcVT = Op.getOperand(0).getSimpleValueType();
19811 MVT DstVT = Op.getSimpleValueType();
19813 if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) {
19814 assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
19815 if (DstVT != MVT::f64)
19816 // This conversion needs to be expanded.
19819 SDValue InVec = Op->getOperand(0);
19821 unsigned NumElts = SrcVT.getVectorNumElements();
19822 EVT SVT = SrcVT.getVectorElementType();
19824 // Widen the vector in input in the case of MVT::v2i32.
19825 // Example: from MVT::v2i32 to MVT::v4i32.
19826 SmallVector<SDValue, 16> Elts;
19827 for (unsigned i = 0, e = NumElts; i != e; ++i)
19828 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec,
19829 DAG.getIntPtrConstant(i)));
19831 // Explicitly mark the extra elements as Undef.
19832 Elts.append(NumElts, DAG.getUNDEF(SVT));
19834 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
19835 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts);
19836 SDValue ToV2F64 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, BV);
19837 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
19838 DAG.getIntPtrConstant(0));
19841 assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
19842 Subtarget->hasMMX() && "Unexpected custom BITCAST");
19843 assert((DstVT == MVT::i64 ||
19844 (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
19845 "Unexpected custom BITCAST");
19846 // i64 <=> MMX conversions are Legal.
19847 if (SrcVT==MVT::i64 && DstVT.isVector())
19849 if (DstVT==MVT::i64 && SrcVT.isVector())
19851 // MMX <=> MMX conversions are Legal.
19852 if (SrcVT.isVector() && DstVT.isVector())
19854 // All other conversions need to be expanded.
19858 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget,
19859 SelectionDAG &DAG) {
19860 SDNode *Node = Op.getNode();
19863 Op = Op.getOperand(0);
19864 EVT VT = Op.getValueType();
19865 assert((VT.is128BitVector() || VT.is256BitVector()) &&
19866 "CTPOP lowering only implemented for 128/256-bit wide vector types");
19868 unsigned NumElts = VT.getVectorNumElements();
19869 EVT EltVT = VT.getVectorElementType();
19870 unsigned Len = EltVT.getSizeInBits();
19872 // This is the vectorized version of the "best" algorithm from
19873 // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
19874 // with a minor tweak to use a series of adds + shifts instead of vector
19875 // multiplications. Implemented for the v2i64, v4i64, v4i32, v8i32 types:
19877 // v2i64, v4i64, v4i32 => Only profitable w/ popcnt disabled
19878 // v8i32 => Always profitable
19880 // FIXME: There a couple of possible improvements:
19882 // 1) Support for i8 and i16 vectors (needs measurements if popcnt enabled).
19883 // 2) Use strategies from http://wm.ite.pl/articles/sse-popcount.html
19885 assert(EltVT.isInteger() && (Len == 32 || Len == 64) && Len % 8 == 0 &&
19886 "CTPOP not implemented for this vector element type.");
19888 // X86 canonicalize ANDs to vXi64, generate the appropriate bitcasts to avoid
19889 // extra legalization.
19890 bool NeedsBitcast = EltVT == MVT::i32;
19891 MVT BitcastVT = VT.is256BitVector() ? MVT::v4i64 : MVT::v2i64;
19893 SDValue Cst55 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), EltVT);
19894 SDValue Cst33 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), EltVT);
19895 SDValue Cst0F = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), EltVT);
19897 // v = v - ((v >> 1) & 0x55555555...)
19898 SmallVector<SDValue, 8> Ones(NumElts, DAG.getConstant(1, EltVT));
19899 SDValue OnesV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ones);
19900 SDValue Srl = DAG.getNode(ISD::SRL, dl, VT, Op, OnesV);
19902 Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);
19904 SmallVector<SDValue, 8> Mask55(NumElts, Cst55);
19905 SDValue M55 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask55);
19907 M55 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M55);
19909 SDValue And = DAG.getNode(ISD::AND, dl, Srl.getValueType(), Srl, M55);
19910 if (VT != And.getValueType())
19911 And = DAG.getNode(ISD::BITCAST, dl, VT, And);
19912 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, Op, And);
19914 // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
19915 SmallVector<SDValue, 8> Mask33(NumElts, Cst33);
19916 SDValue M33 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask33);
19917 SmallVector<SDValue, 8> Twos(NumElts, DAG.getConstant(2, EltVT));
19918 SDValue TwosV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Twos);
19920 Srl = DAG.getNode(ISD::SRL, dl, VT, Sub, TwosV);
19921 if (NeedsBitcast) {
19922 Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);
19923 M33 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M33);
19924 Sub = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Sub);
19927 SDValue AndRHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Srl, M33);
19928 SDValue AndLHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Sub, M33);
19929 if (VT != AndRHS.getValueType()) {
19930 AndRHS = DAG.getNode(ISD::BITCAST, dl, VT, AndRHS);
19931 AndLHS = DAG.getNode(ISD::BITCAST, dl, VT, AndLHS);
19933 SDValue Add = DAG.getNode(ISD::ADD, dl, VT, AndLHS, AndRHS);
19935 // v = (v + (v >> 4)) & 0x0F0F0F0F...
19936 SmallVector<SDValue, 8> Fours(NumElts, DAG.getConstant(4, EltVT));
19937 SDValue FoursV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Fours);
19938 Srl = DAG.getNode(ISD::SRL, dl, VT, Add, FoursV);
19939 Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);
19941 SmallVector<SDValue, 8> Mask0F(NumElts, Cst0F);
19942 SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask0F);
19943 if (NeedsBitcast) {
19944 Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);
19945 M0F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M0F);
19947 And = DAG.getNode(ISD::AND, dl, M0F.getValueType(), Add, M0F);
19948 if (VT != And.getValueType())
19949 And = DAG.getNode(ISD::BITCAST, dl, VT, And);
19951 // The algorithm mentioned above uses:
19952 // v = (v * 0x01010101...) >> (Len - 8)
19954 // Change it to use vector adds + vector shifts which yield faster results on
19955 // Haswell than using vector integer multiplication.
19957 // For i32 elements:
19958 // v = v + (v >> 8)
19959 // v = v + (v >> 16)
19961 // For i64 elements:
19962 // v = v + (v >> 8)
19963 // v = v + (v >> 16)
19964 // v = v + (v >> 32)
19967 SmallVector<SDValue, 8> Csts;
19968 for (unsigned i = 8; i <= Len/2; i *= 2) {
19969 Csts.assign(NumElts, DAG.getConstant(i, EltVT));
19970 SDValue CstsV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Csts);
19971 Srl = DAG.getNode(ISD::SRL, dl, VT, Add, CstsV);
19972 Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);
19976 // The result is on the least significant 6-bits on i32 and 7-bits on i64.
19977 SDValue Cst3F = DAG.getConstant(APInt(Len, Len == 32 ? 0x3F : 0x7F), EltVT);
19978 SmallVector<SDValue, 8> Cst3FV(NumElts, Cst3F);
19979 SDValue M3F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Cst3FV);
19980 if (NeedsBitcast) {
19981 Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);
19982 M3F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M3F);
19984 And = DAG.getNode(ISD::AND, dl, M3F.getValueType(), Add, M3F);
19985 if (VT != And.getValueType())
19986 And = DAG.getNode(ISD::BITCAST, dl, VT, And);
19991 static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
19992 SDNode *Node = Op.getNode();
19994 EVT T = Node->getValueType(0);
19995 SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
19996 DAG.getConstant(0, T), Node->getOperand(2));
19997 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
19998 cast<AtomicSDNode>(Node)->getMemoryVT(),
19999 Node->getOperand(0),
20000 Node->getOperand(1), negOp,
20001 cast<AtomicSDNode>(Node)->getMemOperand(),
20002 cast<AtomicSDNode>(Node)->getOrdering(),
20003 cast<AtomicSDNode>(Node)->getSynchScope());
20006 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
20007 SDNode *Node = Op.getNode();
20009 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
20011 // Convert seq_cst store -> xchg
20012 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
20013 // FIXME: On 32-bit, store -> fist or movq would be more efficient
20014 // (The only way to get a 16-byte store is cmpxchg16b)
20015 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
20016 if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent ||
20017 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
20018 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
20019 cast<AtomicSDNode>(Node)->getMemoryVT(),
20020 Node->getOperand(0),
20021 Node->getOperand(1), Node->getOperand(2),
20022 cast<AtomicSDNode>(Node)->getMemOperand(),
20023 cast<AtomicSDNode>(Node)->getOrdering(),
20024 cast<AtomicSDNode>(Node)->getSynchScope());
20025 return Swap.getValue(1);
20027 // Other atomic stores have a simple pattern.
20031 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
20032 EVT VT = Op.getNode()->getSimpleValueType(0);
20034 // Let legalize expand this if it isn't a legal type yet.
20035 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
20038 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
20041 bool ExtraOp = false;
20042 switch (Op.getOpcode()) {
20043 default: llvm_unreachable("Invalid code");
20044 case ISD::ADDC: Opc = X86ISD::ADD; break;
20045 case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
20046 case ISD::SUBC: Opc = X86ISD::SUB; break;
20047 case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
20051 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
20053 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
20054 Op.getOperand(1), Op.getOperand(2));
20057 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
20058 SelectionDAG &DAG) {
20059 assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit());
20061 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
20062 // which returns the values as { float, float } (in XMM0) or
20063 // { double, double } (which is returned in XMM0, XMM1).
20065 SDValue Arg = Op.getOperand(0);
20066 EVT ArgVT = Arg.getValueType();
20067 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
20069 TargetLowering::ArgListTy Args;
20070 TargetLowering::ArgListEntry Entry;
20074 Entry.isSExt = false;
20075 Entry.isZExt = false;
20076 Args.push_back(Entry);
20078 bool isF64 = ArgVT == MVT::f64;
20079 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
20080 // the small struct {f32, f32} is returned in (eax, edx). For f64,
20081 // the results are returned via SRet in memory.
20082 const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret";
20083 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20084 SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy());
20086 Type *RetTy = isF64
20087 ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
20088 : (Type*)VectorType::get(ArgTy, 4);
20090 TargetLowering::CallLoweringInfo CLI(DAG);
20091 CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
20092 .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0);
20094 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
20097 // Returned in xmm0 and xmm1.
20098 return CallResult.first;
20100 // Returned in bits 0:31 and 32:64 xmm0.
20101 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
20102 CallResult.first, DAG.getIntPtrConstant(0));
20103 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
20104 CallResult.first, DAG.getIntPtrConstant(1));
20105 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
20106 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
20109 /// LowerOperation - Provide custom lowering hooks for some operations.
20111 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
20112 switch (Op.getOpcode()) {
20113 default: llvm_unreachable("Should not custom lower this!");
20114 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG);
20115 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
20116 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
20117 return LowerCMP_SWAP(Op, Subtarget, DAG);
20118 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
20119 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG);
20120 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG);
20121 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
20122 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
20123 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
20124 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
20125 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
20126 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
20127 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
20128 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
20129 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG);
20130 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
20131 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
20132 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
20133 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
20134 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
20135 case ISD::SHL_PARTS:
20136 case ISD::SRA_PARTS:
20137 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
20138 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
20139 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
20140 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
20141 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
20142 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
20143 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
20144 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
20145 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
20146 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
20147 case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG);
20149 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
20150 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
20151 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
20152 case ISD::SETCC: return LowerSETCC(Op, DAG);
20153 case ISD::SELECT: return LowerSELECT(Op, DAG);
20154 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
20155 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
20156 case ISD::VASTART: return LowerVASTART(Op, DAG);
20157 case ISD::VAARG: return LowerVAARG(Op, DAG);
20158 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
20159 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
20160 case ISD::INTRINSIC_VOID:
20161 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
20162 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
20163 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
20164 case ISD::FRAME_TO_ARGS_OFFSET:
20165 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
20166 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
20167 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
20168 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
20169 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
20170 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
20171 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
20172 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
20173 case ISD::CTLZ: return LowerCTLZ(Op, DAG);
20174 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, DAG);
20175 case ISD::CTTZ: return LowerCTTZ(Op, DAG);
20176 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
20177 case ISD::UMUL_LOHI:
20178 case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
20181 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
20187 case ISD::UMULO: return LowerXALUO(Op, DAG);
20188 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
20189 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
20193 case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
20194 case ISD::ADD: return LowerADD(Op, DAG);
20195 case ISD::SUB: return LowerSUB(Op, DAG);
20196 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
20200 /// ReplaceNodeResults - Replace a node with an illegal result type
20201 /// with a new node built out of custom code.
20202 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
20203 SmallVectorImpl<SDValue>&Results,
20204 SelectionDAG &DAG) const {
20206 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20207 switch (N->getOpcode()) {
20209 llvm_unreachable("Do not know how to custom type legalize this operation!");
20210 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
20211 case X86ISD::FMINC:
20213 case X86ISD::FMAXC:
20214 case X86ISD::FMAX: {
20215 EVT VT = N->getValueType(0);
20216 if (VT != MVT::v2f32)
20217 llvm_unreachable("Unexpected type (!= v2f32) on FMIN/FMAX.");
20218 SDValue UNDEF = DAG.getUNDEF(VT);
20219 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
20220 N->getOperand(0), UNDEF);
20221 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
20222 N->getOperand(1), UNDEF);
20223 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
20226 case ISD::SIGN_EXTEND_INREG:
20231 // We don't want to expand or promote these.
20238 case ISD::UDIVREM: {
20239 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
20240 Results.push_back(V);
20243 case ISD::FP_TO_SINT:
20244 case ISD::FP_TO_UINT: {
20245 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
20247 if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType()))
20250 std::pair<SDValue,SDValue> Vals =
20251 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
20252 SDValue FIST = Vals.first, StackSlot = Vals.second;
20253 if (FIST.getNode()) {
20254 EVT VT = N->getValueType(0);
20255 // Return a load from the stack slot.
20256 if (StackSlot.getNode())
20257 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
20258 MachinePointerInfo(),
20259 false, false, false, 0));
20261 Results.push_back(FIST);
20265 case ISD::UINT_TO_FP: {
20266 assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
20267 if (N->getOperand(0).getValueType() != MVT::v2i32 ||
20268 N->getValueType(0) != MVT::v2f32)
20270 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,
20272 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
20274 SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias);
20275 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
20276 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, VBias));
20277 Or = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or);
20278 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
20279 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
20282 case ISD::FP_ROUND: {
20283 if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
20285 SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
20286 Results.push_back(V);
20289 case ISD::INTRINSIC_W_CHAIN: {
20290 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
20292 default : llvm_unreachable("Do not know how to custom type "
20293 "legalize this intrinsic operation!");
20294 case Intrinsic::x86_rdtsc:
20295 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
20297 case Intrinsic::x86_rdtscp:
20298 return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
20300 case Intrinsic::x86_rdpmc:
20301 return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
20304 case ISD::READCYCLECOUNTER: {
20305 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
20308 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
20309 EVT T = N->getValueType(0);
20310 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
20311 bool Regs64bit = T == MVT::i128;
20312 EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
20313 SDValue cpInL, cpInH;
20314 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
20315 DAG.getConstant(0, HalfT));
20316 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
20317 DAG.getConstant(1, HalfT));
20318 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
20319 Regs64bit ? X86::RAX : X86::EAX,
20321 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
20322 Regs64bit ? X86::RDX : X86::EDX,
20323 cpInH, cpInL.getValue(1));
20324 SDValue swapInL, swapInH;
20325 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
20326 DAG.getConstant(0, HalfT));
20327 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
20328 DAG.getConstant(1, HalfT));
20329 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl,
20330 Regs64bit ? X86::RBX : X86::EBX,
20331 swapInL, cpInH.getValue(1));
20332 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl,
20333 Regs64bit ? X86::RCX : X86::ECX,
20334 swapInH, swapInL.getValue(1));
20335 SDValue Ops[] = { swapInH.getValue(0),
20337 swapInH.getValue(1) };
20338 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20339 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
20340 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
20341 X86ISD::LCMPXCHG8_DAG;
20342 SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
20343 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
20344 Regs64bit ? X86::RAX : X86::EAX,
20345 HalfT, Result.getValue(1));
20346 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
20347 Regs64bit ? X86::RDX : X86::EDX,
20348 HalfT, cpOutL.getValue(2));
20349 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
20351 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
20352 MVT::i32, cpOutH.getValue(2));
20354 DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
20355 DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);
20356 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
20358 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
20359 Results.push_back(Success);
20360 Results.push_back(EFLAGS.getValue(1));
20363 case ISD::ATOMIC_SWAP:
20364 case ISD::ATOMIC_LOAD_ADD:
20365 case ISD::ATOMIC_LOAD_SUB:
20366 case ISD::ATOMIC_LOAD_AND:
20367 case ISD::ATOMIC_LOAD_OR:
20368 case ISD::ATOMIC_LOAD_XOR:
20369 case ISD::ATOMIC_LOAD_NAND:
20370 case ISD::ATOMIC_LOAD_MIN:
20371 case ISD::ATOMIC_LOAD_MAX:
20372 case ISD::ATOMIC_LOAD_UMIN:
20373 case ISD::ATOMIC_LOAD_UMAX:
20374 case ISD::ATOMIC_LOAD: {
20375 // Delegate to generic TypeLegalization. Situations we can really handle
20376 // should have already been dealt with by AtomicExpandPass.cpp.
20379 case ISD::BITCAST: {
20380 assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
20381 EVT DstVT = N->getValueType(0);
20382 EVT SrcVT = N->getOperand(0)->getValueType(0);
20384 if (SrcVT != MVT::f64 ||
20385 (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
20388 unsigned NumElts = DstVT.getVectorNumElements();
20389 EVT SVT = DstVT.getVectorElementType();
20390 EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
20391 SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
20392 MVT::v2f64, N->getOperand(0));
20393 SDValue ToVecInt = DAG.getNode(ISD::BITCAST, dl, WiderVT, Expanded);
20395 if (ExperimentalVectorWideningLegalization) {
20396 // If we are legalizing vectors by widening, we already have the desired
20397 // legal vector type, just return it.
20398 Results.push_back(ToVecInt);
20402 SmallVector<SDValue, 8> Elts;
20403 for (unsigned i = 0, e = NumElts; i != e; ++i)
20404 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
20405 ToVecInt, DAG.getIntPtrConstant(i)));
20407 Results.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, DstVT, Elts));
20412 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
20414 default: return nullptr;
20415 case X86ISD::BSF: return "X86ISD::BSF";
20416 case X86ISD::BSR: return "X86ISD::BSR";
20417 case X86ISD::SHLD: return "X86ISD::SHLD";
20418 case X86ISD::SHRD: return "X86ISD::SHRD";
20419 case X86ISD::FAND: return "X86ISD::FAND";
20420 case X86ISD::FANDN: return "X86ISD::FANDN";
20421 case X86ISD::FOR: return "X86ISD::FOR";
20422 case X86ISD::FXOR: return "X86ISD::FXOR";
20423 case X86ISD::FSRL: return "X86ISD::FSRL";
20424 case X86ISD::FILD: return "X86ISD::FILD";
20425 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
20426 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
20427 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
20428 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
20429 case X86ISD::FLD: return "X86ISD::FLD";
20430 case X86ISD::FST: return "X86ISD::FST";
20431 case X86ISD::CALL: return "X86ISD::CALL";
20432 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
20433 case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
20434 case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
20435 case X86ISD::BT: return "X86ISD::BT";
20436 case X86ISD::CMP: return "X86ISD::CMP";
20437 case X86ISD::COMI: return "X86ISD::COMI";
20438 case X86ISD::UCOMI: return "X86ISD::UCOMI";
20439 case X86ISD::CMPM: return "X86ISD::CMPM";
20440 case X86ISD::CMPMU: return "X86ISD::CMPMU";
20441 case X86ISD::SETCC: return "X86ISD::SETCC";
20442 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
20443 case X86ISD::FSETCC: return "X86ISD::FSETCC";
20444 case X86ISD::CMOV: return "X86ISD::CMOV";
20445 case X86ISD::BRCOND: return "X86ISD::BRCOND";
20446 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
20447 case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
20448 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
20449 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
20450 case X86ISD::Wrapper: return "X86ISD::Wrapper";
20451 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
20452 case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
20453 case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
20454 case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
20455 case X86ISD::PINSRB: return "X86ISD::PINSRB";
20456 case X86ISD::PINSRW: return "X86ISD::PINSRW";
20457 case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
20458 case X86ISD::ANDNP: return "X86ISD::ANDNP";
20459 case X86ISD::PSIGN: return "X86ISD::PSIGN";
20460 case X86ISD::BLENDI: return "X86ISD::BLENDI";
20461 case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
20462 case X86ISD::SUBUS: return "X86ISD::SUBUS";
20463 case X86ISD::HADD: return "X86ISD::HADD";
20464 case X86ISD::HSUB: return "X86ISD::HSUB";
20465 case X86ISD::FHADD: return "X86ISD::FHADD";
20466 case X86ISD::FHSUB: return "X86ISD::FHSUB";
20467 case X86ISD::UMAX: return "X86ISD::UMAX";
20468 case X86ISD::UMIN: return "X86ISD::UMIN";
20469 case X86ISD::SMAX: return "X86ISD::SMAX";
20470 case X86ISD::SMIN: return "X86ISD::SMIN";
20471 case X86ISD::FMAX: return "X86ISD::FMAX";
20472 case X86ISD::FMIN: return "X86ISD::FMIN";
20473 case X86ISD::FMAXC: return "X86ISD::FMAXC";
20474 case X86ISD::FMINC: return "X86ISD::FMINC";
20475 case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
20476 case X86ISD::FRCP: return "X86ISD::FRCP";
20477 case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
20478 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
20479 case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
20480 case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
20481 case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
20482 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
20483 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
20484 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
20485 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
20486 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
20487 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
20488 case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
20489 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
20490 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
20491 case X86ISD::VZEXT: return "X86ISD::VZEXT";
20492 case X86ISD::VSEXT: return "X86ISD::VSEXT";
20493 case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
20494 case X86ISD::VTRUNCM: return "X86ISD::VTRUNCM";
20495 case X86ISD::VINSERT: return "X86ISD::VINSERT";
20496 case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
20497 case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
20498 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
20499 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
20500 case X86ISD::VSHL: return "X86ISD::VSHL";
20501 case X86ISD::VSRL: return "X86ISD::VSRL";
20502 case X86ISD::VSRA: return "X86ISD::VSRA";
20503 case X86ISD::VSHLI: return "X86ISD::VSHLI";
20504 case X86ISD::VSRLI: return "X86ISD::VSRLI";
20505 case X86ISD::VSRAI: return "X86ISD::VSRAI";
20506 case X86ISD::CMPP: return "X86ISD::CMPP";
20507 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
20508 case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
20509 case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM";
20510 case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM";
20511 case X86ISD::ADD: return "X86ISD::ADD";
20512 case X86ISD::SUB: return "X86ISD::SUB";
20513 case X86ISD::ADC: return "X86ISD::ADC";
20514 case X86ISD::SBB: return "X86ISD::SBB";
20515 case X86ISD::SMUL: return "X86ISD::SMUL";
20516 case X86ISD::UMUL: return "X86ISD::UMUL";
20517 case X86ISD::SMUL8: return "X86ISD::SMUL8";
20518 case X86ISD::UMUL8: return "X86ISD::UMUL8";
20519 case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
20520 case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
20521 case X86ISD::INC: return "X86ISD::INC";
20522 case X86ISD::DEC: return "X86ISD::DEC";
20523 case X86ISD::OR: return "X86ISD::OR";
20524 case X86ISD::XOR: return "X86ISD::XOR";
20525 case X86ISD::AND: return "X86ISD::AND";
20526 case X86ISD::BEXTR: return "X86ISD::BEXTR";
20527 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
20528 case X86ISD::PTEST: return "X86ISD::PTEST";
20529 case X86ISD::TESTP: return "X86ISD::TESTP";
20530 case X86ISD::TESTM: return "X86ISD::TESTM";
20531 case X86ISD::TESTNM: return "X86ISD::TESTNM";
20532 case X86ISD::KORTEST: return "X86ISD::KORTEST";
20533 case X86ISD::PACKSS: return "X86ISD::PACKSS";
20534 case X86ISD::PACKUS: return "X86ISD::PACKUS";
20535 case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
20536 case X86ISD::VALIGN: return "X86ISD::VALIGN";
20537 case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
20538 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
20539 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
20540 case X86ISD::SHUFP: return "X86ISD::SHUFP";
20541 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
20542 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD";
20543 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
20544 case X86ISD::MOVLPS: return "X86ISD::MOVLPS";
20545 case X86ISD::MOVLPD: return "X86ISD::MOVLPD";
20546 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
20547 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
20548 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
20549 case X86ISD::MOVSD: return "X86ISD::MOVSD";
20550 case X86ISD::MOVSS: return "X86ISD::MOVSS";
20551 case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
20552 case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
20553 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
20554 case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
20555 case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT";
20556 case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
20557 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
20558 case X86ISD::VPERMV: return "X86ISD::VPERMV";
20559 case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
20560 case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";
20561 case X86ISD::VPERMI: return "X86ISD::VPERMI";
20562 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
20563 case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
20564 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
20565 case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
20566 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
20567 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
20568 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
20569 case X86ISD::WIN_FTOL: return "X86ISD::WIN_FTOL";
20570 case X86ISD::SAHF: return "X86ISD::SAHF";
20571 case X86ISD::RDRAND: return "X86ISD::RDRAND";
20572 case X86ISD::RDSEED: return "X86ISD::RDSEED";
20573 case X86ISD::FMADD: return "X86ISD::FMADD";
20574 case X86ISD::FMSUB: return "X86ISD::FMSUB";
20575 case X86ISD::FNMADD: return "X86ISD::FNMADD";
20576 case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
20577 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
20578 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
20579 case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
20580 case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
20581 case X86ISD::XTEST: return "X86ISD::XTEST";
20582 case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
20583 case X86ISD::EXPAND: return "X86ISD::EXPAND";
20584 case X86ISD::SELECT: return "X86ISD::SELECT";
20585 case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
20586 case X86ISD::RCP28: return "X86ISD::RCP28";
20587 case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
20591 // isLegalAddressingMode - Return true if the addressing mode represented
20592 // by AM is legal for this target, for a load/store of the specified type.
20593 bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
20595 // X86 supports extremely general addressing modes.
20596 CodeModel::Model M = getTargetMachine().getCodeModel();
20597 Reloc::Model R = getTargetMachine().getRelocationModel();
20599 // X86 allows a sign-extended 32-bit immediate field as a displacement.
20600 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
20605 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
20607 // If a reference to this global requires an extra load, we can't fold it.
20608 if (isGlobalStubReference(GVFlags))
20611 // If BaseGV requires a register for the PIC base, we cannot also have a
20612 // BaseReg specified.
20613 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
20616 // If lower 4G is not available, then we must use rip-relative addressing.
20617 if ((M != CodeModel::Small || R != Reloc::Static) &&
20618 Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
20622 switch (AM.Scale) {
20628 // These scales always work.
20633 // These scales are formed with basereg+scalereg. Only accept if there is
20638 default: // Other stuff never works.
20645 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
20646 unsigned Bits = Ty->getScalarSizeInBits();
20648 // 8-bit shifts are always expensive, but versions with a scalar amount aren't
20649 // particularly cheaper than those without.
20653 // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
20654 // variable shifts just as cheap as scalar ones.
20655 if (Subtarget->hasInt256() && (Bits == 32 || Bits == 64))
20658 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
20659 // fully general vector.
20663 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
20664 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
20666 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
20667 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
20668 return NumBits1 > NumBits2;
20671 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
20672 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
20675 if (!isTypeLegal(EVT::getEVT(Ty1)))
20678 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
20680 // Assuming the caller doesn't have a zeroext or signext return parameter,
20681 // truncation all the way down to i1 is valid.
20685 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
20686 return isInt<32>(Imm);
20689 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
20690 // Can also use sub to handle negated immediates.
20691 return isInt<32>(Imm);
20694 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
20695 if (!VT1.isInteger() || !VT2.isInteger())
20697 unsigned NumBits1 = VT1.getSizeInBits();
20698 unsigned NumBits2 = VT2.getSizeInBits();
20699 return NumBits1 > NumBits2;
20702 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
20703 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
20704 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
20707 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
20708 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
20709 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
20712 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
20713 EVT VT1 = Val.getValueType();
20714 if (isZExtFree(VT1, VT2))
20717 if (Val.getOpcode() != ISD::LOAD)
20720 if (!VT1.isSimple() || !VT1.isInteger() ||
20721 !VT2.isSimple() || !VT2.isInteger())
20724 switch (VT1.getSimpleVT().SimpleTy) {
20729 // X86 has 8, 16, and 32-bit zero-extending loads.
20736 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
20739 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
20740 if (!(Subtarget->hasFMA() || Subtarget->hasFMA4()))
20743 VT = VT.getScalarType();
20745 if (!VT.isSimple())
20748 switch (VT.getSimpleVT().SimpleTy) {
20759 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
20760 // i16 instructions are longer (0x66 prefix) and potentially slower.
20761 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
20764 /// isShuffleMaskLegal - Targets can use this to indicate that they only
20765 /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
20766 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
20767 /// are assumed to be legal.
20769 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
20771 if (!VT.isSimple())
20774 MVT SVT = VT.getSimpleVT();
20776 // Very little shuffling can be done for 64-bit vectors right now.
20777 if (VT.getSizeInBits() == 64)
20780 // This is an experimental legality test that is tailored to match the
20781 // legality test of the experimental lowering more closely. They are gated
20782 // separately to ease testing of performance differences.
20783 if (ExperimentalVectorShuffleLegality)
20784 // We only care that the types being shuffled are legal. The lowering can
20785 // handle any possible shuffle mask that results.
20786 return isTypeLegal(SVT);
20788 // If this is a single-input shuffle with no 128 bit lane crossings we can
20789 // lower it into pshufb.
20790 if ((SVT.is128BitVector() && Subtarget->hasSSSE3()) ||
20791 (SVT.is256BitVector() && Subtarget->hasInt256())) {
20792 bool isLegal = true;
20793 for (unsigned I = 0, E = M.size(); I != E; ++I) {
20794 if (M[I] >= (int)SVT.getVectorNumElements() ||
20795 ShuffleCrosses128bitLane(SVT, I, M[I])) {
20804 // FIXME: blends, shifts.
20805 return (SVT.getVectorNumElements() == 2 ||
20806 ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
20807 isMOVLMask(M, SVT) ||
20808 isCommutedMOVLMask(M, SVT) ||
20809 isMOVHLPSMask(M, SVT) ||
20810 isSHUFPMask(M, SVT) ||
20811 isSHUFPMask(M, SVT, /* Commuted */ true) ||
20812 isPSHUFDMask(M, SVT) ||
20813 isPSHUFDMask(M, SVT, /* SecondOperand */ true) ||
20814 isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) ||
20815 isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) ||
20816 isPALIGNRMask(M, SVT, Subtarget) ||
20817 isUNPCKLMask(M, SVT, Subtarget->hasInt256()) ||
20818 isUNPCKHMask(M, SVT, Subtarget->hasInt256()) ||
20819 isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
20820 isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
20821 isBlendMask(M, SVT, Subtarget->hasSSE41(), Subtarget->hasInt256()) ||
20822 (Subtarget->hasSSE41() && isINSERTPSMask(M, SVT)));
20826 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
20828 if (!VT.isSimple())
20831 MVT SVT = VT.getSimpleVT();
20833 // This is an experimental legality test that is tailored to match the
20834 // legality test of the experimental lowering more closely. They are gated
20835 // separately to ease testing of performance differences.
20836 if (ExperimentalVectorShuffleLegality)
20837 // The new vector shuffle lowering is very good at managing zero-inputs.
20838 return isShuffleMaskLegal(Mask, VT);
20840 unsigned NumElts = SVT.getVectorNumElements();
20841 // FIXME: This collection of masks seems suspect.
20844 if (NumElts == 4 && SVT.is128BitVector()) {
20845 return (isMOVLMask(Mask, SVT) ||
20846 isCommutedMOVLMask(Mask, SVT, true) ||
20847 isSHUFPMask(Mask, SVT) ||
20848 isSHUFPMask(Mask, SVT, /* Commuted */ true) ||
20849 isBlendMask(Mask, SVT, Subtarget->hasSSE41(),
20850 Subtarget->hasInt256()));
20855 //===----------------------------------------------------------------------===//
20856 // X86 Scheduler Hooks
20857 //===----------------------------------------------------------------------===//
20859 /// Utility function to emit xbegin specifying the start of an RTM region.
20860 static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,
20861 const TargetInstrInfo *TII) {
20862 DebugLoc DL = MI->getDebugLoc();
20864 const BasicBlock *BB = MBB->getBasicBlock();
20865 MachineFunction::iterator I = MBB;
20868 // For the v = xbegin(), we generate
20879 MachineBasicBlock *thisMBB = MBB;
20880 MachineFunction *MF = MBB->getParent();
20881 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
20882 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
20883 MF->insert(I, mainMBB);
20884 MF->insert(I, sinkMBB);
20886 // Transfer the remainder of BB and its successor edges to sinkMBB.
20887 sinkMBB->splice(sinkMBB->begin(), MBB,
20888 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
20889 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
20893 // # fallthrough to mainMBB
20894 // # abortion to sinkMBB
20895 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
20896 thisMBB->addSuccessor(mainMBB);
20897 thisMBB->addSuccessor(sinkMBB);
20901 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
20902 mainMBB->addSuccessor(sinkMBB);
20905 // EAX is live into the sinkMBB
20906 sinkMBB->addLiveIn(X86::EAX);
20907 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
20908 TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
20911 MI->eraseFromParent();
20915 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
20916 // or XMM0_V32I8 in AVX all of this code can be replaced with that
20917 // in the .td file.
20918 static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB,
20919 const TargetInstrInfo *TII) {
20921 switch (MI->getOpcode()) {
20922 default: llvm_unreachable("illegal opcode!");
20923 case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;
20924 case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
20925 case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;
20926 case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
20927 case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;
20928 case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
20929 case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;
20930 case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
20933 DebugLoc dl = MI->getDebugLoc();
20934 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
20936 unsigned NumArgs = MI->getNumOperands();
20937 for (unsigned i = 1; i < NumArgs; ++i) {
20938 MachineOperand &Op = MI->getOperand(i);
20939 if (!(Op.isReg() && Op.isImplicit()))
20940 MIB.addOperand(Op);
20942 if (MI->hasOneMemOperand())
20943 MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
20945 BuildMI(*BB, MI, dl,
20946 TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
20947 .addReg(X86::XMM0);
20949 MI->eraseFromParent();
20953 // FIXME: Custom handling because TableGen doesn't support multiple implicit
20954 // defs in an instruction pattern
20955 static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB,
20956 const TargetInstrInfo *TII) {
20958 switch (MI->getOpcode()) {
20959 default: llvm_unreachable("illegal opcode!");
20960 case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;
20961 case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
20962 case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;
20963 case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
20964 case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;
20965 case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
20966 case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;
20967 case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
20970 DebugLoc dl = MI->getDebugLoc();
20971 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
20973 unsigned NumArgs = MI->getNumOperands(); // remove the results
20974 for (unsigned i = 1; i < NumArgs; ++i) {
20975 MachineOperand &Op = MI->getOperand(i);
20976 if (!(Op.isReg() && Op.isImplicit()))
20977 MIB.addOperand(Op);
20979 if (MI->hasOneMemOperand())
20980 MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
20982 BuildMI(*BB, MI, dl,
20983 TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
20986 MI->eraseFromParent();
20990 static MachineBasicBlock *EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
20991 const X86Subtarget *Subtarget) {
20992 DebugLoc dl = MI->getDebugLoc();
20993 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
20994 // Address into RAX/EAX, other two args into ECX, EDX.
20995 unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
20996 unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
20997 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
20998 for (int i = 0; i < X86::AddrNumOperands; ++i)
20999 MIB.addOperand(MI->getOperand(i));
21001 unsigned ValOps = X86::AddrNumOperands;
21002 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
21003 .addReg(MI->getOperand(ValOps).getReg());
21004 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
21005 .addReg(MI->getOperand(ValOps+1).getReg());
21007 // The instruction doesn't actually take any operands though.
21008 BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr));
21010 MI->eraseFromParent(); // The pseudo is gone now.
21014 MachineBasicBlock *
21015 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI,
21016 MachineBasicBlock *MBB) const {
21017 // Emit va_arg instruction on X86-64.
21019 // Operands to this pseudo-instruction:
21020 // 0 ) Output : destination address (reg)
21021 // 1-5) Input : va_list address (addr, i64mem)
21022 // 6 ) ArgSize : Size (in bytes) of vararg type
21023 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
21024 // 8 ) Align : Alignment of type
21025 // 9 ) EFLAGS (implicit-def)
21027 assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
21028 assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands");
21030 unsigned DestReg = MI->getOperand(0).getReg();
21031 MachineOperand &Base = MI->getOperand(1);
21032 MachineOperand &Scale = MI->getOperand(2);
21033 MachineOperand &Index = MI->getOperand(3);
21034 MachineOperand &Disp = MI->getOperand(4);
21035 MachineOperand &Segment = MI->getOperand(5);
21036 unsigned ArgSize = MI->getOperand(6).getImm();
21037 unsigned ArgMode = MI->getOperand(7).getImm();
21038 unsigned Align = MI->getOperand(8).getImm();
21040 // Memory Reference
21041 assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
21042 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
21043 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
21045 // Machine Information
21046 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21047 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
21048 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
21049 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
21050 DebugLoc DL = MI->getDebugLoc();
21052 // struct va_list {
21055 // i64 overflow_area (address)
21056 // i64 reg_save_area (address)
21058 // sizeof(va_list) = 24
21059 // alignment(va_list) = 8
21061 unsigned TotalNumIntRegs = 6;
21062 unsigned TotalNumXMMRegs = 8;
21063 bool UseGPOffset = (ArgMode == 1);
21064 bool UseFPOffset = (ArgMode == 2);
21065 unsigned MaxOffset = TotalNumIntRegs * 8 +
21066 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
21068 /* Align ArgSize to a multiple of 8 */
21069 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
21070 bool NeedsAlign = (Align > 8);
21072 MachineBasicBlock *thisMBB = MBB;
21073 MachineBasicBlock *overflowMBB;
21074 MachineBasicBlock *offsetMBB;
21075 MachineBasicBlock *endMBB;
21077 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
21078 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
21079 unsigned OffsetReg = 0;
21081 if (!UseGPOffset && !UseFPOffset) {
21082 // If we only pull from the overflow region, we don't create a branch.
21083 // We don't need to alter control flow.
21084 OffsetDestReg = 0; // unused
21085 OverflowDestReg = DestReg;
21087 offsetMBB = nullptr;
21088 overflowMBB = thisMBB;
21091 // First emit code to check if gp_offset (or fp_offset) is below the bound.
21092 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
21093 // If not, pull from overflow_area. (branch to overflowMBB)
21098 // offsetMBB overflowMBB
21103 // Registers for the PHI in endMBB
21104 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
21105 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
21107 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
21108 MachineFunction *MF = MBB->getParent();
21109 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
21110 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
21111 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
21113 MachineFunction::iterator MBBIter = MBB;
21116 // Insert the new basic blocks
21117 MF->insert(MBBIter, offsetMBB);
21118 MF->insert(MBBIter, overflowMBB);
21119 MF->insert(MBBIter, endMBB);
21121 // Transfer the remainder of MBB and its successor edges to endMBB.
21122 endMBB->splice(endMBB->begin(), thisMBB,
21123 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
21124 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
21126 // Make offsetMBB and overflowMBB successors of thisMBB
21127 thisMBB->addSuccessor(offsetMBB);
21128 thisMBB->addSuccessor(overflowMBB);
21130 // endMBB is a successor of both offsetMBB and overflowMBB
21131 offsetMBB->addSuccessor(endMBB);
21132 overflowMBB->addSuccessor(endMBB);
21134 // Load the offset value into a register
21135 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
21136 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
21140 .addDisp(Disp, UseFPOffset ? 4 : 0)
21141 .addOperand(Segment)
21142 .setMemRefs(MMOBegin, MMOEnd);
21144 // Check if there is enough room left to pull this argument.
21145 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
21147 .addImm(MaxOffset + 8 - ArgSizeA8);
21149 // Branch to "overflowMBB" if offset >= max
21150 // Fall through to "offsetMBB" otherwise
21151 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
21152 .addMBB(overflowMBB);
21155 // In offsetMBB, emit code to use the reg_save_area.
21157 assert(OffsetReg != 0);
21159 // Read the reg_save_area address.
21160 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
21161 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
21166 .addOperand(Segment)
21167 .setMemRefs(MMOBegin, MMOEnd);
21169 // Zero-extend the offset
21170 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
21171 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
21174 .addImm(X86::sub_32bit);
21176 // Add the offset to the reg_save_area to get the final address.
21177 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
21178 .addReg(OffsetReg64)
21179 .addReg(RegSaveReg);
21181 // Compute the offset for the next argument
21182 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
21183 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
21185 .addImm(UseFPOffset ? 16 : 8);
21187 // Store it back into the va_list.
21188 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
21192 .addDisp(Disp, UseFPOffset ? 4 : 0)
21193 .addOperand(Segment)
21194 .addReg(NextOffsetReg)
21195 .setMemRefs(MMOBegin, MMOEnd);
21198 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
21203 // Emit code to use overflow area
21206 // Load the overflow_area address into a register.
21207 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
21208 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
21213 .addOperand(Segment)
21214 .setMemRefs(MMOBegin, MMOEnd);
21216 // If we need to align it, do so. Otherwise, just copy the address
21217 // to OverflowDestReg.
21219 // Align the overflow address
21220 assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2");
21221 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
21223 // aligned_addr = (addr + (align-1)) & ~(align-1)
21224 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
21225 .addReg(OverflowAddrReg)
21228 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
21230 .addImm(~(uint64_t)(Align-1));
21232 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
21233 .addReg(OverflowAddrReg);
21236 // Compute the next overflow address after this argument.
21237 // (the overflow address should be kept 8-byte aligned)
21238 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
21239 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
21240 .addReg(OverflowDestReg)
21241 .addImm(ArgSizeA8);
21243 // Store the new overflow address.
21244 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
21249 .addOperand(Segment)
21250 .addReg(NextAddrReg)
21251 .setMemRefs(MMOBegin, MMOEnd);
21253 // If we branched, emit the PHI to the front of endMBB.
21255 BuildMI(*endMBB, endMBB->begin(), DL,
21256 TII->get(X86::PHI), DestReg)
21257 .addReg(OffsetDestReg).addMBB(offsetMBB)
21258 .addReg(OverflowDestReg).addMBB(overflowMBB);
21261 // Erase the pseudo instruction
21262 MI->eraseFromParent();
21267 MachineBasicBlock *
21268 X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
21270 MachineBasicBlock *MBB) const {
21271 // Emit code to save XMM registers to the stack. The ABI says that the
21272 // number of registers to save is given in %al, so it's theoretically
21273 // possible to do an indirect jump trick to avoid saving all of them,
21274 // however this code takes a simpler approach and just executes all
21275 // of the stores if %al is non-zero. It's less code, and it's probably
21276 // easier on the hardware branch predictor, and stores aren't all that
21277 // expensive anyway.
21279 // Create the new basic blocks. One block contains all the XMM stores,
21280 // and one block is the final destination regardless of whether any
21281 // stores were performed.
21282 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
21283 MachineFunction *F = MBB->getParent();
21284 MachineFunction::iterator MBBIter = MBB;
21286 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
21287 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
21288 F->insert(MBBIter, XMMSaveMBB);
21289 F->insert(MBBIter, EndMBB);
21291 // Transfer the remainder of MBB and its successor edges to EndMBB.
21292 EndMBB->splice(EndMBB->begin(), MBB,
21293 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
21294 EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
21296 // The original block will now fall through to the XMM save block.
21297 MBB->addSuccessor(XMMSaveMBB);
21298 // The XMMSaveMBB will fall through to the end block.
21299 XMMSaveMBB->addSuccessor(EndMBB);
21301 // Now add the instructions.
21302 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21303 DebugLoc DL = MI->getDebugLoc();
21305 unsigned CountReg = MI->getOperand(0).getReg();
21306 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
21307 int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
21309 if (!Subtarget->isTargetWin64()) {
21310 // If %al is 0, branch around the XMM save block.
21311 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
21312 BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
21313 MBB->addSuccessor(EndMBB);
21316 // Make sure the last operand is EFLAGS, which gets clobbered by the branch
21317 // that was just emitted, but clearly shouldn't be "saved".
21318 assert((MI->getNumOperands() <= 3 ||
21319 !MI->getOperand(MI->getNumOperands() - 1).isReg() ||
21320 MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS)
21321 && "Expected last argument to be EFLAGS");
21322 unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
21323 // In the XMM save block, save all the XMM argument registers.
21324 for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) {
21325 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
21326 MachineMemOperand *MMO =
21327 F->getMachineMemOperand(
21328 MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset),
21329 MachineMemOperand::MOStore,
21330 /*Size=*/16, /*Align=*/16);
21331 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
21332 .addFrameIndex(RegSaveFrameIndex)
21333 .addImm(/*Scale=*/1)
21334 .addReg(/*IndexReg=*/0)
21335 .addImm(/*Disp=*/Offset)
21336 .addReg(/*Segment=*/0)
21337 .addReg(MI->getOperand(i).getReg())
21338 .addMemOperand(MMO);
21341 MI->eraseFromParent(); // The pseudo instruction is gone now.
21346 // The EFLAGS operand of SelectItr might be missing a kill marker
21347 // because there were multiple uses of EFLAGS, and ISel didn't know
21348 // which to mark. Figure out whether SelectItr should have had a
21349 // kill marker, and set it if it should. Returns the correct kill
21351 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
21352 MachineBasicBlock* BB,
21353 const TargetRegisterInfo* TRI) {
21354 // Scan forward through BB for a use/def of EFLAGS.
21355 MachineBasicBlock::iterator miI(std::next(SelectItr));
21356 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
21357 const MachineInstr& mi = *miI;
21358 if (mi.readsRegister(X86::EFLAGS))
21360 if (mi.definesRegister(X86::EFLAGS))
21361 break; // Should have kill-flag - update below.
21364 // If we hit the end of the block, check whether EFLAGS is live into a
21366 if (miI == BB->end()) {
21367 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
21368 sEnd = BB->succ_end();
21369 sItr != sEnd; ++sItr) {
21370 MachineBasicBlock* succ = *sItr;
21371 if (succ->isLiveIn(X86::EFLAGS))
21376 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
21377 // out. SelectMI should have a kill flag on EFLAGS.
21378 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
21382 MachineBasicBlock *
21383 X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
21384 MachineBasicBlock *BB) const {
21385 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21386 DebugLoc DL = MI->getDebugLoc();
21388 // To "insert" a SELECT_CC instruction, we actually have to insert the
21389 // diamond control-flow pattern. The incoming instruction knows the
21390 // destination vreg to set, the condition code register to branch on, the
21391 // true/false values to select between, and a branch opcode to use.
21392 const BasicBlock *LLVM_BB = BB->getBasicBlock();
21393 MachineFunction::iterator It = BB;
21399 // cmpTY ccX, r1, r2
21401 // fallthrough --> copy0MBB
21402 MachineBasicBlock *thisMBB = BB;
21403 MachineFunction *F = BB->getParent();
21404 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
21405 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
21406 F->insert(It, copy0MBB);
21407 F->insert(It, sinkMBB);
21409 // If the EFLAGS register isn't dead in the terminator, then claim that it's
21410 // live into the sink and copy blocks.
21411 const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
21412 if (!MI->killsRegister(X86::EFLAGS) &&
21413 !checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
21414 copy0MBB->addLiveIn(X86::EFLAGS);
21415 sinkMBB->addLiveIn(X86::EFLAGS);
21418 // Transfer the remainder of BB and its successor edges to sinkMBB.
21419 sinkMBB->splice(sinkMBB->begin(), BB,
21420 std::next(MachineBasicBlock::iterator(MI)), BB->end());
21421 sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
21423 // Add the true and fallthrough blocks as its successors.
21424 BB->addSuccessor(copy0MBB);
21425 BB->addSuccessor(sinkMBB);
21427 // Create the conditional branch instruction.
21429 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
21430 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
21433 // %FalseValue = ...
21434 // # fallthrough to sinkMBB
21435 copy0MBB->addSuccessor(sinkMBB);
21438 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
21440 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
21441 TII->get(X86::PHI), MI->getOperand(0).getReg())
21442 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
21443 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
21445 MI->eraseFromParent(); // The pseudo instruction is gone now.
21449 MachineBasicBlock *
21450 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
21451 MachineBasicBlock *BB) const {
21452 MachineFunction *MF = BB->getParent();
21453 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21454 DebugLoc DL = MI->getDebugLoc();
21455 const BasicBlock *LLVM_BB = BB->getBasicBlock();
21457 assert(MF->shouldSplitStack());
21459 const bool Is64Bit = Subtarget->is64Bit();
21460 const bool IsLP64 = Subtarget->isTarget64BitLP64();
21462 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
21463 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
21466 // ... [Till the alloca]
21467 // If stacklet is not large enough, jump to mallocMBB
21470 // Allocate by subtracting from RSP
21471 // Jump to continueMBB
21474 // Allocate by call to runtime
21478 // [rest of original BB]
21481 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
21482 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
21483 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
21485 MachineRegisterInfo &MRI = MF->getRegInfo();
21486 const TargetRegisterClass *AddrRegClass =
21487 getRegClassFor(getPointerTy());
21489 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
21490 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
21491 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
21492 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
21493 sizeVReg = MI->getOperand(1).getReg(),
21494 physSPReg = IsLP64 || Subtarget->isTargetNaCl64() ? X86::RSP : X86::ESP;
21496 MachineFunction::iterator MBBIter = BB;
21499 MF->insert(MBBIter, bumpMBB);
21500 MF->insert(MBBIter, mallocMBB);
21501 MF->insert(MBBIter, continueMBB);
21503 continueMBB->splice(continueMBB->begin(), BB,
21504 std::next(MachineBasicBlock::iterator(MI)), BB->end());
21505 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
21507 // Add code to the main basic block to check if the stack limit has been hit,
21508 // and if so, jump to mallocMBB otherwise to bumpMBB.
21509 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
21510 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
21511 .addReg(tmpSPVReg).addReg(sizeVReg);
21512 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
21513 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
21514 .addReg(SPLimitVReg);
21515 BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
21517 // bumpMBB simply decreases the stack pointer, since we know the current
21518 // stacklet has enough space.
21519 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
21520 .addReg(SPLimitVReg);
21521 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
21522 .addReg(SPLimitVReg);
21523 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
21525 // Calls into a routine in libgcc to allocate more space from the heap.
21526 const uint32_t *RegMask =
21527 Subtarget->getRegisterInfo()->getCallPreservedMask(CallingConv::C);
21529 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
21531 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
21532 .addExternalSymbol("__morestack_allocate_stack_space")
21533 .addRegMask(RegMask)
21534 .addReg(X86::RDI, RegState::Implicit)
21535 .addReg(X86::RAX, RegState::ImplicitDefine);
21536 } else if (Is64Bit) {
21537 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
21539 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
21540 .addExternalSymbol("__morestack_allocate_stack_space")
21541 .addRegMask(RegMask)
21542 .addReg(X86::EDI, RegState::Implicit)
21543 .addReg(X86::EAX, RegState::ImplicitDefine);
21545 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
21547 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
21548 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
21549 .addExternalSymbol("__morestack_allocate_stack_space")
21550 .addRegMask(RegMask)
21551 .addReg(X86::EAX, RegState::ImplicitDefine);
21555 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
21558 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
21559 .addReg(IsLP64 ? X86::RAX : X86::EAX);
21560 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
21562 // Set up the CFG correctly.
21563 BB->addSuccessor(bumpMBB);
21564 BB->addSuccessor(mallocMBB);
21565 mallocMBB->addSuccessor(continueMBB);
21566 bumpMBB->addSuccessor(continueMBB);
21568 // Take care of the PHI nodes.
21569 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
21570 MI->getOperand(0).getReg())
21571 .addReg(mallocPtrVReg).addMBB(mallocMBB)
21572 .addReg(bumpSPPtrVReg).addMBB(bumpMBB);
21574 // Delete the original pseudo instruction.
21575 MI->eraseFromParent();
21578 return continueMBB;
21581 MachineBasicBlock *
21582 X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
21583 MachineBasicBlock *BB) const {
21584 DebugLoc DL = MI->getDebugLoc();
21586 assert(!Subtarget->isTargetMachO());
21588 X86FrameLowering::emitStackProbeCall(*BB->getParent(), *BB, MI, DL);
21590 MI->eraseFromParent(); // The pseudo instruction is gone now.
21594 MachineBasicBlock *
21595 X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
21596 MachineBasicBlock *BB) const {
21597 // This is pretty easy. We're taking the value that we received from
21598 // our load from the relocation, sticking it in either RDI (x86-64)
21599 // or EAX and doing an indirect call. The return value will then
21600 // be in the normal return register.
21601 MachineFunction *F = BB->getParent();
21602 const X86InstrInfo *TII = Subtarget->getInstrInfo();
21603 DebugLoc DL = MI->getDebugLoc();
21605 assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
21606 assert(MI->getOperand(3).isGlobal() && "This should be a global");
21608 // Get a register mask for the lowered call.
21609 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
21610 // proper register mask.
21611 const uint32_t *RegMask =
21612 Subtarget->getRegisterInfo()->getCallPreservedMask(CallingConv::C);
21613 if (Subtarget->is64Bit()) {
21614 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
21615 TII->get(X86::MOV64rm), X86::RDI)
21617 .addImm(0).addReg(0)
21618 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
21619 MI->getOperand(3).getTargetFlags())
21621 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
21622 addDirectMem(MIB, X86::RDI);
21623 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
21624 } else if (F->getTarget().getRelocationModel() != Reloc::PIC_) {
21625 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
21626 TII->get(X86::MOV32rm), X86::EAX)
21628 .addImm(0).addReg(0)
21629 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
21630 MI->getOperand(3).getTargetFlags())
21632 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
21633 addDirectMem(MIB, X86::EAX);
21634 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
21636 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
21637 TII->get(X86::MOV32rm), X86::EAX)
21638 .addReg(TII->getGlobalBaseReg(F))
21639 .addImm(0).addReg(0)
21640 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
21641 MI->getOperand(3).getTargetFlags())
21643 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
21644 addDirectMem(MIB, X86::EAX);
21645 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
21648 MI->eraseFromParent(); // The pseudo instruction is gone now.
21652 MachineBasicBlock *
21653 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
21654 MachineBasicBlock *MBB) const {
21655 DebugLoc DL = MI->getDebugLoc();
21656 MachineFunction *MF = MBB->getParent();
21657 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21658 MachineRegisterInfo &MRI = MF->getRegInfo();
21660 const BasicBlock *BB = MBB->getBasicBlock();
21661 MachineFunction::iterator I = MBB;
21664 // Memory Reference
21665 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
21666 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
21669 unsigned MemOpndSlot = 0;
21671 unsigned CurOp = 0;
21673 DstReg = MI->getOperand(CurOp++).getReg();
21674 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
21675 assert(RC->hasType(MVT::i32) && "Invalid destination!");
21676 unsigned mainDstReg = MRI.createVirtualRegister(RC);
21677 unsigned restoreDstReg = MRI.createVirtualRegister(RC);
21679 MemOpndSlot = CurOp;
21681 MVT PVT = getPointerTy();
21682 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
21683 "Invalid Pointer Size!");
21685 // For v = setjmp(buf), we generate
21688 // buf[LabelOffset] = restoreMBB
21689 // SjLjSetup restoreMBB
21695 // v = phi(main, restore)
21698 // if base pointer being used, load it from frame
21701 MachineBasicBlock *thisMBB = MBB;
21702 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
21703 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
21704 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
21705 MF->insert(I, mainMBB);
21706 MF->insert(I, sinkMBB);
21707 MF->push_back(restoreMBB);
21709 MachineInstrBuilder MIB;
21711 // Transfer the remainder of BB and its successor edges to sinkMBB.
21712 sinkMBB->splice(sinkMBB->begin(), MBB,
21713 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
21714 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
21717 unsigned PtrStoreOpc = 0;
21718 unsigned LabelReg = 0;
21719 const int64_t LabelOffset = 1 * PVT.getStoreSize();
21720 Reloc::Model RM = MF->getTarget().getRelocationModel();
21721 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
21722 (RM == Reloc::Static || RM == Reloc::DynamicNoPIC);
21724 // Prepare IP either in reg or imm.
21725 if (!UseImmLabel) {
21726 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
21727 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
21728 LabelReg = MRI.createVirtualRegister(PtrRC);
21729 if (Subtarget->is64Bit()) {
21730 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
21734 .addMBB(restoreMBB)
21737 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
21738 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
21739 .addReg(XII->getGlobalBaseReg(MF))
21742 .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference())
21746 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
21748 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
21749 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
21750 if (i == X86::AddrDisp)
21751 MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset);
21753 MIB.addOperand(MI->getOperand(MemOpndSlot + i));
21756 MIB.addReg(LabelReg);
21758 MIB.addMBB(restoreMBB);
21759 MIB.setMemRefs(MMOBegin, MMOEnd);
21761 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
21762 .addMBB(restoreMBB);
21764 const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
21765 MIB.addRegMask(RegInfo->getNoPreservedMask());
21766 thisMBB->addSuccessor(mainMBB);
21767 thisMBB->addSuccessor(restoreMBB);
21771 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
21772 mainMBB->addSuccessor(sinkMBB);
21775 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
21776 TII->get(X86::PHI), DstReg)
21777 .addReg(mainDstReg).addMBB(mainMBB)
21778 .addReg(restoreDstReg).addMBB(restoreMBB);
21781 if (RegInfo->hasBasePointer(*MF)) {
21782 const bool Uses64BitFramePtr =
21783 Subtarget->isTarget64BitLP64() || Subtarget->isTargetNaCl64();
21784 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
21785 X86FI->setRestoreBasePointer(MF);
21786 unsigned FramePtr = RegInfo->getFrameRegister(*MF);
21787 unsigned BasePtr = RegInfo->getBaseRegister();
21788 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
21789 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
21790 FramePtr, true, X86FI->getRestoreBasePointerOffset())
21791 .setMIFlag(MachineInstr::FrameSetup);
21793 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
21794 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
21795 restoreMBB->addSuccessor(sinkMBB);
21797 MI->eraseFromParent();
21801 MachineBasicBlock *
21802 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
21803 MachineBasicBlock *MBB) const {
21804 DebugLoc DL = MI->getDebugLoc();
21805 MachineFunction *MF = MBB->getParent();
21806 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21807 MachineRegisterInfo &MRI = MF->getRegInfo();
21809 // Memory Reference
21810 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
21811 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
21813 MVT PVT = getPointerTy();
21814 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
21815 "Invalid Pointer Size!");
21817 const TargetRegisterClass *RC =
21818 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
21819 unsigned Tmp = MRI.createVirtualRegister(RC);
21820 // Since FP is only updated here but NOT referenced, it's treated as GPR.
21821 const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
21822 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
21823 unsigned SP = RegInfo->getStackRegister();
21825 MachineInstrBuilder MIB;
21827 const int64_t LabelOffset = 1 * PVT.getStoreSize();
21828 const int64_t SPOffset = 2 * PVT.getStoreSize();
21830 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
21831 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
21834 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
21835 for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
21836 MIB.addOperand(MI->getOperand(i));
21837 MIB.setMemRefs(MMOBegin, MMOEnd);
21839 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
21840 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
21841 if (i == X86::AddrDisp)
21842 MIB.addDisp(MI->getOperand(i), LabelOffset);
21844 MIB.addOperand(MI->getOperand(i));
21846 MIB.setMemRefs(MMOBegin, MMOEnd);
21848 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
21849 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
21850 if (i == X86::AddrDisp)
21851 MIB.addDisp(MI->getOperand(i), SPOffset);
21853 MIB.addOperand(MI->getOperand(i));
21855 MIB.setMemRefs(MMOBegin, MMOEnd);
21857 BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
21859 MI->eraseFromParent();
21863 // Replace 213-type (isel default) FMA3 instructions with 231-type for
21864 // accumulator loops. Writing back to the accumulator allows the coalescer
21865 // to remove extra copies in the loop.
21866 MachineBasicBlock *
21867 X86TargetLowering::emitFMA3Instr(MachineInstr *MI,
21868 MachineBasicBlock *MBB) const {
21869 MachineOperand &AddendOp = MI->getOperand(3);
21871 // Bail out early if the addend isn't a register - we can't switch these.
21872 if (!AddendOp.isReg())
21875 MachineFunction &MF = *MBB->getParent();
21876 MachineRegisterInfo &MRI = MF.getRegInfo();
21878 // Check whether the addend is defined by a PHI:
21879 assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?");
21880 MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg());
21881 if (!AddendDef.isPHI())
21884 // Look for the following pattern:
21886 // %addend = phi [%entry, 0], [%loop, %result]
21888 // %result<tied1> = FMA213 %m2<tied0>, %m1, %addend
21892 // %addend = phi [%entry, 0], [%loop, %result]
21894 // %result<tied1> = FMA231 %addend<tied0>, %m1, %m2
21896 for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) {
21897 assert(AddendDef.getOperand(i).isReg());
21898 MachineOperand PHISrcOp = AddendDef.getOperand(i);
21899 MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg());
21900 if (&PHISrcInst == MI) {
21901 // Found a matching instruction.
21902 unsigned NewFMAOpc = 0;
21903 switch (MI->getOpcode()) {
21904 case X86::VFMADDPDr213r: NewFMAOpc = X86::VFMADDPDr231r; break;
21905 case X86::VFMADDPSr213r: NewFMAOpc = X86::VFMADDPSr231r; break;
21906 case X86::VFMADDSDr213r: NewFMAOpc = X86::VFMADDSDr231r; break;
21907 case X86::VFMADDSSr213r: NewFMAOpc = X86::VFMADDSSr231r; break;
21908 case X86::VFMSUBPDr213r: NewFMAOpc = X86::VFMSUBPDr231r; break;
21909 case X86::VFMSUBPSr213r: NewFMAOpc = X86::VFMSUBPSr231r; break;
21910 case X86::VFMSUBSDr213r: NewFMAOpc = X86::VFMSUBSDr231r; break;
21911 case X86::VFMSUBSSr213r: NewFMAOpc = X86::VFMSUBSSr231r; break;
21912 case X86::VFNMADDPDr213r: NewFMAOpc = X86::VFNMADDPDr231r; break;
21913 case X86::VFNMADDPSr213r: NewFMAOpc = X86::VFNMADDPSr231r; break;
21914 case X86::VFNMADDSDr213r: NewFMAOpc = X86::VFNMADDSDr231r; break;
21915 case X86::VFNMADDSSr213r: NewFMAOpc = X86::VFNMADDSSr231r; break;
21916 case X86::VFNMSUBPDr213r: NewFMAOpc = X86::VFNMSUBPDr231r; break;
21917 case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break;
21918 case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break;
21919 case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break;
21920 case X86::VFMADDSUBPDr213r: NewFMAOpc = X86::VFMADDSUBPDr231r; break;
21921 case X86::VFMADDSUBPSr213r: NewFMAOpc = X86::VFMADDSUBPSr231r; break;
21922 case X86::VFMSUBADDPDr213r: NewFMAOpc = X86::VFMSUBADDPDr231r; break;
21923 case X86::VFMSUBADDPSr213r: NewFMAOpc = X86::VFMSUBADDPSr231r; break;
21925 case X86::VFMADDPDr213rY: NewFMAOpc = X86::VFMADDPDr231rY; break;
21926 case X86::VFMADDPSr213rY: NewFMAOpc = X86::VFMADDPSr231rY; break;
21927 case X86::VFMSUBPDr213rY: NewFMAOpc = X86::VFMSUBPDr231rY; break;
21928 case X86::VFMSUBPSr213rY: NewFMAOpc = X86::VFMSUBPSr231rY; break;
21929 case X86::VFNMADDPDr213rY: NewFMAOpc = X86::VFNMADDPDr231rY; break;
21930 case X86::VFNMADDPSr213rY: NewFMAOpc = X86::VFNMADDPSr231rY; break;
21931 case X86::VFNMSUBPDr213rY: NewFMAOpc = X86::VFNMSUBPDr231rY; break;
21932 case X86::VFNMSUBPSr213rY: NewFMAOpc = X86::VFNMSUBPSr231rY; break;
21933 case X86::VFMADDSUBPDr213rY: NewFMAOpc = X86::VFMADDSUBPDr231rY; break;
21934 case X86::VFMADDSUBPSr213rY: NewFMAOpc = X86::VFMADDSUBPSr231rY; break;
21935 case X86::VFMSUBADDPDr213rY: NewFMAOpc = X86::VFMSUBADDPDr231rY; break;
21936 case X86::VFMSUBADDPSr213rY: NewFMAOpc = X86::VFMSUBADDPSr231rY; break;
21937 default: llvm_unreachable("Unrecognized FMA variant.");
21940 const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
21941 MachineInstrBuilder MIB =
21942 BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc))
21943 .addOperand(MI->getOperand(0))
21944 .addOperand(MI->getOperand(3))
21945 .addOperand(MI->getOperand(2))
21946 .addOperand(MI->getOperand(1));
21947 MBB->insert(MachineBasicBlock::iterator(MI), MIB);
21948 MI->eraseFromParent();
21955 MachineBasicBlock *
21956 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
21957 MachineBasicBlock *BB) const {
21958 switch (MI->getOpcode()) {
21959 default: llvm_unreachable("Unexpected instr type to insert");
21960 case X86::TAILJMPd64:
21961 case X86::TAILJMPr64:
21962 case X86::TAILJMPm64:
21963 case X86::TAILJMPd64_REX:
21964 case X86::TAILJMPr64_REX:
21965 case X86::TAILJMPm64_REX:
21966 llvm_unreachable("TAILJMP64 would not be touched here.");
21967 case X86::TCRETURNdi64:
21968 case X86::TCRETURNri64:
21969 case X86::TCRETURNmi64:
21971 case X86::WIN_ALLOCA:
21972 return EmitLoweredWinAlloca(MI, BB);
21973 case X86::SEG_ALLOCA_32:
21974 case X86::SEG_ALLOCA_64:
21975 return EmitLoweredSegAlloca(MI, BB);
21976 case X86::TLSCall_32:
21977 case X86::TLSCall_64:
21978 return EmitLoweredTLSCall(MI, BB);
21979 case X86::CMOV_GR8:
21980 case X86::CMOV_FR32:
21981 case X86::CMOV_FR64:
21982 case X86::CMOV_V4F32:
21983 case X86::CMOV_V2F64:
21984 case X86::CMOV_V2I64:
21985 case X86::CMOV_V8F32:
21986 case X86::CMOV_V4F64:
21987 case X86::CMOV_V4I64:
21988 case X86::CMOV_V16F32:
21989 case X86::CMOV_V8F64:
21990 case X86::CMOV_V8I64:
21991 case X86::CMOV_GR16:
21992 case X86::CMOV_GR32:
21993 case X86::CMOV_RFP32:
21994 case X86::CMOV_RFP64:
21995 case X86::CMOV_RFP80:
21996 return EmitLoweredSelect(MI, BB);
21998 case X86::FP32_TO_INT16_IN_MEM:
21999 case X86::FP32_TO_INT32_IN_MEM:
22000 case X86::FP32_TO_INT64_IN_MEM:
22001 case X86::FP64_TO_INT16_IN_MEM:
22002 case X86::FP64_TO_INT32_IN_MEM:
22003 case X86::FP64_TO_INT64_IN_MEM:
22004 case X86::FP80_TO_INT16_IN_MEM:
22005 case X86::FP80_TO_INT32_IN_MEM:
22006 case X86::FP80_TO_INT64_IN_MEM: {
22007 MachineFunction *F = BB->getParent();
22008 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
22009 DebugLoc DL = MI->getDebugLoc();
22011 // Change the floating point control register to use "round towards zero"
22012 // mode when truncating to an integer value.
22013 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
22014 addFrameReference(BuildMI(*BB, MI, DL,
22015 TII->get(X86::FNSTCW16m)), CWFrameIdx);
22017 // Load the old value of the high byte of the control word...
22019 F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
22020 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
22023 // Set the high part to be round to zero...
22024 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
22027 // Reload the modified control word now...
22028 addFrameReference(BuildMI(*BB, MI, DL,
22029 TII->get(X86::FLDCW16m)), CWFrameIdx);
22031 // Restore the memory image of control word to original value
22032 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
22035 // Get the X86 opcode to use.
22037 switch (MI->getOpcode()) {
22038 default: llvm_unreachable("illegal opcode!");
22039 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
22040 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
22041 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
22042 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
22043 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
22044 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
22045 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
22046 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
22047 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
22051 MachineOperand &Op = MI->getOperand(0);
22053 AM.BaseType = X86AddressMode::RegBase;
22054 AM.Base.Reg = Op.getReg();
22056 AM.BaseType = X86AddressMode::FrameIndexBase;
22057 AM.Base.FrameIndex = Op.getIndex();
22059 Op = MI->getOperand(1);
22061 AM.Scale = Op.getImm();
22062 Op = MI->getOperand(2);
22064 AM.IndexReg = Op.getImm();
22065 Op = MI->getOperand(3);
22066 if (Op.isGlobal()) {
22067 AM.GV = Op.getGlobal();
22069 AM.Disp = Op.getImm();
22071 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
22072 .addReg(MI->getOperand(X86::AddrNumOperands).getReg());
22074 // Reload the original control word now.
22075 addFrameReference(BuildMI(*BB, MI, DL,
22076 TII->get(X86::FLDCW16m)), CWFrameIdx);
22078 MI->eraseFromParent(); // The pseudo instruction is gone now.
22081 // String/text processing lowering.
22082 case X86::PCMPISTRM128REG:
22083 case X86::VPCMPISTRM128REG:
22084 case X86::PCMPISTRM128MEM:
22085 case X86::VPCMPISTRM128MEM:
22086 case X86::PCMPESTRM128REG:
22087 case X86::VPCMPESTRM128REG:
22088 case X86::PCMPESTRM128MEM:
22089 case X86::VPCMPESTRM128MEM:
22090 assert(Subtarget->hasSSE42() &&
22091 "Target must have SSE4.2 or AVX features enabled");
22092 return EmitPCMPSTRM(MI, BB, Subtarget->getInstrInfo());
22094 // String/text processing lowering.
22095 case X86::PCMPISTRIREG:
22096 case X86::VPCMPISTRIREG:
22097 case X86::PCMPISTRIMEM:
22098 case X86::VPCMPISTRIMEM:
22099 case X86::PCMPESTRIREG:
22100 case X86::VPCMPESTRIREG:
22101 case X86::PCMPESTRIMEM:
22102 case X86::VPCMPESTRIMEM:
22103 assert(Subtarget->hasSSE42() &&
22104 "Target must have SSE4.2 or AVX features enabled");
22105 return EmitPCMPSTRI(MI, BB, Subtarget->getInstrInfo());
22107 // Thread synchronization.
22109 return EmitMonitor(MI, BB, Subtarget);
22113 return EmitXBegin(MI, BB, Subtarget->getInstrInfo());
22115 case X86::VASTART_SAVE_XMM_REGS:
22116 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
22118 case X86::VAARG_64:
22119 return EmitVAARG64WithCustomInserter(MI, BB);
22121 case X86::EH_SjLj_SetJmp32:
22122 case X86::EH_SjLj_SetJmp64:
22123 return emitEHSjLjSetJmp(MI, BB);
22125 case X86::EH_SjLj_LongJmp32:
22126 case X86::EH_SjLj_LongJmp64:
22127 return emitEHSjLjLongJmp(MI, BB);
22129 case TargetOpcode::STATEPOINT:
22130 // As an implementation detail, STATEPOINT shares the STACKMAP format at
22131 // this point in the process. We diverge later.
22132 return emitPatchPoint(MI, BB);
22134 case TargetOpcode::STACKMAP:
22135 case TargetOpcode::PATCHPOINT:
22136 return emitPatchPoint(MI, BB);
22138 case X86::VFMADDPDr213r:
22139 case X86::VFMADDPSr213r:
22140 case X86::VFMADDSDr213r:
22141 case X86::VFMADDSSr213r:
22142 case X86::VFMSUBPDr213r:
22143 case X86::VFMSUBPSr213r:
22144 case X86::VFMSUBSDr213r:
22145 case X86::VFMSUBSSr213r:
22146 case X86::VFNMADDPDr213r:
22147 case X86::VFNMADDPSr213r:
22148 case X86::VFNMADDSDr213r:
22149 case X86::VFNMADDSSr213r:
22150 case X86::VFNMSUBPDr213r:
22151 case X86::VFNMSUBPSr213r:
22152 case X86::VFNMSUBSDr213r:
22153 case X86::VFNMSUBSSr213r:
22154 case X86::VFMADDSUBPDr213r:
22155 case X86::VFMADDSUBPSr213r:
22156 case X86::VFMSUBADDPDr213r:
22157 case X86::VFMSUBADDPSr213r:
22158 case X86::VFMADDPDr213rY:
22159 case X86::VFMADDPSr213rY:
22160 case X86::VFMSUBPDr213rY:
22161 case X86::VFMSUBPSr213rY:
22162 case X86::VFNMADDPDr213rY:
22163 case X86::VFNMADDPSr213rY:
22164 case X86::VFNMSUBPDr213rY:
22165 case X86::VFNMSUBPSr213rY:
22166 case X86::VFMADDSUBPDr213rY:
22167 case X86::VFMADDSUBPSr213rY:
22168 case X86::VFMSUBADDPDr213rY:
22169 case X86::VFMSUBADDPSr213rY:
22170 return emitFMA3Instr(MI, BB);
22174 //===----------------------------------------------------------------------===//
22175 // X86 Optimization Hooks
22176 //===----------------------------------------------------------------------===//
22178 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
22181 const SelectionDAG &DAG,
22182 unsigned Depth) const {
22183 unsigned BitWidth = KnownZero.getBitWidth();
22184 unsigned Opc = Op.getOpcode();
22185 assert((Opc >= ISD::BUILTIN_OP_END ||
22186 Opc == ISD::INTRINSIC_WO_CHAIN ||
22187 Opc == ISD::INTRINSIC_W_CHAIN ||
22188 Opc == ISD::INTRINSIC_VOID) &&
22189 "Should use MaskedValueIsZero if you don't know whether Op"
22190 " is a target node!");
22192 KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything.
22206 // These nodes' second result is a boolean.
22207 if (Op.getResNo() == 0)
22210 case X86ISD::SETCC:
22211 KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
22213 case ISD::INTRINSIC_WO_CHAIN: {
22214 unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
22215 unsigned NumLoBits = 0;
22218 case Intrinsic::x86_sse_movmsk_ps:
22219 case Intrinsic::x86_avx_movmsk_ps_256:
22220 case Intrinsic::x86_sse2_movmsk_pd:
22221 case Intrinsic::x86_avx_movmsk_pd_256:
22222 case Intrinsic::x86_mmx_pmovmskb:
22223 case Intrinsic::x86_sse2_pmovmskb_128:
22224 case Intrinsic::x86_avx2_pmovmskb: {
22225 // High bits of movmskp{s|d}, pmovmskb are known zero.
22227 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
22228 case Intrinsic::x86_sse_movmsk_ps: NumLoBits = 4; break;
22229 case Intrinsic::x86_avx_movmsk_ps_256: NumLoBits = 8; break;
22230 case Intrinsic::x86_sse2_movmsk_pd: NumLoBits = 2; break;
22231 case Intrinsic::x86_avx_movmsk_pd_256: NumLoBits = 4; break;
22232 case Intrinsic::x86_mmx_pmovmskb: NumLoBits = 8; break;
22233 case Intrinsic::x86_sse2_pmovmskb_128: NumLoBits = 16; break;
22234 case Intrinsic::x86_avx2_pmovmskb: NumLoBits = 32; break;
22236 KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
22245 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
22247 const SelectionDAG &,
22248 unsigned Depth) const {
22249 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
22250 if (Op.getOpcode() == X86ISD::SETCC_CARRY)
22251 return Op.getValueType().getScalarType().getSizeInBits();
22257 /// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
22258 /// node is a GlobalAddress + offset.
22259 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
22260 const GlobalValue* &GA,
22261 int64_t &Offset) const {
22262 if (N->getOpcode() == X86ISD::Wrapper) {
22263 if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
22264 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
22265 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
22269 return TargetLowering::isGAPlusOffset(N, GA, Offset);
22272 /// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the
22273 /// same as extracting the high 128-bit part of 256-bit vector and then
22274 /// inserting the result into the low part of a new 256-bit vector
22275 static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
22276 EVT VT = SVOp->getValueType(0);
22277 unsigned NumElems = VT.getVectorNumElements();
22279 // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
22280 for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j)
22281 if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
22282 SVOp->getMaskElt(j) >= 0)
22288 /// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the
22289 /// same as extracting the low 128-bit part of 256-bit vector and then
22290 /// inserting the result into the high part of a new 256-bit vector
22291 static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
22292 EVT VT = SVOp->getValueType(0);
22293 unsigned NumElems = VT.getVectorNumElements();
22295 // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
22296 for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j)
22297 if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
22298 SVOp->getMaskElt(j) >= 0)
22304 /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
22305 static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
22306 TargetLowering::DAGCombinerInfo &DCI,
22307 const X86Subtarget* Subtarget) {
22309 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
22310 SDValue V1 = SVOp->getOperand(0);
22311 SDValue V2 = SVOp->getOperand(1);
22312 EVT VT = SVOp->getValueType(0);
22313 unsigned NumElems = VT.getVectorNumElements();
22315 if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
22316 V2.getOpcode() == ISD::CONCAT_VECTORS) {
22320 // V UNDEF BUILD_VECTOR UNDEF
22322 // CONCAT_VECTOR CONCAT_VECTOR
22325 // RESULT: V + zero extended
22327 if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
22328 V2.getOperand(1).getOpcode() != ISD::UNDEF ||
22329 V1.getOperand(1).getOpcode() != ISD::UNDEF)
22332 if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
22335 // To match the shuffle mask, the first half of the mask should
22336 // be exactly the first vector, and all the rest a splat with the
22337 // first element of the second one.
22338 for (unsigned i = 0; i != NumElems/2; ++i)
22339 if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
22340 !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
22343 // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
22344 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
22345 if (Ld->hasNUsesOfValue(1, 0)) {
22346 SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
22347 SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
22349 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
22351 Ld->getPointerInfo(),
22352 Ld->getAlignment(),
22353 false/*isVolatile*/, true/*ReadMem*/,
22354 false/*WriteMem*/);
22356 // Make sure the newly-created LOAD is in the same position as Ld in
22357 // terms of dependency. We create a TokenFactor for Ld and ResNode,
22358 // and update uses of Ld's output chain to use the TokenFactor.
22359 if (Ld->hasAnyUseOfValue(1)) {
22360 SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
22361 SDValue(Ld, 1), SDValue(ResNode.getNode(), 1));
22362 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
22363 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
22364 SDValue(ResNode.getNode(), 1));
22367 return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
22371 // Emit a zeroed vector and insert the desired subvector on its
22373 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
22374 SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
22375 return DCI.CombineTo(N, InsV);
22378 //===--------------------------------------------------------------------===//
22379 // Combine some shuffles into subvector extracts and inserts:
22382 // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
22383 if (isShuffleHigh128VectorInsertLow(SVOp)) {
22384 SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl);
22385 SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl);
22386 return DCI.CombineTo(N, InsV);
22389 // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
22390 if (isShuffleLow128VectorInsertHigh(SVOp)) {
22391 SDValue V = Extract128BitVector(V1, 0, DAG, dl);
22392 SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl);
22393 return DCI.CombineTo(N, InsV);
22399 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
22402 /// This is the leaf of the recursive combinine below. When we have found some
22403 /// chain of single-use x86 shuffle instructions and accumulated the combined
22404 /// shuffle mask represented by them, this will try to pattern match that mask
22405 /// into either a single instruction if there is a special purpose instruction
22406 /// for this operation, or into a PSHUFB instruction which is a fully general
22407 /// instruction but should only be used to replace chains over a certain depth.
22408 static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
22409 int Depth, bool HasPSHUFB, SelectionDAG &DAG,
22410 TargetLowering::DAGCombinerInfo &DCI,
22411 const X86Subtarget *Subtarget) {
22412 assert(!Mask.empty() && "Cannot combine an empty shuffle mask!");
22414 // Find the operand that enters the chain. Note that multiple uses are OK
22415 // here, we're not going to remove the operand we find.
22416 SDValue Input = Op.getOperand(0);
22417 while (Input.getOpcode() == ISD::BITCAST)
22418 Input = Input.getOperand(0);
22420 MVT VT = Input.getSimpleValueType();
22421 MVT RootVT = Root.getSimpleValueType();
22424 // Just remove no-op shuffle masks.
22425 if (Mask.size() == 1) {
22426 DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Input),
22431 // Use the float domain if the operand type is a floating point type.
22432 bool FloatDomain = VT.isFloatingPoint();
22434 // For floating point shuffles, we don't have free copies in the shuffle
22435 // instructions or the ability to load as part of the instruction, so
22436 // canonicalize their shuffles to UNPCK or MOV variants.
22438 // Note that even with AVX we prefer the PSHUFD form of shuffle for integer
22439 // vectors because it can have a load folded into it that UNPCK cannot. This
22440 // doesn't preclude something switching to the shorter encoding post-RA.
22442 if (Mask.equals(0, 0) || Mask.equals(1, 1)) {
22443 bool Lo = Mask.equals(0, 0);
22446 // Check if we have SSE3 which will let us use MOVDDUP. That instruction
22447 // is no slower than UNPCKLPD but has the option to fold the input operand
22448 // into even an unaligned memory load.
22449 if (Lo && Subtarget->hasSSE3()) {
22450 Shuffle = X86ISD::MOVDDUP;
22451 ShuffleVT = MVT::v2f64;
22453 // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller
22454 // than the UNPCK variants.
22455 Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS;
22456 ShuffleVT = MVT::v4f32;
22458 if (Depth == 1 && Root->getOpcode() == Shuffle)
22459 return false; // Nothing to do!
22460 Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
22461 DCI.AddToWorklist(Op.getNode());
22462 if (Shuffle == X86ISD::MOVDDUP)
22463 Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
22465 Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
22466 DCI.AddToWorklist(Op.getNode());
22467 DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22471 if (Subtarget->hasSSE3() &&
22472 (Mask.equals(0, 0, 2, 2) || Mask.equals(1, 1, 3, 3))) {
22473 bool Lo = Mask.equals(0, 0, 2, 2);
22474 unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP;
22475 MVT ShuffleVT = MVT::v4f32;
22476 if (Depth == 1 && Root->getOpcode() == Shuffle)
22477 return false; // Nothing to do!
22478 Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
22479 DCI.AddToWorklist(Op.getNode());
22480 Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
22481 DCI.AddToWorklist(Op.getNode());
22482 DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22486 if (Mask.equals(0, 0, 1, 1) || Mask.equals(2, 2, 3, 3)) {
22487 bool Lo = Mask.equals(0, 0, 1, 1);
22488 unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
22489 MVT ShuffleVT = MVT::v4f32;
22490 if (Depth == 1 && Root->getOpcode() == Shuffle)
22491 return false; // Nothing to do!
22492 Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
22493 DCI.AddToWorklist(Op.getNode());
22494 Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
22495 DCI.AddToWorklist(Op.getNode());
22496 DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22502 // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK
22503 // variants as none of these have single-instruction variants that are
22504 // superior to the UNPCK formulation.
22505 if (!FloatDomain &&
22506 (Mask.equals(0, 0, 1, 1, 2, 2, 3, 3) ||
22507 Mask.equals(4, 4, 5, 5, 6, 6, 7, 7) ||
22508 Mask.equals(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7) ||
22509 Mask.equals(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15,
22511 bool Lo = Mask[0] == 0;
22512 unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
22513 if (Depth == 1 && Root->getOpcode() == Shuffle)
22514 return false; // Nothing to do!
22516 switch (Mask.size()) {
22518 ShuffleVT = MVT::v8i16;
22521 ShuffleVT = MVT::v16i8;
22524 llvm_unreachable("Impossible mask size!");
22526 Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
22527 DCI.AddToWorklist(Op.getNode());
22528 Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
22529 DCI.AddToWorklist(Op.getNode());
22530 DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22535 // Don't try to re-form single instruction chains under any circumstances now
22536 // that we've done encoding canonicalization for them.
22540 // If we have 3 or more shuffle instructions or a chain involving PSHUFB, we
22541 // can replace them with a single PSHUFB instruction profitably. Intel's
22542 // manuals suggest only using PSHUFB if doing so replacing 5 instructions, but
22543 // in practice PSHUFB tends to be *very* fast so we're more aggressive.
22544 if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) {
22545 SmallVector<SDValue, 16> PSHUFBMask;
22546 assert(Mask.size() <= 16 && "Can't shuffle elements smaller than bytes!");
22547 int Ratio = 16 / Mask.size();
22548 for (unsigned i = 0; i < 16; ++i) {
22549 if (Mask[i / Ratio] == SM_SentinelUndef) {
22550 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
22553 int M = Mask[i / Ratio] != SM_SentinelZero
22554 ? Ratio * Mask[i / Ratio] + i % Ratio
22556 PSHUFBMask.push_back(DAG.getConstant(M, MVT::i8));
22558 Op = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Input);
22559 DCI.AddToWorklist(Op.getNode());
22560 SDValue PSHUFBMaskOp =
22561 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, PSHUFBMask);
22562 DCI.AddToWorklist(PSHUFBMaskOp.getNode());
22563 Op = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, Op, PSHUFBMaskOp);
22564 DCI.AddToWorklist(Op.getNode());
22565 DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22570 // Failed to find any combines.
22574 /// \brief Fully generic combining of x86 shuffle instructions.
22576 /// This should be the last combine run over the x86 shuffle instructions. Once
22577 /// they have been fully optimized, this will recursively consider all chains
22578 /// of single-use shuffle instructions, build a generic model of the cumulative
22579 /// shuffle operation, and check for simpler instructions which implement this
22580 /// operation. We use this primarily for two purposes:
22582 /// 1) Collapse generic shuffles to specialized single instructions when
22583 /// equivalent. In most cases, this is just an encoding size win, but
22584 /// sometimes we will collapse multiple generic shuffles into a single
22585 /// special-purpose shuffle.
22586 /// 2) Look for sequences of shuffle instructions with 3 or more total
22587 /// instructions, and replace them with the slightly more expensive SSSE3
22588 /// PSHUFB instruction if available. We do this as the last combining step
22589 /// to ensure we avoid using PSHUFB if we can implement the shuffle with
22590 /// a suitable short sequence of other instructions. The PHUFB will either
22591 /// use a register or have to read from memory and so is slightly (but only
22592 /// slightly) more expensive than the other shuffle instructions.
22594 /// Because this is inherently a quadratic operation (for each shuffle in
22595 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
22596 /// This should never be an issue in practice as the shuffle lowering doesn't
22597 /// produce sequences of more than 8 instructions.
22599 /// FIXME: We will currently miss some cases where the redundant shuffling
22600 /// would simplify under the threshold for PSHUFB formation because of
22601 /// combine-ordering. To fix this, we should do the redundant instruction
22602 /// combining in this recursive walk.
22603 static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
22604 ArrayRef<int> RootMask,
22605 int Depth, bool HasPSHUFB,
22607 TargetLowering::DAGCombinerInfo &DCI,
22608 const X86Subtarget *Subtarget) {
22609 // Bound the depth of our recursive combine because this is ultimately
22610 // quadratic in nature.
22614 // Directly rip through bitcasts to find the underlying operand.
22615 while (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).hasOneUse())
22616 Op = Op.getOperand(0);
22618 MVT VT = Op.getSimpleValueType();
22619 if (!VT.isVector())
22620 return false; // Bail if we hit a non-vector.
22621 // FIXME: This routine should be taught about 256-bit shuffles, or a 256-bit
22622 // version should be added.
22623 if (VT.getSizeInBits() != 128)
22626 assert(Root.getSimpleValueType().isVector() &&
22627 "Shuffles operate on vector types!");
22628 assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
22629 "Can only combine shuffles of the same vector register size.");
22631 if (!isTargetShuffle(Op.getOpcode()))
22633 SmallVector<int, 16> OpMask;
22635 bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, OpMask, IsUnary);
22636 // We only can combine unary shuffles which we can decode the mask for.
22637 if (!HaveMask || !IsUnary)
22640 assert(VT.getVectorNumElements() == OpMask.size() &&
22641 "Different mask size from vector size!");
22642 assert(((RootMask.size() > OpMask.size() &&
22643 RootMask.size() % OpMask.size() == 0) ||
22644 (OpMask.size() > RootMask.size() &&
22645 OpMask.size() % RootMask.size() == 0) ||
22646 OpMask.size() == RootMask.size()) &&
22647 "The smaller number of elements must divide the larger.");
22648 int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
22649 int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
22650 assert(((RootRatio == 1 && OpRatio == 1) ||
22651 (RootRatio == 1) != (OpRatio == 1)) &&
22652 "Must not have a ratio for both incoming and op masks!");
22654 SmallVector<int, 16> Mask;
22655 Mask.reserve(std::max(OpMask.size(), RootMask.size()));
22657 // Merge this shuffle operation's mask into our accumulated mask. Note that
22658 // this shuffle's mask will be the first applied to the input, followed by the
22659 // root mask to get us all the way to the root value arrangement. The reason
22660 // for this order is that we are recursing up the operation chain.
22661 for (int i = 0, e = std::max(OpMask.size(), RootMask.size()); i < e; ++i) {
22662 int RootIdx = i / RootRatio;
22663 if (RootMask[RootIdx] < 0) {
22664 // This is a zero or undef lane, we're done.
22665 Mask.push_back(RootMask[RootIdx]);
22669 int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
22670 int OpIdx = RootMaskedIdx / OpRatio;
22671 if (OpMask[OpIdx] < 0) {
22672 // The incoming lanes are zero or undef, it doesn't matter which ones we
22674 Mask.push_back(OpMask[OpIdx]);
22678 // Ok, we have non-zero lanes, map them through.
22679 Mask.push_back(OpMask[OpIdx] * OpRatio +
22680 RootMaskedIdx % OpRatio);
22683 // See if we can recurse into the operand to combine more things.
22684 switch (Op.getOpcode()) {
22685 case X86ISD::PSHUFB:
22687 case X86ISD::PSHUFD:
22688 case X86ISD::PSHUFHW:
22689 case X86ISD::PSHUFLW:
22690 if (Op.getOperand(0).hasOneUse() &&
22691 combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
22692 HasPSHUFB, DAG, DCI, Subtarget))
22696 case X86ISD::UNPCKL:
22697 case X86ISD::UNPCKH:
22698 assert(Op.getOperand(0) == Op.getOperand(1) && "We only combine unary shuffles!");
22699 // We can't check for single use, we have to check that this shuffle is the only user.
22700 if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
22701 combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
22702 HasPSHUFB, DAG, DCI, Subtarget))
22707 // Minor canonicalization of the accumulated shuffle mask to make it easier
22708 // to match below. All this does is detect masks with squential pairs of
22709 // elements, and shrink them to the half-width mask. It does this in a loop
22710 // so it will reduce the size of the mask to the minimal width mask which
22711 // performs an equivalent shuffle.
22712 SmallVector<int, 16> WidenedMask;
22713 while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
22714 Mask = std::move(WidenedMask);
22715 WidenedMask.clear();
22718 return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI,
22722 /// \brief Get the PSHUF-style mask from PSHUF node.
22724 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
22725 /// PSHUF-style masks that can be reused with such instructions.
22726 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
22727 SmallVector<int, 4> Mask;
22729 bool HaveMask = getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), Mask, IsUnary);
22733 switch (N.getOpcode()) {
22734 case X86ISD::PSHUFD:
22736 case X86ISD::PSHUFLW:
22739 case X86ISD::PSHUFHW:
22740 Mask.erase(Mask.begin(), Mask.begin() + 4);
22741 for (int &M : Mask)
22745 llvm_unreachable("No valid shuffle instruction found!");
22749 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
22751 /// We walk up the chain and look for a combinable shuffle, skipping over
22752 /// shuffles that we could hoist this shuffle's transformation past without
22753 /// altering anything.
22755 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
22757 TargetLowering::DAGCombinerInfo &DCI) {
22758 assert(N.getOpcode() == X86ISD::PSHUFD &&
22759 "Called with something other than an x86 128-bit half shuffle!");
22762 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
22763 // of the shuffles in the chain so that we can form a fresh chain to replace
22765 SmallVector<SDValue, 8> Chain;
22766 SDValue V = N.getOperand(0);
22767 for (; V.hasOneUse(); V = V.getOperand(0)) {
22768 switch (V.getOpcode()) {
22770 return SDValue(); // Nothing combined!
22773 // Skip bitcasts as we always know the type for the target specific
22777 case X86ISD::PSHUFD:
22778 // Found another dword shuffle.
22781 case X86ISD::PSHUFLW:
22782 // Check that the low words (being shuffled) are the identity in the
22783 // dword shuffle, and the high words are self-contained.
22784 if (Mask[0] != 0 || Mask[1] != 1 ||
22785 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
22788 Chain.push_back(V);
22791 case X86ISD::PSHUFHW:
22792 // Check that the high words (being shuffled) are the identity in the
22793 // dword shuffle, and the low words are self-contained.
22794 if (Mask[2] != 2 || Mask[3] != 3 ||
22795 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
22798 Chain.push_back(V);
22801 case X86ISD::UNPCKL:
22802 case X86ISD::UNPCKH:
22803 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
22804 // shuffle into a preceding word shuffle.
22805 if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16)
22808 // Search for a half-shuffle which we can combine with.
22809 unsigned CombineOp =
22810 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
22811 if (V.getOperand(0) != V.getOperand(1) ||
22812 !V->isOnlyUserOf(V.getOperand(0).getNode()))
22814 Chain.push_back(V);
22815 V = V.getOperand(0);
22817 switch (V.getOpcode()) {
22819 return SDValue(); // Nothing to combine.
22821 case X86ISD::PSHUFLW:
22822 case X86ISD::PSHUFHW:
22823 if (V.getOpcode() == CombineOp)
22826 Chain.push_back(V);
22830 V = V.getOperand(0);
22834 } while (V.hasOneUse());
22837 // Break out of the loop if we break out of the switch.
22841 if (!V.hasOneUse())
22842 // We fell out of the loop without finding a viable combining instruction.
22845 // Merge this node's mask and our incoming mask.
22846 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
22847 for (int &M : Mask)
22849 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
22850 getV4X86ShuffleImm8ForMask(Mask, DAG));
22852 // Rebuild the chain around this new shuffle.
22853 while (!Chain.empty()) {
22854 SDValue W = Chain.pop_back_val();
22856 if (V.getValueType() != W.getOperand(0).getValueType())
22857 V = DAG.getNode(ISD::BITCAST, DL, W.getOperand(0).getValueType(), V);
22859 switch (W.getOpcode()) {
22861 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
22863 case X86ISD::UNPCKL:
22864 case X86ISD::UNPCKH:
22865 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
22868 case X86ISD::PSHUFD:
22869 case X86ISD::PSHUFLW:
22870 case X86ISD::PSHUFHW:
22871 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
22875 if (V.getValueType() != N.getValueType())
22876 V = DAG.getNode(ISD::BITCAST, DL, N.getValueType(), V);
22878 // Return the new chain to replace N.
22882 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw.
22884 /// We walk up the chain, skipping shuffles of the other half and looking
22885 /// through shuffles which switch halves trying to find a shuffle of the same
22886 /// pair of dwords.
22887 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
22889 TargetLowering::DAGCombinerInfo &DCI) {
22891 (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
22892 "Called with something other than an x86 128-bit half shuffle!");
22894 unsigned CombineOpcode = N.getOpcode();
22896 // Walk up a single-use chain looking for a combinable shuffle.
22897 SDValue V = N.getOperand(0);
22898 for (; V.hasOneUse(); V = V.getOperand(0)) {
22899 switch (V.getOpcode()) {
22901 return false; // Nothing combined!
22904 // Skip bitcasts as we always know the type for the target specific
22908 case X86ISD::PSHUFLW:
22909 case X86ISD::PSHUFHW:
22910 if (V.getOpcode() == CombineOpcode)
22913 // Other-half shuffles are no-ops.
22916 // Break out of the loop if we break out of the switch.
22920 if (!V.hasOneUse())
22921 // We fell out of the loop without finding a viable combining instruction.
22924 // Combine away the bottom node as its shuffle will be accumulated into
22925 // a preceding shuffle.
22926 DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
22928 // Record the old value.
22931 // Merge this node's mask and our incoming mask (adjusted to account for all
22932 // the pshufd instructions encountered).
22933 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
22934 for (int &M : Mask)
22936 V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
22937 getV4X86ShuffleImm8ForMask(Mask, DAG));
22939 // Check that the shuffles didn't cancel each other out. If not, we need to
22940 // combine to the new one.
22942 // Replace the combinable shuffle with the combined one, updating all users
22943 // so that we re-evaluate the chain here.
22944 DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
22949 /// \brief Try to combine x86 target specific shuffles.
22950 static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
22951 TargetLowering::DAGCombinerInfo &DCI,
22952 const X86Subtarget *Subtarget) {
22954 MVT VT = N.getSimpleValueType();
22955 SmallVector<int, 4> Mask;
22957 switch (N.getOpcode()) {
22958 case X86ISD::PSHUFD:
22959 case X86ISD::PSHUFLW:
22960 case X86ISD::PSHUFHW:
22961 Mask = getPSHUFShuffleMask(N);
22962 assert(Mask.size() == 4);
22968 // Nuke no-op shuffles that show up after combining.
22969 if (isNoopShuffleMask(Mask))
22970 return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
22972 // Look for simplifications involving one or two shuffle instructions.
22973 SDValue V = N.getOperand(0);
22974 switch (N.getOpcode()) {
22977 case X86ISD::PSHUFLW:
22978 case X86ISD::PSHUFHW:
22979 assert(VT == MVT::v8i16);
22982 if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
22983 return SDValue(); // We combined away this shuffle, so we're done.
22985 // See if this reduces to a PSHUFD which is no more expensive and can
22986 // combine with more operations. Note that it has to at least flip the
22987 // dwords as otherwise it would have been removed as a no-op.
22988 if (Mask[0] == 2 && Mask[1] == 3 && Mask[2] == 0 && Mask[3] == 1) {
22989 int DMask[] = {0, 1, 2, 3};
22990 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
22991 DMask[DOffset + 0] = DOffset + 1;
22992 DMask[DOffset + 1] = DOffset + 0;
22993 V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V);
22994 DCI.AddToWorklist(V.getNode());
22995 V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V,
22996 getV4X86ShuffleImm8ForMask(DMask, DAG));
22997 DCI.AddToWorklist(V.getNode());
22998 return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
23001 // Look for shuffle patterns which can be implemented as a single unpack.
23002 // FIXME: This doesn't handle the location of the PSHUFD generically, and
23003 // only works when we have a PSHUFD followed by two half-shuffles.
23004 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
23005 (V.getOpcode() == X86ISD::PSHUFLW ||
23006 V.getOpcode() == X86ISD::PSHUFHW) &&
23007 V.getOpcode() != N.getOpcode() &&
23009 SDValue D = V.getOperand(0);
23010 while (D.getOpcode() == ISD::BITCAST && D.hasOneUse())
23011 D = D.getOperand(0);
23012 if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
23013 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
23014 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
23015 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
23016 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
23018 for (int i = 0; i < 4; ++i) {
23019 WordMask[i + NOffset] = Mask[i] + NOffset;
23020 WordMask[i + VOffset] = VMask[i] + VOffset;
23022 // Map the word mask through the DWord mask.
23024 for (int i = 0; i < 8; ++i)
23025 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
23026 const int UnpackLoMask[] = {0, 0, 1, 1, 2, 2, 3, 3};
23027 const int UnpackHiMask[] = {4, 4, 5, 5, 6, 6, 7, 7};
23028 if (std::equal(std::begin(MappedMask), std::end(MappedMask),
23029 std::begin(UnpackLoMask)) ||
23030 std::equal(std::begin(MappedMask), std::end(MappedMask),
23031 std::begin(UnpackHiMask))) {
23032 // We can replace all three shuffles with an unpack.
23033 V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, D.getOperand(0));
23034 DCI.AddToWorklist(V.getNode());
23035 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
23037 DL, MVT::v8i16, V, V);
23044 case X86ISD::PSHUFD:
23045 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI))
23054 /// \brief Try to combine a shuffle into a target-specific add-sub node.
23056 /// We combine this directly on the abstract vector shuffle nodes so it is
23057 /// easier to generically match. We also insert dummy vector shuffle nodes for
23058 /// the operands which explicitly discard the lanes which are unused by this
23059 /// operation to try to flow through the rest of the combiner the fact that
23060 /// they're unused.
23061 static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) {
23063 EVT VT = N->getValueType(0);
23065 // We only handle target-independent shuffles.
23066 // FIXME: It would be easy and harmless to use the target shuffle mask
23067 // extraction tool to support more.
23068 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
23071 auto *SVN = cast<ShuffleVectorSDNode>(N);
23072 ArrayRef<int> Mask = SVN->getMask();
23073 SDValue V1 = N->getOperand(0);
23074 SDValue V2 = N->getOperand(1);
23076 // We require the first shuffle operand to be the SUB node, and the second to
23077 // be the ADD node.
23078 // FIXME: We should support the commuted patterns.
23079 if (V1->getOpcode() != ISD::FSUB || V2->getOpcode() != ISD::FADD)
23082 // If there are other uses of these operations we can't fold them.
23083 if (!V1->hasOneUse() || !V2->hasOneUse())
23086 // Ensure that both operations have the same operands. Note that we can
23087 // commute the FADD operands.
23088 SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
23089 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
23090 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
23093 // We're looking for blends between FADD and FSUB nodes. We insist on these
23094 // nodes being lined up in a specific expected pattern.
23095 if (!(isShuffleEquivalent(V1, V2, Mask, 0, 3) ||
23096 isShuffleEquivalent(V1, V2, Mask, 0, 5, 2, 7) ||
23097 isShuffleEquivalent(V1, V2, Mask, 0, 9, 2, 11, 4, 13, 6, 15)))
23100 // Only specific types are legal at this point, assert so we notice if and
23101 // when these change.
23102 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 ||
23103 VT == MVT::v4f64) &&
23104 "Unknown vector type encountered!");
23106 return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS);
23109 /// PerformShuffleCombine - Performs several different shuffle combines.
23110 static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
23111 TargetLowering::DAGCombinerInfo &DCI,
23112 const X86Subtarget *Subtarget) {
23114 SDValue N0 = N->getOperand(0);
23115 SDValue N1 = N->getOperand(1);
23116 EVT VT = N->getValueType(0);
23118 // Don't create instructions with illegal types after legalize types has run.
23119 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23120 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
23123 // If we have legalized the vector types, look for blends of FADD and FSUB
23124 // nodes that we can fuse into an ADDSUB node.
23125 if (TLI.isTypeLegal(VT) && Subtarget->hasSSE3())
23126 if (SDValue AddSub = combineShuffleToAddSub(N, DAG))
23129 // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
23130 if (Subtarget->hasFp256() && VT.is256BitVector() &&
23131 N->getOpcode() == ISD::VECTOR_SHUFFLE)
23132 return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
23134 // During Type Legalization, when promoting illegal vector types,
23135 // the backend might introduce new shuffle dag nodes and bitcasts.
23137 // This code performs the following transformation:
23138 // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
23139 // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
23141 // We do this only if both the bitcast and the BINOP dag nodes have
23142 // one use. Also, perform this transformation only if the new binary
23143 // operation is legal. This is to avoid introducing dag nodes that
23144 // potentially need to be further expanded (or custom lowered) into a
23145 // less optimal sequence of dag nodes.
23146 if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
23147 N1.getOpcode() == ISD::UNDEF && N0.hasOneUse() &&
23148 N0.getOpcode() == ISD::BITCAST) {
23149 SDValue BC0 = N0.getOperand(0);
23150 EVT SVT = BC0.getValueType();
23151 unsigned Opcode = BC0.getOpcode();
23152 unsigned NumElts = VT.getVectorNumElements();
23154 if (BC0.hasOneUse() && SVT.isVector() &&
23155 SVT.getVectorNumElements() * 2 == NumElts &&
23156 TLI.isOperationLegal(Opcode, VT)) {
23157 bool CanFold = false;
23169 unsigned SVTNumElts = SVT.getVectorNumElements();
23170 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
23171 for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
23172 CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
23173 for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
23174 CanFold = SVOp->getMaskElt(i) < 0;
23177 SDValue BC00 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(0));
23178 SDValue BC01 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(1));
23179 SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
23180 return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]);
23185 // Only handle 128 wide vector from here on.
23186 if (!VT.is128BitVector())
23189 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
23190 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
23191 // consecutive, non-overlapping, and in the right order.
23192 SmallVector<SDValue, 16> Elts;
23193 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
23194 Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
23196 SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true);
23200 if (isTargetShuffle(N->getOpcode())) {
23202 PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget);
23203 if (Shuffle.getNode())
23206 // Try recursively combining arbitrary sequences of x86 shuffle
23207 // instructions into higher-order shuffles. We do this after combining
23208 // specific PSHUF instruction sequences into their minimal form so that we
23209 // can evaluate how many specialized shuffle instructions are involved in
23210 // a particular chain.
23211 SmallVector<int, 1> NonceMask; // Just a placeholder.
23212 NonceMask.push_back(0);
23213 if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask,
23214 /*Depth*/ 1, /*HasPSHUFB*/ false, DAG,
23216 return SDValue(); // This routine will use CombineTo to replace N.
23222 /// PerformTruncateCombine - Converts truncate operation to
23223 /// a sequence of vector shuffle operations.
23224 /// It is possible when we truncate 256-bit vector to 128-bit vector
23225 static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
23226 TargetLowering::DAGCombinerInfo &DCI,
23227 const X86Subtarget *Subtarget) {
23231 /// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target
23232 /// specific shuffle of a load can be folded into a single element load.
23233 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
23234 /// shuffles have been custom lowered so we need to handle those here.
23235 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
23236 TargetLowering::DAGCombinerInfo &DCI) {
23237 if (DCI.isBeforeLegalizeOps())
23240 SDValue InVec = N->getOperand(0);
23241 SDValue EltNo = N->getOperand(1);
23243 if (!isa<ConstantSDNode>(EltNo))
23246 EVT OriginalVT = InVec.getValueType();
23248 if (InVec.getOpcode() == ISD::BITCAST) {
23249 // Don't duplicate a load with other uses.
23250 if (!InVec.hasOneUse())
23252 EVT BCVT = InVec.getOperand(0).getValueType();
23253 if (BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
23255 InVec = InVec.getOperand(0);
23258 EVT CurrentVT = InVec.getValueType();
23260 if (!isTargetShuffle(InVec.getOpcode()))
23263 // Don't duplicate a load with other uses.
23264 if (!InVec.hasOneUse())
23267 SmallVector<int, 16> ShuffleMask;
23269 if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(),
23270 ShuffleMask, UnaryShuffle))
23273 // Select the input vector, guarding against out of range extract vector.
23274 unsigned NumElems = CurrentVT.getVectorNumElements();
23275 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
23276 int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt];
23277 SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0)
23278 : InVec.getOperand(1);
23280 // If inputs to shuffle are the same for both ops, then allow 2 uses
23281 unsigned AllowedUses = InVec.getNumOperands() > 1 &&
23282 InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
23284 if (LdNode.getOpcode() == ISD::BITCAST) {
23285 // Don't duplicate a load with other uses.
23286 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
23289 AllowedUses = 1; // only allow 1 load use if we have a bitcast
23290 LdNode = LdNode.getOperand(0);
23293 if (!ISD::isNormalLoad(LdNode.getNode()))
23296 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
23298 if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
23301 EVT EltVT = N->getValueType(0);
23302 // If there's a bitcast before the shuffle, check if the load type and
23303 // alignment is valid.
23304 unsigned Align = LN0->getAlignment();
23305 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23306 unsigned NewAlign = TLI.getDataLayout()->getABITypeAlignment(
23307 EltVT.getTypeForEVT(*DAG.getContext()));
23309 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
23312 // All checks match so transform back to vector_shuffle so that DAG combiner
23313 // can finish the job
23316 // Create shuffle node taking into account the case that its a unary shuffle
23317 SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT)
23318 : InVec.getOperand(1);
23319 Shuffle = DAG.getVectorShuffle(CurrentVT, dl,
23320 InVec.getOperand(0), Shuffle,
23322 Shuffle = DAG.getNode(ISD::BITCAST, dl, OriginalVT, Shuffle);
23323 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
23327 /// \brief Detect bitcasts between i32 to x86mmx low word. Since MMX types are
23328 /// special and don't usually play with other vector types, it's better to
23329 /// handle them early to be sure we emit efficient code by avoiding
23330 /// store-load conversions.
23331 static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG) {
23332 if (N->getValueType(0) != MVT::x86mmx ||
23333 N->getOperand(0)->getOpcode() != ISD::BUILD_VECTOR ||
23334 N->getOperand(0)->getValueType(0) != MVT::v2i32)
23337 SDValue V = N->getOperand(0);
23338 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(1));
23339 if (C && C->getZExtValue() == 0 && V.getOperand(0).getValueType() == MVT::i32)
23340 return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(V.getOperand(0)),
23341 N->getValueType(0), V.getOperand(0));
23346 /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
23347 /// generation and convert it from being a bunch of shuffles and extracts
23348 /// into a somewhat faster sequence. For i686, the best sequence is apparently
23349 /// storing the value and loading scalars back, while for x64 we should
23350 /// use 64-bit extracts and shifts.
23351 static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
23352 TargetLowering::DAGCombinerInfo &DCI) {
23353 SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI);
23354 if (NewOp.getNode())
23357 SDValue InputVector = N->getOperand(0);
23359 // Detect mmx to i32 conversion through a v2i32 elt extract.
23360 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
23361 N->getValueType(0) == MVT::i32 &&
23362 InputVector.getValueType() == MVT::v2i32) {
23364 // The bitcast source is a direct mmx result.
23365 SDValue MMXSrc = InputVector.getNode()->getOperand(0);
23366 if (MMXSrc.getValueType() == MVT::x86mmx)
23367 return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
23368 N->getValueType(0),
23369 InputVector.getNode()->getOperand(0));
23371 // The mmx is indirect: (i64 extract_elt (v1i64 bitcast (x86mmx ...))).
23372 SDValue MMXSrcOp = MMXSrc.getOperand(0);
23373 if (MMXSrc.getOpcode() == ISD::EXTRACT_VECTOR_ELT && MMXSrc.hasOneUse() &&
23374 MMXSrc.getValueType() == MVT::i64 && MMXSrcOp.hasOneUse() &&
23375 MMXSrcOp.getOpcode() == ISD::BITCAST &&
23376 MMXSrcOp.getValueType() == MVT::v1i64 &&
23377 MMXSrcOp.getOperand(0).getValueType() == MVT::x86mmx)
23378 return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
23379 N->getValueType(0),
23380 MMXSrcOp.getOperand(0));
23383 // Only operate on vectors of 4 elements, where the alternative shuffling
23384 // gets to be more expensive.
23385 if (InputVector.getValueType() != MVT::v4i32)
23388 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
23389 // single use which is a sign-extend or zero-extend, and all elements are
23391 SmallVector<SDNode *, 4> Uses;
23392 unsigned ExtractedElements = 0;
23393 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
23394 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
23395 if (UI.getUse().getResNo() != InputVector.getResNo())
23398 SDNode *Extract = *UI;
23399 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
23402 if (Extract->getValueType(0) != MVT::i32)
23404 if (!Extract->hasOneUse())
23406 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
23407 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
23409 if (!isa<ConstantSDNode>(Extract->getOperand(1)))
23412 // Record which element was extracted.
23413 ExtractedElements |=
23414 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
23416 Uses.push_back(Extract);
23419 // If not all the elements were used, this may not be worthwhile.
23420 if (ExtractedElements != 15)
23423 // Ok, we've now decided to do the transformation.
23424 // If 64-bit shifts are legal, use the extract-shift sequence,
23425 // otherwise bounce the vector off the cache.
23426 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23428 SDLoc dl(InputVector);
23430 if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
23431 SDValue Cst = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, InputVector);
23432 EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy();
23433 SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
23434 DAG.getConstant(0, VecIdxTy));
23435 SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
23436 DAG.getConstant(1, VecIdxTy));
23438 SDValue ShAmt = DAG.getConstant(32,
23439 DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64));
23440 Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
23441 Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
23442 DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
23443 Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
23444 Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
23445 DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
23447 // Store the value to a temporary stack slot.
23448 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
23449 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
23450 MachinePointerInfo(), false, false, 0);
23452 EVT ElementType = InputVector.getValueType().getVectorElementType();
23453 unsigned EltSize = ElementType.getSizeInBits() / 8;
23455 // Replace each use (extract) with a load of the appropriate element.
23456 for (unsigned i = 0; i < 4; ++i) {
23457 uint64_t Offset = EltSize * i;
23458 SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
23460 SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
23461 StackPtr, OffsetVal);
23463 // Load the scalar.
23464 Vals[i] = DAG.getLoad(ElementType, dl, Ch,
23465 ScalarAddr, MachinePointerInfo(),
23466 false, false, false, 0);
23471 // Replace the extracts
23472 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
23473 UE = Uses.end(); UI != UE; ++UI) {
23474 SDNode *Extract = *UI;
23476 SDValue Idx = Extract->getOperand(1);
23477 uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
23478 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
23481 // The replacement was made in place; don't return anything.
23485 /// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match.
23486 static std::pair<unsigned, bool>
23487 matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
23488 SelectionDAG &DAG, const X86Subtarget *Subtarget) {
23489 if (!VT.isVector())
23490 return std::make_pair(0, false);
23492 bool NeedSplit = false;
23493 switch (VT.getSimpleVT().SimpleTy) {
23494 default: return std::make_pair(0, false);
23497 if (!Subtarget->hasVLX())
23498 return std::make_pair(0, false);
23502 if (!Subtarget->hasBWI())
23503 return std::make_pair(0, false);
23507 if (!Subtarget->hasAVX512())
23508 return std::make_pair(0, false);
23513 if (!Subtarget->hasAVX2())
23515 if (!Subtarget->hasAVX())
23516 return std::make_pair(0, false);
23521 if (!Subtarget->hasSSE2())
23522 return std::make_pair(0, false);
23525 // SSE2 has only a small subset of the operations.
23526 bool hasUnsigned = Subtarget->hasSSE41() ||
23527 (Subtarget->hasSSE2() && VT == MVT::v16i8);
23528 bool hasSigned = Subtarget->hasSSE41() ||
23529 (Subtarget->hasSSE2() && VT == MVT::v8i16);
23531 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23534 // Check for x CC y ? x : y.
23535 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
23536 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
23541 Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
23544 Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
23547 Opc = hasSigned ? X86ISD::SMIN : 0; break;
23550 Opc = hasSigned ? X86ISD::SMAX : 0; break;
23552 // Check for x CC y ? y : x -- a min/max with reversed arms.
23553 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
23554 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
23559 Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
23562 Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
23565 Opc = hasSigned ? X86ISD::SMAX : 0; break;
23568 Opc = hasSigned ? X86ISD::SMIN : 0; break;
23572 return std::make_pair(Opc, NeedSplit);
23576 transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
23577 const X86Subtarget *Subtarget) {
23579 SDValue Cond = N->getOperand(0);
23580 SDValue LHS = N->getOperand(1);
23581 SDValue RHS = N->getOperand(2);
23583 if (Cond.getOpcode() == ISD::SIGN_EXTEND) {
23584 SDValue CondSrc = Cond->getOperand(0);
23585 if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG)
23586 Cond = CondSrc->getOperand(0);
23589 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
23592 // A vselect where all conditions and data are constants can be optimized into
23593 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
23594 if (ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
23595 ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
23598 unsigned MaskValue = 0;
23599 if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
23602 MVT VT = N->getSimpleValueType(0);
23603 unsigned NumElems = VT.getVectorNumElements();
23604 SmallVector<int, 8> ShuffleMask(NumElems, -1);
23605 for (unsigned i = 0; i < NumElems; ++i) {
23606 // Be sure we emit undef where we can.
23607 if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF)
23608 ShuffleMask[i] = -1;
23610 ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1);
23613 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23614 if (!TLI.isShuffleMaskLegal(ShuffleMask, VT))
23616 return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]);
23619 /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
23621 static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
23622 TargetLowering::DAGCombinerInfo &DCI,
23623 const X86Subtarget *Subtarget) {
23625 SDValue Cond = N->getOperand(0);
23626 // Get the LHS/RHS of the select.
23627 SDValue LHS = N->getOperand(1);
23628 SDValue RHS = N->getOperand(2);
23629 EVT VT = LHS.getValueType();
23630 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23632 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
23633 // instructions match the semantics of the common C idiom x<y?x:y but not
23634 // x<=y?x:y, because of how they handle negative zero (which can be
23635 // ignored in unsafe-math mode).
23636 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
23637 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
23638 VT != MVT::f80 && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
23639 (Subtarget->hasSSE2() ||
23640 (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
23641 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23643 unsigned Opcode = 0;
23644 // Check for x CC y ? x : y.
23645 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
23646 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
23650 // Converting this to a min would handle NaNs incorrectly, and swapping
23651 // the operands would cause it to handle comparisons between positive
23652 // and negative zero incorrectly.
23653 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
23654 if (!DAG.getTarget().Options.UnsafeFPMath &&
23655 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
23657 std::swap(LHS, RHS);
23659 Opcode = X86ISD::FMIN;
23662 // Converting this to a min would handle comparisons between positive
23663 // and negative zero incorrectly.
23664 if (!DAG.getTarget().Options.UnsafeFPMath &&
23665 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
23667 Opcode = X86ISD::FMIN;
23670 // Converting this to a min would handle both negative zeros and NaNs
23671 // incorrectly, but we can swap the operands to fix both.
23672 std::swap(LHS, RHS);
23676 Opcode = X86ISD::FMIN;
23680 // Converting this to a max would handle comparisons between positive
23681 // and negative zero incorrectly.
23682 if (!DAG.getTarget().Options.UnsafeFPMath &&
23683 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
23685 Opcode = X86ISD::FMAX;
23688 // Converting this to a max would handle NaNs incorrectly, and swapping
23689 // the operands would cause it to handle comparisons between positive
23690 // and negative zero incorrectly.
23691 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
23692 if (!DAG.getTarget().Options.UnsafeFPMath &&
23693 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
23695 std::swap(LHS, RHS);
23697 Opcode = X86ISD::FMAX;
23700 // Converting this to a max would handle both negative zeros and NaNs
23701 // incorrectly, but we can swap the operands to fix both.
23702 std::swap(LHS, RHS);
23706 Opcode = X86ISD::FMAX;
23709 // Check for x CC y ? y : x -- a min/max with reversed arms.
23710 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
23711 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
23715 // Converting this to a min would handle comparisons between positive
23716 // and negative zero incorrectly, and swapping the operands would
23717 // cause it to handle NaNs incorrectly.
23718 if (!DAG.getTarget().Options.UnsafeFPMath &&
23719 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
23720 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
23722 std::swap(LHS, RHS);
23724 Opcode = X86ISD::FMIN;
23727 // Converting this to a min would handle NaNs incorrectly.
23728 if (!DAG.getTarget().Options.UnsafeFPMath &&
23729 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
23731 Opcode = X86ISD::FMIN;
23734 // Converting this to a min would handle both negative zeros and NaNs
23735 // incorrectly, but we can swap the operands to fix both.
23736 std::swap(LHS, RHS);
23740 Opcode = X86ISD::FMIN;
23744 // Converting this to a max would handle NaNs incorrectly.
23745 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
23747 Opcode = X86ISD::FMAX;
23750 // Converting this to a max would handle comparisons between positive
23751 // and negative zero incorrectly, and swapping the operands would
23752 // cause it to handle NaNs incorrectly.
23753 if (!DAG.getTarget().Options.UnsafeFPMath &&
23754 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
23755 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
23757 std::swap(LHS, RHS);
23759 Opcode = X86ISD::FMAX;
23762 // Converting this to a max would handle both negative zeros and NaNs
23763 // incorrectly, but we can swap the operands to fix both.
23764 std::swap(LHS, RHS);
23768 Opcode = X86ISD::FMAX;
23774 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
23777 EVT CondVT = Cond.getValueType();
23778 if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() &&
23779 CondVT.getVectorElementType() == MVT::i1) {
23780 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
23781 // lowering on KNL. In this case we convert it to
23782 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
23783 // The same situation for all 128 and 256-bit vectors of i8 and i16.
23784 // Since SKX these selects have a proper lowering.
23785 EVT OpVT = LHS.getValueType();
23786 if ((OpVT.is128BitVector() || OpVT.is256BitVector()) &&
23787 (OpVT.getVectorElementType() == MVT::i8 ||
23788 OpVT.getVectorElementType() == MVT::i16) &&
23789 !(Subtarget->hasBWI() && Subtarget->hasVLX())) {
23790 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond);
23791 DCI.AddToWorklist(Cond.getNode());
23792 return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS);
23795 // If this is a select between two integer constants, try to do some
23797 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
23798 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
23799 // Don't do this for crazy integer types.
23800 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
23801 // If this is efficiently invertible, canonicalize the LHSC/RHSC values
23802 // so that TrueC (the true value) is larger than FalseC.
23803 bool NeedsCondInvert = false;
23805 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
23806 // Efficiently invertible.
23807 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible.
23808 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible.
23809 isa<ConstantSDNode>(Cond.getOperand(1))))) {
23810 NeedsCondInvert = true;
23811 std::swap(TrueC, FalseC);
23814 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0.
23815 if (FalseC->getAPIntValue() == 0 &&
23816 TrueC->getAPIntValue().isPowerOf2()) {
23817 if (NeedsCondInvert) // Invert the condition if needed.
23818 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
23819 DAG.getConstant(1, Cond.getValueType()));
23821 // Zero extend the condition if needed.
23822 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
23824 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
23825 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
23826 DAG.getConstant(ShAmt, MVT::i8));
23829 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
23830 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
23831 if (NeedsCondInvert) // Invert the condition if needed.
23832 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
23833 DAG.getConstant(1, Cond.getValueType()));
23835 // Zero extend the condition if needed.
23836 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
23837 FalseC->getValueType(0), Cond);
23838 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
23839 SDValue(FalseC, 0));
23842 // Optimize cases that will turn into an LEA instruction. This requires
23843 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
23844 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
23845 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
23846 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
23848 bool isFastMultiplier = false;
23850 switch ((unsigned char)Diff) {
23852 case 1: // result = add base, cond
23853 case 2: // result = lea base( , cond*2)
23854 case 3: // result = lea base(cond, cond*2)
23855 case 4: // result = lea base( , cond*4)
23856 case 5: // result = lea base(cond, cond*4)
23857 case 8: // result = lea base( , cond*8)
23858 case 9: // result = lea base(cond, cond*8)
23859 isFastMultiplier = true;
23864 if (isFastMultiplier) {
23865 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
23866 if (NeedsCondInvert) // Invert the condition if needed.
23867 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
23868 DAG.getConstant(1, Cond.getValueType()));
23870 // Zero extend the condition if needed.
23871 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
23873 // Scale the condition by the difference.
23875 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
23876 DAG.getConstant(Diff, Cond.getValueType()));
23878 // Add the base if non-zero.
23879 if (FalseC->getAPIntValue() != 0)
23880 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
23881 SDValue(FalseC, 0));
23888 // Canonicalize max and min:
23889 // (x > y) ? x : y -> (x >= y) ? x : y
23890 // (x < y) ? x : y -> (x <= y) ? x : y
23891 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
23892 // the need for an extra compare
23893 // against zero. e.g.
23894 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
23896 // testl %edi, %edi
23898 // cmovgl %edi, %eax
23902 // cmovsl %eax, %edi
23903 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
23904 DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
23905 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
23906 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23911 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
23912 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
23913 Cond.getOperand(0), Cond.getOperand(1), NewCC);
23914 return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
23919 // Early exit check
23920 if (!TLI.isTypeLegal(VT))
23923 // Match VSELECTs into subs with unsigned saturation.
23924 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
23925 // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
23926 ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
23927 (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
23928 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23930 // Check if one of the arms of the VSELECT is a zero vector. If it's on the
23931 // left side invert the predicate to simplify logic below.
23933 if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
23935 CC = ISD::getSetCCInverse(CC, true);
23936 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
23940 if (Other.getNode() && Other->getNumOperands() == 2 &&
23941 DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
23942 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
23943 SDValue CondRHS = Cond->getOperand(1);
23945 // Look for a general sub with unsigned saturation first.
23946 // x >= y ? x-y : 0 --> subus x, y
23947 // x > y ? x-y : 0 --> subus x, y
23948 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
23949 Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
23950 return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
23952 if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
23953 if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
23954 if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
23955 if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
23956 // If the RHS is a constant we have to reverse the const
23957 // canonicalization.
23958 // x > C-1 ? x+-C : 0 --> subus x, C
23959 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
23960 CondRHSConst->getAPIntValue() ==
23961 (-OpRHSConst->getAPIntValue() - 1))
23962 return DAG.getNode(
23963 X86ISD::SUBUS, DL, VT, OpLHS,
23964 DAG.getConstant(-OpRHSConst->getAPIntValue(), VT));
23966 // Another special case: If C was a sign bit, the sub has been
23967 // canonicalized into a xor.
23968 // FIXME: Would it be better to use computeKnownBits to determine
23969 // whether it's safe to decanonicalize the xor?
23970 // x s< 0 ? x^C : 0 --> subus x, C
23971 if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
23972 ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
23973 OpRHSConst->getAPIntValue().isSignBit())
23974 // Note that we have to rebuild the RHS constant here to ensure we
23975 // don't rely on particular values of undef lanes.
23976 return DAG.getNode(
23977 X86ISD::SUBUS, DL, VT, OpLHS,
23978 DAG.getConstant(OpRHSConst->getAPIntValue(), VT));
23983 // Try to match a min/max vector operation.
23984 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) {
23985 std::pair<unsigned, bool> ret = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget);
23986 unsigned Opc = ret.first;
23987 bool NeedSplit = ret.second;
23989 if (Opc && NeedSplit) {
23990 unsigned NumElems = VT.getVectorNumElements();
23991 // Extract the LHS vectors
23992 SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, DL);
23993 SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, DL);
23995 // Extract the RHS vectors
23996 SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, DL);
23997 SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, DL);
23999 // Create min/max for each subvector
24000 LHS = DAG.getNode(Opc, DL, LHS1.getValueType(), LHS1, RHS1);
24001 RHS = DAG.getNode(Opc, DL, LHS2.getValueType(), LHS2, RHS2);
24003 // Merge the result
24004 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS, RHS);
24006 return DAG.getNode(Opc, DL, VT, LHS, RHS);
24009 // Simplify vector selection if condition value type matches vselect
24011 if (N->getOpcode() == ISD::VSELECT && CondVT == VT) {
24012 assert(Cond.getValueType().isVector() &&
24013 "vector select expects a vector selector!");
24015 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
24016 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
24018 // Try invert the condition if true value is not all 1s and false value
24020 if (!TValIsAllOnes && !FValIsAllZeros &&
24021 // Check if the selector will be produced by CMPP*/PCMP*
24022 Cond.getOpcode() == ISD::SETCC &&
24023 // Check if SETCC has already been promoted
24024 TLI.getSetCCResultType(*DAG.getContext(), VT) == CondVT) {
24025 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
24026 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
24028 if (TValIsAllZeros || FValIsAllOnes) {
24029 SDValue CC = Cond.getOperand(2);
24030 ISD::CondCode NewCC =
24031 ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
24032 Cond.getOperand(0).getValueType().isInteger());
24033 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC);
24034 std::swap(LHS, RHS);
24035 TValIsAllOnes = FValIsAllOnes;
24036 FValIsAllZeros = TValIsAllZeros;
24040 if (TValIsAllOnes || FValIsAllZeros) {
24043 if (TValIsAllOnes && FValIsAllZeros)
24045 else if (TValIsAllOnes)
24046 Ret = DAG.getNode(ISD::OR, DL, CondVT, Cond,
24047 DAG.getNode(ISD::BITCAST, DL, CondVT, RHS));
24048 else if (FValIsAllZeros)
24049 Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond,
24050 DAG.getNode(ISD::BITCAST, DL, CondVT, LHS));
24052 return DAG.getNode(ISD::BITCAST, DL, VT, Ret);
24056 // If we know that this node is legal then we know that it is going to be
24057 // matched by one of the SSE/AVX BLEND instructions. These instructions only
24058 // depend on the highest bit in each word. Try to use SimplifyDemandedBits
24059 // to simplify previous instructions.
24060 if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
24061 !DCI.isBeforeLegalize() &&
24062 // We explicitly check against v8i16 and v16i16 because, although
24063 // they're marked as Custom, they might only be legal when Cond is a
24064 // build_vector of constants. This will be taken care in a later
24066 (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 &&
24067 VT != MVT::v8i16) &&
24068 // Don't optimize vector of constants. Those are handled by
24069 // the generic code and all the bits must be properly set for
24070 // the generic optimizer.
24071 !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
24072 unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
24074 // Don't optimize vector selects that map to mask-registers.
24078 assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
24079 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
24081 APInt KnownZero, KnownOne;
24082 TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
24083 DCI.isBeforeLegalizeOps());
24084 if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
24085 TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
24087 // If we changed the computation somewhere in the DAG, this change
24088 // will affect all users of Cond.
24089 // Make sure it is fine and update all the nodes so that we do not
24090 // use the generic VSELECT anymore. Otherwise, we may perform
24091 // wrong optimizations as we messed up with the actual expectation
24092 // for the vector boolean values.
24093 if (Cond != TLO.Old) {
24094 // Check all uses of that condition operand to check whether it will be
24095 // consumed by non-BLEND instructions, which may depend on all bits are
24097 for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
24099 if (I->getOpcode() != ISD::VSELECT)
24100 // TODO: Add other opcodes eventually lowered into BLEND.
24103 // Update all the users of the condition, before committing the change,
24104 // so that the VSELECT optimizations that expect the correct vector
24105 // boolean value will not be triggered.
24106 for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
24108 DAG.ReplaceAllUsesOfValueWith(
24110 DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0),
24111 Cond, I->getOperand(1), I->getOperand(2)));
24112 DCI.CommitTargetLoweringOpt(TLO);
24115 // At this point, only Cond is changed. Change the condition
24116 // just for N to keep the opportunity to optimize all other
24117 // users their own way.
24118 DAG.ReplaceAllUsesOfValueWith(
24120 DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0),
24121 TLO.New, N->getOperand(1), N->getOperand(2)));
24126 // We should generate an X86ISD::BLENDI from a vselect if its argument
24127 // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
24128 // constants. This specific pattern gets generated when we split a
24129 // selector for a 512 bit vector in a machine without AVX512 (but with
24130 // 256-bit vectors), during legalization:
24132 // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
24134 // Iff we find this pattern and the build_vectors are built from
24135 // constants, we translate the vselect into a shuffle_vector that we
24136 // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
24137 if ((N->getOpcode() == ISD::VSELECT ||
24138 N->getOpcode() == X86ISD::SHRUNKBLEND) &&
24139 !DCI.isBeforeLegalize()) {
24140 SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
24141 if (Shuffle.getNode())
24148 // Check whether a boolean test is testing a boolean value generated by
24149 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
24152 // Simplify the following patterns:
24153 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
24154 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
24155 // to (Op EFLAGS Cond)
24157 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
24158 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
24159 // to (Op EFLAGS !Cond)
24161 // where Op could be BRCOND or CMOV.
24163 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
24164 // Quit if not CMP and SUB with its value result used.
24165 if (Cmp.getOpcode() != X86ISD::CMP &&
24166 (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0)))
24169 // Quit if not used as a boolean value.
24170 if (CC != X86::COND_E && CC != X86::COND_NE)
24173 // Check CMP operands. One of them should be 0 or 1 and the other should be
24174 // an SetCC or extended from it.
24175 SDValue Op1 = Cmp.getOperand(0);
24176 SDValue Op2 = Cmp.getOperand(1);
24179 const ConstantSDNode* C = nullptr;
24180 bool needOppositeCond = (CC == X86::COND_E);
24181 bool checkAgainstTrue = false; // Is it a comparison against 1?
24183 if ((C = dyn_cast<ConstantSDNode>(Op1)))
24185 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
24187 else // Quit if all operands are not constants.
24190 if (C->getZExtValue() == 1) {
24191 needOppositeCond = !needOppositeCond;
24192 checkAgainstTrue = true;
24193 } else if (C->getZExtValue() != 0)
24194 // Quit if the constant is neither 0 or 1.
24197 bool truncatedToBoolWithAnd = false;
24198 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
24199 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
24200 SetCC.getOpcode() == ISD::TRUNCATE ||
24201 SetCC.getOpcode() == ISD::AND) {
24202 if (SetCC.getOpcode() == ISD::AND) {
24204 ConstantSDNode *CS;
24205 if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(0))) &&
24206 CS->getZExtValue() == 1)
24208 if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(1))) &&
24209 CS->getZExtValue() == 1)
24213 SetCC = SetCC.getOperand(OpIdx);
24214 truncatedToBoolWithAnd = true;
24216 SetCC = SetCC.getOperand(0);
24219 switch (SetCC.getOpcode()) {
24220 case X86ISD::SETCC_CARRY:
24221 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
24222 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
24223 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
24224 // truncated to i1 using 'and'.
24225 if (checkAgainstTrue && !truncatedToBoolWithAnd)
24227 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
24228 "Invalid use of SETCC_CARRY!");
24230 case X86ISD::SETCC:
24231 // Set the condition code or opposite one if necessary.
24232 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
24233 if (needOppositeCond)
24234 CC = X86::GetOppositeBranchCondition(CC);
24235 return SetCC.getOperand(1);
24236 case X86ISD::CMOV: {
24237 // Check whether false/true value has canonical one, i.e. 0 or 1.
24238 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
24239 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
24240 // Quit if true value is not a constant.
24243 // Quit if false value is not a constant.
24245 SDValue Op = SetCC.getOperand(0);
24246 // Skip 'zext' or 'trunc' node.
24247 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
24248 Op.getOpcode() == ISD::TRUNCATE)
24249 Op = Op.getOperand(0);
24250 // A special case for rdrand/rdseed, where 0 is set if false cond is
24252 if ((Op.getOpcode() != X86ISD::RDRAND &&
24253 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
24256 // Quit if false value is not the constant 0 or 1.
24257 bool FValIsFalse = true;
24258 if (FVal && FVal->getZExtValue() != 0) {
24259 if (FVal->getZExtValue() != 1)
24261 // If FVal is 1, opposite cond is needed.
24262 needOppositeCond = !needOppositeCond;
24263 FValIsFalse = false;
24265 // Quit if TVal is not the constant opposite of FVal.
24266 if (FValIsFalse && TVal->getZExtValue() != 1)
24268 if (!FValIsFalse && TVal->getZExtValue() != 0)
24270 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
24271 if (needOppositeCond)
24272 CC = X86::GetOppositeBranchCondition(CC);
24273 return SetCC.getOperand(3);
24280 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
24281 static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
24282 TargetLowering::DAGCombinerInfo &DCI,
24283 const X86Subtarget *Subtarget) {
24286 // If the flag operand isn't dead, don't touch this CMOV.
24287 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
24290 SDValue FalseOp = N->getOperand(0);
24291 SDValue TrueOp = N->getOperand(1);
24292 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
24293 SDValue Cond = N->getOperand(3);
24295 if (CC == X86::COND_E || CC == X86::COND_NE) {
24296 switch (Cond.getOpcode()) {
24300 // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
24301 if (DAG.isKnownNeverZero(Cond.getOperand(0)))
24302 return (CC == X86::COND_E) ? FalseOp : TrueOp;
24308 Flags = checkBoolTestSetCCCombine(Cond, CC);
24309 if (Flags.getNode() &&
24310 // Extra check as FCMOV only supports a subset of X86 cond.
24311 (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
24312 SDValue Ops[] = { FalseOp, TrueOp,
24313 DAG.getConstant(CC, MVT::i8), Flags };
24314 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
24317 // If this is a select between two integer constants, try to do some
24318 // optimizations. Note that the operands are ordered the opposite of SELECT
24320 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
24321 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
24322 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
24323 // larger than FalseC (the false value).
24324 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
24325 CC = X86::GetOppositeBranchCondition(CC);
24326 std::swap(TrueC, FalseC);
24327 std::swap(TrueOp, FalseOp);
24330 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
24331 // This is efficient for any integer data type (including i8/i16) and
24333 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
24334 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
24335 DAG.getConstant(CC, MVT::i8), Cond);
24337 // Zero extend the condition if needed.
24338 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
24340 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
24341 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
24342 DAG.getConstant(ShAmt, MVT::i8));
24343 if (N->getNumValues() == 2) // Dead flag value?
24344 return DCI.CombineTo(N, Cond, SDValue());
24348 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
24349 // for any integer data type, including i8/i16.
24350 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
24351 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
24352 DAG.getConstant(CC, MVT::i8), Cond);
24354 // Zero extend the condition if needed.
24355 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
24356 FalseC->getValueType(0), Cond);
24357 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
24358 SDValue(FalseC, 0));
24360 if (N->getNumValues() == 2) // Dead flag value?
24361 return DCI.CombineTo(N, Cond, SDValue());
24365 // Optimize cases that will turn into an LEA instruction. This requires
24366 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
24367 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
24368 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
24369 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
24371 bool isFastMultiplier = false;
24373 switch ((unsigned char)Diff) {
24375 case 1: // result = add base, cond
24376 case 2: // result = lea base( , cond*2)
24377 case 3: // result = lea base(cond, cond*2)
24378 case 4: // result = lea base( , cond*4)
24379 case 5: // result = lea base(cond, cond*4)
24380 case 8: // result = lea base( , cond*8)
24381 case 9: // result = lea base(cond, cond*8)
24382 isFastMultiplier = true;
24387 if (isFastMultiplier) {
24388 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
24389 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
24390 DAG.getConstant(CC, MVT::i8), Cond);
24391 // Zero extend the condition if needed.
24392 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
24394 // Scale the condition by the difference.
24396 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
24397 DAG.getConstant(Diff, Cond.getValueType()));
24399 // Add the base if non-zero.
24400 if (FalseC->getAPIntValue() != 0)
24401 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
24402 SDValue(FalseC, 0));
24403 if (N->getNumValues() == 2) // Dead flag value?
24404 return DCI.CombineTo(N, Cond, SDValue());
24411 // Handle these cases:
24412 // (select (x != c), e, c) -> select (x != c), e, x),
24413 // (select (x == c), c, e) -> select (x == c), x, e)
24414 // where the c is an integer constant, and the "select" is the combination
24415 // of CMOV and CMP.
24417 // The rationale for this change is that the conditional-move from a constant
24418 // needs two instructions, however, conditional-move from a register needs
24419 // only one instruction.
24421 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
24422 // some instruction-combining opportunities. This opt needs to be
24423 // postponed as late as possible.
24425 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
24426 // the DCI.xxxx conditions are provided to postpone the optimization as
24427 // late as possible.
24429 ConstantSDNode *CmpAgainst = nullptr;
24430 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
24431 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
24432 !isa<ConstantSDNode>(Cond.getOperand(0))) {
24434 if (CC == X86::COND_NE &&
24435 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
24436 CC = X86::GetOppositeBranchCondition(CC);
24437 std::swap(TrueOp, FalseOp);
24440 if (CC == X86::COND_E &&
24441 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
24442 SDValue Ops[] = { FalseOp, Cond.getOperand(0),
24443 DAG.getConstant(CC, MVT::i8), Cond };
24444 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
24452 static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
24453 const X86Subtarget *Subtarget) {
24454 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
24456 default: return SDValue();
24457 // SSE/AVX/AVX2 blend intrinsics.
24458 case Intrinsic::x86_avx2_pblendvb:
24459 case Intrinsic::x86_avx2_pblendw:
24460 case Intrinsic::x86_avx2_pblendd_128:
24461 case Intrinsic::x86_avx2_pblendd_256:
24462 // Don't try to simplify this intrinsic if we don't have AVX2.
24463 if (!Subtarget->hasAVX2())
24466 case Intrinsic::x86_avx_blend_pd_256:
24467 case Intrinsic::x86_avx_blend_ps_256:
24468 case Intrinsic::x86_avx_blendv_pd_256:
24469 case Intrinsic::x86_avx_blendv_ps_256:
24470 // Don't try to simplify this intrinsic if we don't have AVX.
24471 if (!Subtarget->hasAVX())
24474 case Intrinsic::x86_sse41_pblendw:
24475 case Intrinsic::x86_sse41_blendpd:
24476 case Intrinsic::x86_sse41_blendps:
24477 case Intrinsic::x86_sse41_blendvps:
24478 case Intrinsic::x86_sse41_blendvpd:
24479 case Intrinsic::x86_sse41_pblendvb: {
24480 SDValue Op0 = N->getOperand(1);
24481 SDValue Op1 = N->getOperand(2);
24482 SDValue Mask = N->getOperand(3);
24484 // Don't try to simplify this intrinsic if we don't have SSE4.1.
24485 if (!Subtarget->hasSSE41())
24488 // fold (blend A, A, Mask) -> A
24491 // fold (blend A, B, allZeros) -> A
24492 if (ISD::isBuildVectorAllZeros(Mask.getNode()))
24494 // fold (blend A, B, allOnes) -> B
24495 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
24498 // Simplify the case where the mask is a constant i32 value.
24499 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mask)) {
24500 if (C->isNullValue())
24502 if (C->isAllOnesValue())
24509 // Packed SSE2/AVX2 arithmetic shift immediate intrinsics.
24510 case Intrinsic::x86_sse2_psrai_w:
24511 case Intrinsic::x86_sse2_psrai_d:
24512 case Intrinsic::x86_avx2_psrai_w:
24513 case Intrinsic::x86_avx2_psrai_d:
24514 case Intrinsic::x86_sse2_psra_w:
24515 case Intrinsic::x86_sse2_psra_d:
24516 case Intrinsic::x86_avx2_psra_w:
24517 case Intrinsic::x86_avx2_psra_d: {
24518 SDValue Op0 = N->getOperand(1);
24519 SDValue Op1 = N->getOperand(2);
24520 EVT VT = Op0.getValueType();
24521 assert(VT.isVector() && "Expected a vector type!");
24523 if (isa<BuildVectorSDNode>(Op1))
24524 Op1 = Op1.getOperand(0);
24526 if (!isa<ConstantSDNode>(Op1))
24529 EVT SVT = VT.getVectorElementType();
24530 unsigned SVTBits = SVT.getSizeInBits();
24532 ConstantSDNode *CND = cast<ConstantSDNode>(Op1);
24533 const APInt &C = APInt(SVTBits, CND->getAPIntValue().getZExtValue());
24534 uint64_t ShAmt = C.getZExtValue();
24536 // Don't try to convert this shift into a ISD::SRA if the shift
24537 // count is bigger than or equal to the element size.
24538 if (ShAmt >= SVTBits)
24541 // Trivial case: if the shift count is zero, then fold this
24542 // into the first operand.
24546 // Replace this packed shift intrinsic with a target independent
24548 SDValue Splat = DAG.getConstant(C, VT);
24549 return DAG.getNode(ISD::SRA, SDLoc(N), VT, Op0, Splat);
24554 /// PerformMulCombine - Optimize a single multiply with constant into two
24555 /// in order to implement it with two cheaper instructions, e.g.
24556 /// LEA + SHL, LEA + LEA.
24557 static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
24558 TargetLowering::DAGCombinerInfo &DCI) {
24559 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
24562 EVT VT = N->getValueType(0);
24563 if (VT != MVT::i64 && VT != MVT::i32)
24566 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
24569 uint64_t MulAmt = C->getZExtValue();
24570 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
24573 uint64_t MulAmt1 = 0;
24574 uint64_t MulAmt2 = 0;
24575 if ((MulAmt % 9) == 0) {
24577 MulAmt2 = MulAmt / 9;
24578 } else if ((MulAmt % 5) == 0) {
24580 MulAmt2 = MulAmt / 5;
24581 } else if ((MulAmt % 3) == 0) {
24583 MulAmt2 = MulAmt / 3;
24586 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
24589 if (isPowerOf2_64(MulAmt2) &&
24590 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
24591 // If second multiplifer is pow2, issue it first. We want the multiply by
24592 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
24594 std::swap(MulAmt1, MulAmt2);
24597 if (isPowerOf2_64(MulAmt1))
24598 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
24599 DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
24601 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
24602 DAG.getConstant(MulAmt1, VT));
24604 if (isPowerOf2_64(MulAmt2))
24605 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
24606 DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
24608 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
24609 DAG.getConstant(MulAmt2, VT));
24611 // Do not add new nodes to DAG combiner worklist.
24612 DCI.CombineTo(N, NewMul, false);
24617 static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
24618 SDValue N0 = N->getOperand(0);
24619 SDValue N1 = N->getOperand(1);
24620 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
24621 EVT VT = N0.getValueType();
24623 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
24624 // since the result of setcc_c is all zero's or all ones.
24625 if (VT.isInteger() && !VT.isVector() &&
24626 N1C && N0.getOpcode() == ISD::AND &&
24627 N0.getOperand(1).getOpcode() == ISD::Constant) {
24628 SDValue N00 = N0.getOperand(0);
24629 if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
24630 ((N00.getOpcode() == ISD::ANY_EXTEND ||
24631 N00.getOpcode() == ISD::ZERO_EXTEND) &&
24632 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) {
24633 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
24634 APInt ShAmt = N1C->getAPIntValue();
24635 Mask = Mask.shl(ShAmt);
24637 return DAG.getNode(ISD::AND, SDLoc(N), VT,
24638 N00, DAG.getConstant(Mask, VT));
24642 // Hardware support for vector shifts is sparse which makes us scalarize the
24643 // vector operations in many cases. Also, on sandybridge ADD is faster than
24645 // (shl V, 1) -> add V,V
24646 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
24647 if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
24648 assert(N0.getValueType().isVector() && "Invalid vector shift type");
24649 // We shift all of the values by one. In many cases we do not have
24650 // hardware support for this operation. This is better expressed as an ADD
24652 if (N1SplatC->getZExtValue() == 1)
24653 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
24659 /// \brief Returns a vector of 0s if the node in input is a vector logical
24660 /// shift by a constant amount which is known to be bigger than or equal
24661 /// to the vector element size in bits.
24662 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
24663 const X86Subtarget *Subtarget) {
24664 EVT VT = N->getValueType(0);
24666 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
24667 (!Subtarget->hasInt256() ||
24668 (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
24671 SDValue Amt = N->getOperand(1);
24673 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
24674 if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
24675 APInt ShiftAmt = AmtSplat->getAPIntValue();
24676 unsigned MaxAmount = VT.getVectorElementType().getSizeInBits();
24678 // SSE2/AVX2 logical shifts always return a vector of 0s
24679 // if the shift amount is bigger than or equal to
24680 // the element size. The constant shift amount will be
24681 // encoded as a 8-bit immediate.
24682 if (ShiftAmt.trunc(8).uge(MaxAmount))
24683 return getZeroVector(VT, Subtarget, DAG, DL);
24689 /// PerformShiftCombine - Combine shifts.
24690 static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
24691 TargetLowering::DAGCombinerInfo &DCI,
24692 const X86Subtarget *Subtarget) {
24693 if (N->getOpcode() == ISD::SHL) {
24694 SDValue V = PerformSHLCombine(N, DAG);
24695 if (V.getNode()) return V;
24698 if (N->getOpcode() != ISD::SRA) {
24699 // Try to fold this logical shift into a zero vector.
24700 SDValue V = performShiftToAllZeros(N, DAG, Subtarget);
24701 if (V.getNode()) return V;
24707 // CMPEQCombine - Recognize the distinctive (AND (setcc ...) (setcc ..))
24708 // where both setccs reference the same FP CMP, and rewrite for CMPEQSS
24709 // and friends. Likewise for OR -> CMPNEQSS.
24710 static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
24711 TargetLowering::DAGCombinerInfo &DCI,
24712 const X86Subtarget *Subtarget) {
24715 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
24716 // we're requiring SSE2 for both.
24717 if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
24718 SDValue N0 = N->getOperand(0);
24719 SDValue N1 = N->getOperand(1);
24720 SDValue CMP0 = N0->getOperand(1);
24721 SDValue CMP1 = N1->getOperand(1);
24724 // The SETCCs should both refer to the same CMP.
24725 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
24728 SDValue CMP00 = CMP0->getOperand(0);
24729 SDValue CMP01 = CMP0->getOperand(1);
24730 EVT VT = CMP00.getValueType();
24732 if (VT == MVT::f32 || VT == MVT::f64) {
24733 bool ExpectingFlags = false;
24734 // Check for any users that want flags:
24735 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
24736 !ExpectingFlags && UI != UE; ++UI)
24737 switch (UI->getOpcode()) {
24742 ExpectingFlags = true;
24744 case ISD::CopyToReg:
24745 case ISD::SIGN_EXTEND:
24746 case ISD::ZERO_EXTEND:
24747 case ISD::ANY_EXTEND:
24751 if (!ExpectingFlags) {
24752 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
24753 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
24755 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
24756 X86::CondCode tmp = cc0;
24761 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
24762 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
24763 // FIXME: need symbolic constants for these magic numbers.
24764 // See X86ATTInstPrinter.cpp:printSSECC().
24765 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
24766 if (Subtarget->hasAVX512()) {
24767 SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00,
24768 CMP01, DAG.getConstant(x86cc, MVT::i8));
24769 if (N->getValueType(0) != MVT::i1)
24770 return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
24774 SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
24775 CMP00.getValueType(), CMP00, CMP01,
24776 DAG.getConstant(x86cc, MVT::i8));
24778 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
24779 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
24781 if (is64BitFP && !Subtarget->is64Bit()) {
24782 // On a 32-bit target, we cannot bitcast the 64-bit float to a
24783 // 64-bit integer, since that's not a legal type. Since
24784 // OnesOrZeroesF is all ones of all zeroes, we don't need all the
24785 // bits, but can do this little dance to extract the lowest 32 bits
24786 // and work with those going forward.
24787 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
24789 SDValue Vector32 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32,
24791 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
24792 Vector32, DAG.getIntPtrConstant(0));
24796 SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, IntVT, OnesOrZeroesF);
24797 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
24798 DAG.getConstant(1, IntVT));
24799 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed);
24800 return OneBitOfTruth;
24808 /// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector
24809 /// so it can be folded inside ANDNP.
24810 static bool CanFoldXORWithAllOnes(const SDNode *N) {
24811 EVT VT = N->getValueType(0);
24813 // Match direct AllOnes for 128 and 256-bit vectors
24814 if (ISD::isBuildVectorAllOnes(N))
24817 // Look through a bit convert.
24818 if (N->getOpcode() == ISD::BITCAST)
24819 N = N->getOperand(0).getNode();
24821 // Sometimes the operand may come from a insert_subvector building a 256-bit
24823 if (VT.is256BitVector() &&
24824 N->getOpcode() == ISD::INSERT_SUBVECTOR) {
24825 SDValue V1 = N->getOperand(0);
24826 SDValue V2 = N->getOperand(1);
24828 if (V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
24829 V1.getOperand(0).getOpcode() == ISD::UNDEF &&
24830 ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
24831 ISD::isBuildVectorAllOnes(V2.getNode()))
24838 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
24839 // register. In most cases we actually compare or select YMM-sized registers
24840 // and mixing the two types creates horrible code. This method optimizes
24841 // some of the transition sequences.
24842 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
24843 TargetLowering::DAGCombinerInfo &DCI,
24844 const X86Subtarget *Subtarget) {
24845 EVT VT = N->getValueType(0);
24846 if (!VT.is256BitVector())
24849 assert((N->getOpcode() == ISD::ANY_EXTEND ||
24850 N->getOpcode() == ISD::ZERO_EXTEND ||
24851 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
24853 SDValue Narrow = N->getOperand(0);
24854 EVT NarrowVT = Narrow->getValueType(0);
24855 if (!NarrowVT.is128BitVector())
24858 if (Narrow->getOpcode() != ISD::XOR &&
24859 Narrow->getOpcode() != ISD::AND &&
24860 Narrow->getOpcode() != ISD::OR)
24863 SDValue N0 = Narrow->getOperand(0);
24864 SDValue N1 = Narrow->getOperand(1);
24867 // The Left side has to be a trunc.
24868 if (N0.getOpcode() != ISD::TRUNCATE)
24871 // The type of the truncated inputs.
24872 EVT WideVT = N0->getOperand(0)->getValueType(0);
24876 // The right side has to be a 'trunc' or a constant vector.
24877 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
24878 ConstantSDNode *RHSConstSplat = nullptr;
24879 if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
24880 RHSConstSplat = RHSBV->getConstantSplatNode();
24881 if (!RHSTrunc && !RHSConstSplat)
24884 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24886 if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
24889 // Set N0 and N1 to hold the inputs to the new wide operation.
24890 N0 = N0->getOperand(0);
24891 if (RHSConstSplat) {
24892 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(),
24893 SDValue(RHSConstSplat, 0));
24894 SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);
24895 N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C);
24896 } else if (RHSTrunc) {
24897 N1 = N1->getOperand(0);
24900 // Generate the wide operation.
24901 SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
24902 unsigned Opcode = N->getOpcode();
24904 case ISD::ANY_EXTEND:
24906 case ISD::ZERO_EXTEND: {
24907 unsigned InBits = NarrowVT.getScalarType().getSizeInBits();
24908 APInt Mask = APInt::getAllOnesValue(InBits);
24909 Mask = Mask.zext(VT.getScalarType().getSizeInBits());
24910 return DAG.getNode(ISD::AND, DL, VT,
24911 Op, DAG.getConstant(Mask, VT));
24913 case ISD::SIGN_EXTEND:
24914 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
24915 Op, DAG.getValueType(NarrowVT));
24917 llvm_unreachable("Unexpected opcode");
24921 static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG,
24922 TargetLowering::DAGCombinerInfo &DCI,
24923 const X86Subtarget *Subtarget) {
24924 SDValue N0 = N->getOperand(0);
24925 SDValue N1 = N->getOperand(1);
24928 // A vector zext_in_reg may be represented as a shuffle,
24929 // feeding into a bitcast (this represents anyext) feeding into
24930 // an and with a mask.
24931 // We'd like to try to combine that into a shuffle with zero
24932 // plus a bitcast, removing the and.
24933 if (N0.getOpcode() != ISD::BITCAST ||
24934 N0.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE)
24937 // The other side of the AND should be a splat of 2^C, where C
24938 // is the number of bits in the source type.
24939 if (N1.getOpcode() == ISD::BITCAST)
24940 N1 = N1.getOperand(0);
24941 if (N1.getOpcode() != ISD::BUILD_VECTOR)
24943 BuildVectorSDNode *Vector = cast<BuildVectorSDNode>(N1);
24945 ShuffleVectorSDNode *Shuffle = cast<ShuffleVectorSDNode>(N0.getOperand(0));
24946 EVT SrcType = Shuffle->getValueType(0);
24948 // We expect a single-source shuffle
24949 if (Shuffle->getOperand(1)->getOpcode() != ISD::UNDEF)
24952 unsigned SrcSize = SrcType.getScalarSizeInBits();
24954 APInt SplatValue, SplatUndef;
24955 unsigned SplatBitSize;
24957 if (!Vector->isConstantSplat(SplatValue, SplatUndef,
24958 SplatBitSize, HasAnyUndefs))
24961 unsigned ResSize = N1.getValueType().getScalarSizeInBits();
24962 // Make sure the splat matches the mask we expect
24963 if (SplatBitSize > ResSize ||
24964 (SplatValue + 1).exactLogBase2() != (int)SrcSize)
24967 // Make sure the input and output size make sense
24968 if (SrcSize >= ResSize || ResSize % SrcSize)
24971 // We expect a shuffle of the form <0, u, u, u, 1, u, u, u...>
24972 // The number of u's between each two values depends on the ratio between
24973 // the source and dest type.
24974 unsigned ZextRatio = ResSize / SrcSize;
24975 bool IsZext = true;
24976 for (unsigned i = 0; i < SrcType.getVectorNumElements(); ++i) {
24977 if (i % ZextRatio) {
24978 if (Shuffle->getMaskElt(i) > 0) {
24984 if (Shuffle->getMaskElt(i) != (int)(i / ZextRatio)) {
24985 // Expected element number
24995 // Ok, perform the transformation - replace the shuffle with
24996 // a shuffle of the form <0, k, k, k, 1, k, k, k> with zero
24997 // (instead of undef) where the k elements come from the zero vector.
24998 SmallVector<int, 8> Mask;
24999 unsigned NumElems = SrcType.getVectorNumElements();
25000 for (unsigned i = 0; i < NumElems; ++i)
25002 Mask.push_back(NumElems);
25004 Mask.push_back(i / ZextRatio);
25006 SDValue NewShuffle = DAG.getVectorShuffle(Shuffle->getValueType(0), DL,
25007 Shuffle->getOperand(0), DAG.getConstant(0, SrcType), Mask);
25008 return DAG.getNode(ISD::BITCAST, DL, N0.getValueType(), NewShuffle);
25011 static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
25012 TargetLowering::DAGCombinerInfo &DCI,
25013 const X86Subtarget *Subtarget) {
25014 if (DCI.isBeforeLegalizeOps())
25017 SDValue Zext = VectorZextCombine(N, DAG, DCI, Subtarget);
25018 if (Zext.getNode())
25021 SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
25025 EVT VT = N->getValueType(0);
25026 SDValue N0 = N->getOperand(0);
25027 SDValue N1 = N->getOperand(1);
25030 // Create BEXTR instructions
25031 // BEXTR is ((X >> imm) & (2**size-1))
25032 if (VT == MVT::i32 || VT == MVT::i64) {
25033 // Check for BEXTR.
25034 if ((Subtarget->hasBMI() || Subtarget->hasTBM()) &&
25035 (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) {
25036 ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
25037 ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
25038 if (MaskNode && ShiftNode) {
25039 uint64_t Mask = MaskNode->getZExtValue();
25040 uint64_t Shift = ShiftNode->getZExtValue();
25041 if (isMask_64(Mask)) {
25042 uint64_t MaskSize = countPopulation(Mask);
25043 if (Shift + MaskSize <= VT.getSizeInBits())
25044 return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
25045 DAG.getConstant(Shift | (MaskSize << 8), VT));
25053 // Want to form ANDNP nodes:
25054 // 1) In the hopes of then easily combining them with OR and AND nodes
25055 // to form PBLEND/PSIGN.
25056 // 2) To match ANDN packed intrinsics
25057 if (VT != MVT::v2i64 && VT != MVT::v4i64)
25060 // Check LHS for vnot
25061 if (N0.getOpcode() == ISD::XOR &&
25062 //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
25063 CanFoldXORWithAllOnes(N0.getOperand(1).getNode()))
25064 return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
25066 // Check RHS for vnot
25067 if (N1.getOpcode() == ISD::XOR &&
25068 //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
25069 CanFoldXORWithAllOnes(N1.getOperand(1).getNode()))
25070 return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
25075 static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
25076 TargetLowering::DAGCombinerInfo &DCI,
25077 const X86Subtarget *Subtarget) {
25078 if (DCI.isBeforeLegalizeOps())
25081 SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
25085 SDValue N0 = N->getOperand(0);
25086 SDValue N1 = N->getOperand(1);
25087 EVT VT = N->getValueType(0);
25089 // look for psign/blend
25090 if (VT == MVT::v2i64 || VT == MVT::v4i64) {
25091 if (!Subtarget->hasSSSE3() ||
25092 (VT == MVT::v4i64 && !Subtarget->hasInt256()))
25095 // Canonicalize pandn to RHS
25096 if (N0.getOpcode() == X86ISD::ANDNP)
25098 // or (and (m, y), (pandn m, x))
25099 if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
25100 SDValue Mask = N1.getOperand(0);
25101 SDValue X = N1.getOperand(1);
25103 if (N0.getOperand(0) == Mask)
25104 Y = N0.getOperand(1);
25105 if (N0.getOperand(1) == Mask)
25106 Y = N0.getOperand(0);
25108 // Check to see if the mask appeared in both the AND and ANDNP and
25112 // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
25113 // Look through mask bitcast.
25114 if (Mask.getOpcode() == ISD::BITCAST)
25115 Mask = Mask.getOperand(0);
25116 if (X.getOpcode() == ISD::BITCAST)
25117 X = X.getOperand(0);
25118 if (Y.getOpcode() == ISD::BITCAST)
25119 Y = Y.getOperand(0);
25121 EVT MaskVT = Mask.getValueType();
25123 // Validate that the Mask operand is a vector sra node.
25124 // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
25125 // there is no psrai.b
25126 unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
25127 unsigned SraAmt = ~0;
25128 if (Mask.getOpcode() == ISD::SRA) {
25129 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
25130 if (auto *AmtConst = AmtBV->getConstantSplatNode())
25131 SraAmt = AmtConst->getZExtValue();
25132 } else if (Mask.getOpcode() == X86ISD::VSRAI) {
25133 SDValue SraC = Mask.getOperand(1);
25134 SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue();
25136 if ((SraAmt + 1) != EltBits)
25141 // Now we know we at least have a plendvb with the mask val. See if
25142 // we can form a psignb/w/d.
25143 // psign = x.type == y.type == mask.type && y = sub(0, x);
25144 if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
25145 ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
25146 X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
25147 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
25148 "Unsupported VT for PSIGN");
25149 Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0));
25150 return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
25152 // PBLENDVB only available on SSE 4.1
25153 if (!Subtarget->hasSSE41())
25156 EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
25158 X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X);
25159 Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y);
25160 Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask);
25161 Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
25162 return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
25166 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
25169 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
25170 MachineFunction &MF = DAG.getMachineFunction();
25172 MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize);
25174 // SHLD/SHRD instructions have lower register pressure, but on some
25175 // platforms they have higher latency than the equivalent
25176 // series of shifts/or that would otherwise be generated.
25177 // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
25178 // have higher latencies and we are not optimizing for size.
25179 if (!OptForSize && Subtarget->isSHLDSlow())
25182 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
25184 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
25186 if (!N0.hasOneUse() || !N1.hasOneUse())
25189 SDValue ShAmt0 = N0.getOperand(1);
25190 if (ShAmt0.getValueType() != MVT::i8)
25192 SDValue ShAmt1 = N1.getOperand(1);
25193 if (ShAmt1.getValueType() != MVT::i8)
25195 if (ShAmt0.getOpcode() == ISD::TRUNCATE)
25196 ShAmt0 = ShAmt0.getOperand(0);
25197 if (ShAmt1.getOpcode() == ISD::TRUNCATE)
25198 ShAmt1 = ShAmt1.getOperand(0);
25201 unsigned Opc = X86ISD::SHLD;
25202 SDValue Op0 = N0.getOperand(0);
25203 SDValue Op1 = N1.getOperand(0);
25204 if (ShAmt0.getOpcode() == ISD::SUB) {
25205 Opc = X86ISD::SHRD;
25206 std::swap(Op0, Op1);
25207 std::swap(ShAmt0, ShAmt1);
25210 unsigned Bits = VT.getSizeInBits();
25211 if (ShAmt1.getOpcode() == ISD::SUB) {
25212 SDValue Sum = ShAmt1.getOperand(0);
25213 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
25214 SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
25215 if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
25216 ShAmt1Op1 = ShAmt1Op1.getOperand(0);
25217 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
25218 return DAG.getNode(Opc, DL, VT,
25220 DAG.getNode(ISD::TRUNCATE, DL,
25223 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
25224 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
25226 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
25227 return DAG.getNode(Opc, DL, VT,
25228 N0.getOperand(0), N1.getOperand(0),
25229 DAG.getNode(ISD::TRUNCATE, DL,
25236 // Generate NEG and CMOV for integer abs.
25237 static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
25238 EVT VT = N->getValueType(0);
25240 // Since X86 does not have CMOV for 8-bit integer, we don't convert
25241 // 8-bit integer abs to NEG and CMOV.
25242 if (VT.isInteger() && VT.getSizeInBits() == 8)
25245 SDValue N0 = N->getOperand(0);
25246 SDValue N1 = N->getOperand(1);
25249 // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
25250 // and change it to SUB and CMOV.
25251 if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
25252 N0.getOpcode() == ISD::ADD &&
25253 N0.getOperand(1) == N1 &&
25254 N1.getOpcode() == ISD::SRA &&
25255 N1.getOperand(0) == N0.getOperand(0))
25256 if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
25257 if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) {
25258 // Generate SUB & CMOV.
25259 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
25260 DAG.getConstant(0, VT), N0.getOperand(0));
25262 SDValue Ops[] = { N0.getOperand(0), Neg,
25263 DAG.getConstant(X86::COND_GE, MVT::i8),
25264 SDValue(Neg.getNode(), 1) };
25265 return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
25270 // PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes
25271 static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
25272 TargetLowering::DAGCombinerInfo &DCI,
25273 const X86Subtarget *Subtarget) {
25274 if (DCI.isBeforeLegalizeOps())
25277 if (Subtarget->hasCMov()) {
25278 SDValue RV = performIntegerAbsCombine(N, DAG);
25286 /// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
25287 static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
25288 TargetLowering::DAGCombinerInfo &DCI,
25289 const X86Subtarget *Subtarget) {
25290 LoadSDNode *Ld = cast<LoadSDNode>(N);
25291 EVT RegVT = Ld->getValueType(0);
25292 EVT MemVT = Ld->getMemoryVT();
25294 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25296 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
25297 // into two 16-byte operations.
25298 ISD::LoadExtType Ext = Ld->getExtensionType();
25299 unsigned Alignment = Ld->getAlignment();
25300 bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
25301 if (RegVT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
25302 !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
25303 unsigned NumElems = RegVT.getVectorNumElements();
25307 SDValue Ptr = Ld->getBasePtr();
25308 SDValue Increment = DAG.getConstant(16, TLI.getPointerTy());
25310 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
25312 SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
25313 Ld->getPointerInfo(), Ld->isVolatile(),
25314 Ld->isNonTemporal(), Ld->isInvariant(),
25316 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
25317 SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
25318 Ld->getPointerInfo(), Ld->isVolatile(),
25319 Ld->isNonTemporal(), Ld->isInvariant(),
25320 std::min(16U, Alignment));
25321 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
25323 Load2.getValue(1));
25325 SDValue NewVec = DAG.getUNDEF(RegVT);
25326 NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);
25327 NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);
25328 return DCI.CombineTo(N, NewVec, TF, true);
25334 /// PerformMLOADCombine - Resolve extending loads
25335 static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
25336 TargetLowering::DAGCombinerInfo &DCI,
25337 const X86Subtarget *Subtarget) {
25338 MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
25339 if (Mld->getExtensionType() != ISD::SEXTLOAD)
25342 EVT VT = Mld->getValueType(0);
25343 unsigned NumElems = VT.getVectorNumElements();
25344 EVT LdVT = Mld->getMemoryVT();
25347 assert(LdVT != VT && "Cannot extend to the same type");
25348 unsigned ToSz = VT.getVectorElementType().getSizeInBits();
25349 unsigned FromSz = LdVT.getVectorElementType().getSizeInBits();
25350 // From, To sizes and ElemCount must be pow of two
25351 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
25352 "Unexpected size for extending masked load");
25354 unsigned SizeRatio = ToSz / FromSz;
25355 assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
25357 // Create a type on which we perform the shuffle
25358 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
25359 LdVT.getScalarType(), NumElems*SizeRatio);
25360 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
25362 // Convert Src0 value
25363 SDValue WideSrc0 = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mld->getSrc0());
25364 if (Mld->getSrc0().getOpcode() != ISD::UNDEF) {
25365 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
25366 for (unsigned i = 0; i != NumElems; ++i)
25367 ShuffleVec[i] = i * SizeRatio;
25369 // Can't shuffle using an illegal type.
25370 assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)
25371 && "WideVecVT should be legal");
25372 WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
25373 DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
25375 // Prepare the new mask
25377 SDValue Mask = Mld->getMask();
25378 if (Mask.getValueType() == VT) {
25379 // Mask and original value have the same type
25380 NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
25381 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
25382 for (unsigned i = 0; i != NumElems; ++i)
25383 ShuffleVec[i] = i * SizeRatio;
25384 for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
25385 ShuffleVec[i] = NumElems*SizeRatio;
25386 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
25387 DAG.getConstant(0, WideVecVT),
25391 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
25392 unsigned WidenNumElts = NumElems*SizeRatio;
25393 unsigned MaskNumElts = VT.getVectorNumElements();
25394 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
25397 unsigned NumConcat = WidenNumElts / MaskNumElts;
25398 SmallVector<SDValue, 16> Ops(NumConcat);
25399 SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType());
25401 for (unsigned i = 1; i != NumConcat; ++i)
25404 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
25407 SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
25408 Mld->getBasePtr(), NewMask, WideSrc0,
25409 Mld->getMemoryVT(), Mld->getMemOperand(),
25411 SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
25412 return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
25415 /// PerformMSTORECombine - Resolve truncating stores
25416 static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
25417 const X86Subtarget *Subtarget) {
25418 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
25419 if (!Mst->isTruncatingStore())
25422 EVT VT = Mst->getValue().getValueType();
25423 unsigned NumElems = VT.getVectorNumElements();
25424 EVT StVT = Mst->getMemoryVT();
25427 assert(StVT != VT && "Cannot truncate to the same type");
25428 unsigned FromSz = VT.getVectorElementType().getSizeInBits();
25429 unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
25431 // From, To sizes and ElemCount must be pow of two
25432 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
25433 "Unexpected size for truncating masked store");
25434 // We are going to use the original vector elt for storing.
25435 // Accumulated smaller vector elements must be a multiple of the store size.
25436 assert (((NumElems * FromSz) % ToSz) == 0 &&
25437 "Unexpected ratio for truncating masked store");
25439 unsigned SizeRatio = FromSz / ToSz;
25440 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
25442 // Create a type on which we perform the shuffle
25443 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
25444 StVT.getScalarType(), NumElems*SizeRatio);
25446 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
25448 SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mst->getValue());
25449 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
25450 for (unsigned i = 0; i != NumElems; ++i)
25451 ShuffleVec[i] = i * SizeRatio;
25453 // Can't shuffle using an illegal type.
25454 assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)
25455 && "WideVecVT should be legal");
25457 SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
25458 DAG.getUNDEF(WideVecVT),
25462 SDValue Mask = Mst->getMask();
25463 if (Mask.getValueType() == VT) {
25464 // Mask and original value have the same type
25465 NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
25466 for (unsigned i = 0; i != NumElems; ++i)
25467 ShuffleVec[i] = i * SizeRatio;
25468 for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
25469 ShuffleVec[i] = NumElems*SizeRatio;
25470 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
25471 DAG.getConstant(0, WideVecVT),
25475 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
25476 unsigned WidenNumElts = NumElems*SizeRatio;
25477 unsigned MaskNumElts = VT.getVectorNumElements();
25478 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
25481 unsigned NumConcat = WidenNumElts / MaskNumElts;
25482 SmallVector<SDValue, 16> Ops(NumConcat);
25483 SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType());
25485 for (unsigned i = 1; i != NumConcat; ++i)
25488 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
25491 return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, Mst->getBasePtr(),
25492 NewMask, StVT, Mst->getMemOperand(), false);
25494 /// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
25495 static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
25496 const X86Subtarget *Subtarget) {
25497 StoreSDNode *St = cast<StoreSDNode>(N);
25498 EVT VT = St->getValue().getValueType();
25499 EVT StVT = St->getMemoryVT();
25501 SDValue StoredVal = St->getOperand(1);
25502 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25504 // If we are saving a concatenation of two XMM registers and 32-byte stores
25505 // are slow, such as on Sandy Bridge, perform two 16-byte stores.
25506 unsigned Alignment = St->getAlignment();
25507 bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8;
25508 if (VT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
25509 StVT == VT && !IsAligned) {
25510 unsigned NumElems = VT.getVectorNumElements();
25514 SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl);
25515 SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl);
25517 SDValue Stride = DAG.getConstant(16, TLI.getPointerTy());
25518 SDValue Ptr0 = St->getBasePtr();
25519 SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
25521 SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
25522 St->getPointerInfo(), St->isVolatile(),
25523 St->isNonTemporal(), Alignment);
25524 SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
25525 St->getPointerInfo(), St->isVolatile(),
25526 St->isNonTemporal(),
25527 std::min(16U, Alignment));
25528 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
25531 // Optimize trunc store (of multiple scalars) to shuffle and store.
25532 // First, pack all of the elements in one place. Next, store to memory
25533 // in fewer chunks.
25534 if (St->isTruncatingStore() && VT.isVector()) {
25535 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25536 unsigned NumElems = VT.getVectorNumElements();
25537 assert(StVT != VT && "Cannot truncate to the same type");
25538 unsigned FromSz = VT.getVectorElementType().getSizeInBits();
25539 unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
25541 // From, To sizes and ElemCount must be pow of two
25542 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
25543 // We are going to use the original vector elt for storing.
25544 // Accumulated smaller vector elements must be a multiple of the store size.
25545 if (0 != (NumElems * FromSz) % ToSz) return SDValue();
25547 unsigned SizeRatio = FromSz / ToSz;
25549 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
25551 // Create a type on which we perform the shuffle
25552 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
25553 StVT.getScalarType(), NumElems*SizeRatio);
25555 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
25557 SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());
25558 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
25559 for (unsigned i = 0; i != NumElems; ++i)
25560 ShuffleVec[i] = i * SizeRatio;
25562 // Can't shuffle using an illegal type.
25563 if (!TLI.isTypeLegal(WideVecVT))
25566 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
25567 DAG.getUNDEF(WideVecVT),
25569 // At this point all of the data is stored at the bottom of the
25570 // register. We now need to save it to mem.
25572 // Find the largest store unit
25573 MVT StoreType = MVT::i8;
25574 for (MVT Tp : MVT::integer_valuetypes()) {
25575 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
25579 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
25580 if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
25581 (64 <= NumElems * ToSz))
25582 StoreType = MVT::f64;
25584 // Bitcast the original vector into a vector of store-size units
25585 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
25586 StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
25587 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
25588 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
25589 SmallVector<SDValue, 8> Chains;
25590 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
25591 TLI.getPointerTy());
25592 SDValue Ptr = St->getBasePtr();
25594 // Perform one or more big stores into memory.
25595 for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
25596 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
25597 StoreType, ShuffWide,
25598 DAG.getIntPtrConstant(i));
25599 SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
25600 St->getPointerInfo(), St->isVolatile(),
25601 St->isNonTemporal(), St->getAlignment());
25602 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
25603 Chains.push_back(Ch);
25606 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
25609 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
25610 // the FP state in cases where an emms may be missing.
25611 // A preferable solution to the general problem is to figure out the right
25612 // places to insert EMMS. This qualifies as a quick hack.
25614 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
25615 if (VT.getSizeInBits() != 64)
25618 const Function *F = DAG.getMachineFunction().getFunction();
25619 bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
25620 bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps
25621 && Subtarget->hasSSE2();
25622 if ((VT.isVector() ||
25623 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
25624 isa<LoadSDNode>(St->getValue()) &&
25625 !cast<LoadSDNode>(St->getValue())->isVolatile() &&
25626 St->getChain().hasOneUse() && !St->isVolatile()) {
25627 SDNode* LdVal = St->getValue().getNode();
25628 LoadSDNode *Ld = nullptr;
25629 int TokenFactorIndex = -1;
25630 SmallVector<SDValue, 8> Ops;
25631 SDNode* ChainVal = St->getChain().getNode();
25632 // Must be a store of a load. We currently handle two cases: the load
25633 // is a direct child, and it's under an intervening TokenFactor. It is
25634 // possible to dig deeper under nested TokenFactors.
25635 if (ChainVal == LdVal)
25636 Ld = cast<LoadSDNode>(St->getChain());
25637 else if (St->getValue().hasOneUse() &&
25638 ChainVal->getOpcode() == ISD::TokenFactor) {
25639 for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
25640 if (ChainVal->getOperand(i).getNode() == LdVal) {
25641 TokenFactorIndex = i;
25642 Ld = cast<LoadSDNode>(St->getValue());
25644 Ops.push_back(ChainVal->getOperand(i));
25648 if (!Ld || !ISD::isNormalLoad(Ld))
25651 // If this is not the MMX case, i.e. we are just turning i64 load/store
25652 // into f64 load/store, avoid the transformation if there are multiple
25653 // uses of the loaded value.
25654 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
25659 // If we are a 64-bit capable x86, lower to a single movq load/store pair.
25660 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
25662 if (Subtarget->is64Bit() || F64IsLegal) {
25663 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
25664 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
25665 Ld->getPointerInfo(), Ld->isVolatile(),
25666 Ld->isNonTemporal(), Ld->isInvariant(),
25667 Ld->getAlignment());
25668 SDValue NewChain = NewLd.getValue(1);
25669 if (TokenFactorIndex != -1) {
25670 Ops.push_back(NewChain);
25671 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
25673 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
25674 St->getPointerInfo(),
25675 St->isVolatile(), St->isNonTemporal(),
25676 St->getAlignment());
25679 // Otherwise, lower to two pairs of 32-bit loads / stores.
25680 SDValue LoAddr = Ld->getBasePtr();
25681 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
25682 DAG.getConstant(4, MVT::i32));
25684 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
25685 Ld->getPointerInfo(),
25686 Ld->isVolatile(), Ld->isNonTemporal(),
25687 Ld->isInvariant(), Ld->getAlignment());
25688 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
25689 Ld->getPointerInfo().getWithOffset(4),
25690 Ld->isVolatile(), Ld->isNonTemporal(),
25692 MinAlign(Ld->getAlignment(), 4));
25694 SDValue NewChain = LoLd.getValue(1);
25695 if (TokenFactorIndex != -1) {
25696 Ops.push_back(LoLd);
25697 Ops.push_back(HiLd);
25698 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
25701 LoAddr = St->getBasePtr();
25702 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
25703 DAG.getConstant(4, MVT::i32));
25705 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
25706 St->getPointerInfo(),
25707 St->isVolatile(), St->isNonTemporal(),
25708 St->getAlignment());
25709 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
25710 St->getPointerInfo().getWithOffset(4),
25712 St->isNonTemporal(),
25713 MinAlign(St->getAlignment(), 4));
25714 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
25719 /// Return 'true' if this vector operation is "horizontal"
25720 /// and return the operands for the horizontal operation in LHS and RHS. A
25721 /// horizontal operation performs the binary operation on successive elements
25722 /// of its first operand, then on successive elements of its second operand,
25723 /// returning the resulting values in a vector. For example, if
25724 /// A = < float a0, float a1, float a2, float a3 >
25726 /// B = < float b0, float b1, float b2, float b3 >
25727 /// then the result of doing a horizontal operation on A and B is
25728 /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
25729 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
25730 /// A horizontal-op B, for some already available A and B, and if so then LHS is
25731 /// set to A, RHS to B, and the routine returns 'true'.
25732 /// Note that the binary operation should have the property that if one of the
25733 /// operands is UNDEF then the result is UNDEF.
25734 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
25735 // Look for the following pattern: if
25736 // A = < float a0, float a1, float a2, float a3 >
25737 // B = < float b0, float b1, float b2, float b3 >
25739 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
25740 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
25741 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
25742 // which is A horizontal-op B.
25744 // At least one of the operands should be a vector shuffle.
25745 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
25746 RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
25749 MVT VT = LHS.getSimpleValueType();
25751 assert((VT.is128BitVector() || VT.is256BitVector()) &&
25752 "Unsupported vector type for horizontal add/sub");
25754 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
25755 // operate independently on 128-bit lanes.
25756 unsigned NumElts = VT.getVectorNumElements();
25757 unsigned NumLanes = VT.getSizeInBits()/128;
25758 unsigned NumLaneElts = NumElts / NumLanes;
25759 assert((NumLaneElts % 2 == 0) &&
25760 "Vector type should have an even number of elements in each lane");
25761 unsigned HalfLaneElts = NumLaneElts/2;
25763 // View LHS in the form
25764 // LHS = VECTOR_SHUFFLE A, B, LMask
25765 // If LHS is not a shuffle then pretend it is the shuffle
25766 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
25767 // NOTE: in what follows a default initialized SDValue represents an UNDEF of
25770 SmallVector<int, 16> LMask(NumElts);
25771 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
25772 if (LHS.getOperand(0).getOpcode() != ISD::UNDEF)
25773 A = LHS.getOperand(0);
25774 if (LHS.getOperand(1).getOpcode() != ISD::UNDEF)
25775 B = LHS.getOperand(1);
25776 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
25777 std::copy(Mask.begin(), Mask.end(), LMask.begin());
25779 if (LHS.getOpcode() != ISD::UNDEF)
25781 for (unsigned i = 0; i != NumElts; ++i)
25785 // Likewise, view RHS in the form
25786 // RHS = VECTOR_SHUFFLE C, D, RMask
25788 SmallVector<int, 16> RMask(NumElts);
25789 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
25790 if (RHS.getOperand(0).getOpcode() != ISD::UNDEF)
25791 C = RHS.getOperand(0);
25792 if (RHS.getOperand(1).getOpcode() != ISD::UNDEF)
25793 D = RHS.getOperand(1);
25794 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
25795 std::copy(Mask.begin(), Mask.end(), RMask.begin());
25797 if (RHS.getOpcode() != ISD::UNDEF)
25799 for (unsigned i = 0; i != NumElts; ++i)
25803 // Check that the shuffles are both shuffling the same vectors.
25804 if (!(A == C && B == D) && !(A == D && B == C))
25807 // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
25808 if (!A.getNode() && !B.getNode())
25811 // If A and B occur in reverse order in RHS, then "swap" them (which means
25812 // rewriting the mask).
25814 CommuteVectorShuffleMask(RMask, NumElts);
25816 // At this point LHS and RHS are equivalent to
25817 // LHS = VECTOR_SHUFFLE A, B, LMask
25818 // RHS = VECTOR_SHUFFLE A, B, RMask
25819 // Check that the masks correspond to performing a horizontal operation.
25820 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
25821 for (unsigned i = 0; i != NumLaneElts; ++i) {
25822 int LIdx = LMask[i+l], RIdx = RMask[i+l];
25824 // Ignore any UNDEF components.
25825 if (LIdx < 0 || RIdx < 0 ||
25826 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
25827 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
25830 // Check that successive elements are being operated on. If not, this is
25831 // not a horizontal operation.
25832 unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
25833 int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
25834 if (!(LIdx == Index && RIdx == Index + 1) &&
25835 !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
25840 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
25841 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
25845 /// Do target-specific dag combines on floating point adds.
25846 static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
25847 const X86Subtarget *Subtarget) {
25848 EVT VT = N->getValueType(0);
25849 SDValue LHS = N->getOperand(0);
25850 SDValue RHS = N->getOperand(1);
25852 // Try to synthesize horizontal adds from adds of shuffles.
25853 if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
25854 (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
25855 isHorizontalBinOp(LHS, RHS, true))
25856 return DAG.getNode(X86ISD::FHADD, SDLoc(N), VT, LHS, RHS);
25860 /// Do target-specific dag combines on floating point subs.
25861 static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
25862 const X86Subtarget *Subtarget) {
25863 EVT VT = N->getValueType(0);
25864 SDValue LHS = N->getOperand(0);
25865 SDValue RHS = N->getOperand(1);
25867 // Try to synthesize horizontal subs from subs of shuffles.
25868 if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
25869 (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
25870 isHorizontalBinOp(LHS, RHS, false))
25871 return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS);
25875 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
25876 static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
25877 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
25879 // F[X]OR(0.0, x) -> x
25880 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
25881 if (C->getValueAPF().isPosZero())
25882 return N->getOperand(1);
25884 // F[X]OR(x, 0.0) -> x
25885 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
25886 if (C->getValueAPF().isPosZero())
25887 return N->getOperand(0);
25891 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
25892 static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
25893 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
25895 // Only perform optimizations if UnsafeMath is used.
25896 if (!DAG.getTarget().Options.UnsafeFPMath)
25899 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
25900 // into FMINC and FMAXC, which are Commutative operations.
25901 unsigned NewOp = 0;
25902 switch (N->getOpcode()) {
25903 default: llvm_unreachable("unknown opcode");
25904 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
25905 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
25908 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
25909 N->getOperand(0), N->getOperand(1));
25912 /// Do target-specific dag combines on X86ISD::FAND nodes.
25913 static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
25914 // FAND(0.0, x) -> 0.0
25915 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
25916 if (C->getValueAPF().isPosZero())
25917 return N->getOperand(0);
25919 // FAND(x, 0.0) -> 0.0
25920 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
25921 if (C->getValueAPF().isPosZero())
25922 return N->getOperand(1);
25927 /// Do target-specific dag combines on X86ISD::FANDN nodes
25928 static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) {
25929 // FANDN(0.0, x) -> x
25930 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
25931 if (C->getValueAPF().isPosZero())
25932 return N->getOperand(1);
25934 // FANDN(x, 0.0) -> 0.0
25935 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
25936 if (C->getValueAPF().isPosZero())
25937 return N->getOperand(1);
25942 static SDValue PerformBTCombine(SDNode *N,
25944 TargetLowering::DAGCombinerInfo &DCI) {
25945 // BT ignores high bits in the bit index operand.
25946 SDValue Op1 = N->getOperand(1);
25947 if (Op1.hasOneUse()) {
25948 unsigned BitWidth = Op1.getValueSizeInBits();
25949 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
25950 APInt KnownZero, KnownOne;
25951 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
25952 !DCI.isBeforeLegalizeOps());
25953 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25954 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
25955 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
25956 DCI.CommitTargetLoweringOpt(TLO);
25961 static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
25962 SDValue Op = N->getOperand(0);
25963 if (Op.getOpcode() == ISD::BITCAST)
25964 Op = Op.getOperand(0);
25965 EVT VT = N->getValueType(0), OpVT = Op.getValueType();
25966 if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
25967 VT.getVectorElementType().getSizeInBits() ==
25968 OpVT.getVectorElementType().getSizeInBits()) {
25969 return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
25974 static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
25975 const X86Subtarget *Subtarget) {
25976 EVT VT = N->getValueType(0);
25977 if (!VT.isVector())
25980 SDValue N0 = N->getOperand(0);
25981 SDValue N1 = N->getOperand(1);
25982 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
25985 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
25986 // both SSE and AVX2 since there is no sign-extended shift right
25987 // operation on a vector with 64-bit elements.
25988 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
25989 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
25990 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
25991 N0.getOpcode() == ISD::SIGN_EXTEND)) {
25992 SDValue N00 = N0.getOperand(0);
25994 // EXTLOAD has a better solution on AVX2,
25995 // it may be replaced with X86ISD::VSEXT node.
25996 if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256())
25997 if (!ISD::isNormalLoad(N00.getNode()))
26000 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
26001 SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
26003 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
26009 static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
26010 TargetLowering::DAGCombinerInfo &DCI,
26011 const X86Subtarget *Subtarget) {
26012 SDValue N0 = N->getOperand(0);
26013 EVT VT = N->getValueType(0);
26015 // (i8,i32 sext (sdivrem (i8 x, i8 y)) ->
26016 // (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y)
26017 // This exposes the sext to the sdivrem lowering, so that it directly extends
26018 // from AH (which we otherwise need to do contortions to access).
26019 if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 &&
26020 N0.getValueType() == MVT::i8 && VT == MVT::i32) {
26022 SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
26023 SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, dl, NodeTys,
26024 N0.getOperand(0), N0.getOperand(1));
26025 DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
26026 return R.getValue(1);
26029 if (!DCI.isBeforeLegalizeOps())
26032 if (!Subtarget->hasFp256())
26035 if (VT.isVector() && VT.getSizeInBits() == 256) {
26036 SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
26044 static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
26045 const X86Subtarget* Subtarget) {
26047 EVT VT = N->getValueType(0);
26049 // Let legalize expand this if it isn't a legal type yet.
26050 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
26053 EVT ScalarVT = VT.getScalarType();
26054 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
26055 (!Subtarget->hasFMA() && !Subtarget->hasFMA4()))
26058 SDValue A = N->getOperand(0);
26059 SDValue B = N->getOperand(1);
26060 SDValue C = N->getOperand(2);
26062 bool NegA = (A.getOpcode() == ISD::FNEG);
26063 bool NegB = (B.getOpcode() == ISD::FNEG);
26064 bool NegC = (C.getOpcode() == ISD::FNEG);
26066 // Negative multiplication when NegA xor NegB
26067 bool NegMul = (NegA != NegB);
26069 A = A.getOperand(0);
26071 B = B.getOperand(0);
26073 C = C.getOperand(0);
26077 Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
26079 Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
26081 return DAG.getNode(Opcode, dl, VT, A, B, C);
26084 static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
26085 TargetLowering::DAGCombinerInfo &DCI,
26086 const X86Subtarget *Subtarget) {
26087 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
26088 // (and (i32 x86isd::setcc_carry), 1)
26089 // This eliminates the zext. This transformation is necessary because
26090 // ISD::SETCC is always legalized to i8.
26092 SDValue N0 = N->getOperand(0);
26093 EVT VT = N->getValueType(0);
26095 if (N0.getOpcode() == ISD::AND &&
26097 N0.getOperand(0).hasOneUse()) {
26098 SDValue N00 = N0.getOperand(0);
26099 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
26100 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
26101 if (!C || C->getZExtValue() != 1)
26103 return DAG.getNode(ISD::AND, dl, VT,
26104 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
26105 N00.getOperand(0), N00.getOperand(1)),
26106 DAG.getConstant(1, VT));
26110 if (N0.getOpcode() == ISD::TRUNCATE &&
26112 N0.getOperand(0).hasOneUse()) {
26113 SDValue N00 = N0.getOperand(0);
26114 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
26115 return DAG.getNode(ISD::AND, dl, VT,
26116 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
26117 N00.getOperand(0), N00.getOperand(1)),
26118 DAG.getConstant(1, VT));
26121 if (VT.is256BitVector()) {
26122 SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
26127 // (i8,i32 zext (udivrem (i8 x, i8 y)) ->
26128 // (i8,i32 (udivrem_zext_hreg (i8 x, i8 y)
26129 // This exposes the zext to the udivrem lowering, so that it directly extends
26130 // from AH (which we otherwise need to do contortions to access).
26131 if (N0.getOpcode() == ISD::UDIVREM &&
26132 N0.getResNo() == 1 && N0.getValueType() == MVT::i8 &&
26133 (VT == MVT::i32 || VT == MVT::i64)) {
26134 SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
26135 SDValue R = DAG.getNode(X86ISD::UDIVREM8_ZEXT_HREG, dl, NodeTys,
26136 N0.getOperand(0), N0.getOperand(1));
26137 DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
26138 return R.getValue(1);
26144 // Optimize x == -y --> x+y == 0
26145 // x != -y --> x+y != 0
26146 static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
26147 const X86Subtarget* Subtarget) {
26148 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
26149 SDValue LHS = N->getOperand(0);
26150 SDValue RHS = N->getOperand(1);
26151 EVT VT = N->getValueType(0);
26154 if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
26155 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
26156 if (C->getAPIntValue() == 0 && LHS.hasOneUse()) {
26157 SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
26158 LHS.getValueType(), RHS, LHS.getOperand(1));
26159 return DAG.getSetCC(SDLoc(N), N->getValueType(0),
26160 addV, DAG.getConstant(0, addV.getValueType()), CC);
26162 if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
26163 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0)))
26164 if (C->getAPIntValue() == 0 && RHS.hasOneUse()) {
26165 SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
26166 RHS.getValueType(), LHS, RHS.getOperand(1));
26167 return DAG.getSetCC(SDLoc(N), N->getValueType(0),
26168 addV, DAG.getConstant(0, addV.getValueType()), CC);
26171 if (VT.getScalarType() == MVT::i1) {
26172 bool IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
26173 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
26174 bool IsVZero0 = ISD::isBuildVectorAllZeros(LHS.getNode());
26175 if (!IsSEXT0 && !IsVZero0)
26177 bool IsSEXT1 = (RHS.getOpcode() == ISD::SIGN_EXTEND) &&
26178 (RHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
26179 bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
26181 if (!IsSEXT1 && !IsVZero1)
26184 if (IsSEXT0 && IsVZero1) {
26185 assert(VT == LHS.getOperand(0).getValueType() && "Uexpected operand type");
26186 if (CC == ISD::SETEQ)
26187 return DAG.getNOT(DL, LHS.getOperand(0), VT);
26188 return LHS.getOperand(0);
26190 if (IsSEXT1 && IsVZero0) {
26191 assert(VT == RHS.getOperand(0).getValueType() && "Uexpected operand type");
26192 if (CC == ISD::SETEQ)
26193 return DAG.getNOT(DL, RHS.getOperand(0), VT);
26194 return RHS.getOperand(0);
26201 static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
26202 const X86Subtarget *Subtarget) {
26204 MVT VT = N->getOperand(1)->getSimpleValueType(0);
26205 assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
26206 "X86insertps is only defined for v4x32");
26208 SDValue Ld = N->getOperand(1);
26209 if (MayFoldLoad(Ld)) {
26210 // Extract the countS bits from the immediate so we can get the proper
26211 // address when narrowing the vector load to a specific element.
26212 // When the second source op is a memory address, interps doesn't use
26213 // countS and just gets an f32 from that address.
26214 unsigned DestIndex =
26215 cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
26216 Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
26220 // Create this as a scalar to vector to match the instruction pattern.
26221 SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
26222 // countS bits are ignored when loading from memory on insertps, which
26223 // means we don't need to explicitly set them to 0.
26224 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
26225 LoadScalarToVector, N->getOperand(2));
26228 // Helper function of PerformSETCCCombine. It is to materialize "setb reg"
26229 // as "sbb reg,reg", since it can be extended without zext and produces
26230 // an all-ones bit which is more useful than 0/1 in some cases.
26231 static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG,
26234 return DAG.getNode(ISD::AND, DL, VT,
26235 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
26236 DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS),
26237 DAG.getConstant(1, VT));
26238 assert (VT == MVT::i1 && "Unexpected type for SECCC node");
26239 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,
26240 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
26241 DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS));
26244 // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
26245 static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
26246 TargetLowering::DAGCombinerInfo &DCI,
26247 const X86Subtarget *Subtarget) {
26249 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
26250 SDValue EFLAGS = N->getOperand(1);
26252 if (CC == X86::COND_A) {
26253 // Try to convert COND_A into COND_B in an attempt to facilitate
26254 // materializing "setb reg".
26256 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
26257 // cannot take an immediate as its first operand.
26259 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
26260 EFLAGS.getValueType().isInteger() &&
26261 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
26262 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
26263 EFLAGS.getNode()->getVTList(),
26264 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
26265 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
26266 return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0));
26270 // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
26271 // a zext and produces an all-ones bit which is more useful than 0/1 in some
26273 if (CC == X86::COND_B)
26274 return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
26278 Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
26279 if (Flags.getNode()) {
26280 SDValue Cond = DAG.getConstant(CC, MVT::i8);
26281 return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
26287 // Optimize branch condition evaluation.
26289 static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
26290 TargetLowering::DAGCombinerInfo &DCI,
26291 const X86Subtarget *Subtarget) {
26293 SDValue Chain = N->getOperand(0);
26294 SDValue Dest = N->getOperand(1);
26295 SDValue EFLAGS = N->getOperand(3);
26296 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
26300 Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
26301 if (Flags.getNode()) {
26302 SDValue Cond = DAG.getConstant(CC, MVT::i8);
26303 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
26310 static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
26311 SelectionDAG &DAG) {
26312 // Take advantage of vector comparisons producing 0 or -1 in each lane to
26313 // optimize away operation when it's from a constant.
26315 // The general transformation is:
26316 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
26317 // AND(VECTOR_CMP(x,y), constant2)
26318 // constant2 = UNARYOP(constant)
26320 // Early exit if this isn't a vector operation, the operand of the
26321 // unary operation isn't a bitwise AND, or if the sizes of the operations
26322 // aren't the same.
26323 EVT VT = N->getValueType(0);
26324 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
26325 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
26326 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
26329 // Now check that the other operand of the AND is a constant. We could
26330 // make the transformation for non-constant splats as well, but it's unclear
26331 // that would be a benefit as it would not eliminate any operations, just
26332 // perform one more step in scalar code before moving to the vector unit.
26333 if (BuildVectorSDNode *BV =
26334 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
26335 // Bail out if the vector isn't a constant.
26336 if (!BV->isConstant())
26339 // Everything checks out. Build up the new and improved node.
26341 EVT IntVT = BV->getValueType(0);
26342 // Create a new constant of the appropriate type for the transformed
26344 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
26345 // The AND node needs bitcasts to/from an integer vector type around it.
26346 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
26347 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
26348 N->getOperand(0)->getOperand(0), MaskConst);
26349 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
26356 static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
26357 const X86Subtarget *Subtarget) {
26358 // First try to optimize away the conversion entirely when it's
26359 // conditionally from a constant. Vectors only.
26360 SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG);
26361 if (Res != SDValue())
26364 // Now move on to more general possibilities.
26365 SDValue Op0 = N->getOperand(0);
26366 EVT InVT = Op0->getValueType(0);
26368 // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32))
26369 if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
26371 MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
26372 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
26373 return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
26376 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
26377 // a 32-bit target where SSE doesn't support i64->FP operations.
26378 if (Op0.getOpcode() == ISD::LOAD) {
26379 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
26380 EVT VT = Ld->getValueType(0);
26381 if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
26382 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
26383 !Subtarget->is64Bit() && VT == MVT::i64) {
26384 SDValue FILDChain = Subtarget->getTargetLowering()->BuildFILD(
26385 SDValue(N, 0), Ld->getValueType(0), Ld->getChain(), Op0, DAG);
26386 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
26393 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
26394 static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
26395 X86TargetLowering::DAGCombinerInfo &DCI) {
26396 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
26397 // the result is either zero or one (depending on the input carry bit).
26398 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
26399 if (X86::isZeroNode(N->getOperand(0)) &&
26400 X86::isZeroNode(N->getOperand(1)) &&
26401 // We don't have a good way to replace an EFLAGS use, so only do this when
26403 SDValue(N, 1).use_empty()) {
26405 EVT VT = N->getValueType(0);
26406 SDValue CarryOut = DAG.getConstant(0, N->getValueType(1));
26407 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
26408 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
26409 DAG.getConstant(X86::COND_B,MVT::i8),
26411 DAG.getConstant(1, VT));
26412 return DCI.CombineTo(N, Res1, CarryOut);
26418 // fold (add Y, (sete X, 0)) -> adc 0, Y
26419 // (add Y, (setne X, 0)) -> sbb -1, Y
26420 // (sub (sete X, 0), Y) -> sbb 0, Y
26421 // (sub (setne X, 0), Y) -> adc -1, Y
26422 static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
26425 // Look through ZExts.
26426 SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
26427 if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
26430 SDValue SetCC = Ext.getOperand(0);
26431 if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
26434 X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
26435 if (CC != X86::COND_E && CC != X86::COND_NE)
26438 SDValue Cmp = SetCC.getOperand(1);
26439 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
26440 !X86::isZeroNode(Cmp.getOperand(1)) ||
26441 !Cmp.getOperand(0).getValueType().isInteger())
26444 SDValue CmpOp0 = Cmp.getOperand(0);
26445 SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
26446 DAG.getConstant(1, CmpOp0.getValueType()));
26448 SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
26449 if (CC == X86::COND_NE)
26450 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
26451 DL, OtherVal.getValueType(), OtherVal,
26452 DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp);
26453 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
26454 DL, OtherVal.getValueType(), OtherVal,
26455 DAG.getConstant(0, OtherVal.getValueType()), NewCmp);
26458 /// PerformADDCombine - Do target-specific dag combines on integer adds.
26459 static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
26460 const X86Subtarget *Subtarget) {
26461 EVT VT = N->getValueType(0);
26462 SDValue Op0 = N->getOperand(0);
26463 SDValue Op1 = N->getOperand(1);
26465 // Try to synthesize horizontal adds from adds of shuffles.
26466 if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
26467 (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
26468 isHorizontalBinOp(Op0, Op1, true))
26469 return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
26471 return OptimizeConditionalInDecrement(N, DAG);
26474 static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
26475 const X86Subtarget *Subtarget) {
26476 SDValue Op0 = N->getOperand(0);
26477 SDValue Op1 = N->getOperand(1);
26479 // X86 can't encode an immediate LHS of a sub. See if we can push the
26480 // negation into a preceding instruction.
26481 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
26482 // If the RHS of the sub is a XOR with one use and a constant, invert the
26483 // immediate. Then add one to the LHS of the sub so we can turn
26484 // X-Y -> X+~Y+1, saving one register.
26485 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
26486 isa<ConstantSDNode>(Op1.getOperand(1))) {
26487 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
26488 EVT VT = Op0.getValueType();
26489 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
26491 DAG.getConstant(~XorC, VT));
26492 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
26493 DAG.getConstant(C->getAPIntValue()+1, VT));
26497 // Try to synthesize horizontal adds from adds of shuffles.
26498 EVT VT = N->getValueType(0);
26499 if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
26500 (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
26501 isHorizontalBinOp(Op0, Op1, true))
26502 return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
26504 return OptimizeConditionalInDecrement(N, DAG);
26507 /// performVZEXTCombine - Performs build vector combines
26508 static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
26509 TargetLowering::DAGCombinerInfo &DCI,
26510 const X86Subtarget *Subtarget) {
26512 MVT VT = N->getSimpleValueType(0);
26513 SDValue Op = N->getOperand(0);
26514 MVT OpVT = Op.getSimpleValueType();
26515 MVT OpEltVT = OpVT.getVectorElementType();
26516 unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
26518 // (vzext (bitcast (vzext (x)) -> (vzext x)
26520 while (V.getOpcode() == ISD::BITCAST)
26521 V = V.getOperand(0);
26523 if (V != Op && V.getOpcode() == X86ISD::VZEXT) {
26524 MVT InnerVT = V.getSimpleValueType();
26525 MVT InnerEltVT = InnerVT.getVectorElementType();
26527 // If the element sizes match exactly, we can just do one larger vzext. This
26528 // is always an exact type match as vzext operates on integer types.
26529 if (OpEltVT == InnerEltVT) {
26530 assert(OpVT == InnerVT && "Types must match for vzext!");
26531 return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
26534 // The only other way we can combine them is if only a single element of the
26535 // inner vzext is used in the input to the outer vzext.
26536 if (InnerEltVT.getSizeInBits() < InputBits)
26539 // In this case, the inner vzext is completely dead because we're going to
26540 // only look at bits inside of the low element. Just do the outer vzext on
26541 // a bitcast of the input to the inner.
26542 return DAG.getNode(X86ISD::VZEXT, DL, VT,
26543 DAG.getNode(ISD::BITCAST, DL, OpVT, V));
26546 // Check if we can bypass extracting and re-inserting an element of an input
26547 // vector. Essentialy:
26548 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
26549 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
26550 V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
26551 V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
26552 SDValue ExtractedV = V.getOperand(0);
26553 SDValue OrigV = ExtractedV.getOperand(0);
26554 if (auto *ExtractIdx = dyn_cast<ConstantSDNode>(ExtractedV.getOperand(1)))
26555 if (ExtractIdx->getZExtValue() == 0) {
26556 MVT OrigVT = OrigV.getSimpleValueType();
26557 // Extract a subvector if necessary...
26558 if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
26559 int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
26560 OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
26561 OrigVT.getVectorNumElements() / Ratio);
26562 OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
26563 DAG.getIntPtrConstant(0));
26565 Op = DAG.getNode(ISD::BITCAST, DL, OpVT, OrigV);
26566 return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
26573 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
26574 DAGCombinerInfo &DCI) const {
26575 SelectionDAG &DAG = DCI.DAG;
26576 switch (N->getOpcode()) {
26578 case ISD::EXTRACT_VECTOR_ELT:
26579 return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
26582 case X86ISD::SHRUNKBLEND:
26583 return PerformSELECTCombine(N, DAG, DCI, Subtarget);
26584 case ISD::BITCAST: return PerformBITCASTCombine(N, DAG);
26585 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget);
26586 case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget);
26587 case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget);
26588 case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI);
26589 case ISD::MUL: return PerformMulCombine(N, DAG, DCI);
26592 case ISD::SRL: return PerformShiftCombine(N, DAG, DCI, Subtarget);
26593 case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget);
26594 case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget);
26595 case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget);
26596 case ISD::LOAD: return PerformLOADCombine(N, DAG, DCI, Subtarget);
26597 case ISD::MLOAD: return PerformMLOADCombine(N, DAG, DCI, Subtarget);
26598 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget);
26599 case ISD::MSTORE: return PerformMSTORECombine(N, DAG, Subtarget);
26600 case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, Subtarget);
26601 case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget);
26602 case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget);
26604 case X86ISD::FOR: return PerformFORCombine(N, DAG);
26606 case X86ISD::FMAX: return PerformFMinFMaxCombine(N, DAG);
26607 case X86ISD::FAND: return PerformFANDCombine(N, DAG);
26608 case X86ISD::FANDN: return PerformFANDNCombine(N, DAG);
26609 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI);
26610 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG);
26611 case ISD::ANY_EXTEND:
26612 case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget);
26613 case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget);
26614 case ISD::SIGN_EXTEND_INREG:
26615 return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
26616 case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG,DCI,Subtarget);
26617 case ISD::SETCC: return PerformISDSETCCCombine(N, DAG, Subtarget);
26618 case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget);
26619 case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget);
26620 case X86ISD::VZEXT: return performVZEXTCombine(N, DAG, DCI, Subtarget);
26621 case X86ISD::SHUFP: // Handle all target specific shuffles
26622 case X86ISD::PALIGNR:
26623 case X86ISD::UNPCKH:
26624 case X86ISD::UNPCKL:
26625 case X86ISD::MOVHLPS:
26626 case X86ISD::MOVLHPS:
26627 case X86ISD::PSHUFB:
26628 case X86ISD::PSHUFD:
26629 case X86ISD::PSHUFHW:
26630 case X86ISD::PSHUFLW:
26631 case X86ISD::MOVSS:
26632 case X86ISD::MOVSD:
26633 case X86ISD::VPERMILPI:
26634 case X86ISD::VPERM2X128:
26635 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
26636 case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget);
26637 case ISD::INTRINSIC_WO_CHAIN:
26638 return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget);
26639 case X86ISD::INSERTPS: {
26640 if (getTargetMachine().getOptLevel() > CodeGenOpt::None)
26641 return PerformINSERTPSCombine(N, DAG, Subtarget);
26644 case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DAG, Subtarget);
26650 /// isTypeDesirableForOp - Return true if the target has native support for
26651 /// the specified value type and it is 'desirable' to use the type for the
26652 /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
26653 /// instruction encodings are longer and some i16 instructions are slow.
26654 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
26655 if (!isTypeLegal(VT))
26657 if (VT != MVT::i16)
26664 case ISD::SIGN_EXTEND:
26665 case ISD::ZERO_EXTEND:
26666 case ISD::ANY_EXTEND:
26679 /// IsDesirableToPromoteOp - This method query the target whether it is
26680 /// beneficial for dag combiner to promote the specified node. If true, it
26681 /// should return the desired promotion type by reference.
26682 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
26683 EVT VT = Op.getValueType();
26684 if (VT != MVT::i16)
26687 bool Promote = false;
26688 bool Commute = false;
26689 switch (Op.getOpcode()) {
26692 LoadSDNode *LD = cast<LoadSDNode>(Op);
26693 // If the non-extending load has a single use and it's not live out, then it
26694 // might be folded.
26695 if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&&
26696 Op.hasOneUse()*/) {
26697 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
26698 UE = Op.getNode()->use_end(); UI != UE; ++UI) {
26699 // The only case where we'd want to promote LOAD (rather then it being
26700 // promoted as an operand is when it's only use is liveout.
26701 if (UI->getOpcode() != ISD::CopyToReg)
26708 case ISD::SIGN_EXTEND:
26709 case ISD::ZERO_EXTEND:
26710 case ISD::ANY_EXTEND:
26715 SDValue N0 = Op.getOperand(0);
26716 // Look out for (store (shl (load), x)).
26717 if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
26730 SDValue N0 = Op.getOperand(0);
26731 SDValue N1 = Op.getOperand(1);
26732 if (!Commute && MayFoldLoad(N1))
26734 // Avoid disabling potential load folding opportunities.
26735 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
26737 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
26747 //===----------------------------------------------------------------------===//
26748 // X86 Inline Assembly Support
26749 //===----------------------------------------------------------------------===//
26752 // Helper to match a string separated by whitespace.
26753 bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) {
26754 s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace.
26756 for (unsigned i = 0, e = args.size(); i != e; ++i) {
26757 StringRef piece(*args[i]);
26758 if (!s.startswith(piece)) // Check if the piece matches.
26761 s = s.substr(piece.size());
26762 StringRef::size_type pos = s.find_first_not_of(" \t");
26763 if (pos == 0) // We matched a prefix.
26771 const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={};
26774 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
26776 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
26777 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
26778 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
26779 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
26781 if (AsmPieces.size() == 3)
26783 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
26790 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
26791 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
26793 std::string AsmStr = IA->getAsmString();
26795 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
26796 if (!Ty || Ty->getBitWidth() % 16 != 0)
26799 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
26800 SmallVector<StringRef, 4> AsmPieces;
26801 SplitString(AsmStr, AsmPieces, ";\n");
26803 switch (AsmPieces.size()) {
26804 default: return false;
26806 // FIXME: this should verify that we are targeting a 486 or better. If not,
26807 // we will turn this bswap into something that will be lowered to logical
26808 // ops instead of emitting the bswap asm. For now, we don't support 486 or
26809 // lower so don't worry about this.
26811 if (matchAsm(AsmPieces[0], "bswap", "$0") ||
26812 matchAsm(AsmPieces[0], "bswapl", "$0") ||
26813 matchAsm(AsmPieces[0], "bswapq", "$0") ||
26814 matchAsm(AsmPieces[0], "bswap", "${0:q}") ||
26815 matchAsm(AsmPieces[0], "bswapl", "${0:q}") ||
26816 matchAsm(AsmPieces[0], "bswapq", "${0:q}")) {
26817 // No need to check constraints, nothing other than the equivalent of
26818 // "=r,0" would be valid here.
26819 return IntrinsicLowering::LowerToByteSwap(CI);
26822 // rorw $$8, ${0:w} --> llvm.bswap.i16
26823 if (CI->getType()->isIntegerTy(16) &&
26824 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
26825 (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") ||
26826 matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) {
26828 const std::string &ConstraintsStr = IA->getConstraintString();
26829 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
26830 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
26831 if (clobbersFlagRegisters(AsmPieces))
26832 return IntrinsicLowering::LowerToByteSwap(CI);
26836 if (CI->getType()->isIntegerTy(32) &&
26837 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
26838 matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") &&
26839 matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") &&
26840 matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) {
26842 const std::string &ConstraintsStr = IA->getConstraintString();
26843 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
26844 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
26845 if (clobbersFlagRegisters(AsmPieces))
26846 return IntrinsicLowering::LowerToByteSwap(CI);
26849 if (CI->getType()->isIntegerTy(64)) {
26850 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
26851 if (Constraints.size() >= 2 &&
26852 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
26853 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
26854 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
26855 if (matchAsm(AsmPieces[0], "bswap", "%eax") &&
26856 matchAsm(AsmPieces[1], "bswap", "%edx") &&
26857 matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx"))
26858 return IntrinsicLowering::LowerToByteSwap(CI);
26866 /// getConstraintType - Given a constraint letter, return the type of
26867 /// constraint it is for this target.
26868 X86TargetLowering::ConstraintType
26869 X86TargetLowering::getConstraintType(const std::string &Constraint) const {
26870 if (Constraint.size() == 1) {
26871 switch (Constraint[0]) {
26882 return C_RegisterClass;
26906 return TargetLowering::getConstraintType(Constraint);
26909 /// Examine constraint type and operand type and determine a weight value.
26910 /// This object must already have been set up with the operand type
26911 /// and the current alternative constraint selected.
26912 TargetLowering::ConstraintWeight
26913 X86TargetLowering::getSingleConstraintMatchWeight(
26914 AsmOperandInfo &info, const char *constraint) const {
26915 ConstraintWeight weight = CW_Invalid;
26916 Value *CallOperandVal = info.CallOperandVal;
26917 // If we don't have a value, we can't do a match,
26918 // but allow it at the lowest weight.
26919 if (!CallOperandVal)
26921 Type *type = CallOperandVal->getType();
26922 // Look at the constraint type.
26923 switch (*constraint) {
26925 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
26936 if (CallOperandVal->getType()->isIntegerTy())
26937 weight = CW_SpecificReg;
26942 if (type->isFloatingPointTy())
26943 weight = CW_SpecificReg;
26946 if (type->isX86_MMXTy() && Subtarget->hasMMX())
26947 weight = CW_SpecificReg;
26951 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) ||
26952 ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256()))
26953 weight = CW_Register;
26956 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
26957 if (C->getZExtValue() <= 31)
26958 weight = CW_Constant;
26962 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26963 if (C->getZExtValue() <= 63)
26964 weight = CW_Constant;
26968 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26969 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
26970 weight = CW_Constant;
26974 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26975 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
26976 weight = CW_Constant;
26980 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26981 if (C->getZExtValue() <= 3)
26982 weight = CW_Constant;
26986 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26987 if (C->getZExtValue() <= 0xff)
26988 weight = CW_Constant;
26993 if (dyn_cast<ConstantFP>(CallOperandVal)) {
26994 weight = CW_Constant;
26998 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26999 if ((C->getSExtValue() >= -0x80000000LL) &&
27000 (C->getSExtValue() <= 0x7fffffffLL))
27001 weight = CW_Constant;
27005 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
27006 if (C->getZExtValue() <= 0xffffffff)
27007 weight = CW_Constant;
27014 /// LowerXConstraint - try to replace an X constraint, which matches anything,
27015 /// with another that has more specific requirements based on the type of the
27016 /// corresponding operand.
27017 const char *X86TargetLowering::
27018 LowerXConstraint(EVT ConstraintVT) const {
27019 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
27020 // 'f' like normal targets.
27021 if (ConstraintVT.isFloatingPoint()) {
27022 if (Subtarget->hasSSE2())
27024 if (Subtarget->hasSSE1())
27028 return TargetLowering::LowerXConstraint(ConstraintVT);
27031 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
27032 /// vector. If it is invalid, don't add anything to Ops.
27033 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
27034 std::string &Constraint,
27035 std::vector<SDValue>&Ops,
27036 SelectionDAG &DAG) const {
27039 // Only support length 1 constraints for now.
27040 if (Constraint.length() > 1) return;
27042 char ConstraintLetter = Constraint[0];
27043 switch (ConstraintLetter) {
27046 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
27047 if (C->getZExtValue() <= 31) {
27048 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
27054 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
27055 if (C->getZExtValue() <= 63) {
27056 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
27062 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
27063 if (isInt<8>(C->getSExtValue())) {
27064 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
27070 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
27071 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
27072 (Subtarget->is64Bit() && C->getZExtValue() == 0xffffffff)) {
27073 Result = DAG.getTargetConstant(C->getSExtValue(), Op.getValueType());
27079 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
27080 if (C->getZExtValue() <= 3) {
27081 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
27087 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
27088 if (C->getZExtValue() <= 255) {
27089 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
27095 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
27096 if (C->getZExtValue() <= 127) {
27097 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
27103 // 32-bit signed value
27104 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
27105 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
27106 C->getSExtValue())) {
27107 // Widen to 64 bits here to get it sign extended.
27108 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
27111 // FIXME gcc accepts some relocatable values here too, but only in certain
27112 // memory models; it's complicated.
27117 // 32-bit unsigned value
27118 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
27119 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
27120 C->getZExtValue())) {
27121 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
27125 // FIXME gcc accepts some relocatable values here too, but only in certain
27126 // memory models; it's complicated.
27130 // Literal immediates are always ok.
27131 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
27132 // Widen to 64 bits here to get it sign extended.
27133 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64);
27137 // In any sort of PIC mode addresses need to be computed at runtime by
27138 // adding in a register or some sort of table lookup. These can't
27139 // be used as immediates.
27140 if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC())
27143 // If we are in non-pic codegen mode, we allow the address of a global (with
27144 // an optional displacement) to be used with 'i'.
27145 GlobalAddressSDNode *GA = nullptr;
27146 int64_t Offset = 0;
27148 // Match either (GA), (GA+C), (GA+C1+C2), etc.
27150 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
27151 Offset += GA->getOffset();
27153 } else if (Op.getOpcode() == ISD::ADD) {
27154 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
27155 Offset += C->getZExtValue();
27156 Op = Op.getOperand(0);
27159 } else if (Op.getOpcode() == ISD::SUB) {
27160 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
27161 Offset += -C->getZExtValue();
27162 Op = Op.getOperand(0);
27167 // Otherwise, this isn't something we can handle, reject it.
27171 const GlobalValue *GV = GA->getGlobal();
27172 // If we require an extra load to get this address, as in PIC mode, we
27173 // can't accept it.
27174 if (isGlobalStubReference(
27175 Subtarget->ClassifyGlobalReference(GV, DAG.getTarget())))
27178 Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
27179 GA->getValueType(0), Offset);
27184 if (Result.getNode()) {
27185 Ops.push_back(Result);
27188 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
27191 std::pair<unsigned, const TargetRegisterClass*>
27192 X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
27194 // First, see if this is a constraint that directly corresponds to an LLVM
27196 if (Constraint.size() == 1) {
27197 // GCC Constraint Letters
27198 switch (Constraint[0]) {
27200 // TODO: Slight differences here in allocation order and leaving
27201 // RIP in the class. Do they matter any more here than they do
27202 // in the normal allocation?
27203 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
27204 if (Subtarget->is64Bit()) {
27205 if (VT == MVT::i32 || VT == MVT::f32)
27206 return std::make_pair(0U, &X86::GR32RegClass);
27207 if (VT == MVT::i16)
27208 return std::make_pair(0U, &X86::GR16RegClass);
27209 if (VT == MVT::i8 || VT == MVT::i1)
27210 return std::make_pair(0U, &X86::GR8RegClass);
27211 if (VT == MVT::i64 || VT == MVT::f64)
27212 return std::make_pair(0U, &X86::GR64RegClass);
27215 // 32-bit fallthrough
27216 case 'Q': // Q_REGS
27217 if (VT == MVT::i32 || VT == MVT::f32)
27218 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
27219 if (VT == MVT::i16)
27220 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
27221 if (VT == MVT::i8 || VT == MVT::i1)
27222 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
27223 if (VT == MVT::i64)
27224 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
27226 case 'r': // GENERAL_REGS
27227 case 'l': // INDEX_REGS
27228 if (VT == MVT::i8 || VT == MVT::i1)
27229 return std::make_pair(0U, &X86::GR8RegClass);
27230 if (VT == MVT::i16)
27231 return std::make_pair(0U, &X86::GR16RegClass);
27232 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit())
27233 return std::make_pair(0U, &X86::GR32RegClass);
27234 return std::make_pair(0U, &X86::GR64RegClass);
27235 case 'R': // LEGACY_REGS
27236 if (VT == MVT::i8 || VT == MVT::i1)
27237 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
27238 if (VT == MVT::i16)
27239 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
27240 if (VT == MVT::i32 || !Subtarget->is64Bit())
27241 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
27242 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
27243 case 'f': // FP Stack registers.
27244 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
27245 // value to the correct fpstack register class.
27246 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
27247 return std::make_pair(0U, &X86::RFP32RegClass);
27248 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
27249 return std::make_pair(0U, &X86::RFP64RegClass);
27250 return std::make_pair(0U, &X86::RFP80RegClass);
27251 case 'y': // MMX_REGS if MMX allowed.
27252 if (!Subtarget->hasMMX()) break;
27253 return std::make_pair(0U, &X86::VR64RegClass);
27254 case 'Y': // SSE_REGS if SSE2 allowed
27255 if (!Subtarget->hasSSE2()) break;
27257 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
27258 if (!Subtarget->hasSSE1()) break;
27260 switch (VT.SimpleTy) {
27262 // Scalar SSE types.
27265 return std::make_pair(0U, &X86::FR32RegClass);
27268 return std::make_pair(0U, &X86::FR64RegClass);
27276 return std::make_pair(0U, &X86::VR128RegClass);
27284 return std::make_pair(0U, &X86::VR256RegClass);
27289 return std::make_pair(0U, &X86::VR512RegClass);
27295 // Use the default implementation in TargetLowering to convert the register
27296 // constraint into a member of a register class.
27297 std::pair<unsigned, const TargetRegisterClass*> Res;
27298 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
27300 // Not found as a standard register?
27302 // Map st(0) -> st(7) -> ST0
27303 if (Constraint.size() == 7 && Constraint[0] == '{' &&
27304 tolower(Constraint[1]) == 's' &&
27305 tolower(Constraint[2]) == 't' &&
27306 Constraint[3] == '(' &&
27307 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
27308 Constraint[5] == ')' &&
27309 Constraint[6] == '}') {
27311 Res.first = X86::FP0+Constraint[4]-'0';
27312 Res.second = &X86::RFP80RegClass;
27316 // GCC allows "st(0)" to be called just plain "st".
27317 if (StringRef("{st}").equals_lower(Constraint)) {
27318 Res.first = X86::FP0;
27319 Res.second = &X86::RFP80RegClass;
27324 if (StringRef("{flags}").equals_lower(Constraint)) {
27325 Res.first = X86::EFLAGS;
27326 Res.second = &X86::CCRRegClass;
27330 // 'A' means EAX + EDX.
27331 if (Constraint == "A") {
27332 Res.first = X86::EAX;
27333 Res.second = &X86::GR32_ADRegClass;
27339 // Otherwise, check to see if this is a register class of the wrong value
27340 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
27341 // turn into {ax},{dx}.
27342 if (Res.second->hasType(VT))
27343 return Res; // Correct type already, nothing to do.
27345 // All of the single-register GCC register classes map their values onto
27346 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we
27347 // really want an 8-bit or 32-bit register, map to the appropriate register
27348 // class and return the appropriate register.
27349 if (Res.second == &X86::GR16RegClass) {
27350 if (VT == MVT::i8 || VT == MVT::i1) {
27351 unsigned DestReg = 0;
27352 switch (Res.first) {
27354 case X86::AX: DestReg = X86::AL; break;
27355 case X86::DX: DestReg = X86::DL; break;
27356 case X86::CX: DestReg = X86::CL; break;
27357 case X86::BX: DestReg = X86::BL; break;
27360 Res.first = DestReg;
27361 Res.second = &X86::GR8RegClass;
27363 } else if (VT == MVT::i32 || VT == MVT::f32) {
27364 unsigned DestReg = 0;
27365 switch (Res.first) {
27367 case X86::AX: DestReg = X86::EAX; break;
27368 case X86::DX: DestReg = X86::EDX; break;
27369 case X86::CX: DestReg = X86::ECX; break;
27370 case X86::BX: DestReg = X86::EBX; break;
27371 case X86::SI: DestReg = X86::ESI; break;
27372 case X86::DI: DestReg = X86::EDI; break;
27373 case X86::BP: DestReg = X86::EBP; break;
27374 case X86::SP: DestReg = X86::ESP; break;
27377 Res.first = DestReg;
27378 Res.second = &X86::GR32RegClass;
27380 } else if (VT == MVT::i64 || VT == MVT::f64) {
27381 unsigned DestReg = 0;
27382 switch (Res.first) {
27384 case X86::AX: DestReg = X86::RAX; break;
27385 case X86::DX: DestReg = X86::RDX; break;
27386 case X86::CX: DestReg = X86::RCX; break;
27387 case X86::BX: DestReg = X86::RBX; break;
27388 case X86::SI: DestReg = X86::RSI; break;
27389 case X86::DI: DestReg = X86::RDI; break;
27390 case X86::BP: DestReg = X86::RBP; break;
27391 case X86::SP: DestReg = X86::RSP; break;
27394 Res.first = DestReg;
27395 Res.second = &X86::GR64RegClass;
27398 } else if (Res.second == &X86::FR32RegClass ||
27399 Res.second == &X86::FR64RegClass ||
27400 Res.second == &X86::VR128RegClass ||
27401 Res.second == &X86::VR256RegClass ||
27402 Res.second == &X86::FR32XRegClass ||
27403 Res.second == &X86::FR64XRegClass ||
27404 Res.second == &X86::VR128XRegClass ||
27405 Res.second == &X86::VR256XRegClass ||
27406 Res.second == &X86::VR512RegClass) {
27407 // Handle references to XMM physical registers that got mapped into the
27408 // wrong class. This can happen with constraints like {xmm0} where the
27409 // target independent register mapper will just pick the first match it can
27410 // find, ignoring the required type.
27412 if (VT == MVT::f32 || VT == MVT::i32)
27413 Res.second = &X86::FR32RegClass;
27414 else if (VT == MVT::f64 || VT == MVT::i64)
27415 Res.second = &X86::FR64RegClass;
27416 else if (X86::VR128RegClass.hasType(VT))
27417 Res.second = &X86::VR128RegClass;
27418 else if (X86::VR256RegClass.hasType(VT))
27419 Res.second = &X86::VR256RegClass;
27420 else if (X86::VR512RegClass.hasType(VT))
27421 Res.second = &X86::VR512RegClass;
27427 int X86TargetLowering::getScalingFactorCost(const AddrMode &AM,
27429 // Scaling factors are not free at all.
27430 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
27431 // will take 2 allocations in the out of order engine instead of 1
27432 // for plain addressing mode, i.e. inst (reg1).
27434 // vaddps (%rsi,%drx), %ymm0, %ymm1
27435 // Requires two allocations (one for the load, one for the computation)
27437 // vaddps (%rsi), %ymm0, %ymm1
27438 // Requires just 1 allocation, i.e., freeing allocations for other operations
27439 // and having less micro operations to execute.
27441 // For some X86 architectures, this is even worse because for instance for
27442 // stores, the complex addressing mode forces the instruction to use the
27443 // "load" ports instead of the dedicated "store" port.
27444 // E.g., on Haswell:
27445 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
27446 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
27447 if (isLegalAddressingMode(AM, Ty))
27448 // Scale represents reg2 * scale, thus account for 1
27449 // as soon as we use a second register.
27450 return AM.Scale != 0;
27454 bool X86TargetLowering::isTargetFTOL() const {
27455 return Subtarget->isTargetKnownWindowsMSVC() && !Subtarget->is64Bit();