lib/Target/X86/X86ISelLowering.cpp

   1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This file defines the interfaces that X86 uses to lower LLVM code into a
  11 // selection DAG.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "X86ISelLowering.h"
  16 #include "Utils/X86ShuffleDecode.h"
  17 #include "X86CallingConv.h"
  18 #include "X86FrameLowering.h"
  19 #include "X86InstrBuilder.h"
  20 #include "X86MachineFunctionInfo.h"
  21 #include "X86TargetMachine.h"
  22 #include "X86TargetObjectFile.h"
  23 #include "llvm/ADT/SmallBitVector.h"
  24 #include "llvm/ADT/SmallSet.h"
  25 #include "llvm/ADT/Statistic.h"
  26 #include "llvm/ADT/StringExtras.h"
  27 #include "llvm/ADT/StringSwitch.h"
  28 #include "llvm/ADT/VariadicFunction.h"
  29 #include "llvm/CodeGen/IntrinsicLowering.h"
  30 #include "llvm/CodeGen/MachineFrameInfo.h"
  31 #include "llvm/CodeGen/MachineFunction.h"
  32 #include "llvm/CodeGen/MachineInstrBuilder.h"
  33 #include "llvm/CodeGen/MachineJumpTableInfo.h"
  34 #include "llvm/CodeGen/MachineModuleInfo.h"
  35 #include "llvm/CodeGen/MachineRegisterInfo.h"
  36 #include "llvm/IR/CallSite.h"
  37 #include "llvm/IR/CallingConv.h"
  38 #include "llvm/IR/Constants.h"
  39 #include "llvm/IR/DerivedTypes.h"
  40 #include "llvm/IR/Function.h"
  41 #include "llvm/IR/GlobalAlias.h"
  42 #include "llvm/IR/GlobalVariable.h"
  43 #include "llvm/IR/Instructions.h"
  44 #include "llvm/IR/Intrinsics.h"
  45 #include "llvm/MC/MCAsmInfo.h"
  46 #include "llvm/MC/MCContext.h"
  47 #include "llvm/MC/MCExpr.h"
  48 #include "llvm/MC/MCSymbol.h"
  49 #include "llvm/Support/CommandLine.h"
  50 #include "llvm/Support/Debug.h"
  51 #include "llvm/Support/ErrorHandling.h"
  52 #include "llvm/Support/MathExtras.h"
  53 #include "llvm/Target/TargetOptions.h"
  54 #include "X86IntrinsicsInfo.h"
  55 #include <bitset>
  56 #include <numeric>
  57 #include <cctype>
  58 using namespace llvm;
  59
  60 #define DEBUG_TYPE "x86-isel"
  61
  62 STATISTIC(NumTailCalls, "Number of tail calls");
  63
  64 static cl::opt<bool> ExperimentalVectorWideningLegalization(
  65     "x86-experimental-vector-widening-legalization", cl::init(false),
  66     cl::desc("Enable an experimental vector type legalization through widening "
  67              "rather than promotion."),
  68     cl::Hidden);
  69
  70 static cl::opt<bool> ExperimentalVectorShuffleLowering(
  71     "x86-experimental-vector-shuffle-lowering", cl::init(true),
  72     cl::desc("Enable an experimental vector shuffle lowering code path."),
  73     cl::Hidden);
  74
  75 static cl::opt<bool> ExperimentalVectorShuffleLegality(
  76     "x86-experimental-vector-shuffle-legality", cl::init(false),
  77     cl::desc("Enable experimental shuffle legality based on the experimental "
  78              "shuffle lowering. Should only be used with the experimental "
  79              "shuffle lowering."),
  80     cl::Hidden);
  81
  82 static cl::opt<int> ReciprocalEstimateRefinementSteps(
  83     "x86-recip-refinement-steps", cl::init(1),
  84     cl::desc("Specify the number of Newton-Raphson iterations applied to the "
  85              "result of the hardware reciprocal estimate instruction."),
  86     cl::NotHidden);
  87
  88 // Forward declarations.
  89 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
  90                        SDValue V2);
  91
  92 static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
  93                                 SelectionDAG &DAG, SDLoc dl,
  94                                 unsigned vectorWidth) {
  95   assert((vectorWidth == 128 || vectorWidth == 256) &&
  96          "Unsupported vector width");
  97   EVT VT = Vec.getValueType();
  98   EVT ElVT = VT.getVectorElementType();
  99   unsigned Factor = VT.getSizeInBits()/vectorWidth;
 100   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
 101                                   VT.getVectorNumElements()/Factor);
 102
 103   // Extract from UNDEF is UNDEF.
 104   if (Vec.getOpcode() == ISD::UNDEF)
 105     return DAG.getUNDEF(ResultVT);
 106
 107   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
 108   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
 109
 110   // This is the index of the first element of the vectorWidth-bit chunk
 111   // we want.
 112   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
 113                                * ElemsPerChunk);
 114
 115   // If the input is a buildvector just emit a smaller one.
 116   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
 117     return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
 118                        makeArrayRef(Vec->op_begin() + NormalizedIdxVal,
 119                                     ElemsPerChunk));
 120
 121   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
 122   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
 123 }
 124
 125 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
 126 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
 127 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
 128 /// instructions or a simple subregister reference. Idx is an index in the
 129 /// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
 130 /// lowering EXTRACT_VECTOR_ELT operations easier.
 131 static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
 132                                    SelectionDAG &DAG, SDLoc dl) {
 133   assert((Vec.getValueType().is256BitVector() ||
 134           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
 135   return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
 136 }
 137
 138 /// Generate a DAG to grab 256-bits from a 512-bit vector.
 139 static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
 140                                    SelectionDAG &DAG, SDLoc dl) {
 141   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
 142   return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
 143 }
 144
 145 static SDValue InsertSubVector(SDValue Result, SDValue Vec,
 146                                unsigned IdxVal, SelectionDAG &DAG,
 147                                SDLoc dl, unsigned vectorWidth) {
 148   assert((vectorWidth == 128 || vectorWidth == 256) &&
 149          "Unsupported vector width");
 150   // Inserting UNDEF is Result
 151   if (Vec.getOpcode() == ISD::UNDEF)
 152     return Result;
 153   EVT VT = Vec.getValueType();
 154   EVT ElVT = VT.getVectorElementType();
 155   EVT ResultVT = Result.getValueType();
 156
 157   // Insert the relevant vectorWidth bits.
 158   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
 159
 160   // This is the index of the first element of the vectorWidth-bit chunk
 161   // we want.
 162   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
 163                                * ElemsPerChunk);
 164
 165   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
 166   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
 167 }
 168
 169 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
 170 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
 171 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
 172 /// simple superregister reference.  Idx is an index in the 128 bits
 173 /// we want.  It need not be aligned to a 128-bit boundary.  That makes
 174 /// lowering INSERT_VECTOR_ELT operations easier.
 175 static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
 176                                   SelectionDAG &DAG,SDLoc dl) {
 177   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
 178   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
 179 }
 180
 181 static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
 182                                   SelectionDAG &DAG, SDLoc dl) {
 183   assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
 184   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
 185 }
 186
 187 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
 188 /// instructions. This is used because creating CONCAT_VECTOR nodes of
 189 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
 190 /// large BUILD_VECTORS.
 191 static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
 192                                    unsigned NumElems, SelectionDAG &DAG,
 193                                    SDLoc dl) {
 194   SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
 195   return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
 196 }
 197
 198 static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
 199                                    unsigned NumElems, SelectionDAG &DAG,
 200                                    SDLoc dl) {
 201   SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
 202   return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
 203 }
 204
 205 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 206                                      const X86Subtarget &STI)
 207     : TargetLowering(TM), Subtarget(&STI) {
 208   X86ScalarSSEf64 = Subtarget->hasSSE2();
 209   X86ScalarSSEf32 = Subtarget->hasSSE1();
 210   TD = getDataLayout();
 211
 212   // Set up the TargetLowering object.
 213   static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
 214
 215   // X86 is weird. It always uses i8 for shift amounts and setcc results.
 216   setBooleanContents(ZeroOrOneBooleanContent);
 217   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
 218   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 219
 220   // For 64-bit, since we have so many registers, use the ILP scheduler.
 221   // For 32-bit, use the register pressure specific scheduling.
 222   // For Atom, always use ILP scheduling.
 223   if (Subtarget->isAtom())
 224     setSchedulingPreference(Sched::ILP);
 225   else if (Subtarget->is64Bit())
 226     setSchedulingPreference(Sched::ILP);
 227   else
 228     setSchedulingPreference(Sched::RegPressure);
 229   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
 230   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
 231
 232   // Bypass expensive divides on Atom when compiling with O2.
 233   if (TM.getOptLevel() >= CodeGenOpt::Default) {
 234     if (Subtarget->hasSlowDivide32())
 235       addBypassSlowDiv(32, 8);
 236     if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit())
 237       addBypassSlowDiv(64, 16);
 238   }
 239
 240   if (Subtarget->isTargetKnownWindowsMSVC()) {
 241     // Setup Windows compiler runtime calls.
 242     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
 243     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
 244     setLibcallName(RTLIB::SREM_I64, "_allrem");
 245     setLibcallName(RTLIB::UREM_I64, "_aullrem");
 246     setLibcallName(RTLIB::MUL_I64, "_allmul");
 247     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
 248     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
 249     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
 250     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
 251     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
 252
 253     // The _ftol2 runtime function has an unusual calling conv, which
 254     // is modeled by a special pseudo-instruction.
 255     setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr);
 256     setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr);
 257     setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr);
 258     setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr);
 259   }
 260
 261   if (Subtarget->isTargetDarwin()) {
 262     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
 263     setUseUnderscoreSetJmp(false);
 264     setUseUnderscoreLongJmp(false);
 265   } else if (Subtarget->isTargetWindowsGNU()) {
 266     // MS runtime is weird: it exports _setjmp, but longjmp!
 267     setUseUnderscoreSetJmp(true);
 268     setUseUnderscoreLongJmp(false);
 269   } else {
 270     setUseUnderscoreSetJmp(true);
 271     setUseUnderscoreLongJmp(true);
 272   }
 273
 274   // Set up the register classes.
 275   addRegisterClass(MVT::i8, &X86::GR8RegClass);
 276   addRegisterClass(MVT::i16, &X86::GR16RegClass);
 277   addRegisterClass(MVT::i32, &X86::GR32RegClass);
 278   if (Subtarget->is64Bit())
 279     addRegisterClass(MVT::i64, &X86::GR64RegClass);
 280
 281   for (MVT VT : MVT::integer_valuetypes())
 282     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
 283
 284   // We don't accept any truncstore of integer registers.
 285   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
 286   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
 287   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
 288   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
 289   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
 290   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
 291
 292   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 293
 294   // SETOEQ and SETUNE require checking two conditions.
 295   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
 296   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
 297   setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
 298   setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
 299   setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
 300   setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
 301
 302   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
 303   // operation.
 304   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
 305   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
 306   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
 307
 308   if (Subtarget->is64Bit()) {
 309     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
 310     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
 311   } else if (!TM.Options.UseSoftFloat) {
 312     // We have an algorithm for SSE2->double, and we turn this into a
 313     // 64-bit FILD followed by conditional FADD for other targets.
 314     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
 315     // We have an algorithm for SSE2, and we turn this into a 64-bit
 316     // FILD for other targets.
 317     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
 318   }
 319
 320   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
 321   // this operation.
 322   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
 323   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
 324
 325   if (!TM.Options.UseSoftFloat) {
 326     // SSE has no i16 to fp conversion, only i32
 327     if (X86ScalarSSEf32) {
 328       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
 329       // f32 and f64 cases are Legal, f80 case is not
 330       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
 331     } else {
 332       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
 333       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
 334     }
 335   } else {
 336     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
 337     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
 338   }
 339
 340   // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
 341   // are Legal, f80 is custom lowered.
 342   setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
 343   setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
 344
 345   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
 346   // this operation.
 347   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
 348   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
 349
 350   if (X86ScalarSSEf32) {
 351     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
 352     // f32 and f64 cases are Legal, f80 case is not
 353     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
 354   } else {
 355     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
 356     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
 357   }
 358
 359   // Handle FP_TO_UINT by promoting the destination to a larger signed
 360   // conversion.
 361   setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
 362   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
 363   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
 364
 365   if (Subtarget->is64Bit()) {
 366     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
 367     setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
 368   } else if (!TM.Options.UseSoftFloat) {
 369     // Since AVX is a superset of SSE3, only check for SSE here.
 370     if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
 371       // Expand FP_TO_UINT into a select.
 372       // FIXME: We would like to use a Custom expander here eventually to do
 373       // the optimal thing for SSE vs. the default expansion in the legalizer.
 374       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
 375     else
 376       // With SSE3 we can use fisttpll to convert to a signed i64; without
 377       // SSE, we're stuck with a fistpll.
 378       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
 379   }
 380
 381   if (isTargetFTOL()) {
 382     // Use the _ftol2 runtime function, which has a pseudo-instruction
 383     // to handle its weird calling convention.
 384     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
 385   }
 386
 387   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
 388   if (!X86ScalarSSEf64) {
 389     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
 390     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
 391     if (Subtarget->is64Bit()) {
 392       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
 393       // Without SSE, i64->f64 goes through memory.
 394       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
 395     }
 396   }
 397
 398   // Scalar integer divide and remainder are lowered to use operations that
 399   // produce two results, to match the available instructions. This exposes
 400   // the two-result form to trivial CSE, which is able to combine x/y and x%y
 401   // into a single instruction.
 402   //
 403   // Scalar integer multiply-high is also lowered to use two-result
 404   // operations, to match the available instructions. However, plain multiply
 405   // (low) operations are left as Legal, as there are single-result
 406   // instructions for this in x86. Using the two-result multiply instructions
 407   // when both high and low results are needed must be arranged by dagcombine.
 408   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
 409     MVT VT = IntVTs[i];
 410     setOperationAction(ISD::MULHS, VT, Expand);
 411     setOperationAction(ISD::MULHU, VT, Expand);
 412     setOperationAction(ISD::SDIV, VT, Expand);
 413     setOperationAction(ISD::UDIV, VT, Expand);
 414     setOperationAction(ISD::SREM, VT, Expand);
 415     setOperationAction(ISD::UREM, VT, Expand);
 416
 417     // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
 418     setOperationAction(ISD::ADDC, VT, Custom);
 419     setOperationAction(ISD::ADDE, VT, Custom);
 420     setOperationAction(ISD::SUBC, VT, Custom);
 421     setOperationAction(ISD::SUBE, VT, Custom);
 422   }
 423
 424   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
 425   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
 426   setOperationAction(ISD::BR_CC            , MVT::f32,   Expand);
 427   setOperationAction(ISD::BR_CC            , MVT::f64,   Expand);
 428   setOperationAction(ISD::BR_CC            , MVT::f80,   Expand);
 429   setOperationAction(ISD::BR_CC            , MVT::i8,    Expand);
 430   setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
 431   setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
 432   setOperationAction(ISD::BR_CC            , MVT::i64,   Expand);
 433   setOperationAction(ISD::SELECT_CC        , MVT::f32,   Expand);
 434   setOperationAction(ISD::SELECT_CC        , MVT::f64,   Expand);
 435   setOperationAction(ISD::SELECT_CC        , MVT::f80,   Expand);
 436   setOperationAction(ISD::SELECT_CC        , MVT::i8,    Expand);
 437   setOperationAction(ISD::SELECT_CC        , MVT::i16,   Expand);
 438   setOperationAction(ISD::SELECT_CC        , MVT::i32,   Expand);
 439   setOperationAction(ISD::SELECT_CC        , MVT::i64,   Expand);
 440   if (Subtarget->is64Bit())
 441     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
 442   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
 443   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
 444   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
 445   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
 446   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
 447   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
 448   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
 449   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
 450
 451   // Promote the i8 variants and force them on up to i32 which has a shorter
 452   // encoding.
 453   setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
 454   AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
 455   setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
 456   AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
 457   if (Subtarget->hasBMI()) {
 458     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
 459     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
 460     if (Subtarget->is64Bit())
 461       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
 462   } else {
 463     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
 464     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
 465     if (Subtarget->is64Bit())
 466       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
 467   }
 468
 469   if (Subtarget->hasLZCNT()) {
 470     // When promoting the i8 variants, force them to i32 for a shorter
 471     // encoding.
 472     setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
 473     AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
 474     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
 475     AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
 476     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
 477     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
 478     if (Subtarget->is64Bit())
 479       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
 480   } else {
 481     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
 482     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
 483     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
 484     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
 485     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
 486     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
 487     if (Subtarget->is64Bit()) {
 488       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
 489       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
 490     }
 491   }
 492
 493   // Special handling for half-precision floating point conversions.
 494   // If we don't have F16C support, then lower half float conversions
 495   // into library calls.
 496   if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) {
 497     setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
 498     setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
 499   }
 500
 501   // There's never any support for operations beyond MVT::f32.
 502   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
 503   setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
 504   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
 505   setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
 506
 507   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
 508   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
 509   setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
 510   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
 511   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
 512   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
 513
 514   if (Subtarget->hasPOPCNT()) {
 515     setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
 516   } else {
 517     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
 518     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
 519     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
 520     if (Subtarget->is64Bit())
 521       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
 522   }
 523
 524   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
 525
 526   if (!Subtarget->hasMOVBE())
 527     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
 528
 529   // These should be promoted to a larger select which is supported.
 530   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
 531   // X86 wants to expand cmov itself.
 532   setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
 533   setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
 534   setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
 535   setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
 536   setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
 537   setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
 538   setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
 539   setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
 540   setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
 541   setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
 542   setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
 543   setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
 544   if (Subtarget->is64Bit()) {
 545     setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
 546     setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
 547   }
 548   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
 549   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
 550   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
 551   // support continuation, user-level threading, and etc.. As a result, no
 552   // other SjLj exception interfaces are implemented and please don't build
 553   // your own exception handling based on them.
 554   // LLVM/Clang supports zero-cost DWARF exception handling.
 555   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
 556   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
 557
 558   // Darwin ABI issue.
 559   setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
 560   setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
 561   setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
 562   setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
 563   if (Subtarget->is64Bit())
 564     setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
 565   setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
 566   setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
 567   if (Subtarget->is64Bit()) {
 568     setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
 569     setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
 570     setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
 571     setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
 572     setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
 573   }
 574   // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
 575   setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
 576   setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
 577   setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
 578   if (Subtarget->is64Bit()) {
 579     setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
 580     setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
 581     setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
 582   }
 583
 584   if (Subtarget->hasSSE1())
 585     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
 586
 587   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
 588
 589   // Expand certain atomics
 590   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
 591     MVT VT = IntVTs[i];
 592     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
 593     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
 594     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
 595   }
 596
 597   if (Subtarget->hasCmpxchg16b()) {
 598     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
 599   }
 600
 601   // FIXME - use subtarget debug flags
 602   if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() &&
 603       !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) {
 604     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
 605   }
 606
 607   if (Subtarget->is64Bit()) {
 608     setExceptionPointerRegister(X86::RAX);
 609     setExceptionSelectorRegister(X86::RDX);
 610   } else {
 611     setExceptionPointerRegister(X86::EAX);
 612     setExceptionSelectorRegister(X86::EDX);
 613   }
 614   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
 615   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
 616
 617   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
 618   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
 619
 620   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 621   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
 622
 623   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
 624   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
 625   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
 626   if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {
 627     // TargetInfo::X86_64ABIBuiltinVaList
 628     setOperationAction(ISD::VAARG           , MVT::Other, Custom);
 629     setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
 630   } else {
 631     // TargetInfo::CharPtrBuiltinVaList
 632     setOperationAction(ISD::VAARG           , MVT::Other, Expand);
 633     setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
 634   }
 635
 636   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
 637   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
 638
 639   setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(), Custom);
 640
 641   if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
 642     // f32 and f64 use SSE.
 643     // Set up the FP register classes.
 644     addRegisterClass(MVT::f32, &X86::FR32RegClass);
 645     addRegisterClass(MVT::f64, &X86::FR64RegClass);
 646
 647     // Use ANDPD to simulate FABS.
 648     setOperationAction(ISD::FABS , MVT::f64, Custom);
 649     setOperationAction(ISD::FABS , MVT::f32, Custom);
 650
 651     // Use XORP to simulate FNEG.
 652     setOperationAction(ISD::FNEG , MVT::f64, Custom);
 653     setOperationAction(ISD::FNEG , MVT::f32, Custom);
 654
 655     // Use ANDPD and ORPD to simulate FCOPYSIGN.
 656     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
 657     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 658
 659     // Lower this to FGETSIGNx86 plus an AND.
 660     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
 661     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
 662
 663     // We don't support sin/cos/fmod
 664     setOperationAction(ISD::FSIN   , MVT::f64, Expand);
 665     setOperationAction(ISD::FCOS   , MVT::f64, Expand);
 666     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 667     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
 668     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
 669     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 670
 671     // Expand FP immediates into loads from the stack, except for the special
 672     // cases we handle.
 673     addLegalFPImmediate(APFloat(+0.0)); // xorpd
 674     addLegalFPImmediate(APFloat(+0.0f)); // xorps
 675   } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
 676     // Use SSE for f32, x87 for f64.
 677     // Set up the FP register classes.
 678     addRegisterClass(MVT::f32, &X86::FR32RegClass);
 679     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
 680
 681     // Use ANDPS to simulate FABS.
 682     setOperationAction(ISD::FABS , MVT::f32, Custom);
 683
 684     // Use XORP to simulate FNEG.
 685     setOperationAction(ISD::FNEG , MVT::f32, Custom);
 686
 687     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
 688
 689     // Use ANDPS and ORPS to simulate FCOPYSIGN.
 690     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
 691     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 692
 693     // We don't support sin/cos/fmod
 694     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
 695     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
 696     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 697
 698     // Special cases we handle for FP constants.
 699     addLegalFPImmediate(APFloat(+0.0f)); // xorps
 700     addLegalFPImmediate(APFloat(+0.0)); // FLD0
 701     addLegalFPImmediate(APFloat(+1.0)); // FLD1
 702     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
 703     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
 704
 705     if (!TM.Options.UnsafeFPMath) {
 706       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
 707       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
 708       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 709     }
 710   } else if (!TM.Options.UseSoftFloat) {
 711     // f32 and f64 in x87.
 712     // Set up the FP register classes.
 713     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
 714     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
 715
 716     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
 717     setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
 718     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
 719     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
 720
 721     if (!TM.Options.UnsafeFPMath) {
 722       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
 723       setOperationAction(ISD::FSIN   , MVT::f32, Expand);
 724       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
 725       setOperationAction(ISD::FCOS   , MVT::f32, Expand);
 726       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 727       setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 728     }
 729     addLegalFPImmediate(APFloat(+0.0)); // FLD0
 730     addLegalFPImmediate(APFloat(+1.0)); // FLD1
 731     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
 732     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
 733     addLegalFPImmediate(APFloat(+0.0f)); // FLD0
 734     addLegalFPImmediate(APFloat(+1.0f)); // FLD1
 735     addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
 736     addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
 737   }
 738
 739   // We don't support FMA.
 740   setOperationAction(ISD::FMA, MVT::f64, Expand);
 741   setOperationAction(ISD::FMA, MVT::f32, Expand);
 742
 743   // Long double always uses X87.
 744   if (!TM.Options.UseSoftFloat) {
 745     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
 746     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
 747     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
 748     {
 749       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
 750       addLegalFPImmediate(TmpFlt);  // FLD0
 751       TmpFlt.changeSign();
 752       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
 753
 754       bool ignored;
 755       APFloat TmpFlt2(+1.0);
 756       TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
 757                       &ignored);
 758       addLegalFPImmediate(TmpFlt2);  // FLD1
 759       TmpFlt2.changeSign();
 760       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
 761     }
 762
 763     if (!TM.Options.UnsafeFPMath) {
 764       setOperationAction(ISD::FSIN   , MVT::f80, Expand);
 765       setOperationAction(ISD::FCOS   , MVT::f80, Expand);
 766       setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
 767     }
 768
 769     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
 770     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
 771     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
 772     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
 773     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
 774     setOperationAction(ISD::FMA, MVT::f80, Expand);
 775   }
 776
 777   // Always use a library call for pow.
 778   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
 779   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
 780   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
 781
 782   setOperationAction(ISD::FLOG, MVT::f80, Expand);
 783   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
 784   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
 785   setOperationAction(ISD::FEXP, MVT::f80, Expand);
 786   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
 787   setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
 788   setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
 789
 790   // First set operation action for all vector types to either promote
 791   // (for widening) or expand (for scalarization). Then we will selectively
 792   // turn on ones that can be effectively codegen'd.
 793   for (MVT VT : MVT::vector_valuetypes()) {
 794     setOperationAction(ISD::ADD , VT, Expand);
 795     setOperationAction(ISD::SUB , VT, Expand);
 796     setOperationAction(ISD::FADD, VT, Expand);
 797     setOperationAction(ISD::FNEG, VT, Expand);
 798     setOperationAction(ISD::FSUB, VT, Expand);
 799     setOperationAction(ISD::MUL , VT, Expand);
 800     setOperationAction(ISD::FMUL, VT, Expand);
 801     setOperationAction(ISD::SDIV, VT, Expand);
 802     setOperationAction(ISD::UDIV, VT, Expand);
 803     setOperationAction(ISD::FDIV, VT, Expand);
 804     setOperationAction(ISD::SREM, VT, Expand);
 805     setOperationAction(ISD::UREM, VT, Expand);
 806     setOperationAction(ISD::LOAD, VT, Expand);
 807     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
 808     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
 809     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
 810     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
 811     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
 812     setOperationAction(ISD::FABS, VT, Expand);
 813     setOperationAction(ISD::FSIN, VT, Expand);
 814     setOperationAction(ISD::FSINCOS, VT, Expand);
 815     setOperationAction(ISD::FCOS, VT, Expand);
 816     setOperationAction(ISD::FSINCOS, VT, Expand);
 817     setOperationAction(ISD::FREM, VT, Expand);
 818     setOperationAction(ISD::FMA,  VT, Expand);
 819     setOperationAction(ISD::FPOWI, VT, Expand);
 820     setOperationAction(ISD::FSQRT, VT, Expand);
 821     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
 822     setOperationAction(ISD::FFLOOR, VT, Expand);
 823     setOperationAction(ISD::FCEIL, VT, Expand);
 824     setOperationAction(ISD::FTRUNC, VT, Expand);
 825     setOperationAction(ISD::FRINT, VT, Expand);
 826     setOperationAction(ISD::FNEARBYINT, VT, Expand);
 827     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
 828     setOperationAction(ISD::MULHS, VT, Expand);
 829     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 830     setOperationAction(ISD::MULHU, VT, Expand);
 831     setOperationAction(ISD::SDIVREM, VT, Expand);
 832     setOperationAction(ISD::UDIVREM, VT, Expand);
 833     setOperationAction(ISD::FPOW, VT, Expand);
 834     setOperationAction(ISD::CTPOP, VT, Expand);
 835     setOperationAction(ISD::CTTZ, VT, Expand);
 836     setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
 837     setOperationAction(ISD::CTLZ, VT, Expand);
 838     setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
 839     setOperationAction(ISD::SHL, VT, Expand);
 840     setOperationAction(ISD::SRA, VT, Expand);
 841     setOperationAction(ISD::SRL, VT, Expand);
 842     setOperationAction(ISD::ROTL, VT, Expand);
 843     setOperationAction(ISD::ROTR, VT, Expand);
 844     setOperationAction(ISD::BSWAP, VT, Expand);
 845     setOperationAction(ISD::SETCC, VT, Expand);
 846     setOperationAction(ISD::FLOG, VT, Expand);
 847     setOperationAction(ISD::FLOG2, VT, Expand);
 848     setOperationAction(ISD::FLOG10, VT, Expand);
 849     setOperationAction(ISD::FEXP, VT, Expand);
 850     setOperationAction(ISD::FEXP2, VT, Expand);
 851     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
 852     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
 853     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
 854     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
 855     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
 856     setOperationAction(ISD::TRUNCATE, VT, Expand);
 857     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
 858     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
 859     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
 860     setOperationAction(ISD::VSELECT, VT, Expand);
 861     setOperationAction(ISD::SELECT_CC, VT, Expand);
 862     for (MVT InnerVT : MVT::vector_valuetypes()) {
 863       setTruncStoreAction(InnerVT, VT, Expand);
 864
 865       setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
 866       setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
 867
 868       // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
 869       // types, we have to deal with them whether we ask for Expansion or not.
 870       // Setting Expand causes its own optimisation problems though, so leave
 871       // them legal.
 872       if (VT.getVectorElementType() == MVT::i1)
 873         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
 874     }
 875   }
 876
 877   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
 878   // with -msoft-float, disable use of MMX as well.
 879   if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
 880     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
 881     // No operations on x86mmx supported, everything uses intrinsics.
 882   }
 883
 884   // MMX-sized vectors (other than x86mmx) are expected to be expanded
 885   // into smaller operations.
 886   setOperationAction(ISD::MULHS,              MVT::v8i8,  Expand);
 887   setOperationAction(ISD::MULHS,              MVT::v4i16, Expand);
 888   setOperationAction(ISD::MULHS,              MVT::v2i32, Expand);
 889   setOperationAction(ISD::MULHS,              MVT::v1i64, Expand);
 890   setOperationAction(ISD::AND,                MVT::v8i8,  Expand);
 891   setOperationAction(ISD::AND,                MVT::v4i16, Expand);
 892   setOperationAction(ISD::AND,                MVT::v2i32, Expand);
 893   setOperationAction(ISD::AND,                MVT::v1i64, Expand);
 894   setOperationAction(ISD::OR,                 MVT::v8i8,  Expand);
 895   setOperationAction(ISD::OR,                 MVT::v4i16, Expand);
 896   setOperationAction(ISD::OR,                 MVT::v2i32, Expand);
 897   setOperationAction(ISD::OR,                 MVT::v1i64, Expand);
 898   setOperationAction(ISD::XOR,                MVT::v8i8,  Expand);
 899   setOperationAction(ISD::XOR,                MVT::v4i16, Expand);
 900   setOperationAction(ISD::XOR,                MVT::v2i32, Expand);
 901   setOperationAction(ISD::XOR,                MVT::v1i64, Expand);
 902   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Expand);
 903   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Expand);
 904   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2i32, Expand);
 905   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Expand);
 906   setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
 907   setOperationAction(ISD::SELECT,             MVT::v8i8,  Expand);
 908   setOperationAction(ISD::SELECT,             MVT::v4i16, Expand);
 909   setOperationAction(ISD::SELECT,             MVT::v2i32, Expand);
 910   setOperationAction(ISD::SELECT,             MVT::v1i64, Expand);
 911   setOperationAction(ISD::BITCAST,            MVT::v8i8,  Expand);
 912   setOperationAction(ISD::BITCAST,            MVT::v4i16, Expand);
 913   setOperationAction(ISD::BITCAST,            MVT::v2i32, Expand);
 914   setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
 915
 916   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
 917     addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
 918
 919     setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
 920     setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
 921     setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
 922     setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
 923     setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
 924     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
 925     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
 926     setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
 927     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
 928     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
 929     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
 930     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
 931     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
 932   }
 933
 934   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
 935     addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
 936
 937     // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
 938     // registers cannot be used even for integer operations.
 939     addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
 940     addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
 941     addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
 942     addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
 943
 944     setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
 945     setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
 946     setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
 947     setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
 948     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
 949     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
 950     setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
 951     setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
 952     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
 953     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
 954     setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
 955     setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
 956     setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
 957     setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
 958     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
 959     setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
 960     setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
 961     setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
 962     setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
 963     setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
 964     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
 965     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
 966
 967     setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
 968     setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
 969     setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
 970     setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
 971
 972     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
 973     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
 974     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
 975     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
 976     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
 977
 978     // Only provide customized ctpop vector bit twiddling for vector types we
 979     // know to perform better than using the popcnt instructions on each vector
 980     // element. If popcnt isn't supported, always provide the custom version.
 981     if (!Subtarget->hasPOPCNT()) {
 982       setOperationAction(ISD::CTPOP,            MVT::v4i32, Custom);
 983       setOperationAction(ISD::CTPOP,            MVT::v2i64, Custom);
 984     }
 985
 986     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
 987     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
 988       MVT VT = (MVT::SimpleValueType)i;
 989       // Do not attempt to custom lower non-power-of-2 vectors
 990       if (!isPowerOf2_32(VT.getVectorNumElements()))
 991         continue;
 992       // Do not attempt to custom lower non-128-bit vectors
 993       if (!VT.is128BitVector())
 994         continue;
 995       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
 996       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
 997       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
 998     }
 999
1000     // We support custom legalizing of sext and anyext loads for specific
1001     // memory vector types which we can load as a scalar (or sequence of
1002     // scalars) and extend in-register to a legal 128-bit vector type. For sext
1003     // loads these must work with a single scalar load.
1004     for (MVT VT : MVT::integer_vector_valuetypes()) {
1005       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
1006       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
1007       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
1008       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
1009       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
1010       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
1011       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
1012       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
1013       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
1014     }
1015
1016     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
1017     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
1018     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
1019     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
1020     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
1021     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
1022
1023     if (Subtarget->is64Bit()) {
1024       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
1025       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
1026     }
1027
1028     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
1029     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
1030       MVT VT = (MVT::SimpleValueType)i;
1031
1032       // Do not attempt to promote non-128-bit vectors
1033       if (!VT.is128BitVector())
1034         continue;
1035
1036       setOperationAction(ISD::AND,    VT, Promote);
1037       AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
1038       setOperationAction(ISD::OR,     VT, Promote);
1039       AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
1040       setOperationAction(ISD::XOR,    VT, Promote);
1041       AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
1042       setOperationAction(ISD::LOAD,   VT, Promote);
1043       AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
1044       setOperationAction(ISD::SELECT, VT, Promote);
1045       AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
1046     }
1047
1048     // Custom lower v2i64 and v2f64 selects.
1049     setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
1050     setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
1051     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
1052     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
1053
1054     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
1055     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
1056
1057     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
1058     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
1059     // As there is no 64-bit GPR available, we need build a special custom
1060     // sequence to convert from v2i32 to v2f32.
1061     if (!Subtarget->is64Bit())
1062       setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
1063
1064     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
1065     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
1066
1067     for (MVT VT : MVT::fp_vector_valuetypes())
1068       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
1069
1070     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
1071     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
1072     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
1073   }
1074
1075   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
1076     setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
1077     setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
1078     setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
1079     setOperationAction(ISD::FRINT,              MVT::f32,   Legal);
1080     setOperationAction(ISD::FNEARBYINT,         MVT::f32,   Legal);
1081     setOperationAction(ISD::FFLOOR,             MVT::f64,   Legal);
1082     setOperationAction(ISD::FCEIL,              MVT::f64,   Legal);
1083     setOperationAction(ISD::FTRUNC,             MVT::f64,   Legal);
1084     setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
1085     setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
1086
1087     setOperationAction(ISD::FFLOOR,             MVT::v4f32, Legal);
1088     setOperationAction(ISD::FCEIL,              MVT::v4f32, Legal);
1089     setOperationAction(ISD::FTRUNC,             MVT::v4f32, Legal);
1090     setOperationAction(ISD::FRINT,              MVT::v4f32, Legal);
1091     setOperationAction(ISD::FNEARBYINT,         MVT::v4f32, Legal);
1092     setOperationAction(ISD::FFLOOR,             MVT::v2f64, Legal);
1093     setOperationAction(ISD::FCEIL,              MVT::v2f64, Legal);
1094     setOperationAction(ISD::FTRUNC,             MVT::v2f64, Legal);
1095     setOperationAction(ISD::FRINT,              MVT::v2f64, Legal);
1096     setOperationAction(ISD::FNEARBYINT,         MVT::v2f64, Legal);
1097
1098     // FIXME: Do we need to handle scalar-to-vector here?
1099     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
1100
1101     setOperationAction(ISD::VSELECT,            MVT::v2f64, Custom);
1102     setOperationAction(ISD::VSELECT,            MVT::v2i64, Custom);
1103     setOperationAction(ISD::VSELECT,            MVT::v4i32, Custom);
1104     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
1105     setOperationAction(ISD::VSELECT,            MVT::v8i16, Custom);
1106     // There is no BLENDI for byte vectors. We don't need to custom lower
1107     // some vselects for now.
1108     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
1109
1110     // SSE41 brings specific instructions for doing vector sign extend even in
1111     // cases where we don't have SRA.
1112     for (MVT VT : MVT::integer_vector_valuetypes()) {
1113       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
1114       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
1115       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
1116     }
1117
1118     // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1119     setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
1120     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
1121     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
1122     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1123     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1124     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1125
1126     setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
1127     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
1128     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
1129     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1130     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1131     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1132
1133     // i8 and i16 vectors are custom because the source register and source
1134     // source memory operand types are not the same width.  f32 vectors are
1135     // custom since the immediate controlling the insert encodes additional
1136     // information.
1137     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
1138     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
1139     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
1140     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
1141
1142     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
1143     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
1144     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
1145     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
1146
1147     // FIXME: these should be Legal, but that's only for the case where
1148     // the index is constant.  For now custom expand to deal with that.
1149     if (Subtarget->is64Bit()) {
1150       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
1151       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
1152     }
1153   }
1154
1155   if (Subtarget->hasSSE2()) {
1156     setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
1157     setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
1158
1159     setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
1160     setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
1161
1162     setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
1163     setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
1164
1165     // In the customized shift lowering, the legal cases in AVX2 will be
1166     // recognized.
1167     setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
1168     setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
1169
1170     setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
1171     setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
1172
1173     setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
1174   }
1175
1176   if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
1177     addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
1178     addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
1179     addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
1180     addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
1181     addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
1182     addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
1183
1184     setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
1185     setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
1186     setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
1187
1188     setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
1189     setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
1190     setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
1191     setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
1192     setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
1193     setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
1194     setOperationAction(ISD::FCEIL,              MVT::v8f32, Legal);
1195     setOperationAction(ISD::FTRUNC,             MVT::v8f32, Legal);
1196     setOperationAction(ISD::FRINT,              MVT::v8f32, Legal);
1197     setOperationAction(ISD::FNEARBYINT,         MVT::v8f32, Legal);
1198     setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
1199     setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
1200
1201     setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
1202     setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
1203     setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
1204     setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
1205     setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
1206     setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
1207     setOperationAction(ISD::FCEIL,              MVT::v4f64, Legal);
1208     setOperationAction(ISD::FTRUNC,             MVT::v4f64, Legal);
1209     setOperationAction(ISD::FRINT,              MVT::v4f64, Legal);
1210     setOperationAction(ISD::FNEARBYINT,         MVT::v4f64, Legal);
1211     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
1212     setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
1213
1214     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1215     // even though v8i16 is a legal type.
1216     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Promote);
1217     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Promote);
1218     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
1219
1220     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
1221     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
1222     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
1223
1224     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
1225     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
1226
1227     for (MVT VT : MVT::fp_vector_valuetypes())
1228       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1229
1230     setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
1231     setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
1232
1233     setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
1234     setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
1235
1236     setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
1237     setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
1238
1239     setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
1240     setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
1241     setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
1242     setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
1243
1244     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
1245     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
1246     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
1247
1248     setOperationAction(ISD::VSELECT,           MVT::v4f64, Custom);
1249     setOperationAction(ISD::VSELECT,           MVT::v4i64, Custom);
1250     setOperationAction(ISD::VSELECT,           MVT::v8i32, Custom);
1251     setOperationAction(ISD::VSELECT,           MVT::v8f32, Custom);
1252
1253     setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
1254     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
1255     setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
1256     setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
1257     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
1258     setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i16, Custom);
1259     setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
1260     setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
1261     setOperationAction(ISD::ANY_EXTEND,        MVT::v16i16, Custom);
1262     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
1263     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
1264     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
1265
1266     if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
1267       setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
1268       setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
1269       setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
1270       setOperationAction(ISD::FMA,             MVT::v2f64, Legal);
1271       setOperationAction(ISD::FMA,             MVT::f32, Legal);
1272       setOperationAction(ISD::FMA,             MVT::f64, Legal);
1273     }
1274
1275     if (Subtarget->hasInt256()) {
1276       setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
1277       setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
1278       setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
1279       setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
1280
1281       setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
1282       setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
1283       setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
1284       setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
1285
1286       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
1287       setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
1288       setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
1289       // Don't lower v32i8 because there is no 128-bit byte mul
1290
1291       setOperationAction(ISD::UMUL_LOHI,       MVT::v8i32, Custom);
1292       setOperationAction(ISD::SMUL_LOHI,       MVT::v8i32, Custom);
1293       setOperationAction(ISD::MULHU,           MVT::v16i16, Legal);
1294       setOperationAction(ISD::MULHS,           MVT::v16i16, Legal);
1295
1296       setOperationAction(ISD::VSELECT,         MVT::v16i16, Custom);
1297       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
1298
1299       // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1300       // when we have a 256bit-wide blend with immediate.
1301       setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1302
1303       // Only provide customized ctpop vector bit twiddling for vector types we
1304       // know to perform better than using the popcnt instructions on each
1305       // vector element. If popcnt isn't supported, always provide the custom
1306       // version.
1307       if (!Subtarget->hasPOPCNT())
1308         setOperationAction(ISD::CTPOP,           MVT::v4i64, Custom);
1309
1310       // Custom CTPOP always performs better on natively supported v8i32
1311       setOperationAction(ISD::CTPOP,             MVT::v8i32, Custom);
1312
1313       // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1314       setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1315       setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
1316       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
1317       setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
1318       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
1319       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
1320
1321       setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1322       setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
1323       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
1324       setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
1325       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
1326       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
1327     } else {
1328       setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
1329       setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
1330       setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
1331       setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
1332
1333       setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
1334       setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
1335       setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
1336       setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
1337
1338       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
1339       setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
1340       setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
1341       // Don't lower v32i8 because there is no 128-bit byte mul
1342     }
1343
1344     // In the customized shift lowering, the legal cases in AVX2 will be
1345     // recognized.
1346     setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
1347     setOperationAction(ISD::SRL,               MVT::v8i32, Custom);
1348
1349     setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
1350     setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
1351
1352     setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
1353
1354     // Custom lower several nodes for 256-bit types.
1355     for (MVT VT : MVT::vector_valuetypes()) {
1356       if (VT.getScalarSizeInBits() >= 32) {
1357         setOperationAction(ISD::MLOAD,  VT, Legal);
1358         setOperationAction(ISD::MSTORE, VT, Legal);
1359       }
1360       // Extract subvector is special because the value type
1361       // (result) is 128-bit but the source is 256-bit wide.
1362       if (VT.is128BitVector()) {
1363         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1364       }
1365       // Do not attempt to custom lower other non-256-bit vectors
1366       if (!VT.is256BitVector())
1367         continue;
1368
1369       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1370       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1371       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1372       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1373       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
1374       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
1375       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1376     }
1377
1378     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1379     for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
1380       MVT VT = (MVT::SimpleValueType)i;
1381
1382       // Do not attempt to promote non-256-bit vectors
1383       if (!VT.is256BitVector())
1384         continue;
1385
1386       setOperationAction(ISD::AND,    VT, Promote);
1387       AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
1388       setOperationAction(ISD::OR,     VT, Promote);
1389       AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
1390       setOperationAction(ISD::XOR,    VT, Promote);
1391       AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
1392       setOperationAction(ISD::LOAD,   VT, Promote);
1393       AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
1394       setOperationAction(ISD::SELECT, VT, Promote);
1395       AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
1396     }
1397   }
1398
1399   if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) {
1400     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1401     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1402     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
1403     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
1404
1405     addRegisterClass(MVT::i1,     &X86::VK1RegClass);
1406     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
1407     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
1408
1409     for (MVT VT : MVT::fp_vector_valuetypes())
1410       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1411
1412     setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
1413     setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
1414     setOperationAction(ISD::XOR,                MVT::i1,    Legal);
1415     setOperationAction(ISD::OR,                 MVT::i1,    Legal);
1416     setOperationAction(ISD::AND,                MVT::i1,    Legal);
1417     setOperationAction(ISD::LOAD,               MVT::v16f32, Legal);
1418     setOperationAction(ISD::LOAD,               MVT::v8f64, Legal);
1419     setOperationAction(ISD::LOAD,               MVT::v8i64, Legal);
1420     setOperationAction(ISD::LOAD,               MVT::v16i32, Legal);
1421     setOperationAction(ISD::LOAD,               MVT::v16i1, Legal);
1422
1423     setOperationAction(ISD::FADD,               MVT::v16f32, Legal);
1424     setOperationAction(ISD::FSUB,               MVT::v16f32, Legal);
1425     setOperationAction(ISD::FMUL,               MVT::v16f32, Legal);
1426     setOperationAction(ISD::FDIV,               MVT::v16f32, Legal);
1427     setOperationAction(ISD::FSQRT,              MVT::v16f32, Legal);
1428     setOperationAction(ISD::FNEG,               MVT::v16f32, Custom);
1429
1430     setOperationAction(ISD::FADD,               MVT::v8f64, Legal);
1431     setOperationAction(ISD::FSUB,               MVT::v8f64, Legal);
1432     setOperationAction(ISD::FMUL,               MVT::v8f64, Legal);
1433     setOperationAction(ISD::FDIV,               MVT::v8f64, Legal);
1434     setOperationAction(ISD::FSQRT,              MVT::v8f64, Legal);
1435     setOperationAction(ISD::FNEG,               MVT::v8f64, Custom);
1436     setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
1437     setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
1438
1439     setOperationAction(ISD::FP_TO_SINT,         MVT::i32, Legal);
1440     setOperationAction(ISD::FP_TO_UINT,         MVT::i32, Legal);
1441     setOperationAction(ISD::SINT_TO_FP,         MVT::i32, Legal);
1442     setOperationAction(ISD::UINT_TO_FP,         MVT::i32, Legal);
1443     if (Subtarget->is64Bit()) {
1444       setOperationAction(ISD::FP_TO_UINT,       MVT::i64, Legal);
1445       setOperationAction(ISD::FP_TO_SINT,       MVT::i64, Legal);
1446       setOperationAction(ISD::SINT_TO_FP,       MVT::i64, Legal);
1447       setOperationAction(ISD::UINT_TO_FP,       MVT::i64, Legal);
1448     }
1449     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
1450     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
1451     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
1452     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
1453     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
1454     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i1,   Custom);
1455     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i1,  Custom);
1456     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i8,  Promote);
1457     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i16, Promote);
1458     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
1459     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
1460     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
1461     setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
1462     setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
1463
1464     setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
1465     setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
1466     setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
1467     setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
1468     setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
1469     setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
1470     setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
1471     setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
1472     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
1473     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
1474     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
1475     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
1476     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
1477
1478     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
1479     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
1480     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
1481     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
1482     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1,    Custom);
1483     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1, Legal);
1484
1485     setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
1486     setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);
1487
1488     setOperationAction(ISD::MUL,              MVT::v8i64, Custom);
1489
1490     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
1491     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
1492     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
1493     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
1494     setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
1495     setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
1496     setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
1497     setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
1498     setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
1499
1500     setOperationAction(ISD::ADD,                MVT::v8i64, Legal);
1501     setOperationAction(ISD::ADD,                MVT::v16i32, Legal);
1502
1503     setOperationAction(ISD::SUB,                MVT::v8i64, Legal);
1504     setOperationAction(ISD::SUB,                MVT::v16i32, Legal);
1505
1506     setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
1507
1508     setOperationAction(ISD::SRL,                MVT::v8i64, Custom);
1509     setOperationAction(ISD::SRL,                MVT::v16i32, Custom);
1510
1511     setOperationAction(ISD::SHL,                MVT::v8i64, Custom);
1512     setOperationAction(ISD::SHL,                MVT::v16i32, Custom);
1513
1514     setOperationAction(ISD::SRA,                MVT::v8i64, Custom);
1515     setOperationAction(ISD::SRA,                MVT::v16i32, Custom);
1516
1517     setOperationAction(ISD::AND,                MVT::v8i64, Legal);
1518     setOperationAction(ISD::OR,                 MVT::v8i64, Legal);
1519     setOperationAction(ISD::XOR,                MVT::v8i64, Legal);
1520     setOperationAction(ISD::AND,                MVT::v16i32, Legal);
1521     setOperationAction(ISD::OR,                 MVT::v16i32, Legal);
1522     setOperationAction(ISD::XOR,                MVT::v16i32, Legal);
1523
1524     if (Subtarget->hasCDI()) {
1525       setOperationAction(ISD::CTLZ,             MVT::v8i64, Legal);
1526       setOperationAction(ISD::CTLZ,             MVT::v16i32, Legal);
1527     }
1528
1529     // Custom lower several nodes.
1530     for (MVT VT : MVT::vector_valuetypes()) {
1531       unsigned EltSize = VT.getVectorElementType().getSizeInBits();
1532       // Extract subvector is special because the value type
1533       // (result) is 256/128-bit but the source is 512-bit wide.
1534       if (VT.is128BitVector() || VT.is256BitVector()) {
1535         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1536       }
1537       if (VT.getVectorElementType() == MVT::i1)
1538         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1539
1540       // Do not attempt to custom lower other non-512-bit vectors
1541       if (!VT.is512BitVector())
1542         continue;
1543
1544       if ( EltSize >= 32) {
1545         setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
1546         setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
1547         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
1548         setOperationAction(ISD::VSELECT,             VT, Legal);
1549         setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
1550         setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
1551         setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
1552         setOperationAction(ISD::MLOAD,               VT, Legal);
1553         setOperationAction(ISD::MSTORE,              VT, Legal);
1554       }
1555     }
1556     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
1557       MVT VT = (MVT::SimpleValueType)i;
1558
1559       // Do not attempt to promote non-512-bit vectors.
1560       if (!VT.is512BitVector())
1561         continue;
1562
1563       setOperationAction(ISD::SELECT, VT, Promote);
1564       AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
1565     }
1566   }// has  AVX-512
1567
1568   if (!TM.Options.UseSoftFloat && Subtarget->hasBWI()) {
1569     addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1570     addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
1571
1572     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
1573     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
1574
1575     setOperationAction(ISD::LOAD,               MVT::v32i16, Legal);
1576     setOperationAction(ISD::LOAD,               MVT::v64i8, Legal);
1577     setOperationAction(ISD::SETCC,              MVT::v32i1, Custom);
1578     setOperationAction(ISD::SETCC,              MVT::v64i1, Custom);
1579     setOperationAction(ISD::ADD,                MVT::v32i16, Legal);
1580     setOperationAction(ISD::ADD,                MVT::v64i8, Legal);
1581     setOperationAction(ISD::SUB,                MVT::v32i16, Legal);
1582     setOperationAction(ISD::SUB,                MVT::v64i8, Legal);
1583     setOperationAction(ISD::MUL,                MVT::v32i16, Legal);
1584
1585     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
1586       const MVT VT = (MVT::SimpleValueType)i;
1587
1588       const unsigned EltSize = VT.getVectorElementType().getSizeInBits();
1589
1590       // Do not attempt to promote non-512-bit vectors.
1591       if (!VT.is512BitVector())
1592         continue;
1593
1594       if (EltSize < 32) {
1595         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
1596         setOperationAction(ISD::VSELECT,             VT, Legal);
1597       }
1598     }
1599   }
1600
1601   if (!TM.Options.UseSoftFloat && Subtarget->hasVLX()) {
1602     addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
1603     addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
1604
1605     setOperationAction(ISD::SETCC,              MVT::v4i1, Custom);
1606     setOperationAction(ISD::SETCC,              MVT::v2i1, Custom);
1607     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v8i1, Legal);
1608
1609     setOperationAction(ISD::AND,                MVT::v8i32, Legal);
1610     setOperationAction(ISD::OR,                 MVT::v8i32, Legal);
1611     setOperationAction(ISD::XOR,                MVT::v8i32, Legal);
1612     setOperationAction(ISD::AND,                MVT::v4i32, Legal);
1613     setOperationAction(ISD::OR,                 MVT::v4i32, Legal);
1614     setOperationAction(ISD::XOR,                MVT::v4i32, Legal);
1615   }
1616
1617   // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
1618   // of this type with custom code.
1619   for (MVT VT : MVT::vector_valuetypes())
1620     setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
1621
1622   // We want to custom lower some of our intrinsics.
1623   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1624   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1625   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1626   if (!Subtarget->is64Bit())
1627     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1628
1629   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1630   // handle type legalization for these operations here.
1631   //
1632   // FIXME: We really should do custom legalization for addition and
1633   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
1634   // than generic legalization for 64-bit multiplication-with-overflow, though.
1635   for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
1636     // Add/Sub/Mul with overflow operations are custom lowered.
1637     MVT VT = IntVTs[i];
1638     setOperationAction(ISD::SADDO, VT, Custom);
1639     setOperationAction(ISD::UADDO, VT, Custom);
1640     setOperationAction(ISD::SSUBO, VT, Custom);
1641     setOperationAction(ISD::USUBO, VT, Custom);
1642     setOperationAction(ISD::SMULO, VT, Custom);
1643     setOperationAction(ISD::UMULO, VT, Custom);
1644   }
1645
1646
1647   if (!Subtarget->is64Bit()) {
1648     // These libcalls are not available in 32-bit.
1649     setLibcallName(RTLIB::SHL_I128, nullptr);
1650     setLibcallName(RTLIB::SRL_I128, nullptr);
1651     setLibcallName(RTLIB::SRA_I128, nullptr);
1652   }
1653
1654   // Combine sin / cos into one node or libcall if possible.
1655   if (Subtarget->hasSinCos()) {
1656     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1657     setLibcallName(RTLIB::SINCOS_F64, "sincos");
1658     if (Subtarget->isTargetDarwin()) {
1659       // For MacOSX, we don't want the normal expansion of a libcall to sincos.
1660       // We want to issue a libcall to __sincos_stret to avoid memory traffic.
1661       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1662       setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1663     }
1664   }
1665
1666   if (Subtarget->isTargetWin64()) {
1667     setOperationAction(ISD::SDIV, MVT::i128, Custom);
1668     setOperationAction(ISD::UDIV, MVT::i128, Custom);
1669     setOperationAction(ISD::SREM, MVT::i128, Custom);
1670     setOperationAction(ISD::UREM, MVT::i128, Custom);
1671     setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1672     setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1673   }
1674
1675   // We have target-specific dag combine patterns for the following nodes:
1676   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1677   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1678   setTargetDAGCombine(ISD::BITCAST);
1679   setTargetDAGCombine(ISD::VSELECT);
1680   setTargetDAGCombine(ISD::SELECT);
1681   setTargetDAGCombine(ISD::SHL);
1682   setTargetDAGCombine(ISD::SRA);
1683   setTargetDAGCombine(ISD::SRL);
1684   setTargetDAGCombine(ISD::OR);
1685   setTargetDAGCombine(ISD::AND);
1686   setTargetDAGCombine(ISD::ADD);
1687   setTargetDAGCombine(ISD::FADD);
1688   setTargetDAGCombine(ISD::FSUB);
1689   setTargetDAGCombine(ISD::FMA);
1690   setTargetDAGCombine(ISD::SUB);
1691   setTargetDAGCombine(ISD::LOAD);
1692   setTargetDAGCombine(ISD::MLOAD);
1693   setTargetDAGCombine(ISD::STORE);
1694   setTargetDAGCombine(ISD::MSTORE);
1695   setTargetDAGCombine(ISD::ZERO_EXTEND);
1696   setTargetDAGCombine(ISD::ANY_EXTEND);
1697   setTargetDAGCombine(ISD::SIGN_EXTEND);
1698   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1699   setTargetDAGCombine(ISD::TRUNCATE);
1700   setTargetDAGCombine(ISD::SINT_TO_FP);
1701   setTargetDAGCombine(ISD::SETCC);
1702   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
1703   setTargetDAGCombine(ISD::BUILD_VECTOR);
1704   setTargetDAGCombine(ISD::MUL);
1705   setTargetDAGCombine(ISD::XOR);
1706
1707   computeRegisterProperties();
1708
1709   // On Darwin, -Os means optimize for size without hurting performance,
1710   // do not reduce the limit.
1711   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1712   MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
1713   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1714   MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1715   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1716   MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1717   setPrefLoopAlignment(4); // 2^4 bytes.
1718
1719   // Predictable cmov don't hurt on atom because it's in-order.
1720   PredictableSelectIsExpensive = !Subtarget->isAtom();
1721   EnableExtLdPromotion = true;
1722   setPrefFunctionAlignment(4); // 2^4 bytes.
1723
1724   verifyIntrinsicTables();
1725 }
1726
1727 // This has so far only been implemented for 64-bit MachO.
1728 bool X86TargetLowering::useLoadStackGuardNode() const {
1729   return Subtarget->isTargetMachO() && Subtarget->is64Bit();
1730 }
1731
1732 TargetLoweringBase::LegalizeTypeAction
1733 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1734   if (ExperimentalVectorWideningLegalization &&
1735       VT.getVectorNumElements() != 1 &&
1736       VT.getVectorElementType().getSimpleVT() != MVT::i1)
1737     return TypeWidenVector;
1738
1739   return TargetLoweringBase::getPreferredVectorAction(VT);
1740 }
1741
1742 EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1743   if (!VT.isVector())
1744     return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
1745
1746   const unsigned NumElts = VT.getVectorNumElements();
1747   const EVT EltVT = VT.getVectorElementType();
1748   if (VT.is512BitVector()) {
1749     if (Subtarget->hasAVX512())
1750       if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1751           EltVT == MVT::f32 || EltVT == MVT::f64)
1752         switch(NumElts) {
1753         case  8: return MVT::v8i1;
1754         case 16: return MVT::v16i1;
1755       }
1756     if (Subtarget->hasBWI())
1757       if (EltVT == MVT::i8 || EltVT == MVT::i16)
1758         switch(NumElts) {
1759         case 32: return MVT::v32i1;
1760         case 64: return MVT::v64i1;
1761       }
1762   }
1763
1764   if (VT.is256BitVector() || VT.is128BitVector()) {
1765     if (Subtarget->hasVLX())
1766       if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1767           EltVT == MVT::f32 || EltVT == MVT::f64)
1768         switch(NumElts) {
1769         case 2: return MVT::v2i1;
1770         case 4: return MVT::v4i1;
1771         case 8: return MVT::v8i1;
1772       }
1773     if (Subtarget->hasBWI() && Subtarget->hasVLX())
1774       if (EltVT == MVT::i8 || EltVT == MVT::i16)
1775         switch(NumElts) {
1776         case  8: return MVT::v8i1;
1777         case 16: return MVT::v16i1;
1778         case 32: return MVT::v32i1;
1779       }
1780   }
1781
1782   return VT.changeVectorElementTypeToInteger();
1783 }
1784
1785 /// Helper for getByValTypeAlignment to determine
1786 /// the desired ByVal argument alignment.
1787 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1788   if (MaxAlign == 16)
1789     return;
1790   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1791     if (VTy->getBitWidth() == 128)
1792       MaxAlign = 16;
1793   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1794     unsigned EltAlign = 0;
1795     getMaxByValAlign(ATy->getElementType(), EltAlign);
1796     if (EltAlign > MaxAlign)
1797       MaxAlign = EltAlign;
1798   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1799     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
1800       unsigned EltAlign = 0;
1801       getMaxByValAlign(STy->getElementType(i), EltAlign);
1802       if (EltAlign > MaxAlign)
1803         MaxAlign = EltAlign;
1804       if (MaxAlign == 16)
1805         break;
1806     }
1807   }
1808 }
1809
1810 /// Return the desired alignment for ByVal aggregate
1811 /// function arguments in the caller parameter area. For X86, aggregates
1812 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1813 /// are at 4-byte boundaries.
1814 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
1815   if (Subtarget->is64Bit()) {
1816     // Max of 8 and alignment of type.
1817     unsigned TyAlign = TD->getABITypeAlignment(Ty);
1818     if (TyAlign > 8)
1819       return TyAlign;
1820     return 8;
1821   }
1822
1823   unsigned Align = 4;
1824   if (Subtarget->hasSSE1())
1825     getMaxByValAlign(Ty, Align);
1826   return Align;
1827 }
1828
1829 /// Returns the target specific optimal type for load
1830 /// and store operations as a result of memset, memcpy, and memmove
1831 /// lowering. If DstAlign is zero that means it's safe to destination
1832 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1833 /// means there isn't a need to check it against alignment requirement,
1834 /// probably because the source does not need to be loaded. If 'IsMemset' is
1835 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1836 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1837 /// source is constant so it does not need to be loaded.
1838 /// It returns EVT::Other if the type should be determined using generic
1839 /// target-independent logic.
1840 EVT
1841 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1842                                        unsigned DstAlign, unsigned SrcAlign,
1843                                        bool IsMemset, bool ZeroMemset,
1844                                        bool MemcpyStrSrc,
1845                                        MachineFunction &MF) const {
1846   const Function *F = MF.getFunction();
1847   if ((!IsMemset || ZeroMemset) &&
1848       !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
1849                                        Attribute::NoImplicitFloat)) {
1850     if (Size >= 16 &&
1851         (Subtarget->isUnalignedMemAccessFast() ||
1852          ((DstAlign == 0 || DstAlign >= 16) &&
1853           (SrcAlign == 0 || SrcAlign >= 16)))) {
1854       if (Size >= 32) {
1855         if (Subtarget->hasInt256())
1856           return MVT::v8i32;
1857         if (Subtarget->hasFp256())
1858           return MVT::v8f32;
1859       }
1860       if (Subtarget->hasSSE2())
1861         return MVT::v4i32;
1862       if (Subtarget->hasSSE1())
1863         return MVT::v4f32;
1864     } else if (!MemcpyStrSrc && Size >= 8 &&
1865                !Subtarget->is64Bit() &&
1866                Subtarget->hasSSE2()) {
1867       // Do not use f64 to lower memcpy if source is string constant. It's
1868       // better to use i32 to avoid the loads.
1869       return MVT::f64;
1870     }
1871   }
1872   if (Subtarget->is64Bit() && Size >= 8)
1873     return MVT::i64;
1874   return MVT::i32;
1875 }
1876
1877 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1878   if (VT == MVT::f32)
1879     return X86ScalarSSEf32;
1880   else if (VT == MVT::f64)
1881     return X86ScalarSSEf64;
1882   return true;
1883 }
1884
1885 bool
1886 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1887                                                   unsigned,
1888                                                   unsigned,
1889                                                   bool *Fast) const {
1890   if (Fast)
1891     *Fast = Subtarget->isUnalignedMemAccessFast();
1892   return true;
1893 }
1894
1895 /// Return the entry encoding for a jump table in the
1896 /// current function.  The returned value is a member of the
1897 /// MachineJumpTableInfo::JTEntryKind enum.
1898 unsigned X86TargetLowering::getJumpTableEncoding() const {
1899   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1900   // symbol.
1901   if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1902       Subtarget->isPICStyleGOT())
1903     return MachineJumpTableInfo::EK_Custom32;
1904
1905   // Otherwise, use the normal jump table encoding heuristics.
1906   return TargetLowering::getJumpTableEncoding();
1907 }
1908
1909 const MCExpr *
1910 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1911                                              const MachineBasicBlock *MBB,
1912                                              unsigned uid,MCContext &Ctx) const{
1913   assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ &&
1914          Subtarget->isPICStyleGOT());
1915   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1916   // entries.
1917   return MCSymbolRefExpr::Create(MBB->getSymbol(),
1918                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
1919 }
1920
1921 /// Returns relocation base for the given PIC jumptable.
1922 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1923                                                     SelectionDAG &DAG) const {
1924   if (!Subtarget->is64Bit())
1925     // This doesn't have SDLoc associated with it, but is not really the
1926     // same as a Register.
1927     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
1928   return Table;
1929 }
1930
1931 /// This returns the relocation base for the given PIC jumptable,
1932 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
1933 const MCExpr *X86TargetLowering::
1934 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1935                              MCContext &Ctx) const {
1936   // X86-64 uses RIP relative addressing based on the jump table label.
1937   if (Subtarget->isPICStyleRIPRel())
1938     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1939
1940   // Otherwise, the reference is relative to the PIC base.
1941   return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
1942 }
1943
1944 // FIXME: Why this routine is here? Move to RegInfo!
1945 std::pair<const TargetRegisterClass*, uint8_t>
1946 X86TargetLowering::findRepresentativeClass(MVT VT) const{
1947   const TargetRegisterClass *RRC = nullptr;
1948   uint8_t Cost = 1;
1949   switch (VT.SimpleTy) {
1950   default:
1951     return TargetLowering::findRepresentativeClass(VT);
1952   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1953     RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
1954     break;
1955   case MVT::x86mmx:
1956     RRC = &X86::VR64RegClass;
1957     break;
1958   case MVT::f32: case MVT::f64:
1959   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1960   case MVT::v4f32: case MVT::v2f64:
1961   case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
1962   case MVT::v4f64:
1963     RRC = &X86::VR128RegClass;
1964     break;
1965   }
1966   return std::make_pair(RRC, Cost);
1967 }
1968
1969 bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
1970                                                unsigned &Offset) const {
1971   if (!Subtarget->isTargetLinux())
1972     return false;
1973
1974   if (Subtarget->is64Bit()) {
1975     // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
1976     Offset = 0x28;
1977     if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
1978       AddressSpace = 256;
1979     else
1980       AddressSpace = 257;
1981   } else {
1982     // %gs:0x14 on i386
1983     Offset = 0x14;
1984     AddressSpace = 256;
1985   }
1986   return true;
1987 }
1988
1989 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
1990                                             unsigned DestAS) const {
1991   assert(SrcAS != DestAS && "Expected different address spaces!");
1992
1993   return SrcAS < 256 && DestAS < 256;
1994 }
1995
1996 //===----------------------------------------------------------------------===//
1997 //               Return Value Calling Convention Implementation
1998 //===----------------------------------------------------------------------===//
1999
2000 #include "X86GenCallingConv.inc"
2001
2002 bool
2003 X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
2004                                   MachineFunction &MF, bool isVarArg,
2005                         const SmallVectorImpl<ISD::OutputArg> &Outs,
2006                         LLVMContext &Context) const {
2007   SmallVector<CCValAssign, 16> RVLocs;
2008   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2009   return CCInfo.CheckReturn(Outs, RetCC_X86);
2010 }
2011
2012 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2013   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2014   return ScratchRegs;
2015 }
2016
2017 SDValue
2018 X86TargetLowering::LowerReturn(SDValue Chain,
2019                                CallingConv::ID CallConv, bool isVarArg,
2020                                const SmallVectorImpl<ISD::OutputArg> &Outs,
2021                                const SmallVectorImpl<SDValue> &OutVals,
2022                                SDLoc dl, SelectionDAG &DAG) const {
2023   MachineFunction &MF = DAG.getMachineFunction();
2024   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2025
2026   SmallVector<CCValAssign, 16> RVLocs;
2027   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2028   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2029
2030   SDValue Flag;
2031   SmallVector<SDValue, 6> RetOps;
2032   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2033   // Operand #1 = Bytes To Pop
2034   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
2035                    MVT::i16));
2036
2037   // Copy the result values into the output registers.
2038   for (unsigned i = 0; i != RVLocs.size(); ++i) {
2039     CCValAssign &VA = RVLocs[i];
2040     assert(VA.isRegLoc() && "Can only return in registers!");
2041     SDValue ValToCopy = OutVals[i];
2042     EVT ValVT = ValToCopy.getValueType();
2043
2044     // Promote values to the appropriate types.
2045     if (VA.getLocInfo() == CCValAssign::SExt)
2046       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2047     else if (VA.getLocInfo() == CCValAssign::ZExt)
2048       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2049     else if (VA.getLocInfo() == CCValAssign::AExt)
2050       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2051     else if (VA.getLocInfo() == CCValAssign::BCvt)
2052       ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
2053
2054     assert(VA.getLocInfo() != CCValAssign::FPExt &&
2055            "Unexpected FP-extend for return value.");
2056
2057     // If this is x86-64, and we disabled SSE, we can't return FP values,
2058     // or SSE or MMX vectors.
2059     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2060          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2061           (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
2062       report_fatal_error("SSE register return with SSE disabled");
2063     }
2064     // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
2065     // llvm-gcc has never done it right and no one has noticed, so this
2066     // should be OK for now.
2067     if (ValVT == MVT::f64 &&
2068         (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
2069       report_fatal_error("SSE2 register return with SSE2 disabled");
2070
2071     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2072     // the RET instruction and handled by the FP Stackifier.
2073     if (VA.getLocReg() == X86::FP0 ||
2074         VA.getLocReg() == X86::FP1) {
2075       // If this is a copy from an xmm register to ST(0), use an FPExtend to
2076       // change the value to the FP stack register class.
2077       if (isScalarFPTypeInSSEReg(VA.getValVT()))
2078         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2079       RetOps.push_back(ValToCopy);
2080       // Don't emit a copytoreg.
2081       continue;
2082     }
2083
2084     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2085     // which is returned in RAX / RDX.
2086     if (Subtarget->is64Bit()) {
2087       if (ValVT == MVT::x86mmx) {
2088         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2089           ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
2090           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2091                                   ValToCopy);
2092           // If we don't have SSE2 available, convert to v4f32 so the generated
2093           // register is legal.
2094           if (!Subtarget->hasSSE2())
2095             ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
2096         }
2097       }
2098     }
2099
2100     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
2101     Flag = Chain.getValue(1);
2102     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2103   }
2104
2105   // The x86-64 ABIs require that for returning structs by value we copy
2106   // the sret argument into %rax/%eax (depending on ABI) for the return.
2107   // Win32 requires us to put the sret argument to %eax as well.
2108   // We saved the argument into a virtual register in the entry block,
2109   // so now we copy the value out and into %rax/%eax.
2110   //
2111   // Checking Function.hasStructRetAttr() here is insufficient because the IR
2112   // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2113   // false, then an sret argument may be implicitly inserted in the SelDAG. In
2114   // either case FuncInfo->setSRetReturnReg() will have been called.
2115   if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2116     assert((Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) &&
2117            "No need for an sret register");
2118     SDValue Val = DAG.getCopyFromReg(Chain, dl, SRetReg, getPointerTy());
2119
2120     unsigned RetValReg
2121         = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
2122           X86::RAX : X86::EAX;
2123     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2124     Flag = Chain.getValue(1);
2125
2126     // RAX/EAX now acts like a return value.
2127     RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
2128   }
2129
2130   RetOps[0] = Chain;  // Update chain.
2131
2132   // Add the flag if we have it.
2133   if (Flag.getNode())
2134     RetOps.push_back(Flag);
2135
2136   return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps);
2137 }
2138
2139 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2140   if (N->getNumValues() != 1)
2141     return false;
2142   if (!N->hasNUsesOfValue(1, 0))
2143     return false;
2144
2145   SDValue TCChain = Chain;
2146   SDNode *Copy = *N->use_begin();
2147   if (Copy->getOpcode() == ISD::CopyToReg) {
2148     // If the copy has a glue operand, we conservatively assume it isn't safe to
2149     // perform a tail call.
2150     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2151       return false;
2152     TCChain = Copy->getOperand(0);
2153   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2154     return false;
2155
2156   bool HasRet = false;
2157   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2158        UI != UE; ++UI) {
2159     if (UI->getOpcode() != X86ISD::RET_FLAG)
2160       return false;
2161     // If we are returning more than one value, we can definitely
2162     // not make a tail call see PR19530
2163     if (UI->getNumOperands() > 4)
2164       return false;
2165     if (UI->getNumOperands() == 4 &&
2166         UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2167       return false;
2168     HasRet = true;
2169   }
2170
2171   if (!HasRet)
2172     return false;
2173
2174   Chain = TCChain;
2175   return true;
2176 }
2177
2178 EVT
2179 X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
2180                                             ISD::NodeType ExtendKind) const {
2181   MVT ReturnMVT;
2182   // TODO: Is this also valid on 32-bit?
2183   if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
2184     ReturnMVT = MVT::i8;
2185   else
2186     ReturnMVT = MVT::i32;
2187
2188   EVT MinVT = getRegisterType(Context, ReturnMVT);
2189   return VT.bitsLT(MinVT) ? MinVT : VT;
2190 }
2191
2192 /// Lower the result values of a call into the
2193 /// appropriate copies out of appropriate physical registers.
2194 ///
2195 SDValue
2196 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
2197                                    CallingConv::ID CallConv, bool isVarArg,
2198                                    const SmallVectorImpl<ISD::InputArg> &Ins,
2199                                    SDLoc dl, SelectionDAG &DAG,
2200                                    SmallVectorImpl<SDValue> &InVals) const {
2201
2202   // Assign locations to each value returned by this call.
2203   SmallVector<CCValAssign, 16> RVLocs;
2204   bool Is64Bit = Subtarget->is64Bit();
2205   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2206                  *DAG.getContext());
2207   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2208
2209   // Copy all of the result registers out of their specified physreg.
2210   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
2211     CCValAssign &VA = RVLocs[i];
2212     EVT CopyVT = VA.getValVT();
2213
2214     // If this is x86-64, and we disabled SSE, we can't return FP values
2215     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
2216         ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
2217       report_fatal_error("SSE register return with SSE disabled");
2218     }
2219
2220     // If we prefer to use the value in xmm registers, copy it out as f80 and
2221     // use a truncate to move it from fp stack reg to xmm reg.
2222     if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2223         isScalarFPTypeInSSEReg(VA.getValVT()))
2224       CopyVT = MVT::f80;
2225
2226     Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
2227                                CopyVT, InFlag).getValue(1);
2228     SDValue Val = Chain.getValue(0);
2229
2230     if (CopyVT != VA.getValVT())
2231       Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2232                         // This truncation won't change the value.
2233                         DAG.getIntPtrConstant(1));
2234
2235     InFlag = Chain.getValue(2);
2236     InVals.push_back(Val);
2237   }
2238
2239   return Chain;
2240 }
2241
2242 //===----------------------------------------------------------------------===//
2243 //                C & StdCall & Fast Calling Convention implementation
2244 //===----------------------------------------------------------------------===//
2245 //  StdCall calling convention seems to be standard for many Windows' API
2246 //  routines and around. It differs from C calling convention just a little:
2247 //  callee should clean up the stack, not caller. Symbols should be also
2248 //  decorated in some fancy way :) It doesn't support any vector arguments.
2249 //  For info on fast calling convention see Fast Calling Convention (tail call)
2250 //  implementation LowerX86_32FastCCCallTo.
2251
2252 /// CallIsStructReturn - Determines whether a call uses struct return
2253 /// semantics.
2254 enum StructReturnType {
2255   NotStructReturn,
2256   RegStructReturn,
2257   StackStructReturn
2258 };
2259 static StructReturnType
2260 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
2261   if (Outs.empty())
2262     return NotStructReturn;
2263
2264   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2265   if (!Flags.isSRet())
2266     return NotStructReturn;
2267   if (Flags.isInReg())
2268     return RegStructReturn;
2269   return StackStructReturn;
2270 }
2271
2272 /// Determines whether a function uses struct return semantics.
2273 static StructReturnType
2274 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
2275   if (Ins.empty())
2276     return NotStructReturn;
2277
2278   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2279   if (!Flags.isSRet())
2280     return NotStructReturn;
2281   if (Flags.isInReg())
2282     return RegStructReturn;
2283   return StackStructReturn;
2284 }
2285
2286 /// Make a copy of an aggregate at address specified by "Src" to address
2287 /// "Dst" with size and alignment information specified by the specific
2288 /// parameter attribute. The copy will be passed as a byval function parameter.
2289 static SDValue
2290 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
2291                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
2292                           SDLoc dl) {
2293   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
2294
2295   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2296                        /*isVolatile*/false, /*AlwaysInline=*/true,
2297                        MachinePointerInfo(), MachinePointerInfo());
2298 }
2299
2300 /// Return true if the calling convention is one that
2301 /// supports tail call optimization.
2302 static bool IsTailCallConvention(CallingConv::ID CC) {
2303   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2304           CC == CallingConv::HiPE);
2305 }
2306
2307 /// \brief Return true if the calling convention is a C calling convention.
2308 static bool IsCCallConvention(CallingConv::ID CC) {
2309   return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||
2310           CC == CallingConv::X86_64_SysV);
2311 }
2312
2313 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
2314   if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
2315     return false;
2316
2317   CallSite CS(CI);
2318   CallingConv::ID CalleeCC = CS.getCallingConv();
2319   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
2320     return false;
2321
2322   return true;
2323 }
2324
2325 /// Return true if the function is being made into
2326 /// a tailcall target by changing its ABI.
2327 static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
2328                                    bool GuaranteedTailCallOpt) {
2329   return GuaranteedTailCallOpt && IsTailCallConvention(CC);
2330 }
2331
2332 SDValue
2333 X86TargetLowering::LowerMemArgument(SDValue Chain,
2334                                     CallingConv::ID CallConv,
2335                                     const SmallVectorImpl<ISD::InputArg> &Ins,
2336                                     SDLoc dl, SelectionDAG &DAG,
2337                                     const CCValAssign &VA,
2338                                     MachineFrameInfo *MFI,
2339                                     unsigned i) const {
2340   // Create the nodes corresponding to a load from this parameter slot.
2341   ISD::ArgFlagsTy Flags = Ins[i].Flags;
2342   bool AlwaysUseMutable = FuncIsMadeTailCallSafe(
2343       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2344   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2345   EVT ValVT;
2346
2347   // If value is passed by pointer we have address passed instead of the value
2348   // itself.
2349   if (VA.getLocInfo() == CCValAssign::Indirect)
2350     ValVT = VA.getLocVT();
2351   else
2352     ValVT = VA.getValVT();
2353
2354   // FIXME: For now, all byval parameter objects are marked mutable. This can be
2355   // changed with more analysis.
2356   // In case of tail call optimization mark all arguments mutable. Since they
2357   // could be overwritten by lowering of arguments in case of a tail call.
2358   if (Flags.isByVal()) {
2359     unsigned Bytes = Flags.getByValSize();
2360     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2361     int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2362     return DAG.getFrameIndex(FI, getPointerTy());
2363   } else {
2364     int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
2365                                     VA.getLocMemOffset(), isImmutable);
2366     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
2367     return DAG.getLoad(ValVT, dl, Chain, FIN,
2368                        MachinePointerInfo::getFixedStack(FI),
2369                        false, false, false, 0);
2370   }
2371 }
2372
2373 // FIXME: Get this from tablegen.
2374 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2375                                                 const X86Subtarget *Subtarget) {
2376   assert(Subtarget->is64Bit());
2377
2378   if (Subtarget->isCallingConvWin64(CallConv)) {
2379     static const MCPhysReg GPR64ArgRegsWin64[] = {
2380       X86::RCX, X86::RDX, X86::R8,  X86::R9
2381     };
2382     return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2383   }
2384
2385   static const MCPhysReg GPR64ArgRegs64Bit[] = {
2386     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2387   };
2388   return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2389 }
2390
2391 // FIXME: Get this from tablegen.
2392 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2393                                                 CallingConv::ID CallConv,
2394                                                 const X86Subtarget *Subtarget) {
2395   assert(Subtarget->is64Bit());
2396   if (Subtarget->isCallingConvWin64(CallConv)) {
2397     // The XMM registers which might contain var arg parameters are shadowed
2398     // in their paired GPR.  So we only need to save the GPR to their home
2399     // slots.
2400     // TODO: __vectorcall will change this.
2401     return None;
2402   }
2403
2404   const Function *Fn = MF.getFunction();
2405   bool NoImplicitFloatOps = Fn->getAttributes().
2406       hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
2407   assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) &&
2408          "SSE register cannot be used when SSE is disabled!");
2409   if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
2410       !Subtarget->hasSSE1())
2411     // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2412     // registers.
2413     return None;
2414
2415   static const MCPhysReg XMMArgRegs64Bit[] = {
2416     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2417     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2418   };
2419   return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2420 }
2421
2422 SDValue
2423 X86TargetLowering::LowerFormalArguments(SDValue Chain,
2424                                         CallingConv::ID CallConv,
2425                                         bool isVarArg,
2426                                       const SmallVectorImpl<ISD::InputArg> &Ins,
2427                                         SDLoc dl,
2428                                         SelectionDAG &DAG,
2429                                         SmallVectorImpl<SDValue> &InVals)
2430                                           const {
2431   MachineFunction &MF = DAG.getMachineFunction();
2432   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2433
2434   const Function* Fn = MF.getFunction();
2435   if (Fn->hasExternalLinkage() &&
2436       Subtarget->isTargetCygMing() &&
2437       Fn->getName() == "main")
2438     FuncInfo->setForceFramePointer(true);
2439
2440   MachineFrameInfo *MFI = MF.getFrameInfo();
2441   bool Is64Bit = Subtarget->is64Bit();
2442   bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
2443
2444   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
2445          "Var args not supported with calling convention fastcc, ghc or hipe");
2446
2447   // Assign locations to all of the incoming arguments.
2448   SmallVector<CCValAssign, 16> ArgLocs;
2449   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2450
2451   // Allocate shadow area for Win64
2452   if (IsWin64)
2453     CCInfo.AllocateStack(32, 8);
2454
2455   CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
2456
2457   unsigned LastVal = ~0U;
2458   SDValue ArgValue;
2459   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2460     CCValAssign &VA = ArgLocs[i];
2461     // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
2462     // places.
2463     assert(VA.getValNo() != LastVal &&
2464            "Don't support value assigned to multiple locs yet");
2465     (void)LastVal;
2466     LastVal = VA.getValNo();
2467
2468     if (VA.isRegLoc()) {
2469       EVT RegVT = VA.getLocVT();
2470       const TargetRegisterClass *RC;
2471       if (RegVT == MVT::i32)
2472         RC = &X86::GR32RegClass;
2473       else if (Is64Bit && RegVT == MVT::i64)
2474         RC = &X86::GR64RegClass;
2475       else if (RegVT == MVT::f32)
2476         RC = &X86::FR32RegClass;
2477       else if (RegVT == MVT::f64)
2478         RC = &X86::FR64RegClass;
2479       else if (RegVT.is512BitVector())
2480         RC = &X86::VR512RegClass;
2481       else if (RegVT.is256BitVector())
2482         RC = &X86::VR256RegClass;
2483       else if (RegVT.is128BitVector())
2484         RC = &X86::VR128RegClass;
2485       else if (RegVT == MVT::x86mmx)
2486         RC = &X86::VR64RegClass;
2487       else if (RegVT == MVT::i1)
2488         RC = &X86::VK1RegClass;
2489       else if (RegVT == MVT::v8i1)
2490         RC = &X86::VK8RegClass;
2491       else if (RegVT == MVT::v16i1)
2492         RC = &X86::VK16RegClass;
2493       else if (RegVT == MVT::v32i1)
2494         RC = &X86::VK32RegClass;
2495       else if (RegVT == MVT::v64i1)
2496         RC = &X86::VK64RegClass;
2497       else
2498         llvm_unreachable("Unknown argument type!");
2499
2500       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2501       ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2502
2503       // If this is an 8 or 16-bit value, it is really passed promoted to 32
2504       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
2505       // right size.
2506       if (VA.getLocInfo() == CCValAssign::SExt)
2507         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
2508                                DAG.getValueType(VA.getValVT()));
2509       else if (VA.getLocInfo() == CCValAssign::ZExt)
2510         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
2511                                DAG.getValueType(VA.getValVT()));
2512       else if (VA.getLocInfo() == CCValAssign::BCvt)
2513         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
2514
2515       if (VA.isExtInLoc()) {
2516         // Handle MMX values passed in XMM regs.
2517         if (RegVT.isVector())
2518           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
2519         else
2520           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
2521       }
2522     } else {
2523       assert(VA.isMemLoc());
2524       ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
2525     }
2526
2527     // If value is passed via pointer - do a load.
2528     if (VA.getLocInfo() == CCValAssign::Indirect)
2529       ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
2530                              MachinePointerInfo(), false, false, false, 0);
2531
2532     InVals.push_back(ArgValue);
2533   }
2534
2535   if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) {
2536     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2537       // The x86-64 ABIs require that for returning structs by value we copy
2538       // the sret argument into %rax/%eax (depending on ABI) for the return.
2539       // Win32 requires us to put the sret argument to %eax as well.
2540       // Save the argument into a virtual register so that we can access it
2541       // from the return points.
2542       if (Ins[i].Flags.isSRet()) {
2543         unsigned Reg = FuncInfo->getSRetReturnReg();
2544         if (!Reg) {
2545           MVT PtrTy = getPointerTy();
2546           Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
2547           FuncInfo->setSRetReturnReg(Reg);
2548         }
2549         SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
2550         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
2551         break;
2552       }
2553     }
2554   }
2555
2556   unsigned StackSize = CCInfo.getNextStackOffset();
2557   // Align stack specially for tail calls.
2558   if (FuncIsMadeTailCallSafe(CallConv,
2559                              MF.getTarget().Options.GuaranteedTailCallOpt))
2560     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
2561
2562   // If the function takes variable number of arguments, make a frame index for
2563   // the start of the first vararg value... for expansion of llvm.va_start. We
2564   // can skip this if there are no va_start calls.
2565   if (MFI->hasVAStart() &&
2566       (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
2567                    CallConv != CallingConv::X86_ThisCall))) {
2568     FuncInfo->setVarArgsFrameIndex(
2569         MFI->CreateFixedObject(1, StackSize, true));
2570   }
2571
2572   // Figure out if XMM registers are in use.
2573   assert(!(MF.getTarget().Options.UseSoftFloat &&
2574            Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
2575                                             Attribute::NoImplicitFloat)) &&
2576          "SSE register cannot be used when SSE is disabled!");
2577
2578   // 64-bit calling conventions support varargs and register parameters, so we
2579   // have to do extra work to spill them in the prologue.
2580   if (Is64Bit && isVarArg && MFI->hasVAStart()) {
2581     // Find the first unallocated argument registers.
2582     ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
2583     ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
2584     unsigned NumIntRegs =
2585         CCInfo.getFirstUnallocated(ArgGPRs.data(), ArgGPRs.size());
2586     unsigned NumXMMRegs =
2587         CCInfo.getFirstUnallocated(ArgXMMs.data(), ArgXMMs.size());
2588     assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
2589            "SSE register cannot be used when SSE is disabled!");
2590
2591     // Gather all the live in physical registers.
2592     SmallVector<SDValue, 6> LiveGPRs;
2593     SmallVector<SDValue, 8> LiveXMMRegs;
2594     SDValue ALVal;
2595     for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
2596       unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
2597       LiveGPRs.push_back(
2598           DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
2599     }
2600     if (!ArgXMMs.empty()) {
2601       unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2602       ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
2603       for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
2604         unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
2605         LiveXMMRegs.push_back(
2606             DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
2607       }
2608     }
2609
2610     if (IsWin64) {
2611       const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
2612       // Get to the caller-allocated home save location.  Add 8 to account
2613       // for the return address.
2614       int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
2615       FuncInfo->setRegSaveFrameIndex(
2616           MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
2617       // Fixup to set vararg frame on shadow area (4 x i64).
2618       if (NumIntRegs < 4)
2619         FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
2620     } else {
2621       // For X86-64, if there are vararg parameters that are passed via
2622       // registers, then we must store them to their spots on the stack so
2623       // they may be loaded by deferencing the result of va_next.
2624       FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
2625       FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
2626       FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
2627           ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
2628     }
2629
2630     // Store the integer parameter registers.
2631     SmallVector<SDValue, 8> MemOps;
2632     SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
2633                                       getPointerTy());
2634     unsigned Offset = FuncInfo->getVarArgsGPOffset();
2635     for (SDValue Val : LiveGPRs) {
2636       SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
2637                                 DAG.getIntPtrConstant(Offset));
2638       SDValue Store =
2639         DAG.getStore(Val.getValue(1), dl, Val, FIN,
2640                      MachinePointerInfo::getFixedStack(
2641                        FuncInfo->getRegSaveFrameIndex(), Offset),
2642                      false, false, 0);
2643       MemOps.push_back(Store);
2644       Offset += 8;
2645     }
2646
2647     if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
2648       // Now store the XMM (fp + vector) parameter registers.
2649       SmallVector<SDValue, 12> SaveXMMOps;
2650       SaveXMMOps.push_back(Chain);
2651       SaveXMMOps.push_back(ALVal);
2652       SaveXMMOps.push_back(DAG.getIntPtrConstant(
2653                              FuncInfo->getRegSaveFrameIndex()));
2654       SaveXMMOps.push_back(DAG.getIntPtrConstant(
2655                              FuncInfo->getVarArgsFPOffset()));
2656       SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
2657                         LiveXMMRegs.end());
2658       MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
2659                                    MVT::Other, SaveXMMOps));
2660     }
2661
2662     if (!MemOps.empty())
2663       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
2664   }
2665
2666   if (isVarArg && MFI->hasMustTailInVarArgFunc()) {
2667     // Find the largest legal vector type.
2668     MVT VecVT = MVT::Other;
2669     // FIXME: Only some x86_32 calling conventions support AVX512.
2670     if (Subtarget->hasAVX512() &&
2671         (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
2672                      CallConv == CallingConv::Intel_OCL_BI)))
2673       VecVT = MVT::v16f32;
2674     else if (Subtarget->hasAVX())
2675       VecVT = MVT::v8f32;
2676     else if (Subtarget->hasSSE2())
2677       VecVT = MVT::v4f32;
2678
2679     // We forward some GPRs and some vector types.
2680     SmallVector<MVT, 2> RegParmTypes;
2681     MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
2682     RegParmTypes.push_back(IntVT);
2683     if (VecVT != MVT::Other)
2684       RegParmTypes.push_back(VecVT);
2685
2686     // Compute the set of forwarded registers. The rest are scratch.
2687     SmallVectorImpl<ForwardedRegister> &Forwards =
2688         FuncInfo->getForwardedMustTailRegParms();
2689     CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
2690
2691     // Conservatively forward AL on x86_64, since it might be used for varargs.
2692     if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
2693       unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2694       Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
2695     }
2696
2697     // Copy all forwards from physical to virtual registers.
2698     for (ForwardedRegister &F : Forwards) {
2699       // FIXME: Can we use a less constrained schedule?
2700       SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
2701       F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
2702       Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
2703     }
2704   }
2705
2706   // Some CCs need callee pop.
2707   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
2708                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
2709     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
2710   } else {
2711     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
2712     // If this is an sret function, the return should pop the hidden pointer.
2713     if (!Is64Bit && !IsTailCallConvention(CallConv) &&
2714         !Subtarget->getTargetTriple().isOSMSVCRT() &&
2715         argsAreStructReturn(Ins) == StackStructReturn)
2716       FuncInfo->setBytesToPopOnReturn(4);
2717   }
2718
2719   if (!Is64Bit) {
2720     // RegSaveFrameIndex is X86-64 only.
2721     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
2722     if (CallConv == CallingConv::X86_FastCall ||
2723         CallConv == CallingConv::X86_ThisCall)
2724       // fastcc functions can't have varargs.
2725       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
2726   }
2727
2728   FuncInfo->setArgumentStackSize(StackSize);
2729
2730   return Chain;
2731 }
2732
2733 SDValue
2734 X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
2735                                     SDValue StackPtr, SDValue Arg,
2736                                     SDLoc dl, SelectionDAG &DAG,
2737                                     const CCValAssign &VA,
2738                                     ISD::ArgFlagsTy Flags) const {
2739   unsigned LocMemOffset = VA.getLocMemOffset();
2740   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
2741   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
2742   if (Flags.isByVal())
2743     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
2744
2745   return DAG.getStore(Chain, dl, Arg, PtrOff,
2746                       MachinePointerInfo::getStack(LocMemOffset),
2747                       false, false, 0);
2748 }
2749
2750 /// Emit a load of return address if tail call
2751 /// optimization is performed and it is required.
2752 SDValue
2753 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
2754                                            SDValue &OutRetAddr, SDValue Chain,
2755                                            bool IsTailCall, bool Is64Bit,
2756                                            int FPDiff, SDLoc dl) const {
2757   // Adjust the Return address stack slot.
2758   EVT VT = getPointerTy();
2759   OutRetAddr = getReturnAddressFrameIndex(DAG);
2760
2761   // Load the "old" Return address.
2762   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
2763                            false, false, false, 0);
2764   return SDValue(OutRetAddr.getNode(), 1);
2765 }
2766
2767 /// Emit a store of the return address if tail call
2768 /// optimization is performed and it is required (FPDiff!=0).
2769 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
2770                                         SDValue Chain, SDValue RetAddrFrIdx,
2771                                         EVT PtrVT, unsigned SlotSize,
2772                                         int FPDiff, SDLoc dl) {
2773   // Store the return address to the appropriate stack slot.
2774   if (!FPDiff) return Chain;
2775   // Calculate the new stack slot for the return address.
2776   int NewReturnAddrFI =
2777     MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
2778                                          false);
2779   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
2780   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
2781                        MachinePointerInfo::getFixedStack(NewReturnAddrFI),
2782                        false, false, 0);
2783   return Chain;
2784 }
2785
2786 SDValue
2787 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2788                              SmallVectorImpl<SDValue> &InVals) const {
2789   SelectionDAG &DAG                     = CLI.DAG;
2790   SDLoc &dl                             = CLI.DL;
2791   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2792   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
2793   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
2794   SDValue Chain                         = CLI.Chain;
2795   SDValue Callee                        = CLI.Callee;
2796   CallingConv::ID CallConv              = CLI.CallConv;
2797   bool &isTailCall                      = CLI.IsTailCall;
2798   bool isVarArg                         = CLI.IsVarArg;
2799
2800   MachineFunction &MF = DAG.getMachineFunction();
2801   bool Is64Bit        = Subtarget->is64Bit();
2802   bool IsWin64        = Subtarget->isCallingConvWin64(CallConv);
2803   StructReturnType SR = callIsStructReturn(Outs);
2804   bool IsSibcall      = false;
2805   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
2806
2807   if (MF.getTarget().Options.DisableTailCalls)
2808     isTailCall = false;
2809
2810   bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
2811   if (IsMustTail) {
2812     // Force this to be a tail call.  The verifier rules are enough to ensure
2813     // that we can lower this successfully without moving the return address
2814     // around.
2815     isTailCall = true;
2816   } else if (isTailCall) {
2817     // Check if it's really possible to do a tail call.
2818     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
2819                     isVarArg, SR != NotStructReturn,
2820                     MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
2821                     Outs, OutVals, Ins, DAG);
2822
2823     // Sibcalls are automatically detected tailcalls which do not require
2824     // ABI changes.
2825     if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
2826       IsSibcall = true;
2827
2828     if (isTailCall)
2829       ++NumTailCalls;
2830   }
2831
2832   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
2833          "Var args not supported with calling convention fastcc, ghc or hipe");
2834
2835   // Analyze operands of the call, assigning locations to each operand.
2836   SmallVector<CCValAssign, 16> ArgLocs;
2837   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2838
2839   // Allocate shadow area for Win64
2840   if (IsWin64)
2841     CCInfo.AllocateStack(32, 8);
2842
2843   CCInfo.AnalyzeCallOperands(Outs, CC_X86);
2844
2845   // Get a count of how many bytes are to be pushed on the stack.
2846   unsigned NumBytes = CCInfo.getNextStackOffset();
2847   if (IsSibcall)
2848     // This is a sibcall. The memory operands are available in caller's
2849     // own caller's stack.
2850     NumBytes = 0;
2851   else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
2852            IsTailCallConvention(CallConv))
2853     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
2854
2855   int FPDiff = 0;
2856   if (isTailCall && !IsSibcall && !IsMustTail) {
2857     // Lower arguments at fp - stackoffset + fpdiff.
2858     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
2859
2860     FPDiff = NumBytesCallerPushed - NumBytes;
2861
2862     // Set the delta of movement of the returnaddr stackslot.
2863     // But only set if delta is greater than previous delta.
2864     if (FPDiff < X86Info->getTCReturnAddrDelta())
2865       X86Info->setTCReturnAddrDelta(FPDiff);
2866   }
2867
2868   unsigned NumBytesToPush = NumBytes;
2869   unsigned NumBytesToPop = NumBytes;
2870
2871   // If we have an inalloca argument, all stack space has already been allocated
2872   // for us and be right at the top of the stack.  We don't support multiple
2873   // arguments passed in memory when using inalloca.
2874   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
2875     NumBytesToPush = 0;
2876     if (!ArgLocs.back().isMemLoc())
2877       report_fatal_error("cannot use inalloca attribute on a register "
2878                          "parameter");
2879     if (ArgLocs.back().getLocMemOffset() != 0)
2880       report_fatal_error("any parameter with the inalloca attribute must be "
2881                          "the only memory argument");
2882   }
2883
2884   if (!IsSibcall)
2885     Chain = DAG.getCALLSEQ_START(
2886         Chain, DAG.getIntPtrConstant(NumBytesToPush, true), dl);
2887
2888   SDValue RetAddrFrIdx;
2889   // Load return address for tail calls.
2890   if (isTailCall && FPDiff)
2891     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
2892                                     Is64Bit, FPDiff, dl);
2893
2894   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2895   SmallVector<SDValue, 8> MemOpChains;
2896   SDValue StackPtr;
2897
2898   // Walk the register/memloc assignments, inserting copies/loads.  In the case
2899   // of tail call optimization arguments are handle later.
2900   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
2901   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2902     // Skip inalloca arguments, they have already been written.
2903     ISD::ArgFlagsTy Flags = Outs[i].Flags;
2904     if (Flags.isInAlloca())
2905       continue;
2906
2907     CCValAssign &VA = ArgLocs[i];
2908     EVT RegVT = VA.getLocVT();
2909     SDValue Arg = OutVals[i];
2910     bool isByVal = Flags.isByVal();
2911
2912     // Promote the value if needed.
2913     switch (VA.getLocInfo()) {
2914     default: llvm_unreachable("Unknown loc info!");
2915     case CCValAssign::Full: break;
2916     case CCValAssign::SExt:
2917       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
2918       break;
2919     case CCValAssign::ZExt:
2920       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
2921       break;
2922     case CCValAssign::AExt:
2923       if (RegVT.is128BitVector()) {
2924         // Special case: passing MMX values in XMM registers.
2925         Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
2926         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
2927         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
2928       } else
2929         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
2930       break;
2931     case CCValAssign::BCvt:
2932       Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
2933       break;
2934     case CCValAssign::Indirect: {
2935       // Store the argument.
2936       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
2937       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
2938       Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
2939                            MachinePointerInfo::getFixedStack(FI),
2940                            false, false, 0);
2941       Arg = SpillSlot;
2942       break;
2943     }
2944     }
2945
2946     if (VA.isRegLoc()) {
2947       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2948       if (isVarArg && IsWin64) {
2949         // Win64 ABI requires argument XMM reg to be copied to the corresponding
2950         // shadow reg if callee is a varargs function.
2951         unsigned ShadowReg = 0;
2952         switch (VA.getLocReg()) {
2953         case X86::XMM0: ShadowReg = X86::RCX; break;
2954         case X86::XMM1: ShadowReg = X86::RDX; break;
2955         case X86::XMM2: ShadowReg = X86::R8; break;
2956         case X86::XMM3: ShadowReg = X86::R9; break;
2957         }
2958         if (ShadowReg)
2959           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
2960       }
2961     } else if (!IsSibcall && (!isTailCall || isByVal)) {
2962       assert(VA.isMemLoc());
2963       if (!StackPtr.getNode())
2964         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
2965                                       getPointerTy());
2966       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
2967                                              dl, DAG, VA, Flags));
2968     }
2969   }
2970
2971   if (!MemOpChains.empty())
2972     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2973
2974   if (Subtarget->isPICStyleGOT()) {
2975     // ELF / PIC requires GOT in the EBX register before function calls via PLT
2976     // GOT pointer.
2977     if (!isTailCall) {
2978       RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
2979                DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
2980     } else {
2981       // If we are tail calling and generating PIC/GOT style code load the
2982       // address of the callee into ECX. The value in ecx is used as target of
2983       // the tail jump. This is done to circumvent the ebx/callee-saved problem
2984       // for tail calls on PIC/GOT architectures. Normally we would just put the
2985       // address of GOT into ebx and then call target@PLT. But for tail calls
2986       // ebx would be restored (since ebx is callee saved) before jumping to the
2987       // target@PLT.
2988
2989       // Note: The actual moving to ECX is done further down.
2990       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
2991       if (G && !G->getGlobal()->hasHiddenVisibility() &&
2992           !G->getGlobal()->hasProtectedVisibility())
2993         Callee = LowerGlobalAddress(Callee, DAG);
2994       else if (isa<ExternalSymbolSDNode>(Callee))
2995         Callee = LowerExternalSymbol(Callee, DAG);
2996     }
2997   }
2998
2999   if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3000     // From AMD64 ABI document:
3001     // For calls that may call functions that use varargs or stdargs
3002     // (prototype-less calls or calls to functions containing ellipsis (...) in
3003     // the declaration) %al is used as hidden argument to specify the number
3004     // of SSE registers used. The contents of %al do not need to match exactly
3005     // the number of registers, but must be an ubound on the number of SSE
3006     // registers used and is in the range 0 - 8 inclusive.
3007
3008     // Count the number of XMM registers allocated.
3009     static const MCPhysReg XMMArgRegs[] = {
3010       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3011       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3012     };
3013     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
3014     assert((Subtarget->hasSSE1() || !NumXMMRegs)
3015            && "SSE registers cannot be used when SSE is disabled");
3016
3017     RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3018                                         DAG.getConstant(NumXMMRegs, MVT::i8)));
3019   }
3020
3021   if (isVarArg && IsMustTail) {
3022     const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3023     for (const auto &F : Forwards) {
3024       SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3025       RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3026     }
3027   }
3028
3029   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
3030   // don't need this because the eligibility check rejects calls that require
3031   // shuffling arguments passed in memory.
3032   if (!IsSibcall && isTailCall) {
3033     // Force all the incoming stack arguments to be loaded from the stack
3034     // before any new outgoing arguments are stored to the stack, because the
3035     // outgoing stack slots may alias the incoming argument stack slots, and
3036     // the alias isn't otherwise explicit. This is slightly more conservative
3037     // than necessary, because it means that each store effectively depends
3038     // on every argument instead of just those arguments it would clobber.
3039     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3040
3041     SmallVector<SDValue, 8> MemOpChains2;
3042     SDValue FIN;
3043     int FI = 0;
3044     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3045       CCValAssign &VA = ArgLocs[i];
3046       if (VA.isRegLoc())
3047         continue;
3048       assert(VA.isMemLoc());
3049       SDValue Arg = OutVals[i];
3050       ISD::ArgFlagsTy Flags = Outs[i].Flags;
3051       // Skip inalloca arguments.  They don't require any work.
3052       if (Flags.isInAlloca())
3053         continue;
3054       // Create frame index.
3055       int32_t Offset = VA.getLocMemOffset()+FPDiff;
3056       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3057       FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
3058       FIN = DAG.getFrameIndex(FI, getPointerTy());
3059
3060       if (Flags.isByVal()) {
3061         // Copy relative to framepointer.
3062         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
3063         if (!StackPtr.getNode())
3064           StackPtr = DAG.getCopyFromReg(Chain, dl,
3065                                         RegInfo->getStackRegister(),
3066                                         getPointerTy());
3067         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
3068
3069         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3070                                                          ArgChain,
3071                                                          Flags, DAG, dl));
3072       } else {
3073         // Store relative to framepointer.
3074         MemOpChains2.push_back(
3075           DAG.getStore(ArgChain, dl, Arg, FIN,
3076                        MachinePointerInfo::getFixedStack(FI),
3077                        false, false, 0));
3078       }
3079     }
3080
3081     if (!MemOpChains2.empty())
3082       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3083
3084     // Store the return address to the appropriate stack slot.
3085     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3086                                      getPointerTy(), RegInfo->getSlotSize(),
3087                                      FPDiff, dl);
3088   }
3089
3090   // Build a sequence of copy-to-reg nodes chained together with token chain
3091   // and flag operands which copy the outgoing args into registers.
3092   SDValue InFlag;
3093   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3094     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3095                              RegsToPass[i].second, InFlag);
3096     InFlag = Chain.getValue(1);
3097   }
3098
3099   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3100     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3101     // In the 64-bit large code model, we have to make all calls
3102     // through a register, since the call instruction's 32-bit
3103     // pc-relative offset may not be large enough to hold the whole
3104     // address.
3105   } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3106     // If the callee is a GlobalAddress node (quite common, every direct call
3107     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3108     // it.
3109     GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3110
3111     // We should use extra load for direct calls to dllimported functions in
3112     // non-JIT mode.
3113     const GlobalValue *GV = G->getGlobal();
3114     if (!GV->hasDLLImportStorageClass()) {
3115       unsigned char OpFlags = 0;
3116       bool ExtraLoad = false;
3117       unsigned WrapperKind = ISD::DELETED_NODE;
3118
3119       // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
3120       // external symbols most go through the PLT in PIC mode.  If the symbol
3121       // has hidden or protected visibility, or if it is static or local, then
3122       // we don't need to use the PLT - we can directly call it.
3123       if (Subtarget->isTargetELF() &&
3124           DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
3125           GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
3126         OpFlags = X86II::MO_PLT;
3127       } else if (Subtarget->isPICStyleStubAny() &&
3128                  (GV->isDeclaration() || GV->isWeakForLinker()) &&
3129                  (!Subtarget->getTargetTriple().isMacOSX() ||
3130                   Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
3131         // PC-relative references to external symbols should go through $stub,
3132         // unless we're building with the leopard linker or later, which
3133         // automatically synthesizes these stubs.
3134         OpFlags = X86II::MO_DARWIN_STUB;
3135       } else if (Subtarget->isPICStyleRIPRel() &&
3136                  isa<Function>(GV) &&
3137                  cast<Function>(GV)->getAttributes().
3138                    hasAttribute(AttributeSet::FunctionIndex,
3139                                 Attribute::NonLazyBind)) {
3140         // If the function is marked as non-lazy, generate an indirect call
3141         // which loads from the GOT directly. This avoids runtime overhead
3142         // at the cost of eager binding (and one extra byte of encoding).
3143         OpFlags = X86II::MO_GOTPCREL;
3144         WrapperKind = X86ISD::WrapperRIP;
3145         ExtraLoad = true;
3146       }
3147
3148       Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
3149                                           G->getOffset(), OpFlags);
3150
3151       // Add a wrapper if needed.
3152       if (WrapperKind != ISD::DELETED_NODE)
3153         Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
3154       // Add extra indirection if needed.
3155       if (ExtraLoad)
3156         Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
3157                              MachinePointerInfo::getGOT(),
3158                              false, false, false, 0);
3159     }
3160   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3161     unsigned char OpFlags = 0;
3162
3163     // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
3164     // external symbols should go through the PLT.
3165     if (Subtarget->isTargetELF() &&
3166         DAG.getTarget().getRelocationModel() == Reloc::PIC_) {
3167       OpFlags = X86II::MO_PLT;
3168     } else if (Subtarget->isPICStyleStubAny() &&
3169                (!Subtarget->getTargetTriple().isMacOSX() ||
3170                 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
3171       // PC-relative references to external symbols should go through $stub,
3172       // unless we're building with the leopard linker or later, which
3173       // automatically synthesizes these stubs.
3174       OpFlags = X86II::MO_DARWIN_STUB;
3175     }
3176
3177     Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
3178                                          OpFlags);
3179   } else if (Subtarget->isTarget64BitILP32() &&
3180              Callee->getValueType(0) == MVT::i32) {
3181     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3182     Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3183   }
3184
3185   // Returns a chain & a flag for retval copy to use.
3186   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3187   SmallVector<SDValue, 8> Ops;
3188
3189   if (!IsSibcall && isTailCall) {
3190     Chain = DAG.getCALLSEQ_END(Chain,
3191                                DAG.getIntPtrConstant(NumBytesToPop, true),
3192                                DAG.getIntPtrConstant(0, true), InFlag, dl);
3193     InFlag = Chain.getValue(1);
3194   }
3195
3196   Ops.push_back(Chain);
3197   Ops.push_back(Callee);
3198
3199   if (isTailCall)
3200     Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
3201
3202   // Add argument registers to the end of the list so that they are known live
3203   // into the call.
3204   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3205     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3206                                   RegsToPass[i].second.getValueType()));
3207
3208   // Add a register mask operand representing the call-preserved registers.
3209   const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
3210   const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
3211   assert(Mask && "Missing call preserved mask for calling convention");
3212   Ops.push_back(DAG.getRegisterMask(Mask));
3213
3214   if (InFlag.getNode())
3215     Ops.push_back(InFlag);
3216
3217   if (isTailCall) {
3218     // We used to do:
3219     //// If this is the first return lowered for this function, add the regs
3220     //// to the liveout set for the function.
3221     // This isn't right, although it's probably harmless on x86; liveouts
3222     // should be computed from returns not tail calls.  Consider a void
3223     // function making a tail call to a function returning int.
3224     return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3225   }
3226
3227   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3228   InFlag = Chain.getValue(1);
3229
3230   // Create the CALLSEQ_END node.
3231   unsigned NumBytesForCalleeToPop;
3232   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3233                        DAG.getTarget().Options.GuaranteedTailCallOpt))
3234     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
3235   else if (!Is64Bit && !IsTailCallConvention(CallConv) &&
3236            !Subtarget->getTargetTriple().isOSMSVCRT() &&
3237            SR == StackStructReturn)
3238     // If this is a call to a struct-return function, the callee
3239     // pops the hidden struct pointer, so we have to push it back.
3240     // This is common for Darwin/X86, Linux & Mingw32 targets.
3241     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3242     NumBytesForCalleeToPop = 4;
3243   else
3244     NumBytesForCalleeToPop = 0;  // Callee pops nothing.
3245
3246   // Returns a flag for retval copy to use.
3247   if (!IsSibcall) {
3248     Chain = DAG.getCALLSEQ_END(Chain,
3249                                DAG.getIntPtrConstant(NumBytesToPop, true),
3250                                DAG.getIntPtrConstant(NumBytesForCalleeToPop,
3251                                                      true),
3252                                InFlag, dl);
3253     InFlag = Chain.getValue(1);
3254   }
3255
3256   // Handle result values, copying them out of physregs into vregs that we
3257   // return.
3258   return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
3259                          Ins, dl, DAG, InVals);
3260 }
3261
3262 //===----------------------------------------------------------------------===//
3263 //                Fast Calling Convention (tail call) implementation
3264 //===----------------------------------------------------------------------===//
3265
3266 //  Like std call, callee cleans arguments, convention except that ECX is
3267 //  reserved for storing the tail called function address. Only 2 registers are
3268 //  free for argument passing (inreg). Tail call optimization is performed
3269 //  provided:
3270 //                * tailcallopt is enabled
3271 //                * caller/callee are fastcc
3272 //  On X86_64 architecture with GOT-style position independent code only local
3273 //  (within module) calls are supported at the moment.
3274 //  To keep the stack aligned according to platform abi the function
3275 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
3276 //  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3277 //  If a tail called function callee has more arguments than the caller the
3278 //  caller needs to make sure that there is room to move the RETADDR to. This is
3279 //  achieved by reserving an area the size of the argument delta right after the
3280 //  original RETADDR, but before the saved framepointer or the spilled registers
3281 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3282 //  stack layout:
3283 //    arg1
3284 //    arg2
3285 //    RETADDR
3286 //    [ new RETADDR
3287 //      move area ]
3288 //    (possible EBP)
3289 //    ESI
3290 //    EDI
3291 //    local1 ..
3292
3293 /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
3294 /// for a 16 byte align requirement.
3295 unsigned
3296 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3297                                                SelectionDAG& DAG) const {
3298   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
3299   const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
3300   unsigned StackAlignment = TFI.getStackAlignment();
3301   uint64_t AlignMask = StackAlignment - 1;
3302   int64_t Offset = StackSize;
3303   unsigned SlotSize = RegInfo->getSlotSize();
3304   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3305     // Number smaller than 12 so just add the difference.
3306     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3307   } else {
3308     // Mask out lower bits, add stackalignment once plus the 12 bytes.
3309     Offset = ((~AlignMask) & Offset) + StackAlignment +
3310       (StackAlignment-SlotSize);
3311   }
3312   return Offset;
3313 }
3314
3315 /// MatchingStackOffset - Return true if the given stack call argument is
3316 /// already available in the same position (relatively) of the caller's
3317 /// incoming argument stack.
3318 static
3319 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3320                          MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
3321                          const X86InstrInfo *TII) {
3322   unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
3323   int FI = INT_MAX;
3324   if (Arg.getOpcode() == ISD::CopyFromReg) {
3325     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3326     if (!TargetRegisterInfo::isVirtualRegister(VR))
3327       return false;
3328     MachineInstr *Def = MRI->getVRegDef(VR);
3329     if (!Def)
3330       return false;
3331     if (!Flags.isByVal()) {
3332       if (!TII->isLoadFromStackSlot(Def, FI))
3333         return false;
3334     } else {
3335       unsigned Opcode = Def->getOpcode();
3336       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
3337            Opcode == X86::LEA64_32r) &&
3338           Def->getOperand(1).isFI()) {
3339         FI = Def->getOperand(1).getIndex();
3340         Bytes = Flags.getByValSize();
3341       } else
3342         return false;
3343     }
3344   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
3345     if (Flags.isByVal())
3346       // ByVal argument is passed in as a pointer but it's now being
3347       // dereferenced. e.g.
3348       // define @foo(%struct.X* %A) {
3349       //   tail call @bar(%struct.X* byval %A)
3350       // }
3351       return false;
3352     SDValue Ptr = Ld->getBasePtr();
3353     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
3354     if (!FINode)
3355       return false;
3356     FI = FINode->getIndex();
3357   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
3358     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
3359     FI = FINode->getIndex();
3360     Bytes = Flags.getByValSize();
3361   } else
3362     return false;
3363
3364   assert(FI != INT_MAX);
3365   if (!MFI->isFixedObjectIndex(FI))
3366     return false;
3367   return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
3368 }
3369
3370 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
3371 /// for tail call optimization. Targets which want to do tail call
3372 /// optimization should implement this function.
3373 bool
3374 X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
3375                                                      CallingConv::ID CalleeCC,
3376                                                      bool isVarArg,
3377                                                      bool isCalleeStructRet,
3378                                                      bool isCallerStructRet,
3379                                                      Type *RetTy,
3380                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
3381                                     const SmallVectorImpl<SDValue> &OutVals,
3382                                     const SmallVectorImpl<ISD::InputArg> &Ins,
3383                                                      SelectionDAG &DAG) const {
3384   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
3385     return false;
3386
3387   // If -tailcallopt is specified, make fastcc functions tail-callable.
3388   const MachineFunction &MF = DAG.getMachineFunction();
3389   const Function *CallerF = MF.getFunction();
3390
3391   // If the function return type is x86_fp80 and the callee return type is not,
3392   // then the FP_EXTEND of the call result is not a nop. It's not safe to
3393   // perform a tailcall optimization here.
3394   if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
3395     return false;
3396
3397   CallingConv::ID CallerCC = CallerF->getCallingConv();
3398   bool CCMatch = CallerCC == CalleeCC;
3399   bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
3400   bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
3401
3402   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
3403     if (IsTailCallConvention(CalleeCC) && CCMatch)
3404       return true;
3405     return false;
3406   }
3407
3408   // Look for obvious safe cases to perform tail call optimization that do not
3409   // require ABI changes. This is what gcc calls sibcall.
3410
3411   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
3412   // emit a special epilogue.
3413   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
3414   if (RegInfo->needsStackRealignment(MF))
3415     return false;
3416
3417   // Also avoid sibcall optimization if either caller or callee uses struct
3418   // return semantics.
3419   if (isCalleeStructRet || isCallerStructRet)
3420     return false;
3421
3422   // An stdcall/thiscall caller is expected to clean up its arguments; the
3423   // callee isn't going to do that.
3424   // FIXME: this is more restrictive than needed. We could produce a tailcall
3425   // when the stack adjustment matches. For example, with a thiscall that takes
3426   // only one argument.
3427   if (!CCMatch && (CallerCC == CallingConv::X86_StdCall ||
3428                    CallerCC == CallingConv::X86_ThisCall))
3429     return false;
3430
3431   // Do not sibcall optimize vararg calls unless all arguments are passed via
3432   // registers.
3433   if (isVarArg && !Outs.empty()) {
3434
3435     // Optimizing for varargs on Win64 is unlikely to be safe without
3436     // additional testing.
3437     if (IsCalleeWin64 || IsCallerWin64)
3438       return false;
3439
3440     SmallVector<CCValAssign, 16> ArgLocs;
3441     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
3442                    *DAG.getContext());
3443
3444     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3445     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
3446       if (!ArgLocs[i].isRegLoc())
3447         return false;
3448   }
3449
3450   // If the call result is in ST0 / ST1, it needs to be popped off the x87
3451   // stack.  Therefore, if it's not used by the call it is not safe to optimize
3452   // this into a sibcall.
3453   bool Unused = false;
3454   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
3455     if (!Ins[i].Used) {
3456       Unused = true;
3457       break;
3458     }
3459   }
3460   if (Unused) {
3461     SmallVector<CCValAssign, 16> RVLocs;
3462     CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs,
3463                    *DAG.getContext());
3464     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3465     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
3466       CCValAssign &VA = RVLocs[i];
3467       if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
3468         return false;
3469     }
3470   }
3471
3472   // If the calling conventions do not match, then we'd better make sure the
3473   // results are returned in the same way as what the caller expects.
3474   if (!CCMatch) {
3475     SmallVector<CCValAssign, 16> RVLocs1;
3476     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
3477                     *DAG.getContext());
3478     CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
3479
3480     SmallVector<CCValAssign, 16> RVLocs2;
3481     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
3482                     *DAG.getContext());
3483     CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
3484
3485     if (RVLocs1.size() != RVLocs2.size())
3486       return false;
3487     for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
3488       if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
3489         return false;
3490       if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
3491         return false;
3492       if (RVLocs1[i].isRegLoc()) {
3493         if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
3494           return false;
3495       } else {
3496         if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
3497           return false;
3498       }
3499     }
3500   }
3501
3502   // If the callee takes no arguments then go on to check the results of the
3503   // call.
3504   if (!Outs.empty()) {
3505     // Check if stack adjustment is needed. For now, do not do this if any
3506     // argument is passed on the stack.
3507     SmallVector<CCValAssign, 16> ArgLocs;
3508     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
3509                    *DAG.getContext());
3510
3511     // Allocate shadow area for Win64
3512     if (IsCalleeWin64)
3513       CCInfo.AllocateStack(32, 8);
3514
3515     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3516     if (CCInfo.getNextStackOffset()) {
3517       MachineFunction &MF = DAG.getMachineFunction();
3518       if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
3519         return false;
3520
3521       // Check if the arguments are already laid out in the right way as
3522       // the caller's fixed stack objects.
3523       MachineFrameInfo *MFI = MF.getFrameInfo();
3524       const MachineRegisterInfo *MRI = &MF.getRegInfo();
3525       const X86InstrInfo *TII = Subtarget->getInstrInfo();
3526       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3527         CCValAssign &VA = ArgLocs[i];
3528         SDValue Arg = OutVals[i];
3529         ISD::ArgFlagsTy Flags = Outs[i].Flags;
3530         if (VA.getLocInfo() == CCValAssign::Indirect)
3531           return false;
3532         if (!VA.isRegLoc()) {
3533           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
3534                                    MFI, MRI, TII))
3535             return false;
3536         }
3537       }
3538     }
3539
3540     // If the tailcall address may be in a register, then make sure it's
3541     // possible to register allocate for it. In 32-bit, the call address can
3542     // only target EAX, EDX, or ECX since the tail call must be scheduled after
3543     // callee-saved registers are restored. These happen to be the same
3544     // registers used to pass 'inreg' arguments so watch out for those.
3545     if (!Subtarget->is64Bit() &&
3546         ((!isa<GlobalAddressSDNode>(Callee) &&
3547           !isa<ExternalSymbolSDNode>(Callee)) ||
3548          DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
3549       unsigned NumInRegs = 0;
3550       // In PIC we need an extra register to formulate the address computation
3551       // for the callee.
3552       unsigned MaxInRegs =
3553         (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
3554
3555       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3556         CCValAssign &VA = ArgLocs[i];
3557         if (!VA.isRegLoc())
3558           continue;
3559         unsigned Reg = VA.getLocReg();
3560         switch (Reg) {
3561         default: break;
3562         case X86::EAX: case X86::EDX: case X86::ECX:
3563           if (++NumInRegs == MaxInRegs)
3564             return false;
3565           break;
3566         }
3567       }
3568     }
3569   }
3570
3571   return true;
3572 }
3573
3574 FastISel *
3575 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
3576                                   const TargetLibraryInfo *libInfo) const {
3577   return X86::createFastISel(funcInfo, libInfo);
3578 }
3579
3580 //===----------------------------------------------------------------------===//
3581 //                           Other Lowering Hooks
3582 //===----------------------------------------------------------------------===//
3583
3584 static bool MayFoldLoad(SDValue Op) {
3585   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
3586 }
3587
3588 static bool MayFoldIntoStore(SDValue Op) {
3589   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
3590 }
3591
3592 static bool isTargetShuffle(unsigned Opcode) {
3593   switch(Opcode) {
3594   default: return false;
3595   case X86ISD::BLENDI:
3596   case X86ISD::PSHUFB:
3597   case X86ISD::PSHUFD:
3598   case X86ISD::PSHUFHW:
3599   case X86ISD::PSHUFLW:
3600   case X86ISD::SHUFP:
3601   case X86ISD::PALIGNR:
3602   case X86ISD::MOVLHPS:
3603   case X86ISD::MOVLHPD:
3604   case X86ISD::MOVHLPS:
3605   case X86ISD::MOVLPS:
3606   case X86ISD::MOVLPD:
3607   case X86ISD::MOVSHDUP:
3608   case X86ISD::MOVSLDUP:
3609   case X86ISD::MOVDDUP:
3610   case X86ISD::MOVSS:
3611   case X86ISD::MOVSD:
3612   case X86ISD::UNPCKL:
3613   case X86ISD::UNPCKH:
3614   case X86ISD::VPERMILPI:
3615   case X86ISD::VPERM2X128:
3616   case X86ISD::VPERMI:
3617     return true;
3618   }
3619 }
3620
3621 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3622                                     SDValue V1, SelectionDAG &DAG) {
3623   switch(Opc) {
3624   default: llvm_unreachable("Unknown x86 shuffle node");
3625   case X86ISD::MOVSHDUP:
3626   case X86ISD::MOVSLDUP:
3627   case X86ISD::MOVDDUP:
3628     return DAG.getNode(Opc, dl, VT, V1);
3629   }
3630 }
3631
3632 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3633                                     SDValue V1, unsigned TargetMask,
3634                                     SelectionDAG &DAG) {
3635   switch(Opc) {
3636   default: llvm_unreachable("Unknown x86 shuffle node");
3637   case X86ISD::PSHUFD:
3638   case X86ISD::PSHUFHW:
3639   case X86ISD::PSHUFLW:
3640   case X86ISD::VPERMILPI:
3641   case X86ISD::VPERMI:
3642     return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
3643   }
3644 }
3645
3646 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3647                                     SDValue V1, SDValue V2, unsigned TargetMask,
3648                                     SelectionDAG &DAG) {
3649   switch(Opc) {
3650   default: llvm_unreachable("Unknown x86 shuffle node");
3651   case X86ISD::PALIGNR:
3652   case X86ISD::VALIGN:
3653   case X86ISD::SHUFP:
3654   case X86ISD::VPERM2X128:
3655     return DAG.getNode(Opc, dl, VT, V1, V2,
3656                        DAG.getConstant(TargetMask, MVT::i8));
3657   }
3658 }
3659
3660 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3661                                     SDValue V1, SDValue V2, SelectionDAG &DAG) {
3662   switch(Opc) {
3663   default: llvm_unreachable("Unknown x86 shuffle node");
3664   case X86ISD::MOVLHPS:
3665   case X86ISD::MOVLHPD:
3666   case X86ISD::MOVHLPS:
3667   case X86ISD::MOVLPS:
3668   case X86ISD::MOVLPD:
3669   case X86ISD::MOVSS:
3670   case X86ISD::MOVSD:
3671   case X86ISD::UNPCKL:
3672   case X86ISD::UNPCKH:
3673     return DAG.getNode(Opc, dl, VT, V1, V2);
3674   }
3675 }
3676
3677 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
3678   MachineFunction &MF = DAG.getMachineFunction();
3679   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
3680   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3681   int ReturnAddrIndex = FuncInfo->getRAIndex();
3682
3683   if (ReturnAddrIndex == 0) {
3684     // Set up a frame object for the return address.
3685     unsigned SlotSize = RegInfo->getSlotSize();
3686     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
3687                                                            -(int64_t)SlotSize,
3688                                                            false);
3689     FuncInfo->setRAIndex(ReturnAddrIndex);
3690   }
3691
3692   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
3693 }
3694
3695 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
3696                                        bool hasSymbolicDisplacement) {
3697   // Offset should fit into 32 bit immediate field.
3698   if (!isInt<32>(Offset))
3699     return false;
3700
3701   // If we don't have a symbolic displacement - we don't have any extra
3702   // restrictions.
3703   if (!hasSymbolicDisplacement)
3704     return true;
3705
3706   // FIXME: Some tweaks might be needed for medium code model.
3707   if (M != CodeModel::Small && M != CodeModel::Kernel)
3708     return false;
3709
3710   // For small code model we assume that latest object is 16MB before end of 31
3711   // bits boundary. We may also accept pretty large negative constants knowing
3712   // that all objects are in the positive half of address space.
3713   if (M == CodeModel::Small && Offset < 16*1024*1024)
3714     return true;
3715
3716   // For kernel code model we know that all object resist in the negative half
3717   // of 32bits address space. We may not accept negative offsets, since they may
3718   // be just off and we may accept pretty large positive ones.
3719   if (M == CodeModel::Kernel && Offset >= 0)
3720     return true;
3721
3722   return false;
3723 }
3724
3725 /// isCalleePop - Determines whether the callee is required to pop its
3726 /// own arguments. Callee pop is necessary to support tail calls.
3727 bool X86::isCalleePop(CallingConv::ID CallingConv,
3728                       bool is64Bit, bool IsVarArg, bool TailCallOpt) {
3729   switch (CallingConv) {
3730   default:
3731     return false;
3732   case CallingConv::X86_StdCall:
3733   case CallingConv::X86_FastCall:
3734   case CallingConv::X86_ThisCall:
3735     return !is64Bit;
3736   case CallingConv::Fast:
3737   case CallingConv::GHC:
3738   case CallingConv::HiPE:
3739     if (IsVarArg)
3740       return false;
3741     return TailCallOpt;
3742   }
3743 }
3744
3745 /// \brief Return true if the condition is an unsigned comparison operation.
3746 static bool isX86CCUnsigned(unsigned X86CC) {
3747   switch (X86CC) {
3748   default: llvm_unreachable("Invalid integer condition!");
3749   case X86::COND_E:     return true;
3750   case X86::COND_G:     return false;
3751   case X86::COND_GE:    return false;
3752   case X86::COND_L:     return false;
3753   case X86::COND_LE:    return false;
3754   case X86::COND_NE:    return true;
3755   case X86::COND_B:     return true;
3756   case X86::COND_A:     return true;
3757   case X86::COND_BE:    return true;
3758   case X86::COND_AE:    return true;
3759   }
3760   llvm_unreachable("covered switch fell through?!");
3761 }
3762
3763 /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
3764 /// specific condition code, returning the condition code and the LHS/RHS of the
3765 /// comparison to make.
3766 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
3767                                SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
3768   if (!isFP) {
3769     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
3770       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
3771         // X > -1   -> X == 0, jump !sign.
3772         RHS = DAG.getConstant(0, RHS.getValueType());
3773         return X86::COND_NS;
3774       }
3775       if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
3776         // X < 0   -> X == 0, jump on sign.
3777         return X86::COND_S;
3778       }
3779       if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
3780         // X < 1   -> X <= 0
3781         RHS = DAG.getConstant(0, RHS.getValueType());
3782         return X86::COND_LE;
3783       }
3784     }
3785
3786     switch (SetCCOpcode) {
3787     default: llvm_unreachable("Invalid integer condition!");
3788     case ISD::SETEQ:  return X86::COND_E;
3789     case ISD::SETGT:  return X86::COND_G;
3790     case ISD::SETGE:  return X86::COND_GE;
3791     case ISD::SETLT:  return X86::COND_L;
3792     case ISD::SETLE:  return X86::COND_LE;
3793     case ISD::SETNE:  return X86::COND_NE;
3794     case ISD::SETULT: return X86::COND_B;
3795     case ISD::SETUGT: return X86::COND_A;
3796     case ISD::SETULE: return X86::COND_BE;
3797     case ISD::SETUGE: return X86::COND_AE;
3798     }
3799   }
3800
3801   // First determine if it is required or is profitable to flip the operands.
3802
3803   // If LHS is a foldable load, but RHS is not, flip the condition.
3804   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3805       !ISD::isNON_EXTLoad(RHS.getNode())) {
3806     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3807     std::swap(LHS, RHS);
3808   }
3809
3810   switch (SetCCOpcode) {
3811   default: break;
3812   case ISD::SETOLT:
3813   case ISD::SETOLE:
3814   case ISD::SETUGT:
3815   case ISD::SETUGE:
3816     std::swap(LHS, RHS);
3817     break;
3818   }
3819
3820   // On a floating point condition, the flags are set as follows:
3821   // ZF  PF  CF   op
3822   //  0 | 0 | 0 | X > Y
3823   //  0 | 0 | 1 | X < Y
3824   //  1 | 0 | 0 | X == Y
3825   //  1 | 1 | 1 | unordered
3826   switch (SetCCOpcode) {
3827   default: llvm_unreachable("Condcode should be pre-legalized away");
3828   case ISD::SETUEQ:
3829   case ISD::SETEQ:   return X86::COND_E;
3830   case ISD::SETOLT:              // flipped
3831   case ISD::SETOGT:
3832   case ISD::SETGT:   return X86::COND_A;
3833   case ISD::SETOLE:              // flipped
3834   case ISD::SETOGE:
3835   case ISD::SETGE:   return X86::COND_AE;
3836   case ISD::SETUGT:              // flipped
3837   case ISD::SETULT:
3838   case ISD::SETLT:   return X86::COND_B;
3839   case ISD::SETUGE:              // flipped
3840   case ISD::SETULE:
3841   case ISD::SETLE:   return X86::COND_BE;
3842   case ISD::SETONE:
3843   case ISD::SETNE:   return X86::COND_NE;
3844   case ISD::SETUO:   return X86::COND_P;
3845   case ISD::SETO:    return X86::COND_NP;
3846   case ISD::SETOEQ:
3847   case ISD::SETUNE:  return X86::COND_INVALID;
3848   }
3849 }
3850
3851 /// hasFPCMov - is there a floating point cmov for the specific X86 condition
3852 /// code. Current x86 isa includes the following FP cmov instructions:
3853 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3854 static bool hasFPCMov(unsigned X86CC) {
3855   switch (X86CC) {
3856   default:
3857     return false;
3858   case X86::COND_B:
3859   case X86::COND_BE:
3860   case X86::COND_E:
3861   case X86::COND_P:
3862   case X86::COND_A:
3863   case X86::COND_AE:
3864   case X86::COND_NE:
3865   case X86::COND_NP:
3866     return true;
3867   }
3868 }
3869
3870 /// isFPImmLegal - Returns true if the target can instruction select the
3871 /// specified FP immediate natively. If false, the legalizer will
3872 /// materialize the FP immediate as a load from a constant pool.
3873 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
3874   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
3875     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
3876       return true;
3877   }
3878   return false;
3879 }
3880
3881 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
3882                                               ISD::LoadExtType ExtTy,
3883                                               EVT NewVT) const {
3884   // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3885   // relocation target a movq or addq instruction: don't let the load shrink.
3886   SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3887   if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3888     if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3889       return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3890   return true;
3891 }
3892
3893 /// \brief Returns true if it is beneficial to convert a load of a constant
3894 /// to just the constant itself.
3895 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
3896                                                           Type *Ty) const {
3897   assert(Ty->isIntegerTy());
3898
3899   unsigned BitSize = Ty->getPrimitiveSizeInBits();
3900   if (BitSize == 0 || BitSize > 64)
3901     return false;
3902   return true;
3903 }
3904
3905 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
3906                                                 unsigned Index) const {
3907   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
3908     return false;
3909
3910   return (Index == 0 || Index == ResVT.getVectorNumElements());
3911 }
3912
3913 bool X86TargetLowering::isCheapToSpeculateCttz() const {
3914   // Speculate cttz only if we can directly use TZCNT.
3915   return Subtarget->hasBMI();
3916 }
3917
3918 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
3919   // Speculate ctlz only if we can directly use LZCNT.
3920   return Subtarget->hasLZCNT();
3921 }
3922
3923 /// isUndefOrInRange - Return true if Val is undef or if its value falls within
3924 /// the specified range (L, H].
3925 static bool isUndefOrInRange(int Val, int Low, int Hi) {
3926   return (Val < 0) || (Val >= Low && Val < Hi);
3927 }
3928
3929 /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
3930 /// specified value.
3931 static bool isUndefOrEqual(int Val, int CmpVal) {
3932   return (Val < 0 || Val == CmpVal);
3933 }
3934
3935 /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
3936 /// from position Pos and ending in Pos+Size, falls within the specified
3937 /// sequential range (Low, Low+Size]. or is undef.
3938 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
3939                                        unsigned Pos, unsigned Size, int Low) {
3940   for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
3941     if (!isUndefOrEqual(Mask[i], Low))
3942       return false;
3943   return true;
3944 }
3945
3946 /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
3947 /// is suitable for input to PSHUFD. That is, it doesn't reference the other
3948 /// operand - by default will match for first operand.
3949 static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT,
3950                          bool TestSecondOperand = false) {
3951   if (VT != MVT::v4f32 && VT != MVT::v4i32 &&
3952       VT != MVT::v2f64 && VT != MVT::v2i64)
3953     return false;
3954
3955   unsigned NumElems = VT.getVectorNumElements();
3956   unsigned Lo = TestSecondOperand ? NumElems : 0;
3957   unsigned Hi = Lo + NumElems;
3958
3959   for (unsigned i = 0; i < NumElems; ++i)
3960     if (!isUndefOrInRange(Mask[i], (int)Lo, (int)Hi))
3961       return false;
3962
3963   return true;
3964 }
3965
3966 /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
3967 /// is suitable for input to PSHUFHW.
3968 static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
3969   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
3970     return false;
3971
3972   // Lower quadword copied in order or undef.
3973   if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
3974     return false;
3975
3976   // Upper quadword shuffled.
3977   for (unsigned i = 4; i != 8; ++i)
3978     if (!isUndefOrInRange(Mask[i], 4, 8))
3979       return false;
3980
3981   if (VT == MVT::v16i16) {
3982     // Lower quadword copied in order or undef.
3983     if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
3984       return false;
3985
3986     // Upper quadword shuffled.
3987     for (unsigned i = 12; i != 16; ++i)
3988       if (!isUndefOrInRange(Mask[i], 12, 16))
3989         return false;
3990   }
3991
3992   return true;
3993 }
3994
3995 /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
3996 /// is suitable for input to PSHUFLW.
3997 static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
3998   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
3999     return false;
4000
4001   // Upper quadword copied in order.
4002   if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
4003     return false;
4004
4005   // Lower quadword shuffled.
4006   for (unsigned i = 0; i != 4; ++i)
4007     if (!isUndefOrInRange(Mask[i], 0, 4))
4008       return false;
4009
4010   if (VT == MVT::v16i16) {
4011     // Upper quadword copied in order.
4012     if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
4013       return false;
4014
4015     // Lower quadword shuffled.
4016     for (unsigned i = 8; i != 12; ++i)
4017       if (!isUndefOrInRange(Mask[i], 8, 12))
4018         return false;
4019   }
4020
4021   return true;
4022 }
4023
4024 /// \brief Return true if the mask specifies a shuffle of elements that is
4025 /// suitable for input to intralane (palignr) or interlane (valign) vector
4026 /// right-shift.
4027 static bool isAlignrMask(ArrayRef<int> Mask, MVT VT, bool InterLane) {
4028   unsigned NumElts = VT.getVectorNumElements();
4029   unsigned NumLanes = InterLane ? 1: VT.getSizeInBits()/128;
4030   unsigned NumLaneElts = NumElts/NumLanes;
4031
4032   // Do not handle 64-bit element shuffles with palignr.
4033   if (NumLaneElts == 2)
4034     return false;
4035
4036   for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
4037     unsigned i;
4038     for (i = 0; i != NumLaneElts; ++i) {
4039       if (Mask[i+l] >= 0)
4040         break;
4041     }
4042
4043     // Lane is all undef, go to next lane
4044     if (i == NumLaneElts)
4045       continue;
4046
4047     int Start = Mask[i+l];
4048
4049     // Make sure its in this lane in one of the sources
4050     if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
4051         !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
4052       return false;
4053
4054     // If not lane 0, then we must match lane 0
4055     if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
4056       return false;
4057
4058     // Correct second source to be contiguous with first source
4059     if (Start >= (int)NumElts)
4060       Start -= NumElts - NumLaneElts;
4061
4062     // Make sure we're shifting in the right direction.
4063     if (Start <= (int)(i+l))
4064       return false;
4065
4066     Start -= i;
4067
4068     // Check the rest of the elements to see if they are consecutive.
4069     for (++i; i != NumLaneElts; ++i) {
4070       int Idx = Mask[i+l];
4071
4072       // Make sure its in this lane
4073       if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
4074           !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
4075         return false;
4076
4077       // If not lane 0, then we must match lane 0
4078       if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
4079         return false;
4080
4081       if (Idx >= (int)NumElts)
4082         Idx -= NumElts - NumLaneElts;
4083
4084       if (!isUndefOrEqual(Idx, Start+i))
4085         return false;
4086
4087     }
4088   }
4089
4090   return true;
4091 }
4092
4093 /// \brief Return true if the node specifies a shuffle of elements that is
4094 /// suitable for input to PALIGNR.
4095 static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
4096                           const X86Subtarget *Subtarget) {
4097   if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
4098       (VT.is256BitVector() && !Subtarget->hasInt256()) ||
4099       VT.is512BitVector())
4100     // FIXME: Add AVX512BW.
4101     return false;
4102
4103   return isAlignrMask(Mask, VT, false);
4104 }
4105
4106 /// \brief Return true if the node specifies a shuffle of elements that is
4107 /// suitable for input to VALIGN.
4108 static bool isVALIGNMask(ArrayRef<int> Mask, MVT VT,
4109                           const X86Subtarget *Subtarget) {
4110   // FIXME: Add AVX512VL.
4111   if (!VT.is512BitVector() || !Subtarget->hasAVX512())
4112     return false;
4113   return isAlignrMask(Mask, VT, true);
4114 }
4115
4116 /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
4117 /// the two vector operands have swapped position.
4118 static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
4119                                      unsigned NumElems) {
4120   for (unsigned i = 0; i != NumElems; ++i) {
4121     int idx = Mask[i];
4122     if (idx < 0)
4123       continue;
4124     else if (idx < (int)NumElems)
4125       Mask[i] = idx + NumElems;
4126     else
4127       Mask[i] = idx - NumElems;
4128   }
4129 }
4130
4131 /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
4132 /// specifies a shuffle of elements that is suitable for input to 128/256-bit
4133 /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
4134 /// reverse of what x86 shuffles want.
4135 static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) {
4136
4137   unsigned NumElems = VT.getVectorNumElements();
4138   unsigned NumLanes = VT.getSizeInBits()/128;
4139   unsigned NumLaneElems = NumElems/NumLanes;
4140
4141   if (NumLaneElems != 2 && NumLaneElems != 4)
4142     return false;
4143
4144   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4145   bool symmetricMaskRequired =
4146     (VT.getSizeInBits() >= 256) && (EltSize == 32);
4147
4148   // VSHUFPSY divides the resulting vector into 4 chunks.
4149   // The sources are also splitted into 4 chunks, and each destination
4150   // chunk must come from a different source chunk.
4151   //
4152   //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
4153   //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
4154   //
4155   //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
4156   //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
4157   //
4158   // VSHUFPDY divides the resulting vector into 4 chunks.
4159   // The sources are also splitted into 4 chunks, and each destination
4160   // chunk must come from a different source chunk.
4161   //
4162   //  SRC1 =>      X3       X2       X1       X0
4163   //  SRC2 =>      Y3       Y2       Y1       Y0
4164   //
4165   //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
4166   //
4167   SmallVector<int, 4> MaskVal(NumLaneElems, -1);
4168   unsigned HalfLaneElems = NumLaneElems/2;
4169   for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
4170     for (unsigned i = 0; i != NumLaneElems; ++i) {
4171       int Idx = Mask[i+l];
4172       unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
4173       if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
4174         return false;
4175       // For VSHUFPSY, the mask of the second half must be the same as the
4176       // first but with the appropriate offsets. This works in the same way as
4177       // VPERMILPS works with masks.
4178       if (!symmetricMaskRequired || Idx < 0)
4179         continue;
4180       if (MaskVal[i] < 0) {
4181         MaskVal[i] = Idx - l;
4182         continue;
4183       }
4184       if ((signed)(Idx - l) != MaskVal[i])
4185         return false;
4186     }
4187   }
4188
4189   return true;
4190 }
4191
4192 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
4193 /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
4194 static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) {
4195   if (!VT.is128BitVector())
4196     return false;
4197
4198   unsigned NumElems = VT.getVectorNumElements();
4199
4200   if (NumElems != 4)
4201     return false;
4202
4203   // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
4204   return isUndefOrEqual(Mask[0], 6) &&
4205          isUndefOrEqual(Mask[1], 7) &&
4206          isUndefOrEqual(Mask[2], 2) &&
4207          isUndefOrEqual(Mask[3], 3);
4208 }
4209
4210 /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
4211 /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
4212 /// <2, 3, 2, 3>
4213 static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) {
4214   if (!VT.is128BitVector())
4215     return false;
4216
4217   unsigned NumElems = VT.getVectorNumElements();
4218
4219   if (NumElems != 4)
4220     return false;
4221
4222   return isUndefOrEqual(Mask[0], 2) &&
4223          isUndefOrEqual(Mask[1], 3) &&
4224          isUndefOrEqual(Mask[2], 2) &&
4225          isUndefOrEqual(Mask[3], 3);
4226 }
4227
4228 /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
4229 /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
4230 static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) {
4231   if (!VT.is128BitVector())
4232     return false;
4233
4234   unsigned NumElems = VT.getVectorNumElements();
4235
4236   if (NumElems != 2 && NumElems != 4)
4237     return false;
4238
4239   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4240     if (!isUndefOrEqual(Mask[i], i + NumElems))
4241       return false;
4242
4243   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
4244     if (!isUndefOrEqual(Mask[i], i))
4245       return false;
4246
4247   return true;
4248 }
4249
4250 /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
4251 /// specifies a shuffle of elements that is suitable for input to MOVLHPS.
4252 static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
4253   if (!VT.is128BitVector())
4254     return false;
4255
4256   unsigned NumElems = VT.getVectorNumElements();
4257
4258   if (NumElems != 2 && NumElems != 4)
4259     return false;
4260
4261   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4262     if (!isUndefOrEqual(Mask[i], i))
4263       return false;
4264
4265   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4266     if (!isUndefOrEqual(Mask[i + e], i + NumElems))
4267       return false;
4268
4269   return true;
4270 }
4271
4272 /// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
4273 /// specifies a shuffle of elements that is suitable for input to INSERTPS.
4274 /// i. e: If all but one element come from the same vector.
4275 static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
4276   // TODO: Deal with AVX's VINSERTPS
4277   if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))
4278     return false;
4279
4280   unsigned CorrectPosV1 = 0;
4281   unsigned CorrectPosV2 = 0;
4282   for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) {
4283     if (Mask[i] == -1) {
4284       ++CorrectPosV1;
4285       ++CorrectPosV2;
4286       continue;
4287     }
4288
4289     if (Mask[i] == i)
4290       ++CorrectPosV1;
4291     else if (Mask[i] == i + 4)
4292       ++CorrectPosV2;
4293   }
4294
4295   if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
4296     // We have 3 elements (undefs count as elements from any vector) from one
4297     // vector, and one from another.
4298     return true;
4299
4300   return false;
4301 }
4302
4303 //
4304 // Some special combinations that can be optimized.
4305 //
4306 static
4307 SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
4308                                SelectionDAG &DAG) {
4309   MVT VT = SVOp->getSimpleValueType(0);
4310   SDLoc dl(SVOp);
4311
4312   if (VT != MVT::v8i32 && VT != MVT::v8f32)
4313     return SDValue();
4314
4315   ArrayRef<int> Mask = SVOp->getMask();
4316
4317   // These are the special masks that may be optimized.
4318   static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
4319   static const int MaskToOptimizeOdd[]  = {1, 9, 3, 11, 5, 13, 7, 15};
4320   bool MatchEvenMask = true;
4321   bool MatchOddMask  = true;
4322   for (int i=0; i<8; ++i) {
4323     if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
4324       MatchEvenMask = false;
4325     if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
4326       MatchOddMask = false;
4327   }
4328
4329   if (!MatchEvenMask && !MatchOddMask)
4330     return SDValue();
4331
4332   SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
4333
4334   SDValue Op0 = SVOp->getOperand(0);
4335   SDValue Op1 = SVOp->getOperand(1);
4336
4337   if (MatchEvenMask) {
4338     // Shift the second operand right to 32 bits.
4339     static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
4340     Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
4341   } else {
4342     // Shift the first operand left to 32 bits.
4343     static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
4344     Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
4345   }
4346   static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
4347   return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
4348 }
4349
4350 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
4351 /// specifies a shuffle of elements that is suitable for input to UNPCKL.
4352 static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT,
4353                          bool HasInt256, bool V2IsSplat = false) {
4354
4355   assert(VT.getSizeInBits() >= 128 &&
4356          "Unsupported vector type for unpckl");
4357
4358   unsigned NumElts = VT.getVectorNumElements();
4359   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
4360       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4361     return false;
4362
4363   assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
4364          "Unsupported vector type for unpckh");
4365
4366   // AVX defines UNPCK* to operate independently on 128-bit lanes.
4367   unsigned NumLanes = VT.getSizeInBits()/128;
4368   unsigned NumLaneElts = NumElts/NumLanes;
4369
4370   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4371     for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
4372       int BitI  = Mask[l+i];
4373       int BitI1 = Mask[l+i+1];
4374       if (!isUndefOrEqual(BitI, j))
4375         return false;
4376       if (V2IsSplat) {
4377         if (!isUndefOrEqual(BitI1, NumElts))
4378           return false;
4379       } else {
4380         if (!isUndefOrEqual(BitI1, j + NumElts))
4381           return false;
4382       }
4383     }
4384   }
4385
4386   return true;
4387 }
4388
4389 /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
4390 /// specifies a shuffle of elements that is suitable for input to UNPCKH.
4391 static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT,
4392                          bool HasInt256, bool V2IsSplat = false) {
4393   assert(VT.getSizeInBits() >= 128 &&
4394          "Unsupported vector type for unpckh");
4395
4396   unsigned NumElts = VT.getVectorNumElements();
4397   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
4398       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4399     return false;
4400
4401   assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
4402          "Unsupported vector type for unpckh");
4403
4404   // AVX defines UNPCK* to operate independently on 128-bit lanes.
4405   unsigned NumLanes = VT.getSizeInBits()/128;
4406   unsigned NumLaneElts = NumElts/NumLanes;
4407
4408   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4409     for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
4410       int BitI  = Mask[l+i];
4411       int BitI1 = Mask[l+i+1];
4412       if (!isUndefOrEqual(BitI, j))
4413         return false;
4414       if (V2IsSplat) {
4415         if (isUndefOrEqual(BitI1, NumElts))
4416           return false;
4417       } else {
4418         if (!isUndefOrEqual(BitI1, j+NumElts))
4419           return false;
4420       }
4421     }
4422   }
4423   return true;
4424 }
4425
4426 /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
4427 /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
4428 /// <0, 0, 1, 1>
4429 static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
4430   unsigned NumElts = VT.getVectorNumElements();
4431   bool Is256BitVec = VT.is256BitVector();
4432
4433   if (VT.is512BitVector())
4434     return false;
4435   assert((VT.is128BitVector() || VT.is256BitVector()) &&
4436          "Unsupported vector type for unpckh");
4437
4438   if (Is256BitVec && NumElts != 4 && NumElts != 8 &&
4439       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4440     return false;
4441
4442   // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
4443   // FIXME: Need a better way to get rid of this, there's no latency difference
4444   // between UNPCKLPD and MOVDDUP, the later should always be checked first and
4445   // the former later. We should also remove the "_undef" special mask.
4446   if (NumElts == 4 && Is256BitVec)
4447     return false;
4448
4449   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
4450   // independently on 128-bit lanes.
4451   unsigned NumLanes = VT.getSizeInBits()/128;
4452   unsigned NumLaneElts = NumElts/NumLanes;
4453
4454   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4455     for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
4456       int BitI  = Mask[l+i];
4457       int BitI1 = Mask[l+i+1];
4458
4459       if (!isUndefOrEqual(BitI, j))
4460         return false;
4461       if (!isUndefOrEqual(BitI1, j))
4462         return false;
4463     }
4464   }
4465
4466   return true;
4467 }
4468
4469 /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
4470 /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
4471 /// <2, 2, 3, 3>
4472 static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
4473   unsigned NumElts = VT.getVectorNumElements();
4474
4475   if (VT.is512BitVector())
4476     return false;
4477
4478   assert((VT.is128BitVector() || VT.is256BitVector()) &&
4479          "Unsupported vector type for unpckh");
4480
4481   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
4482       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4483     return false;
4484
4485   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
4486   // independently on 128-bit lanes.
4487   unsigned NumLanes = VT.getSizeInBits()/128;
4488   unsigned NumLaneElts = NumElts/NumLanes;
4489
4490   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4491     for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
4492       int BitI  = Mask[l+i];
4493       int BitI1 = Mask[l+i+1];
4494       if (!isUndefOrEqual(BitI, j))
4495         return false;
4496       if (!isUndefOrEqual(BitI1, j))
4497         return false;
4498     }
4499   }
4500   return true;
4501 }
4502
4503 // Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or
4504 // (src1[0], src0[1]), manipulation with 256-bit sub-vectors
4505 static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) {
4506   if (!VT.is512BitVector())
4507     return false;
4508
4509   unsigned NumElts = VT.getVectorNumElements();
4510   unsigned HalfSize = NumElts/2;
4511   if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) {
4512     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) {
4513       *Imm = 1;
4514       return true;
4515     }
4516   }
4517   if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) {
4518     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) {
4519       *Imm = 0;
4520       return true;
4521     }
4522   }
4523   return false;
4524 }
4525
4526 /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
4527 /// specifies a shuffle of elements that is suitable for input to MOVSS,
4528 /// MOVSD, and MOVD, i.e. setting the lowest element.
4529 static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
4530   if (VT.getVectorElementType().getSizeInBits() < 32)
4531     return false;
4532   if (!VT.is128BitVector())
4533     return false;
4534
4535   unsigned NumElts = VT.getVectorNumElements();
4536
4537   if (!isUndefOrEqual(Mask[0], NumElts))
4538     return false;
4539
4540   for (unsigned i = 1; i != NumElts; ++i)
4541     if (!isUndefOrEqual(Mask[i], i))
4542       return false;
4543
4544   return true;
4545 }
4546
4547 /// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
4548 /// as permutations between 128-bit chunks or halves. As an example: this
4549 /// shuffle bellow:
4550 ///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
4551 /// The first half comes from the second half of V1 and the second half from the
4552 /// the second half of V2.
4553 static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
4554   if (!HasFp256 || !VT.is256BitVector())
4555     return false;
4556
4557   // The shuffle result is divided into half A and half B. In total the two
4558   // sources have 4 halves, namely: C, D, E, F. The final values of A and
4559   // B must come from C, D, E or F.
4560   unsigned HalfSize = VT.getVectorNumElements()/2;
4561   bool MatchA = false, MatchB = false;
4562
4563   // Check if A comes from one of C, D, E, F.
4564   for (unsigned Half = 0; Half != 4; ++Half) {
4565     if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
4566       MatchA = true;
4567       break;
4568     }
4569   }
4570
4571   // Check if B comes from one of C, D, E, F.
4572   for (unsigned Half = 0; Half != 4; ++Half) {
4573     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
4574       MatchB = true;
4575       break;
4576     }
4577   }
4578
4579   return MatchA && MatchB;
4580 }
4581
4582 /// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
4583 /// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
4584 static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
4585   MVT VT = SVOp->getSimpleValueType(0);
4586
4587   unsigned HalfSize = VT.getVectorNumElements()/2;
4588
4589   unsigned FstHalf = 0, SndHalf = 0;
4590   for (unsigned i = 0; i < HalfSize; ++i) {
4591     if (SVOp->getMaskElt(i) > 0) {
4592       FstHalf = SVOp->getMaskElt(i)/HalfSize;
4593       break;
4594     }
4595   }
4596   for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
4597     if (SVOp->getMaskElt(i) > 0) {
4598       SndHalf = SVOp->getMaskElt(i)/HalfSize;
4599       break;
4600     }
4601   }
4602
4603   return (FstHalf | (SndHalf << 4));
4604 }
4605
4606 // Symmetric in-lane mask. Each lane has 4 elements (for imm8)
4607 static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) {
4608   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4609   if (EltSize < 32)
4610     return false;
4611
4612   unsigned NumElts = VT.getVectorNumElements();
4613   Imm8 = 0;
4614   if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) {
4615     for (unsigned i = 0; i != NumElts; ++i) {
4616       if (Mask[i] < 0)
4617         continue;
4618       Imm8 |= Mask[i] << (i*2);
4619     }
4620     return true;
4621   }
4622
4623   unsigned LaneSize = 4;
4624   SmallVector<int, 4> MaskVal(LaneSize, -1);
4625
4626   for (unsigned l = 0; l != NumElts; l += LaneSize) {
4627     for (unsigned i = 0; i != LaneSize; ++i) {
4628       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
4629         return false;
4630       if (Mask[i+l] < 0)
4631         continue;
4632       if (MaskVal[i] < 0) {
4633         MaskVal[i] = Mask[i+l] - l;
4634         Imm8 |= MaskVal[i] << (i*2);
4635         continue;
4636       }
4637       if (Mask[i+l] != (signed)(MaskVal[i]+l))
4638         return false;
4639     }
4640   }
4641   return true;
4642 }
4643
4644 /// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
4645 /// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
4646 /// Note that VPERMIL mask matching is different depending whether theunderlying
4647 /// type is 32 or 64. In the VPERMILPS the high half of the mask should point
4648 /// to the same elements of the low, but to the higher half of the source.
4649 /// In VPERMILPD the two lanes could be shuffled independently of each other
4650 /// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
4651 static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {
4652   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4653   if (VT.getSizeInBits() < 256 || EltSize < 32)
4654     return false;
4655   bool symmetricMaskRequired = (EltSize == 32);
4656   unsigned NumElts = VT.getVectorNumElements();
4657
4658   unsigned NumLanes = VT.getSizeInBits()/128;
4659   unsigned LaneSize = NumElts/NumLanes;
4660   // 2 or 4 elements in one lane
4661
4662   SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);
4663   for (unsigned l = 0; l != NumElts; l += LaneSize) {
4664     for (unsigned i = 0; i != LaneSize; ++i) {
4665       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
4666         return false;
4667       if (symmetricMaskRequired) {
4668         if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {
4669           ExpectedMaskVal[i] = Mask[i+l] - l;
4670           continue;
4671         }
4672         if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l))
4673           return false;
4674       }
4675     }
4676   }
4677   return true;
4678 }
4679
4680 /// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
4681 /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
4682 /// element of vector 2 and the other elements to come from vector 1 in order.
4683 static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT,
4684                                bool V2IsSplat = false, bool V2IsUndef = false) {
4685   if (!VT.is128BitVector())
4686     return false;
4687
4688   unsigned NumOps = VT.getVectorNumElements();
4689   if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
4690     return false;
4691
4692   if (!isUndefOrEqual(Mask[0], 0))
4693     return false;
4694
4695   for (unsigned i = 1; i != NumOps; ++i)
4696     if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
4697           (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
4698           (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
4699       return false;
4700
4701   return true;
4702 }
4703
4704 /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4705 /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
4706 /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
4707 static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT,
4708                            const X86Subtarget *Subtarget) {
4709   if (!Subtarget->hasSSE3())
4710     return false;
4711
4712   unsigned NumElems = VT.getVectorNumElements();
4713
4714   if ((VT.is128BitVector() && NumElems != 4) ||
4715       (VT.is256BitVector() && NumElems != 8) ||
4716       (VT.is512BitVector() && NumElems != 16))
4717     return false;
4718
4719   // "i+1" is the value the indexed mask element must have
4720   for (unsigned i = 0; i != NumElems; i += 2)
4721     if (!isUndefOrEqual(Mask[i], i+1) ||
4722         !isUndefOrEqual(Mask[i+1], i+1))
4723       return false;
4724
4725   return true;
4726 }
4727
4728 /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4729 /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
4730 /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
4731 static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT,
4732                            const X86Subtarget *Subtarget) {
4733   if (!Subtarget->hasSSE3())
4734     return false;
4735
4736   unsigned NumElems = VT.getVectorNumElements();
4737
4738   if ((VT.is128BitVector() && NumElems != 4) ||
4739       (VT.is256BitVector() && NumElems != 8) ||
4740       (VT.is512BitVector() && NumElems != 16))
4741     return false;
4742
4743   // "i" is the value the indexed mask element must have
4744   for (unsigned i = 0; i != NumElems; i += 2)
4745     if (!isUndefOrEqual(Mask[i], i) ||
4746         !isUndefOrEqual(Mask[i+1], i))
4747       return false;
4748
4749   return true;
4750 }
4751
4752 /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
4753 /// specifies a shuffle of elements that is suitable for input to 256-bit
4754 /// version of MOVDDUP.
4755 static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
4756   if (!HasFp256 || !VT.is256BitVector())
4757     return false;
4758
4759   unsigned NumElts = VT.getVectorNumElements();
4760   if (NumElts != 4)
4761     return false;
4762
4763   for (unsigned i = 0; i != NumElts/2; ++i)
4764     if (!isUndefOrEqual(Mask[i], 0))
4765       return false;
4766   for (unsigned i = NumElts/2; i != NumElts; ++i)
4767     if (!isUndefOrEqual(Mask[i], NumElts/2))
4768       return false;
4769   return true;
4770 }
4771
4772 /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4773 /// specifies a shuffle of elements that is suitable for input to 128-bit
4774 /// version of MOVDDUP.
4775 static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) {
4776   if (!VT.is128BitVector())
4777     return false;
4778
4779   unsigned e = VT.getVectorNumElements() / 2;
4780   for (unsigned i = 0; i != e; ++i)
4781     if (!isUndefOrEqual(Mask[i], i))
4782       return false;
4783   for (unsigned i = 0; i != e; ++i)
4784     if (!isUndefOrEqual(Mask[e+i], i))
4785       return false;
4786   return true;
4787 }
4788
4789 /// isVEXTRACTIndex - Return true if the specified
4790 /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
4791 /// suitable for instruction that extract 128 or 256 bit vectors
4792 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
4793   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4794   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4795     return false;
4796
4797   // The index should be aligned on a vecWidth-bit boundary.
4798   uint64_t Index =
4799     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4800
4801   MVT VT = N->getSimpleValueType(0);
4802   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
4803   bool Result = (Index * ElSize) % vecWidth == 0;
4804
4805   return Result;
4806 }
4807
4808 /// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR
4809 /// operand specifies a subvector insert that is suitable for input to
4810 /// insertion of 128 or 256-bit subvectors
4811 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
4812   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4813   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4814     return false;
4815   // The index should be aligned on a vecWidth-bit boundary.
4816   uint64_t Index =
4817     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4818
4819   MVT VT = N->getSimpleValueType(0);
4820   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
4821   bool Result = (Index * ElSize) % vecWidth == 0;
4822
4823   return Result;
4824 }
4825
4826 bool X86::isVINSERT128Index(SDNode *N) {
4827   return isVINSERTIndex(N, 128);
4828 }
4829
4830 bool X86::isVINSERT256Index(SDNode *N) {
4831   return isVINSERTIndex(N, 256);
4832 }
4833
4834 bool X86::isVEXTRACT128Index(SDNode *N) {
4835   return isVEXTRACTIndex(N, 128);
4836 }
4837
4838 bool X86::isVEXTRACT256Index(SDNode *N) {
4839   return isVEXTRACTIndex(N, 256);
4840 }
4841
4842 /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
4843 /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
4844 /// Handles 128-bit and 256-bit.
4845 static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
4846   MVT VT = N->getSimpleValueType(0);
4847
4848   assert((VT.getSizeInBits() >= 128) &&
4849          "Unsupported vector type for PSHUF/SHUFP");
4850
4851   // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
4852   // independently on 128-bit lanes.
4853   unsigned NumElts = VT.getVectorNumElements();
4854   unsigned NumLanes = VT.getSizeInBits()/128;
4855   unsigned NumLaneElts = NumElts/NumLanes;
4856
4857   assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) &&
4858          "Only supports 2, 4 or 8 elements per lane");
4859
4860   unsigned Shift = (NumLaneElts >= 4) ? 1 : 0;
4861   unsigned Mask = 0;
4862   for (unsigned i = 0; i != NumElts; ++i) {
4863     int Elt = N->getMaskElt(i);
4864     if (Elt < 0) continue;
4865     Elt &= NumLaneElts - 1;
4866     unsigned ShAmt = (i << Shift) % 8;
4867     Mask |= Elt << ShAmt;
4868   }
4869
4870   return Mask;
4871 }
4872
4873 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
4874 /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
4875 static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
4876   MVT VT = N->getSimpleValueType(0);
4877
4878   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
4879          "Unsupported vector type for PSHUFHW");
4880
4881   unsigned NumElts = VT.getVectorNumElements();
4882
4883   unsigned Mask = 0;
4884   for (unsigned l = 0; l != NumElts; l += 8) {
4885     // 8 nodes per lane, but we only care about the last 4.
4886     for (unsigned i = 0; i < 4; ++i) {
4887       int Elt = N->getMaskElt(l+i+4);
4888       if (Elt < 0) continue;
4889       Elt &= 0x3; // only 2-bits.
4890       Mask |= Elt << (i * 2);
4891     }
4892   }
4893
4894   return Mask;
4895 }
4896
4897 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
4898 /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
4899 static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
4900   MVT VT = N->getSimpleValueType(0);
4901
4902   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
4903          "Unsupported vector type for PSHUFHW");
4904
4905   unsigned NumElts = VT.getVectorNumElements();
4906
4907   unsigned Mask = 0;
4908   for (unsigned l = 0; l != NumElts; l += 8) {
4909     // 8 nodes per lane, but we only care about the first 4.
4910     for (unsigned i = 0; i < 4; ++i) {
4911       int Elt = N->getMaskElt(l+i);
4912       if (Elt < 0) continue;
4913       Elt &= 0x3; // only 2-bits
4914       Mask |= Elt << (i * 2);
4915     }
4916   }
4917
4918   return Mask;
4919 }
4920
4921 /// \brief Return the appropriate immediate to shuffle the specified
4922 /// VECTOR_SHUFFLE mask with the PALIGNR (if InterLane is false) or with
4923 /// VALIGN (if Interlane is true) instructions.
4924 static unsigned getShuffleAlignrImmediate(ShuffleVectorSDNode *SVOp,
4925                                            bool InterLane) {
4926   MVT VT = SVOp->getSimpleValueType(0);
4927   unsigned EltSize = InterLane ? 1 :
4928     VT.getVectorElementType().getSizeInBits() >> 3;
4929
4930   unsigned NumElts = VT.getVectorNumElements();
4931   unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128;
4932   unsigned NumLaneElts = NumElts/NumLanes;
4933
4934   int Val = 0;
4935   unsigned i;
4936   for (i = 0; i != NumElts; ++i) {
4937     Val = SVOp->getMaskElt(i);
4938     if (Val >= 0)
4939       break;
4940   }
4941   if (Val >= (int)NumElts)
4942     Val -= NumElts - NumLaneElts;
4943
4944   assert(Val - i > 0 && "PALIGNR imm should be positive");
4945   return (Val - i) * EltSize;
4946 }
4947
4948 /// \brief Return the appropriate immediate to shuffle the specified
4949 /// VECTOR_SHUFFLE mask with the PALIGNR instruction.
4950 static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
4951   return getShuffleAlignrImmediate(SVOp, false);
4952 }
4953
4954 /// \brief Return the appropriate immediate to shuffle the specified
4955 /// VECTOR_SHUFFLE mask with the VALIGN instruction.
4956 static unsigned getShuffleVALIGNImmediate(ShuffleVectorSDNode *SVOp) {
4957   return getShuffleAlignrImmediate(SVOp, true);
4958 }
4959
4960
4961 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
4962   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4963   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4964     llvm_unreachable("Illegal extract subvector for VEXTRACT");
4965
4966   uint64_t Index =
4967     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4968
4969   MVT VecVT = N->getOperand(0).getSimpleValueType();
4970   MVT ElVT = VecVT.getVectorElementType();
4971
4972   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4973   return Index / NumElemsPerChunk;
4974 }
4975
4976 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
4977   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4978   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4979     llvm_unreachable("Illegal insert subvector for VINSERT");
4980
4981   uint64_t Index =
4982     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4983
4984   MVT VecVT = N->getSimpleValueType(0);
4985   MVT ElVT = VecVT.getVectorElementType();
4986
4987   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4988   return Index / NumElemsPerChunk;
4989 }
4990
4991 /// getExtractVEXTRACT128Immediate - Return the appropriate immediate
4992 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
4993 /// and VINSERTI128 instructions.
4994 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
4995   return getExtractVEXTRACTImmediate(N, 128);
4996 }
4997
4998 /// getExtractVEXTRACT256Immediate - Return the appropriate immediate
4999 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4
5000 /// and VINSERTI64x4 instructions.
5001 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
5002   return getExtractVEXTRACTImmediate(N, 256);
5003 }
5004
5005 /// getInsertVINSERT128Immediate - Return the appropriate immediate
5006 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
5007 /// and VINSERTI128 instructions.
5008 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
5009   return getInsertVINSERTImmediate(N, 128);
5010 }
5011
5012 /// getInsertVINSERT256Immediate - Return the appropriate immediate
5013 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4
5014 /// and VINSERTI64x4 instructions.
5015 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
5016   return getInsertVINSERTImmediate(N, 256);
5017 }
5018
5019 /// isZero - Returns true if Elt is a constant integer zero
5020 static bool isZero(SDValue V) {
5021   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
5022   return C && C->isNullValue();
5023 }
5024
5025 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
5026 /// constant +0.0.
5027 bool X86::isZeroNode(SDValue Elt) {
5028   if (isZero(Elt))
5029     return true;
5030   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
5031     return CFP->getValueAPF().isPosZero();
5032   return false;
5033 }
5034
5035 /// ShouldXformToMOVHLPS - Return true if the node should be transformed to
5036 /// match movhlps. The lower half elements should come from upper half of
5037 /// V1 (and in order), and the upper half elements should come from the upper
5038 /// half of V2 (and in order).
5039 static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) {
5040   if (!VT.is128BitVector())
5041     return false;
5042   if (VT.getVectorNumElements() != 4)
5043     return false;
5044   for (unsigned i = 0, e = 2; i != e; ++i)
5045     if (!isUndefOrEqual(Mask[i], i+2))
5046       return false;
5047   for (unsigned i = 2; i != 4; ++i)
5048     if (!isUndefOrEqual(Mask[i], i+4))
5049       return false;
5050   return true;
5051 }
5052
5053 /// isScalarLoadToVector - Returns true if the node is a scalar load that
5054 /// is promoted to a vector. It also returns the LoadSDNode by reference if
5055 /// required.
5056 static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) {
5057   if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
5058     return false;
5059   N = N->getOperand(0).getNode();
5060   if (!ISD::isNON_EXTLoad(N))
5061     return false;
5062   if (LD)
5063     *LD = cast<LoadSDNode>(N);
5064   return true;
5065 }
5066
5067 // Test whether the given value is a vector value which will be legalized
5068 // into a load.
5069 static bool WillBeConstantPoolLoad(SDNode *N) {
5070   if (N->getOpcode() != ISD::BUILD_VECTOR)
5071     return false;
5072
5073   // Check for any non-constant elements.
5074   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
5075     switch (N->getOperand(i).getNode()->getOpcode()) {
5076     case ISD::UNDEF:
5077     case ISD::ConstantFP:
5078     case ISD::Constant:
5079       break;
5080     default:
5081       return false;
5082     }
5083
5084   // Vectors of all-zeros and all-ones are materialized with special
5085   // instructions rather than being loaded.
5086   return !ISD::isBuildVectorAllZeros(N) &&
5087          !ISD::isBuildVectorAllOnes(N);
5088 }
5089
5090 /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
5091 /// match movlp{s|d}. The lower half elements should come from lower half of
5092 /// V1 (and in order), and the upper half elements should come from the upper
5093 /// half of V2 (and in order). And since V1 will become the source of the
5094 /// MOVLP, it must be either a vector load or a scalar load to vector.
5095 static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
5096                                ArrayRef<int> Mask, MVT VT) {
5097   if (!VT.is128BitVector())
5098     return false;
5099
5100   if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
5101     return false;
5102   // Is V2 is a vector load, don't do this transformation. We will try to use
5103   // load folding shufps op.
5104   if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
5105     return false;
5106
5107   unsigned NumElems = VT.getVectorNumElements();
5108
5109   if (NumElems != 2 && NumElems != 4)
5110     return false;
5111   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
5112     if (!isUndefOrEqual(Mask[i], i))
5113       return false;
5114   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
5115     if (!isUndefOrEqual(Mask[i], i+NumElems))
5116       return false;
5117   return true;
5118 }
5119
5120 /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
5121 /// to an zero vector.
5122 /// FIXME: move to dag combiner / method on ShuffleVectorSDNode
5123 static bool isZeroShuffle(ShuffleVectorSDNode *N) {
5124   SDValue V1 = N->getOperand(0);
5125   SDValue V2 = N->getOperand(1);
5126   unsigned NumElems = N->getValueType(0).getVectorNumElements();
5127   for (unsigned i = 0; i != NumElems; ++i) {
5128     int Idx = N->getMaskElt(i);
5129     if (Idx >= (int)NumElems) {
5130       unsigned Opc = V2.getOpcode();
5131       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
5132         continue;
5133       if (Opc != ISD::BUILD_VECTOR ||
5134           !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
5135         return false;
5136     } else if (Idx >= 0) {
5137       unsigned Opc = V1.getOpcode();
5138       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
5139         continue;
5140       if (Opc != ISD::BUILD_VECTOR ||
5141           !X86::isZeroNode(V1.getOperand(Idx)))
5142         return false;
5143     }
5144   }
5145   return true;
5146 }
5147
5148 /// getZeroVector - Returns a vector of specified type with all zero elements.
5149 ///
5150 static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
5151                              SelectionDAG &DAG, SDLoc dl) {
5152   assert(VT.isVector() && "Expected a vector type");
5153
5154   // Always build SSE zero vectors as <4 x i32> bitcasted
5155   // to their dest type. This ensures they get CSE'd.
5156   SDValue Vec;
5157   if (VT.is128BitVector()) {  // SSE
5158     if (Subtarget->hasSSE2()) {  // SSE2
5159       SDValue Cst = DAG.getConstant(0, MVT::i32);
5160       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
5161     } else { // SSE1
5162       SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
5163       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
5164     }
5165   } else if (VT.is256BitVector()) { // AVX
5166     if (Subtarget->hasInt256()) { // AVX2
5167       SDValue Cst = DAG.getConstant(0, MVT::i32);
5168       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5169       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
5170     } else {
5171       // 256-bit logic and arithmetic instructions in AVX are all
5172       // floating-point, no support for integer ops. Emit fp zeroed vectors.
5173       SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
5174       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5175       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
5176     }
5177   } else if (VT.is512BitVector()) { // AVX-512
5178       SDValue Cst = DAG.getConstant(0, MVT::i32);
5179       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
5180                         Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5181       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
5182   } else if (VT.getScalarType() == MVT::i1) {
5183     assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type");
5184     SDValue Cst = DAG.getConstant(0, MVT::i1);
5185     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
5186     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
5187   } else
5188     llvm_unreachable("Unexpected vector type");
5189
5190   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
5191 }
5192
5193 /// getOnesVector - Returns a vector of specified type with all bits set.
5194 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
5195 /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
5196 /// Then bitcast to their original type, ensuring they get CSE'd.
5197 static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
5198                              SDLoc dl) {
5199   assert(VT.isVector() && "Expected a vector type");
5200
5201   SDValue Cst = DAG.getConstant(~0U, MVT::i32);
5202   SDValue Vec;
5203   if (VT.is256BitVector()) {
5204     if (HasInt256) { // AVX2
5205       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5206       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
5207     } else { // AVX
5208       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
5209       Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
5210     }
5211   } else if (VT.is128BitVector()) {
5212     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
5213   } else
5214     llvm_unreachable("Unexpected vector type");
5215
5216   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
5217 }
5218
5219 /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
5220 /// that point to V2 points to its first element.
5221 static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
5222   for (unsigned i = 0; i != NumElems; ++i) {
5223     if (Mask[i] > (int)NumElems) {
5224       Mask[i] = NumElems;
5225     }
5226   }
5227 }
5228
5229 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
5230 /// operation of specified width.
5231 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
5232                        SDValue V2) {
5233   unsigned NumElems = VT.getVectorNumElements();
5234   SmallVector<int, 8> Mask;
5235   Mask.push_back(NumElems);
5236   for (unsigned i = 1; i != NumElems; ++i)
5237     Mask.push_back(i);
5238   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
5239 }
5240
5241 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
5242 static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
5243                           SDValue V2) {
5244   unsigned NumElems = VT.getVectorNumElements();
5245   SmallVector<int, 8> Mask;
5246   for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
5247     Mask.push_back(i);
5248     Mask.push_back(i + NumElems);
5249   }
5250   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
5251 }
5252
5253 /// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
5254 static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
5255                           SDValue V2) {
5256   unsigned NumElems = VT.getVectorNumElements();
5257   SmallVector<int, 8> Mask;
5258   for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
5259     Mask.push_back(i + Half);
5260     Mask.push_back(i + NumElems + Half);
5261   }
5262   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
5263 }
5264
5265 // PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
5266 // a generic shuffle instruction because the target has no such instructions.
5267 // Generate shuffles which repeat i16 and i8 several times until they can be
5268 // represented by v4f32 and then be manipulated by target suported shuffles.
5269 static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
5270   MVT VT = V.getSimpleValueType();
5271   int NumElems = VT.getVectorNumElements();
5272   SDLoc dl(V);
5273
5274   while (NumElems > 4) {
5275     if (EltNo < NumElems/2) {
5276       V = getUnpackl(DAG, dl, VT, V, V);
5277     } else {
5278       V = getUnpackh(DAG, dl, VT, V, V);
5279       EltNo -= NumElems/2;
5280     }
5281     NumElems >>= 1;
5282   }
5283   return V;
5284 }
5285
5286 /// getLegalSplat - Generate a legal splat with supported x86 shuffles
5287 static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
5288   MVT VT = V.getSimpleValueType();
5289   SDLoc dl(V);
5290
5291   if (VT.is128BitVector()) {
5292     V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
5293     int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
5294     V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
5295                              &SplatMask[0]);
5296   } else if (VT.is256BitVector()) {
5297     // To use VPERMILPS to splat scalars, the second half of indicies must
5298     // refer to the higher part, which is a duplication of the lower one,
5299     // because VPERMILPS can only handle in-lane permutations.
5300     int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
5301                          EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
5302
5303     V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
5304     V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
5305                              &SplatMask[0]);
5306   } else
5307     llvm_unreachable("Vector size not supported");
5308
5309   return DAG.getNode(ISD::BITCAST, dl, VT, V);
5310 }
5311
5312 /// PromoteSplat - Splat is promoted to target supported vector shuffles.
5313 static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
5314   MVT SrcVT = SV->getSimpleValueType(0);
5315   SDValue V1 = SV->getOperand(0);
5316   SDLoc dl(SV);
5317
5318   int EltNo = SV->getSplatIndex();
5319   int NumElems = SrcVT.getVectorNumElements();
5320   bool Is256BitVec = SrcVT.is256BitVector();
5321
5322   assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) &&
5323          "Unknown how to promote splat for type");
5324
5325   // Extract the 128-bit part containing the splat element and update
5326   // the splat element index when it refers to the higher register.
5327   if (Is256BitVec) {
5328     V1 = Extract128BitVector(V1, EltNo, DAG, dl);
5329     if (EltNo >= NumElems/2)
5330       EltNo -= NumElems/2;
5331   }
5332
5333   // All i16 and i8 vector types can't be used directly by a generic shuffle
5334   // instruction because the target has no such instruction. Generate shuffles
5335   // which repeat i16 and i8 several times until they fit in i32, and then can
5336   // be manipulated by target suported shuffles.
5337   MVT EltVT = SrcVT.getVectorElementType();
5338   if (EltVT == MVT::i8 || EltVT == MVT::i16)
5339     V1 = PromoteSplati8i16(V1, DAG, EltNo);
5340
5341   // Recreate the 256-bit vector and place the same 128-bit vector
5342   // into the low and high part. This is necessary because we want
5343   // to use VPERM* to shuffle the vectors
5344   if (Is256BitVec) {
5345     V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
5346   }
5347
5348   return getLegalSplat(DAG, V1, EltNo);
5349 }
5350
5351 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
5352 /// vector of zero or undef vector.  This produces a shuffle where the low
5353 /// element of V2 is swizzled into the zero/undef vector, landing at element
5354 /// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
5355 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
5356                                            bool IsZero,
5357                                            const X86Subtarget *Subtarget,
5358                                            SelectionDAG &DAG) {
5359   MVT VT = V2.getSimpleValueType();
5360   SDValue V1 = IsZero
5361     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5362   unsigned NumElems = VT.getVectorNumElements();
5363   SmallVector<int, 16> MaskVec;
5364   for (unsigned i = 0; i != NumElems; ++i)
5365     // If this is the insertion idx, put the low elt of V2 here.
5366     MaskVec.push_back(i == Idx ? NumElems : i);
5367   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
5368 }
5369
5370 /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
5371 /// target specific opcode. Returns true if the Mask could be calculated. Sets
5372 /// IsUnary to true if only uses one source. Note that this will set IsUnary for
5373 /// shuffles which use a single input multiple times, and in those cases it will
5374 /// adjust the mask to only have indices within that single input.
5375 static bool getTargetShuffleMask(SDNode *N, MVT VT,
5376                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
5377   unsigned NumElems = VT.getVectorNumElements();
5378   SDValue ImmN;
5379
5380   IsUnary = false;
5381   bool IsFakeUnary = false;
5382   switch(N->getOpcode()) {
5383   case X86ISD::BLENDI:
5384     ImmN = N->getOperand(N->getNumOperands()-1);
5385     DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5386     break;
5387   case X86ISD::SHUFP:
5388     ImmN = N->getOperand(N->getNumOperands()-1);
5389     DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5390     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5391     break;
5392   case X86ISD::UNPCKH:
5393     DecodeUNPCKHMask(VT, Mask);
5394     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5395     break;
5396   case X86ISD::UNPCKL:
5397     DecodeUNPCKLMask(VT, Mask);
5398     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5399     break;
5400   case X86ISD::MOVHLPS:
5401     DecodeMOVHLPSMask(NumElems, Mask);
5402     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5403     break;
5404   case X86ISD::MOVLHPS:
5405     DecodeMOVLHPSMask(NumElems, Mask);
5406     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5407     break;
5408   case X86ISD::PALIGNR:
5409     ImmN = N->getOperand(N->getNumOperands()-1);
5410     DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5411     break;
5412   case X86ISD::PSHUFD:
5413   case X86ISD::VPERMILPI:
5414     ImmN = N->getOperand(N->getNumOperands()-1);
5415     DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5416     IsUnary = true;
5417     break;
5418   case X86ISD::PSHUFHW:
5419     ImmN = N->getOperand(N->getNumOperands()-1);
5420     DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5421     IsUnary = true;
5422     break;
5423   case X86ISD::PSHUFLW:
5424     ImmN = N->getOperand(N->getNumOperands()-1);
5425     DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5426     IsUnary = true;
5427     break;
5428   case X86ISD::PSHUFB: {
5429     IsUnary = true;
5430     SDValue MaskNode = N->getOperand(1);
5431     while (MaskNode->getOpcode() == ISD::BITCAST)
5432       MaskNode = MaskNode->getOperand(0);
5433
5434     if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
5435       // If we have a build-vector, then things are easy.
5436       EVT VT = MaskNode.getValueType();
5437       assert(VT.isVector() &&
5438              "Can't produce a non-vector with a build_vector!");
5439       if (!VT.isInteger())
5440         return false;
5441
5442       int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8;
5443
5444       SmallVector<uint64_t, 32> RawMask;
5445       for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) {
5446         SDValue Op = MaskNode->getOperand(i);
5447         if (Op->getOpcode() == ISD::UNDEF) {
5448           RawMask.push_back((uint64_t)SM_SentinelUndef);
5449           continue;
5450         }
5451         auto *CN = dyn_cast<ConstantSDNode>(Op.getNode());
5452         if (!CN)
5453           return false;
5454         APInt MaskElement = CN->getAPIntValue();
5455
5456         // We now have to decode the element which could be any integer size and
5457         // extract each byte of it.
5458         for (int j = 0; j < NumBytesPerElement; ++j) {
5459           // Note that this is x86 and so always little endian: the low byte is
5460           // the first byte of the mask.
5461           RawMask.push_back(MaskElement.getLoBits(8).getZExtValue());
5462           MaskElement = MaskElement.lshr(8);
5463         }
5464       }
5465       DecodePSHUFBMask(RawMask, Mask);
5466       break;
5467     }
5468
5469     auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
5470     if (!MaskLoad)
5471       return false;
5472
5473     SDValue Ptr = MaskLoad->getBasePtr();
5474     if (Ptr->getOpcode() == X86ISD::Wrapper)
5475       Ptr = Ptr->getOperand(0);
5476
5477     auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
5478     if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
5479       return false;
5480
5481     if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
5482       DecodePSHUFBMask(C, Mask);
5483       if (Mask.empty())
5484         return false;
5485       break;
5486     }
5487
5488     return false;
5489   }
5490   case X86ISD::VPERMI:
5491     ImmN = N->getOperand(N->getNumOperands()-1);
5492     DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5493     IsUnary = true;
5494     break;
5495   case X86ISD::MOVSS:
5496   case X86ISD::MOVSD:
5497     DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
5498     break;
5499   case X86ISD::VPERM2X128:
5500     ImmN = N->getOperand(N->getNumOperands()-1);
5501     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5502     if (Mask.empty()) return false;
5503     break;
5504   case X86ISD::MOVSLDUP:
5505     DecodeMOVSLDUPMask(VT, Mask);
5506     IsUnary = true;
5507     break;
5508   case X86ISD::MOVSHDUP:
5509     DecodeMOVSHDUPMask(VT, Mask);
5510     IsUnary = true;
5511     break;
5512   case X86ISD::MOVDDUP:
5513     DecodeMOVDDUPMask(VT, Mask);
5514     IsUnary = true;
5515     break;
5516   case X86ISD::MOVLHPD:
5517   case X86ISD::MOVLPD:
5518   case X86ISD::MOVLPS:
5519     // Not yet implemented
5520     return false;
5521   default: llvm_unreachable("unknown target shuffle node");
5522   }
5523
5524   // If we have a fake unary shuffle, the shuffle mask is spread across two
5525   // inputs that are actually the same node. Re-map the mask to always point
5526   // into the first input.
5527   if (IsFakeUnary)
5528     for (int &M : Mask)
5529       if (M >= (int)Mask.size())
5530         M -= Mask.size();
5531
5532   return true;
5533 }
5534
5535 /// getShuffleScalarElt - Returns the scalar element that will make up the ith
5536 /// element of the result of the vector shuffle.
5537 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
5538                                    unsigned Depth) {
5539   if (Depth == 6)
5540     return SDValue();  // Limit search depth.
5541
5542   SDValue V = SDValue(N, 0);
5543   EVT VT = V.getValueType();
5544   unsigned Opcode = V.getOpcode();
5545
5546   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
5547   if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
5548     int Elt = SV->getMaskElt(Index);
5549
5550     if (Elt < 0)
5551       return DAG.getUNDEF(VT.getVectorElementType());
5552
5553     unsigned NumElems = VT.getVectorNumElements();
5554     SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
5555                                          : SV->getOperand(1);
5556     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
5557   }
5558
5559   // Recurse into target specific vector shuffles to find scalars.
5560   if (isTargetShuffle(Opcode)) {
5561     MVT ShufVT = V.getSimpleValueType();
5562     unsigned NumElems = ShufVT.getVectorNumElements();
5563     SmallVector<int, 16> ShuffleMask;
5564     bool IsUnary;
5565
5566     if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
5567       return SDValue();
5568
5569     int Elt = ShuffleMask[Index];
5570     if (Elt < 0)
5571       return DAG.getUNDEF(ShufVT.getVectorElementType());
5572
5573     SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
5574                                          : N->getOperand(1);
5575     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
5576                                Depth+1);
5577   }
5578
5579   // Actual nodes that may contain scalar elements
5580   if (Opcode == ISD::BITCAST) {
5581     V = V.getOperand(0);
5582     EVT SrcVT = V.getValueType();
5583     unsigned NumElems = VT.getVectorNumElements();
5584
5585     if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
5586       return SDValue();
5587   }
5588
5589   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
5590     return (Index == 0) ? V.getOperand(0)
5591                         : DAG.getUNDEF(VT.getVectorElementType());
5592
5593   if (V.getOpcode() == ISD::BUILD_VECTOR)
5594     return V.getOperand(Index);
5595
5596   return SDValue();
5597 }
5598
5599 /// getNumOfConsecutiveZeros - Return the number of elements of a vector
5600 /// shuffle operation which come from a consecutively from a zero. The
5601 /// search can start in two different directions, from left or right.
5602 /// We count undefs as zeros until PreferredNum is reached.
5603 static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp,
5604                                          unsigned NumElems, bool ZerosFromLeft,
5605                                          SelectionDAG &DAG,
5606                                          unsigned PreferredNum = -1U) {
5607   unsigned NumZeros = 0;
5608   for (unsigned i = 0; i != NumElems; ++i) {
5609     unsigned Index = ZerosFromLeft ? i : NumElems - i - 1;
5610     SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
5611     if (!Elt.getNode())
5612       break;
5613
5614     if (X86::isZeroNode(Elt))
5615       ++NumZeros;
5616     else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum.
5617       NumZeros = std::min(NumZeros + 1, PreferredNum);
5618     else
5619       break;
5620   }
5621
5622   return NumZeros;
5623 }
5624
5625 /// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
5626 /// correspond consecutively to elements from one of the vector operands,
5627 /// starting from its index OpIdx. Also tell OpNum which source vector operand.
5628 static
5629 bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
5630                               unsigned MaskI, unsigned MaskE, unsigned OpIdx,
5631                               unsigned NumElems, unsigned &OpNum) {
5632   bool SeenV1 = false;
5633   bool SeenV2 = false;
5634
5635   for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
5636     int Idx = SVOp->getMaskElt(i);
5637     // Ignore undef indicies
5638     if (Idx < 0)
5639       continue;
5640
5641     if (Idx < (int)NumElems)
5642       SeenV1 = true;
5643     else
5644       SeenV2 = true;
5645
5646     // Only accept consecutive elements from the same vector
5647     if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
5648       return false;
5649   }
5650
5651   OpNum = SeenV1 ? 0 : 1;
5652   return true;
5653 }
5654
5655 /// isVectorShiftRight - Returns true if the shuffle can be implemented as a
5656 /// logical left shift of a vector.
5657 static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
5658                                bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5659   unsigned NumElems =
5660     SVOp->getSimpleValueType(0).getVectorNumElements();
5661   unsigned NumZeros = getNumOfConsecutiveZeros(
5662       SVOp, NumElems, false /* check zeros from right */, DAG,
5663       SVOp->getMaskElt(0));
5664   unsigned OpSrc;
5665
5666   if (!NumZeros)
5667     return false;
5668
5669   // Considering the elements in the mask that are not consecutive zeros,
5670   // check if they consecutively come from only one of the source vectors.
5671   //
5672   //               V1 = {X, A, B, C}     0
5673   //                         \  \  \    /
5674   //   vector_shuffle V1, V2 <1, 2, 3, X>
5675   //
5676   if (!isShuffleMaskConsecutive(SVOp,
5677             0,                   // Mask Start Index
5678             NumElems-NumZeros,   // Mask End Index(exclusive)
5679             NumZeros,            // Where to start looking in the src vector
5680             NumElems,            // Number of elements in vector
5681             OpSrc))              // Which source operand ?
5682     return false;
5683
5684   isLeft = false;
5685   ShAmt = NumZeros;
5686   ShVal = SVOp->getOperand(OpSrc);
5687   return true;
5688 }
5689
5690 /// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
5691 /// logical left shift of a vector.
5692 static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
5693                               bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5694   unsigned NumElems =
5695     SVOp->getSimpleValueType(0).getVectorNumElements();
5696   unsigned NumZeros = getNumOfConsecutiveZeros(
5697       SVOp, NumElems, true /* check zeros from left */, DAG,
5698       NumElems - SVOp->getMaskElt(NumElems - 1) - 1);
5699   unsigned OpSrc;
5700
5701   if (!NumZeros)
5702     return false;
5703
5704   // Considering the elements in the mask that are not consecutive zeros,
5705   // check if they consecutively come from only one of the source vectors.
5706   //
5707   //                           0    { A, B, X, X } = V2
5708   //                          / \    /  /
5709   //   vector_shuffle V1, V2 <X, X, 4, 5>
5710   //
5711   if (!isShuffleMaskConsecutive(SVOp,
5712             NumZeros,     // Mask Start Index
5713             NumElems,     // Mask End Index(exclusive)
5714             0,            // Where to start looking in the src vector
5715             NumElems,     // Number of elements in vector
5716             OpSrc))       // Which source operand ?
5717     return false;
5718
5719   isLeft = true;
5720   ShAmt = NumZeros;
5721   ShVal = SVOp->getOperand(OpSrc);
5722   return true;
5723 }
5724
5725 /// isVectorShift - Returns true if the shuffle can be implemented as a
5726 /// logical left or right shift of a vector.
5727 static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
5728                           bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5729   // Although the logic below support any bitwidth size, there are no
5730   // shift instructions which handle more than 128-bit vectors.
5731   if (!SVOp->getSimpleValueType(0).is128BitVector())
5732     return false;
5733
5734   if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
5735       isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
5736     return true;
5737
5738   return false;
5739 }
5740
5741 /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
5742 ///
5743 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
5744                                        unsigned NumNonZero, unsigned NumZero,
5745                                        SelectionDAG &DAG,
5746                                        const X86Subtarget* Subtarget,
5747                                        const TargetLowering &TLI) {
5748   if (NumNonZero > 8)
5749     return SDValue();
5750
5751   SDLoc dl(Op);
5752   SDValue V;
5753   bool First = true;
5754   for (unsigned i = 0; i < 16; ++i) {
5755     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
5756     if (ThisIsNonZero && First) {
5757       if (NumZero)
5758         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5759       else
5760         V = DAG.getUNDEF(MVT::v8i16);
5761       First = false;
5762     }
5763
5764     if ((i & 1) != 0) {
5765       SDValue ThisElt, LastElt;
5766       bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
5767       if (LastIsNonZero) {
5768         LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
5769                               MVT::i16, Op.getOperand(i-1));
5770       }
5771       if (ThisIsNonZero) {
5772         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
5773         ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
5774                               ThisElt, DAG.getConstant(8, MVT::i8));
5775         if (LastIsNonZero)
5776           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
5777       } else
5778         ThisElt = LastElt;
5779
5780       if (ThisElt.getNode())
5781         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
5782                         DAG.getIntPtrConstant(i/2));
5783     }
5784   }
5785
5786   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
5787 }
5788
5789 /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
5790 ///
5791 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
5792                                      unsigned NumNonZero, unsigned NumZero,
5793                                      SelectionDAG &DAG,
5794                                      const X86Subtarget* Subtarget,
5795                                      const TargetLowering &TLI) {
5796   if (NumNonZero > 4)
5797     return SDValue();
5798
5799   SDLoc dl(Op);
5800   SDValue V;
5801   bool First = true;
5802   for (unsigned i = 0; i < 8; ++i) {
5803     bool isNonZero = (NonZeros & (1 << i)) != 0;
5804     if (isNonZero) {
5805       if (First) {
5806         if (NumZero)
5807           V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5808         else
5809           V = DAG.getUNDEF(MVT::v8i16);
5810         First = false;
5811       }
5812       V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
5813                       MVT::v8i16, V, Op.getOperand(i),
5814                       DAG.getIntPtrConstant(i));
5815     }
5816   }
5817
5818   return V;
5819 }
5820
5821 /// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
5822 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
5823                                      const X86Subtarget *Subtarget,
5824                                      const TargetLowering &TLI) {
5825   // Find all zeroable elements.
5826   bool Zeroable[4];
5827   for (int i=0; i < 4; ++i) {
5828     SDValue Elt = Op->getOperand(i);
5829     Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt));
5830   }
5831   assert(std::count_if(&Zeroable[0], &Zeroable[4],
5832                        [](bool M) { return !M; }) > 1 &&
5833          "We expect at least two non-zero elements!");
5834
5835   // We only know how to deal with build_vector nodes where elements are either
5836   // zeroable or extract_vector_elt with constant index.
5837   SDValue FirstNonZero;
5838   unsigned FirstNonZeroIdx;
5839   for (unsigned i=0; i < 4; ++i) {
5840     if (Zeroable[i])
5841       continue;
5842     SDValue Elt = Op->getOperand(i);
5843     if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5844         !isa<ConstantSDNode>(Elt.getOperand(1)))
5845       return SDValue();
5846     // Make sure that this node is extracting from a 128-bit vector.
5847     MVT VT = Elt.getOperand(0).getSimpleValueType();
5848     if (!VT.is128BitVector())
5849       return SDValue();
5850     if (!FirstNonZero.getNode()) {
5851       FirstNonZero = Elt;
5852       FirstNonZeroIdx = i;
5853     }
5854   }
5855
5856   assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
5857   SDValue V1 = FirstNonZero.getOperand(0);
5858   MVT VT = V1.getSimpleValueType();
5859
5860   // See if this build_vector can be lowered as a blend with zero.
5861   SDValue Elt;
5862   unsigned EltMaskIdx, EltIdx;
5863   int Mask[4];
5864   for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
5865     if (Zeroable[EltIdx]) {
5866       // The zero vector will be on the right hand side.
5867       Mask[EltIdx] = EltIdx+4;
5868       continue;
5869     }
5870
5871     Elt = Op->getOperand(EltIdx);
5872     // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
5873     EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
5874     if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
5875       break;
5876     Mask[EltIdx] = EltIdx;
5877   }
5878
5879   if (EltIdx == 4) {
5880     // Let the shuffle legalizer deal with blend operations.
5881     SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
5882     if (V1.getSimpleValueType() != VT)
5883       V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1);
5884     return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]);
5885   }
5886
5887   // See if we can lower this build_vector to a INSERTPS.
5888   if (!Subtarget->hasSSE41())
5889     return SDValue();
5890
5891   SDValue V2 = Elt.getOperand(0);
5892   if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
5893     V1 = SDValue();
5894
5895   bool CanFold = true;
5896   for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
5897     if (Zeroable[i])
5898       continue;
5899
5900     SDValue Current = Op->getOperand(i);
5901     SDValue SrcVector = Current->getOperand(0);
5902     if (!V1.getNode())
5903       V1 = SrcVector;
5904     CanFold = SrcVector == V1 &&
5905       cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
5906   }
5907
5908   if (!CanFold)
5909     return SDValue();
5910
5911   assert(V1.getNode() && "Expected at least two non-zero elements!");
5912   if (V1.getSimpleValueType() != MVT::v4f32)
5913     V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1);
5914   if (V2.getSimpleValueType() != MVT::v4f32)
5915     V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2);
5916
5917   // Ok, we can emit an INSERTPS instruction.
5918   unsigned ZMask = 0;
5919   for (int i = 0; i < 4; ++i)
5920     if (Zeroable[i])
5921       ZMask |= 1 << i;
5922
5923   unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
5924   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
5925   SDValue Result = DAG.getNode(X86ISD::INSERTPS, SDLoc(Op), MVT::v4f32, V1, V2,
5926                                DAG.getIntPtrConstant(InsertPSMask));
5927   return DAG.getNode(ISD::BITCAST, SDLoc(Op), VT, Result);
5928 }
5929
5930 /// Return a vector logical shift node.
5931 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
5932                          unsigned NumBits, SelectionDAG &DAG,
5933                          const TargetLowering &TLI, SDLoc dl) {
5934   assert(VT.is128BitVector() && "Unknown type for VShift");
5935   MVT ShVT = MVT::v2i64;
5936   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
5937   SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
5938   MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(SrcOp.getValueType());
5939   SDValue ShiftVal = DAG.getConstant(NumBits, ScalarShiftTy);
5940   return DAG.getNode(ISD::BITCAST, dl, VT,
5941                      DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
5942 }
5943
5944 static SDValue
5945 LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
5946
5947   // Check if the scalar load can be widened into a vector load. And if
5948   // the address is "base + cst" see if the cst can be "absorbed" into
5949   // the shuffle mask.
5950   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
5951     SDValue Ptr = LD->getBasePtr();
5952     if (!ISD::isNormalLoad(LD) || LD->isVolatile())
5953       return SDValue();
5954     EVT PVT = LD->getValueType(0);
5955     if (PVT != MVT::i32 && PVT != MVT::f32)
5956       return SDValue();
5957
5958     int FI = -1;
5959     int64_t Offset = 0;
5960     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
5961       FI = FINode->getIndex();
5962       Offset = 0;
5963     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
5964                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
5965       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
5966       Offset = Ptr.getConstantOperandVal(1);
5967       Ptr = Ptr.getOperand(0);
5968     } else {
5969       return SDValue();
5970     }
5971
5972     // FIXME: 256-bit vector instructions don't require a strict alignment,
5973     // improve this code to support it better.
5974     unsigned RequiredAlign = VT.getSizeInBits()/8;
5975     SDValue Chain = LD->getChain();
5976     // Make sure the stack object alignment is at least 16 or 32.
5977     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
5978     if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
5979       if (MFI->isFixedObjectIndex(FI)) {
5980         // Can't change the alignment. FIXME: It's possible to compute
5981         // the exact stack offset and reference FI + adjust offset instead.
5982         // If someone *really* cares about this. That's the way to implement it.
5983         return SDValue();
5984       } else {
5985         MFI->setObjectAlignment(FI, RequiredAlign);
5986       }
5987     }
5988
5989     // (Offset % 16 or 32) must be multiple of 4. Then address is then
5990     // Ptr + (Offset & ~15).
5991     if (Offset < 0)
5992       return SDValue();
5993     if ((Offset % RequiredAlign) & 3)
5994       return SDValue();
5995     int64_t StartOffset = Offset & ~(RequiredAlign-1);
5996     if (StartOffset)
5997       Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(),
5998                         Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
5999
6000     int EltNo = (Offset - StartOffset) >> 2;
6001     unsigned NumElems = VT.getVectorNumElements();
6002
6003     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6004     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6005                              LD->getPointerInfo().getWithOffset(StartOffset),
6006                              false, false, false, 0);
6007
6008     SmallVector<int, 8> Mask;
6009     for (unsigned i = 0; i != NumElems; ++i)
6010       Mask.push_back(EltNo);
6011
6012     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
6013   }
6014
6015   return SDValue();
6016 }
6017
6018 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6019 /// elements can be replaced by a single large load which has the same value as
6020 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6021 ///
6022 /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
6023 ///
6024 /// FIXME: we'd also like to handle the case where the last elements are zero
6025 /// rather than undef via VZEXT_LOAD, but we do not detect that case today.
6026 /// There's even a handy isZeroNode for that purpose.
6027 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6028                                         SDLoc &DL, SelectionDAG &DAG,
6029                                         bool isAfterLegalize) {
6030   unsigned NumElems = Elts.size();
6031
6032   LoadSDNode *LDBase = nullptr;
6033   unsigned LastLoadedElt = -1U;
6034
6035   // For each element in the initializer, see if we've found a load or an undef.
6036   // If we don't find an initial load element, or later load elements are
6037   // non-consecutive, bail out.
6038   for (unsigned i = 0; i < NumElems; ++i) {
6039     SDValue Elt = Elts[i];
6040     // Look through a bitcast.
6041     if (Elt.getNode() && Elt.getOpcode() == ISD::BITCAST)
6042       Elt = Elt.getOperand(0);
6043     if (!Elt.getNode() ||
6044         (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
6045       return SDValue();
6046     if (!LDBase) {
6047       if (Elt.getNode()->getOpcode() == ISD::UNDEF)
6048         return SDValue();
6049       LDBase = cast<LoadSDNode>(Elt.getNode());
6050       LastLoadedElt = i;
6051       continue;
6052     }
6053     if (Elt.getOpcode() == ISD::UNDEF)
6054       continue;
6055
6056     LoadSDNode *LD = cast<LoadSDNode>(Elt);
6057     EVT LdVT = Elt.getValueType();
6058     // Each loaded element must be the correct fractional portion of the
6059     // requested vector load.
6060     if (LdVT.getSizeInBits() != VT.getSizeInBits() / NumElems)
6061       return SDValue();
6062     if (!DAG.isConsecutiveLoad(LD, LDBase, LdVT.getSizeInBits() / 8, i))
6063       return SDValue();
6064     LastLoadedElt = i;
6065   }
6066
6067   // If we have found an entire vector of loads and undefs, then return a large
6068   // load of the entire vector width starting at the base pointer.  If we found
6069   // consecutive loads for the low half, generate a vzext_load node.
6070   if (LastLoadedElt == NumElems - 1) {
6071     assert(LDBase && "Did not find base load for merging consecutive loads");
6072     EVT EltVT = LDBase->getValueType(0);
6073     // Ensure that the input vector size for the merged loads matches the
6074     // cumulative size of the input elements.
6075     if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
6076       return SDValue();
6077
6078     if (isAfterLegalize &&
6079         !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
6080       return SDValue();
6081
6082     SDValue NewLd = SDValue();
6083
6084     NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6085                         LDBase->getPointerInfo(), LDBase->isVolatile(),
6086                         LDBase->isNonTemporal(), LDBase->isInvariant(),
6087                         LDBase->getAlignment());
6088
6089     if (LDBase->hasAnyUseOfValue(1)) {
6090       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
6091                                      SDValue(LDBase, 1),
6092                                      SDValue(NewLd.getNode(), 1));
6093       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6094       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6095                              SDValue(NewLd.getNode(), 1));
6096     }
6097
6098     return NewLd;
6099   }
6100
6101   //TODO: The code below fires only for for loading the low v2i32 / v2f32
6102   //of a v4i32 / v4f32. It's probably worth generalizing.
6103   EVT EltVT = VT.getVectorElementType();
6104   if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) &&
6105       DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
6106     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
6107     SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6108     SDValue ResNode =
6109         DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
6110                                 LDBase->getPointerInfo(),
6111                                 LDBase->getAlignment(),
6112                                 false/*isVolatile*/, true/*ReadMem*/,
6113                                 false/*WriteMem*/);
6114
6115     // Make sure the newly-created LOAD is in the same position as LDBase in
6116     // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
6117     // update uses of LDBase's output chain to use the TokenFactor.
6118     if (LDBase->hasAnyUseOfValue(1)) {
6119       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
6120                              SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
6121       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6122       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6123                              SDValue(ResNode.getNode(), 1));
6124     }
6125
6126     return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
6127   }
6128   return SDValue();
6129 }
6130
6131 /// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
6132 /// to generate a splat value for the following cases:
6133 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
6134 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
6135 /// a scalar load, or a constant.
6136 /// The VBROADCAST node is returned when a pattern is found,
6137 /// or SDValue() otherwise.
6138 static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
6139                                     SelectionDAG &DAG) {
6140   // VBROADCAST requires AVX.
6141   // TODO: Splats could be generated for non-AVX CPUs using SSE
6142   // instructions, but there's less potential gain for only 128-bit vectors.
6143   if (!Subtarget->hasAVX())
6144     return SDValue();
6145
6146   MVT VT = Op.getSimpleValueType();
6147   SDLoc dl(Op);
6148
6149   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
6150          "Unsupported vector type for broadcast.");
6151
6152   SDValue Ld;
6153   bool ConstSplatVal;
6154
6155   switch (Op.getOpcode()) {
6156     default:
6157       // Unknown pattern found.
6158       return SDValue();
6159
6160     case ISD::BUILD_VECTOR: {
6161       auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
6162       BitVector UndefElements;
6163       SDValue Splat = BVOp->getSplatValue(&UndefElements);
6164
6165       // We need a splat of a single value to use broadcast, and it doesn't
6166       // make any sense if the value is only in one element of the vector.
6167       if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
6168         return SDValue();
6169
6170       Ld = Splat;
6171       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
6172                        Ld.getOpcode() == ISD::ConstantFP);
6173
6174       // Make sure that all of the users of a non-constant load are from the
6175       // BUILD_VECTOR node.
6176       if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
6177         return SDValue();
6178       break;
6179     }
6180
6181     case ISD::VECTOR_SHUFFLE: {
6182       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
6183
6184       // Shuffles must have a splat mask where the first element is
6185       // broadcasted.
6186       if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
6187         return SDValue();
6188
6189       SDValue Sc = Op.getOperand(0);
6190       if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
6191           Sc.getOpcode() != ISD::BUILD_VECTOR) {
6192
6193         if (!Subtarget->hasInt256())
6194           return SDValue();
6195
6196         // Use the register form of the broadcast instruction available on AVX2.
6197         if (VT.getSizeInBits() >= 256)
6198           Sc = Extract128BitVector(Sc, 0, DAG, dl);
6199         return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
6200       }
6201
6202       Ld = Sc.getOperand(0);
6203       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
6204                        Ld.getOpcode() == ISD::ConstantFP);
6205
6206       // The scalar_to_vector node and the suspected
6207       // load node must have exactly one user.
6208       // Constants may have multiple users.
6209
6210       // AVX-512 has register version of the broadcast
6211       bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
6212         Ld.getValueType().getSizeInBits() >= 32;
6213       if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
6214           !hasRegVer))
6215         return SDValue();
6216       break;
6217     }
6218   }
6219
6220   unsigned ScalarSize = Ld.getValueType().getSizeInBits();
6221   bool IsGE256 = (VT.getSizeInBits() >= 256);
6222
6223   // When optimizing for size, generate up to 5 extra bytes for a broadcast
6224   // instruction to save 8 or more bytes of constant pool data.
6225   // TODO: If multiple splats are generated to load the same constant,
6226   // it may be detrimental to overall size. There needs to be a way to detect
6227   // that condition to know if this is truly a size win.
6228   const Function *F = DAG.getMachineFunction().getFunction();
6229   bool OptForSize = F->getAttributes().
6230     hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
6231
6232   // Handle broadcasting a single constant scalar from the constant pool
6233   // into a vector.
6234   // On Sandybridge (no AVX2), it is still better to load a constant vector
6235   // from the constant pool and not to broadcast it from a scalar.
6236   // But override that restriction when optimizing for size.
6237   // TODO: Check if splatting is recommended for other AVX-capable CPUs.
6238   if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) {
6239     EVT CVT = Ld.getValueType();
6240     assert(!CVT.isVector() && "Must not broadcast a vector type");
6241
6242     // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
6243     // For size optimization, also splat v2f64 and v2i64, and for size opt
6244     // with AVX2, also splat i8 and i16.
6245     // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
6246     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6247         (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) {
6248       const Constant *C = nullptr;
6249       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
6250         C = CI->getConstantIntValue();
6251       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
6252         C = CF->getConstantFPValue();
6253
6254       assert(C && "Invalid constant type");
6255
6256       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6257       SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
6258       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6259       Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
6260                        MachinePointerInfo::getConstantPool(),
6261                        false, false, false, Alignment);
6262
6263       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6264     }
6265   }
6266
6267   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
6268
6269   // Handle AVX2 in-register broadcasts.
6270   if (!IsLoad && Subtarget->hasInt256() &&
6271       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
6272     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6273
6274   // The scalar source must be a normal load.
6275   if (!IsLoad)
6276     return SDValue();
6277
6278   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6279       (Subtarget->hasVLX() && ScalarSize == 64))
6280     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6281
6282   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
6283   // double since there is no vbroadcastsd xmm
6284   if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
6285     if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
6286       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6287   }
6288
6289   // Unsupported broadcast.
6290   return SDValue();
6291 }
6292
6293 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
6294 /// underlying vector and index.
6295 ///
6296 /// Modifies \p ExtractedFromVec to the real vector and returns the real
6297 /// index.
6298 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
6299                                          SDValue ExtIdx) {
6300   int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
6301   if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
6302     return Idx;
6303
6304   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
6305   // lowered this:
6306   //   (extract_vector_elt (v8f32 %vreg1), Constant<6>)
6307   // to:
6308   //   (extract_vector_elt (vector_shuffle<2,u,u,u>
6309   //                           (extract_subvector (v8f32 %vreg0), Constant<4>),
6310   //                           undef)
6311   //                       Constant<0>)
6312   // In this case the vector is the extract_subvector expression and the index
6313   // is 2, as specified by the shuffle.
6314   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
6315   SDValue ShuffleVec = SVOp->getOperand(0);
6316   MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
6317   assert(ShuffleVecVT.getVectorElementType() ==
6318          ExtractedFromVec.getSimpleValueType().getVectorElementType());
6319
6320   int ShuffleIdx = SVOp->getMaskElt(Idx);
6321   if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
6322     ExtractedFromVec = ShuffleVec;
6323     return ShuffleIdx;
6324   }
6325   return Idx;
6326 }
6327
6328 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
6329   MVT VT = Op.getSimpleValueType();
6330
6331   // Skip if insert_vec_elt is not supported.
6332   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6333   if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
6334     return SDValue();
6335
6336   SDLoc DL(Op);
6337   unsigned NumElems = Op.getNumOperands();
6338
6339   SDValue VecIn1;
6340   SDValue VecIn2;
6341   SmallVector<unsigned, 4> InsertIndices;
6342   SmallVector<int, 8> Mask(NumElems, -1);
6343
6344   for (unsigned i = 0; i != NumElems; ++i) {
6345     unsigned Opc = Op.getOperand(i).getOpcode();
6346
6347     if (Opc == ISD::UNDEF)
6348       continue;
6349
6350     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
6351       // Quit if more than 1 elements need inserting.
6352       if (InsertIndices.size() > 1)
6353         return SDValue();
6354
6355       InsertIndices.push_back(i);
6356       continue;
6357     }
6358
6359     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
6360     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
6361     // Quit if non-constant index.
6362     if (!isa<ConstantSDNode>(ExtIdx))
6363       return SDValue();
6364     int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
6365
6366     // Quit if extracted from vector of different type.
6367     if (ExtractedFromVec.getValueType() != VT)
6368       return SDValue();
6369
6370     if (!VecIn1.getNode())
6371       VecIn1 = ExtractedFromVec;
6372     else if (VecIn1 != ExtractedFromVec) {
6373       if (!VecIn2.getNode())
6374         VecIn2 = ExtractedFromVec;
6375       else if (VecIn2 != ExtractedFromVec)
6376         // Quit if more than 2 vectors to shuffle
6377         return SDValue();
6378     }
6379
6380     if (ExtractedFromVec == VecIn1)
6381       Mask[i] = Idx;
6382     else if (ExtractedFromVec == VecIn2)
6383       Mask[i] = Idx + NumElems;
6384   }
6385
6386   if (!VecIn1.getNode())
6387     return SDValue();
6388
6389   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
6390   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
6391   for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
6392     unsigned Idx = InsertIndices[i];
6393     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
6394                      DAG.getIntPtrConstant(Idx));
6395   }
6396
6397   return NV;
6398 }
6399
6400 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
6401 SDValue
6402 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
6403
6404   MVT VT = Op.getSimpleValueType();
6405   assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) &&
6406          "Unexpected type in LowerBUILD_VECTORvXi1!");
6407
6408   SDLoc dl(Op);
6409   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
6410     SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
6411     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
6412     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
6413   }
6414
6415   if (ISD::isBuildVectorAllOnes(Op.getNode())) {
6416     SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
6417     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
6418     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
6419   }
6420
6421   bool AllContants = true;
6422   uint64_t Immediate = 0;
6423   int NonConstIdx = -1;
6424   bool IsSplat = true;
6425   unsigned NumNonConsts = 0;
6426   unsigned NumConsts = 0;
6427   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6428     SDValue In = Op.getOperand(idx);
6429     if (In.getOpcode() == ISD::UNDEF)
6430       continue;
6431     if (!isa<ConstantSDNode>(In)) {
6432       AllContants = false;
6433       NonConstIdx = idx;
6434       NumNonConsts++;
6435     } else {
6436       NumConsts++;
6437       if (cast<ConstantSDNode>(In)->getZExtValue())
6438       Immediate |= (1ULL << idx);
6439     }
6440     if (In != Op.getOperand(0))
6441       IsSplat = false;
6442   }
6443
6444   if (AllContants) {
6445     SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1,
6446       DAG.getConstant(Immediate, MVT::i16));
6447     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask,
6448                        DAG.getIntPtrConstant(0));
6449   }
6450
6451   if (NumNonConsts == 1 && NonConstIdx != 0) {
6452     SDValue DstVec;
6453     if (NumConsts) {
6454       SDValue VecAsImm = DAG.getConstant(Immediate,
6455                                          MVT::getIntegerVT(VT.getSizeInBits()));
6456       DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
6457     }
6458     else
6459       DstVec = DAG.getUNDEF(VT);
6460     return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
6461                        Op.getOperand(NonConstIdx),
6462                        DAG.getIntPtrConstant(NonConstIdx));
6463   }
6464   if (!IsSplat && (NonConstIdx != 0))
6465     llvm_unreachable("Unsupported BUILD_VECTOR operation");
6466   MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8;
6467   SDValue Select;
6468   if (IsSplat)
6469     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
6470                           DAG.getConstant(-1, SelectVT),
6471                           DAG.getConstant(0, SelectVT));
6472   else
6473     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
6474                          DAG.getConstant((Immediate | 1), SelectVT),
6475                          DAG.getConstant(Immediate, SelectVT));
6476   return DAG.getNode(ISD::BITCAST, dl, VT, Select);
6477 }
6478
6479 /// \brief Return true if \p N implements a horizontal binop and return the
6480 /// operands for the horizontal binop into V0 and V1.
6481 ///
6482 /// This is a helper function of PerformBUILD_VECTORCombine.
6483 /// This function checks that the build_vector \p N in input implements a
6484 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
6485 /// operation to match.
6486 /// For example, if \p Opcode is equal to ISD::ADD, then this function
6487 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
6488 /// is equal to ISD::SUB, then this function checks if this is a horizontal
6489 /// arithmetic sub.
6490 ///
6491 /// This function only analyzes elements of \p N whose indices are
6492 /// in range [BaseIdx, LastIdx).
6493 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
6494                               SelectionDAG &DAG,
6495                               unsigned BaseIdx, unsigned LastIdx,
6496                               SDValue &V0, SDValue &V1) {
6497   EVT VT = N->getValueType(0);
6498
6499   assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
6500   assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
6501          "Invalid Vector in input!");
6502
6503   bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
6504   bool CanFold = true;
6505   unsigned ExpectedVExtractIdx = BaseIdx;
6506   unsigned NumElts = LastIdx - BaseIdx;
6507   V0 = DAG.getUNDEF(VT);
6508   V1 = DAG.getUNDEF(VT);
6509
6510   // Check if N implements a horizontal binop.
6511   for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
6512     SDValue Op = N->getOperand(i + BaseIdx);
6513
6514     // Skip UNDEFs.
6515     if (Op->getOpcode() == ISD::UNDEF) {
6516       // Update the expected vector extract index.
6517       if (i * 2 == NumElts)
6518         ExpectedVExtractIdx = BaseIdx;
6519       ExpectedVExtractIdx += 2;
6520       continue;
6521     }
6522
6523     CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
6524
6525     if (!CanFold)
6526       break;
6527
6528     SDValue Op0 = Op.getOperand(0);
6529     SDValue Op1 = Op.getOperand(1);
6530
6531     // Try to match the following pattern:
6532     // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
6533     CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6534         Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6535         Op0.getOperand(0) == Op1.getOperand(0) &&
6536         isa<ConstantSDNode>(Op0.getOperand(1)) &&
6537         isa<ConstantSDNode>(Op1.getOperand(1)));
6538     if (!CanFold)
6539       break;
6540
6541     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
6542     unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
6543
6544     if (i * 2 < NumElts) {
6545       if (V0.getOpcode() == ISD::UNDEF)
6546         V0 = Op0.getOperand(0);
6547     } else {
6548       if (V1.getOpcode() == ISD::UNDEF)
6549         V1 = Op0.getOperand(0);
6550       if (i * 2 == NumElts)
6551         ExpectedVExtractIdx = BaseIdx;
6552     }
6553
6554     SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
6555     if (I0 == ExpectedVExtractIdx)
6556       CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
6557     else if (IsCommutable && I1 == ExpectedVExtractIdx) {
6558       // Try to match the following dag sequence:
6559       // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
6560       CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
6561     } else
6562       CanFold = false;
6563
6564     ExpectedVExtractIdx += 2;
6565   }
6566
6567   return CanFold;
6568 }
6569
6570 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
6571 /// a concat_vector.
6572 ///
6573 /// This is a helper function of PerformBUILD_VECTORCombine.
6574 /// This function expects two 256-bit vectors called V0 and V1.
6575 /// At first, each vector is split into two separate 128-bit vectors.
6576 /// Then, the resulting 128-bit vectors are used to implement two
6577 /// horizontal binary operations.
6578 ///
6579 /// The kind of horizontal binary operation is defined by \p X86Opcode.
6580 ///
6581 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
6582 /// the two new horizontal binop.
6583 /// When Mode is set, the first horizontal binop dag node would take as input
6584 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
6585 /// horizontal binop dag node would take as input the lower 128-bit of V1
6586 /// and the upper 128-bit of V1.
6587 ///   Example:
6588 ///     HADD V0_LO, V0_HI
6589 ///     HADD V1_LO, V1_HI
6590 ///
6591 /// Otherwise, the first horizontal binop dag node takes as input the lower
6592 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
6593 /// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1.
6594 ///   Example:
6595 ///     HADD V0_LO, V1_LO
6596 ///     HADD V0_HI, V1_HI
6597 ///
6598 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
6599 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
6600 /// the upper 128-bits of the result.
6601 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
6602                                      SDLoc DL, SelectionDAG &DAG,
6603                                      unsigned X86Opcode, bool Mode,
6604                                      bool isUndefLO, bool isUndefHI) {
6605   EVT VT = V0.getValueType();
6606   assert(VT.is256BitVector() && VT == V1.getValueType() &&
6607          "Invalid nodes in input!");
6608
6609   unsigned NumElts = VT.getVectorNumElements();
6610   SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL);
6611   SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL);
6612   SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL);
6613   SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL);
6614   EVT NewVT = V0_LO.getValueType();
6615
6616   SDValue LO = DAG.getUNDEF(NewVT);
6617   SDValue HI = DAG.getUNDEF(NewVT);
6618
6619   if (Mode) {
6620     // Don't emit a horizontal binop if the result is expected to be UNDEF.
6621     if (!isUndefLO && V0->getOpcode() != ISD::UNDEF)
6622       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
6623     if (!isUndefHI && V1->getOpcode() != ISD::UNDEF)
6624       HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
6625   } else {
6626     // Don't emit a horizontal binop if the result is expected to be UNDEF.
6627     if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF ||
6628                        V1_LO->getOpcode() != ISD::UNDEF))
6629       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
6630
6631     if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF ||
6632                        V1_HI->getOpcode() != ISD::UNDEF))
6633       HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
6634   }
6635
6636   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
6637 }
6638
6639 /// \brief Try to fold a build_vector that performs an 'addsub' into the
6640 /// sequence of 'vadd + vsub + blendi'.
6641 static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG,
6642                            const X86Subtarget *Subtarget) {
6643   SDLoc DL(BV);
6644   EVT VT = BV->getValueType(0);
6645   unsigned NumElts = VT.getVectorNumElements();
6646   SDValue InVec0 = DAG.getUNDEF(VT);
6647   SDValue InVec1 = DAG.getUNDEF(VT);
6648
6649   assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
6650           VT == MVT::v2f64) && "build_vector with an invalid type found!");
6651
6652   // Odd-numbered elements in the input build vector are obtained from
6653   // adding two integer/float elements.
6654   // Even-numbered elements in the input build vector are obtained from
6655   // subtracting two integer/float elements.
6656   unsigned ExpectedOpcode = ISD::FSUB;
6657   unsigned NextExpectedOpcode = ISD::FADD;
6658   bool AddFound = false;
6659   bool SubFound = false;
6660
6661   for (unsigned i = 0, e = NumElts; i != e; i++) {
6662     SDValue Op = BV->getOperand(i);
6663
6664     // Skip 'undef' values.
6665     unsigned Opcode = Op.getOpcode();
6666     if (Opcode == ISD::UNDEF) {
6667       std::swap(ExpectedOpcode, NextExpectedOpcode);
6668       continue;
6669     }
6670
6671     // Early exit if we found an unexpected opcode.
6672     if (Opcode != ExpectedOpcode)
6673       return SDValue();
6674
6675     SDValue Op0 = Op.getOperand(0);
6676     SDValue Op1 = Op.getOperand(1);
6677
6678     // Try to match the following pattern:
6679     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
6680     // Early exit if we cannot match that sequence.
6681     if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6682         Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6683         !isa<ConstantSDNode>(Op0.getOperand(1)) ||
6684         !isa<ConstantSDNode>(Op1.getOperand(1)) ||
6685         Op0.getOperand(1) != Op1.getOperand(1))
6686       return SDValue();
6687
6688     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
6689     if (I0 != i)
6690       return SDValue();
6691
6692     // We found a valid add/sub node. Update the information accordingly.
6693     if (i & 1)
6694       AddFound = true;
6695     else
6696       SubFound = true;
6697
6698     // Update InVec0 and InVec1.
6699     if (InVec0.getOpcode() == ISD::UNDEF)
6700       InVec0 = Op0.getOperand(0);
6701     if (InVec1.getOpcode() == ISD::UNDEF)
6702       InVec1 = Op1.getOperand(0);
6703
6704     // Make sure that operands in input to each add/sub node always
6705     // come from a same pair of vectors.
6706     if (InVec0 != Op0.getOperand(0)) {
6707       if (ExpectedOpcode == ISD::FSUB)
6708         return SDValue();
6709
6710       // FADD is commutable. Try to commute the operands
6711       // and then test again.
6712       std::swap(Op0, Op1);
6713       if (InVec0 != Op0.getOperand(0))
6714         return SDValue();
6715     }
6716
6717     if (InVec1 != Op1.getOperand(0))
6718       return SDValue();
6719
6720     // Update the pair of expected opcodes.
6721     std::swap(ExpectedOpcode, NextExpectedOpcode);
6722   }
6723
6724   // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
6725   if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF &&
6726       InVec1.getOpcode() != ISD::UNDEF)
6727     return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
6728
6729   return SDValue();
6730 }
6731
6732 static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
6733                                           const X86Subtarget *Subtarget) {
6734   SDLoc DL(N);
6735   EVT VT = N->getValueType(0);
6736   unsigned NumElts = VT.getVectorNumElements();
6737   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N);
6738   SDValue InVec0, InVec1;
6739
6740   // Try to match an ADDSUB.
6741   if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
6742       (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
6743     SDValue Value = matchAddSub(BV, DAG, Subtarget);
6744     if (Value.getNode())
6745       return Value;
6746   }
6747
6748   // Try to match horizontal ADD/SUB.
6749   unsigned NumUndefsLO = 0;
6750   unsigned NumUndefsHI = 0;
6751   unsigned Half = NumElts/2;
6752
6753   // Count the number of UNDEF operands in the build_vector in input.
6754   for (unsigned i = 0, e = Half; i != e; ++i)
6755     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
6756       NumUndefsLO++;
6757
6758   for (unsigned i = Half, e = NumElts; i != e; ++i)
6759     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
6760       NumUndefsHI++;
6761
6762   // Early exit if this is either a build_vector of all UNDEFs or all the
6763   // operands but one are UNDEF.
6764   if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
6765     return SDValue();
6766
6767   if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) {
6768     // Try to match an SSE3 float HADD/HSUB.
6769     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
6770       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
6771
6772     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
6773       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
6774   } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
6775     // Try to match an SSSE3 integer HADD/HSUB.
6776     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
6777       return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
6778
6779     if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
6780       return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
6781   }
6782
6783   if (!Subtarget->hasAVX())
6784     return SDValue();
6785
6786   if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
6787     // Try to match an AVX horizontal add/sub of packed single/double
6788     // precision floating point values from 256-bit vectors.
6789     SDValue InVec2, InVec3;
6790     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
6791         isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
6792         ((InVec0.getOpcode() == ISD::UNDEF ||
6793           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6794         ((InVec1.getOpcode() == ISD::UNDEF ||
6795           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6796       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
6797
6798     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
6799         isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
6800         ((InVec0.getOpcode() == ISD::UNDEF ||
6801           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6802         ((InVec1.getOpcode() == ISD::UNDEF ||
6803           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6804       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
6805   } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
6806     // Try to match an AVX2 horizontal add/sub of signed integers.
6807     SDValue InVec2, InVec3;
6808     unsigned X86Opcode;
6809     bool CanFold = true;
6810
6811     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
6812         isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
6813         ((InVec0.getOpcode() == ISD::UNDEF ||
6814           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6815         ((InVec1.getOpcode() == ISD::UNDEF ||
6816           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6817       X86Opcode = X86ISD::HADD;
6818     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
6819         isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
6820         ((InVec0.getOpcode() == ISD::UNDEF ||
6821           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6822         ((InVec1.getOpcode() == ISD::UNDEF ||
6823           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6824       X86Opcode = X86ISD::HSUB;
6825     else
6826       CanFold = false;
6827
6828     if (CanFold) {
6829       // Fold this build_vector into a single horizontal add/sub.
6830       // Do this only if the target has AVX2.
6831       if (Subtarget->hasAVX2())
6832         return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
6833
6834       // Do not try to expand this build_vector into a pair of horizontal
6835       // add/sub if we can emit a pair of scalar add/sub.
6836       if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
6837         return SDValue();
6838
6839       // Convert this build_vector into a pair of horizontal binop followed by
6840       // a concat vector.
6841       bool isUndefLO = NumUndefsLO == Half;
6842       bool isUndefHI = NumUndefsHI == Half;
6843       return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
6844                                    isUndefLO, isUndefHI);
6845     }
6846   }
6847
6848   if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
6849        VT == MVT::v16i16) && Subtarget->hasAVX()) {
6850     unsigned X86Opcode;
6851     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
6852       X86Opcode = X86ISD::HADD;
6853     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
6854       X86Opcode = X86ISD::HSUB;
6855     else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
6856       X86Opcode = X86ISD::FHADD;
6857     else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
6858       X86Opcode = X86ISD::FHSUB;
6859     else
6860       return SDValue();
6861
6862     // Don't try to expand this build_vector into a pair of horizontal add/sub
6863     // if we can simply emit a pair of scalar add/sub.
6864     if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
6865       return SDValue();
6866
6867     // Convert this build_vector into two horizontal add/sub followed by
6868     // a concat vector.
6869     bool isUndefLO = NumUndefsLO == Half;
6870     bool isUndefHI = NumUndefsHI == Half;
6871     return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
6872                                  isUndefLO, isUndefHI);
6873   }
6874
6875   return SDValue();
6876 }
6877
6878 SDValue
6879 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
6880   SDLoc dl(Op);
6881
6882   MVT VT = Op.getSimpleValueType();
6883   MVT ExtVT = VT.getVectorElementType();
6884   unsigned NumElems = Op.getNumOperands();
6885
6886   // Generate vectors for predicate vectors.
6887   if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512())
6888     return LowerBUILD_VECTORvXi1(Op, DAG);
6889
6890   // Vectors containing all zeros can be matched by pxor and xorps later
6891   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
6892     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
6893     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
6894     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
6895       return Op;
6896
6897     return getZeroVector(VT, Subtarget, DAG, dl);
6898   }
6899
6900   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
6901   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
6902   // vpcmpeqd on 256-bit vectors.
6903   if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
6904     if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
6905       return Op;
6906
6907     if (!VT.is512BitVector())
6908       return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
6909   }
6910
6911   SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
6912   if (Broadcast.getNode())
6913     return Broadcast;
6914
6915   unsigned EVTBits = ExtVT.getSizeInBits();
6916
6917   unsigned NumZero  = 0;
6918   unsigned NumNonZero = 0;
6919   unsigned NonZeros = 0;
6920   bool IsAllConstants = true;
6921   SmallSet<SDValue, 8> Values;
6922   for (unsigned i = 0; i < NumElems; ++i) {
6923     SDValue Elt = Op.getOperand(i);
6924     if (Elt.getOpcode() == ISD::UNDEF)
6925       continue;
6926     Values.insert(Elt);
6927     if (Elt.getOpcode() != ISD::Constant &&
6928         Elt.getOpcode() != ISD::ConstantFP)
6929       IsAllConstants = false;
6930     if (X86::isZeroNode(Elt))
6931       NumZero++;
6932     else {
6933       NonZeros |= (1 << i);
6934       NumNonZero++;
6935     }
6936   }
6937
6938   // All undef vector. Return an UNDEF.  All zero vectors were handled above.
6939   if (NumNonZero == 0)
6940     return DAG.getUNDEF(VT);
6941
6942   // Special case for single non-zero, non-undef, element.
6943   if (NumNonZero == 1) {
6944     unsigned Idx = countTrailingZeros(NonZeros);
6945     SDValue Item = Op.getOperand(Idx);
6946
6947     // If this is an insertion of an i64 value on x86-32, and if the top bits of
6948     // the value are obviously zero, truncate the value to i32 and do the
6949     // insertion that way.  Only do this if the value is non-constant or if the
6950     // value is a constant being inserted into element 0.  It is cheaper to do
6951     // a constant pool load than it is to do a movd + shuffle.
6952     if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
6953         (!IsAllConstants || Idx == 0)) {
6954       if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
6955         // Handle SSE only.
6956         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
6957         EVT VecVT = MVT::v4i32;
6958         unsigned VecElts = 4;
6959
6960         // Truncate the value (which may itself be a constant) to i32, and
6961         // convert it to a vector with movd (S2V+shuffle to zero extend).
6962         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
6963         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
6964
6965         // If using the new shuffle lowering, just directly insert this.
6966         if (ExperimentalVectorShuffleLowering)
6967           return DAG.getNode(
6968               ISD::BITCAST, dl, VT,
6969               getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG));
6970
6971         Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
6972
6973         // Now we have our 32-bit value zero extended in the low element of
6974         // a vector.  If Idx != 0, swizzle it into place.
6975         if (Idx != 0) {
6976           SmallVector<int, 4> Mask;
6977           Mask.push_back(Idx);
6978           for (unsigned i = 1; i != VecElts; ++i)
6979             Mask.push_back(i);
6980           Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
6981                                       &Mask[0]);
6982         }
6983         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
6984       }
6985     }
6986
6987     // If we have a constant or non-constant insertion into the low element of
6988     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
6989     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
6990     // depending on what the source datatype is.
6991     if (Idx == 0) {
6992       if (NumZero == 0)
6993         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
6994
6995       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
6996           (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
6997         if (VT.is256BitVector() || VT.is512BitVector()) {
6998           SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
6999           return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
7000                              Item, DAG.getIntPtrConstant(0));
7001         }
7002         assert(VT.is128BitVector() && "Expected an SSE value type!");
7003         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7004         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
7005         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7006       }
7007
7008       if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
7009         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
7010         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7011         if (VT.is256BitVector()) {
7012           SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
7013           Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
7014         } else {
7015           assert(VT.is128BitVector() && "Expected an SSE value type!");
7016           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7017         }
7018         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
7019       }
7020     }
7021
7022     // Is it a vector logical left shift?
7023     if (NumElems == 2 && Idx == 1 &&
7024         X86::isZeroNode(Op.getOperand(0)) &&
7025         !X86::isZeroNode(Op.getOperand(1))) {
7026       unsigned NumBits = VT.getSizeInBits();
7027       return getVShift(true, VT,
7028                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
7029                                    VT, Op.getOperand(1)),
7030                        NumBits/2, DAG, *this, dl);
7031     }
7032
7033     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
7034       return SDValue();
7035
7036     // Otherwise, if this is a vector with i32 or f32 elements, and the element
7037     // is a non-constant being inserted into an element other than the low one,
7038     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
7039     // movd/movss) to move this into the low element, then shuffle it into
7040     // place.
7041     if (EVTBits == 32) {
7042       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7043
7044       // If using the new shuffle lowering, just directly insert this.
7045       if (ExperimentalVectorShuffleLowering)
7046         return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
7047
7048       // Turn it into a shuffle of zero and zero-extended scalar to vector.
7049       Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
7050       SmallVector<int, 8> MaskVec;
7051       for (unsigned i = 0; i != NumElems; ++i)
7052         MaskVec.push_back(i == Idx ? 0 : 1);
7053       return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
7054     }
7055   }
7056
7057   // Splat is obviously ok. Let legalizer expand it to a shuffle.
7058   if (Values.size() == 1) {
7059     if (EVTBits == 32) {
7060       // Instead of a shuffle like this:
7061       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
7062       // Check if it's possible to issue this instead.
7063       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
7064       unsigned Idx = countTrailingZeros(NonZeros);
7065       SDValue Item = Op.getOperand(Idx);
7066       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
7067         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
7068     }
7069     return SDValue();
7070   }
7071
7072   // A vector full of immediates; various special cases are already
7073   // handled, so this is best done with a single constant-pool load.
7074   if (IsAllConstants)
7075     return SDValue();
7076
7077   // For AVX-length vectors, see if we can use a vector load to get all of the
7078   // elements, otherwise build the individual 128-bit pieces and use
7079   // shuffles to put them in place.
7080   if (VT.is256BitVector() || VT.is512BitVector()) {
7081     SmallVector<SDValue, 64> V;
7082     for (unsigned i = 0; i != NumElems; ++i)
7083       V.push_back(Op.getOperand(i));
7084
7085     // Check for a build vector of consecutive loads.
7086     if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
7087       return LD;
7088
7089     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
7090
7091     // Build both the lower and upper subvector.
7092     SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
7093                                 makeArrayRef(&V[0], NumElems/2));
7094     SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
7095                                 makeArrayRef(&V[NumElems / 2], NumElems/2));
7096
7097     // Recreate the wider vector with the lower and upper part.
7098     if (VT.is256BitVector())
7099       return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7100     return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7101   }
7102
7103   // Let legalizer expand 2-wide build_vectors.
7104   if (EVTBits == 64) {
7105     if (NumNonZero == 1) {
7106       // One half is zero or undef.
7107       unsigned Idx = countTrailingZeros(NonZeros);
7108       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
7109                                  Op.getOperand(Idx));
7110       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
7111     }
7112     return SDValue();
7113   }
7114
7115   // If element VT is < 32 bits, convert it to inserts into a zero vector.
7116   if (EVTBits == 8 && NumElems == 16) {
7117     SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
7118                                         Subtarget, *this);
7119     if (V.getNode()) return V;
7120   }
7121
7122   if (EVTBits == 16 && NumElems == 8) {
7123     SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
7124                                       Subtarget, *this);
7125     if (V.getNode()) return V;
7126   }
7127
7128   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
7129   if (EVTBits == 32 && NumElems == 4) {
7130     SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this);
7131     if (V.getNode())
7132       return V;
7133   }
7134
7135   // If element VT is == 32 bits, turn it into a number of shuffles.
7136   SmallVector<SDValue, 8> V(NumElems);
7137   if (NumElems == 4 && NumZero > 0) {
7138     for (unsigned i = 0; i < 4; ++i) {
7139       bool isZero = !(NonZeros & (1 << i));
7140       if (isZero)
7141         V[i] = getZeroVector(VT, Subtarget, DAG, dl);
7142       else
7143         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7144     }
7145
7146     for (unsigned i = 0; i < 2; ++i) {
7147       switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
7148         default: break;
7149         case 0:
7150           V[i] = V[i*2];  // Must be a zero vector.
7151           break;
7152         case 1:
7153           V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
7154           break;
7155         case 2:
7156           V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
7157           break;
7158         case 3:
7159           V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
7160           break;
7161       }
7162     }
7163
7164     bool Reverse1 = (NonZeros & 0x3) == 2;
7165     bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
7166     int MaskVec[] = {
7167       Reverse1 ? 1 : 0,
7168       Reverse1 ? 0 : 1,
7169       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
7170       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
7171     };
7172     return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
7173   }
7174
7175   if (Values.size() > 1 && VT.is128BitVector()) {
7176     // Check for a build vector of consecutive loads.
7177     for (unsigned i = 0; i < NumElems; ++i)
7178       V[i] = Op.getOperand(i);
7179
7180     // Check for elements which are consecutive loads.
7181     SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false);
7182     if (LD.getNode())
7183       return LD;
7184
7185     // Check for a build vector from mostly shuffle plus few inserting.
7186     SDValue Sh = buildFromShuffleMostly(Op, DAG);
7187     if (Sh.getNode())
7188       return Sh;
7189
7190     // For SSE 4.1, use insertps to put the high elements into the low element.
7191     if (Subtarget->hasSSE41()) {
7192       SDValue Result;
7193       if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
7194         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
7195       else
7196         Result = DAG.getUNDEF(VT);
7197
7198       for (unsigned i = 1; i < NumElems; ++i) {
7199         if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
7200         Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
7201                              Op.getOperand(i), DAG.getIntPtrConstant(i));
7202       }
7203       return Result;
7204     }
7205
7206     // Otherwise, expand into a number of unpckl*, start by extending each of
7207     // our (non-undef) elements to the full vector width with the element in the
7208     // bottom slot of the vector (which generates no code for SSE).
7209     for (unsigned i = 0; i < NumElems; ++i) {
7210       if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
7211         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7212       else
7213         V[i] = DAG.getUNDEF(VT);
7214     }
7215
7216     // Next, we iteratively mix elements, e.g. for v4f32:
7217     //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
7218     //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
7219     //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
7220     unsigned EltStride = NumElems >> 1;
7221     while (EltStride != 0) {
7222       for (unsigned i = 0; i < EltStride; ++i) {
7223         // If V[i+EltStride] is undef and this is the first round of mixing,
7224         // then it is safe to just drop this shuffle: V[i] is already in the
7225         // right place, the one element (since it's the first round) being
7226         // inserted as undef can be dropped.  This isn't safe for successive
7227         // rounds because they will permute elements within both vectors.
7228         if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
7229             EltStride == NumElems/2)
7230           continue;
7231
7232         V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
7233       }
7234       EltStride >>= 1;
7235     }
7236     return V[0];
7237   }
7238   return SDValue();
7239 }
7240
7241 // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
7242 // to create 256-bit vectors from two other 128-bit ones.
7243 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7244   SDLoc dl(Op);
7245   MVT ResVT = Op.getSimpleValueType();
7246
7247   assert((ResVT.is256BitVector() ||
7248           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
7249
7250   SDValue V1 = Op.getOperand(0);
7251   SDValue V2 = Op.getOperand(1);
7252   unsigned NumElems = ResVT.getVectorNumElements();
7253   if(ResVT.is256BitVector())
7254     return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7255
7256   if (Op.getNumOperands() == 4) {
7257     MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),
7258                                 ResVT.getVectorNumElements()/2);
7259     SDValue V3 = Op.getOperand(2);
7260     SDValue V4 = Op.getOperand(3);
7261     return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl),
7262       Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl);
7263   }
7264   return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7265 }
7266
7267 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7268   MVT LLVM_ATTRIBUTE_UNUSED VT = Op.getSimpleValueType();
7269   assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
7270          (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
7271           Op.getNumOperands() == 4)));
7272
7273   // AVX can use the vinsertf128 instruction to create 256-bit vectors
7274   // from two other 128-bit ones.
7275
7276   // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
7277   return LowerAVXCONCAT_VECTORS(Op, DAG);
7278 }
7279
7280
7281 //===----------------------------------------------------------------------===//
7282 // Vector shuffle lowering
7283 //
7284 // This is an experimental code path for lowering vector shuffles on x86. It is
7285 // designed to handle arbitrary vector shuffles and blends, gracefully
7286 // degrading performance as necessary. It works hard to recognize idiomatic
7287 // shuffles and lower them to optimal instruction patterns without leaving
7288 // a framework that allows reasonably efficient handling of all vector shuffle
7289 // patterns.
7290 //===----------------------------------------------------------------------===//
7291
7292 /// \brief Tiny helper function to identify a no-op mask.
7293 ///
7294 /// This is a somewhat boring predicate function. It checks whether the mask
7295 /// array input, which is assumed to be a single-input shuffle mask of the kind
7296 /// used by the X86 shuffle instructions (not a fully general
7297 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
7298 /// in-place shuffle are 'no-op's.
7299 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
7300   for (int i = 0, Size = Mask.size(); i < Size; ++i)
7301     if (Mask[i] != -1 && Mask[i] != i)
7302       return false;
7303   return true;
7304 }
7305
7306 /// \brief Helper function to classify a mask as a single-input mask.
7307 ///
7308 /// This isn't a generic single-input test because in the vector shuffle
7309 /// lowering we canonicalize single inputs to be the first input operand. This
7310 /// means we can more quickly test for a single input by only checking whether
7311 /// an input from the second operand exists. We also assume that the size of
7312 /// mask corresponds to the size of the input vectors which isn't true in the
7313 /// fully general case.
7314 static bool isSingleInputShuffleMask(ArrayRef<int> Mask) {
7315   for (int M : Mask)
7316     if (M >= (int)Mask.size())
7317       return false;
7318   return true;
7319 }
7320
7321 /// \brief Test whether there are elements crossing 128-bit lanes in this
7322 /// shuffle mask.
7323 ///
7324 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
7325 /// and we routinely test for these.
7326 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
7327   int LaneSize = 128 / VT.getScalarSizeInBits();
7328   int Size = Mask.size();
7329   for (int i = 0; i < Size; ++i)
7330     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
7331       return true;
7332   return false;
7333 }
7334
7335 /// \brief Test whether a shuffle mask is equivalent within each 128-bit lane.
7336 ///
7337 /// This checks a shuffle mask to see if it is performing the same
7338 /// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies
7339 /// that it is also not lane-crossing. It may however involve a blend from the
7340 /// same lane of a second vector.
7341 ///
7342 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
7343 /// non-trivial to compute in the face of undef lanes. The representation is
7344 /// *not* suitable for use with existing 128-bit shuffles as it will contain
7345 /// entries from both V1 and V2 inputs to the wider mask.
7346 static bool
7347 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
7348                                 SmallVectorImpl<int> &RepeatedMask) {
7349   int LaneSize = 128 / VT.getScalarSizeInBits();
7350   RepeatedMask.resize(LaneSize, -1);
7351   int Size = Mask.size();
7352   for (int i = 0; i < Size; ++i) {
7353     if (Mask[i] < 0)
7354       continue;
7355     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
7356       // This entry crosses lanes, so there is no way to model this shuffle.
7357       return false;
7358
7359     // Ok, handle the in-lane shuffles by detecting if and when they repeat.
7360     if (RepeatedMask[i % LaneSize] == -1)
7361       // This is the first non-undef entry in this slot of a 128-bit lane.
7362       RepeatedMask[i % LaneSize] =
7363           Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size;
7364     else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i])
7365       // Found a mismatch with the repeated mask.
7366       return false;
7367   }
7368   return true;
7369 }
7370
7371 // Hide this symbol with an anonymous namespace instead of 'static' so that MSVC
7372 // 2013 will allow us to use it as a non-type template parameter.
7373 namespace {
7374
7375 /// \brief Implementation of the \c isShuffleEquivalent variadic functor.
7376 ///
7377 /// See its documentation for details.
7378 bool isShuffleEquivalentImpl(ArrayRef<int> Mask, ArrayRef<const int *> Args) {
7379   if (Mask.size() != Args.size())
7380     return false;
7381   for (int i = 0, e = Mask.size(); i < e; ++i) {
7382     assert(*Args[i] >= 0 && "Arguments must be positive integers!");
7383     if (Mask[i] != -1 && Mask[i] != *Args[i])
7384       return false;
7385   }
7386   return true;
7387 }
7388
7389 } // namespace
7390
7391 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
7392 /// arguments.
7393 ///
7394 /// This is a fast way to test a shuffle mask against a fixed pattern:
7395 ///
7396 ///   if (isShuffleEquivalent(Mask, 3, 2, 1, 0)) { ... }
7397 ///
7398 /// It returns true if the mask is exactly as wide as the argument list, and
7399 /// each element of the mask is either -1 (signifying undef) or the value given
7400 /// in the argument.
7401 static const VariadicFunction1<
7402     bool, ArrayRef<int>, int, isShuffleEquivalentImpl> isShuffleEquivalent = {};
7403
7404 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
7405 ///
7406 /// This helper function produces an 8-bit shuffle immediate corresponding to
7407 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
7408 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
7409 /// example.
7410 ///
7411 /// NB: We rely heavily on "undef" masks preserving the input lane.
7412 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask,
7413                                           SelectionDAG &DAG) {
7414   assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
7415   assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
7416   assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
7417   assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
7418   assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
7419
7420   unsigned Imm = 0;
7421   Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0;
7422   Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2;
7423   Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4;
7424   Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6;
7425   return DAG.getConstant(Imm, MVT::i8);
7426 }
7427
7428 /// \brief Try to emit a blend instruction for a shuffle.
7429 ///
7430 /// This doesn't do any checks for the availability of instructions for blending
7431 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
7432 /// be matched in the backend with the type given. What it does check for is
7433 /// that the shuffle mask is in fact a blend.
7434 static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
7435                                          SDValue V2, ArrayRef<int> Mask,
7436                                          const X86Subtarget *Subtarget,
7437                                          SelectionDAG &DAG) {
7438
7439   unsigned BlendMask = 0;
7440   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7441     if (Mask[i] >= Size) {
7442       if (Mask[i] != i + Size)
7443         return SDValue(); // Shuffled V2 input!
7444       BlendMask |= 1u << i;
7445       continue;
7446     }
7447     if (Mask[i] >= 0 && Mask[i] != i)
7448       return SDValue(); // Shuffled V1 input!
7449   }
7450   switch (VT.SimpleTy) {
7451   case MVT::v2f64:
7452   case MVT::v4f32:
7453   case MVT::v4f64:
7454   case MVT::v8f32:
7455     return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
7456                        DAG.getConstant(BlendMask, MVT::i8));
7457
7458   case MVT::v4i64:
7459   case MVT::v8i32:
7460     assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
7461     // FALLTHROUGH
7462   case MVT::v2i64:
7463   case MVT::v4i32:
7464     // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
7465     // that instruction.
7466     if (Subtarget->hasAVX2()) {
7467       // Scale the blend by the number of 32-bit dwords per element.
7468       int Scale =  VT.getScalarSizeInBits() / 32;
7469       BlendMask = 0;
7470       for (int i = 0, Size = Mask.size(); i < Size; ++i)
7471         if (Mask[i] >= Size)
7472           for (int j = 0; j < Scale; ++j)
7473             BlendMask |= 1u << (i * Scale + j);
7474
7475       MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
7476       V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
7477       V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
7478       return DAG.getNode(ISD::BITCAST, DL, VT,
7479                          DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
7480                                      DAG.getConstant(BlendMask, MVT::i8)));
7481     }
7482     // FALLTHROUGH
7483   case MVT::v8i16: {
7484     // For integer shuffles we need to expand the mask and cast the inputs to
7485     // v8i16s prior to blending.
7486     int Scale = 8 / VT.getVectorNumElements();
7487     BlendMask = 0;
7488     for (int i = 0, Size = Mask.size(); i < Size; ++i)
7489       if (Mask[i] >= Size)
7490         for (int j = 0; j < Scale; ++j)
7491           BlendMask |= 1u << (i * Scale + j);
7492
7493     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
7494     V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
7495     return DAG.getNode(ISD::BITCAST, DL, VT,
7496                        DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
7497                                    DAG.getConstant(BlendMask, MVT::i8)));
7498   }
7499
7500   case MVT::v16i16: {
7501     assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
7502     SmallVector<int, 8> RepeatedMask;
7503     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
7504       // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
7505       assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
7506       BlendMask = 0;
7507       for (int i = 0; i < 8; ++i)
7508         if (RepeatedMask[i] >= 16)
7509           BlendMask |= 1u << i;
7510       return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
7511                          DAG.getConstant(BlendMask, MVT::i8));
7512     }
7513   }
7514     // FALLTHROUGH
7515   case MVT::v32i8: {
7516     assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
7517     // Scale the blend by the number of bytes per element.
7518     int Scale =  VT.getScalarSizeInBits() / 8;
7519     assert(Mask.size() * Scale == 32 && "Not a 256-bit vector!");
7520
7521     // Compute the VSELECT mask. Note that VSELECT is really confusing in the
7522     // mix of LLVM's code generator and the x86 backend. We tell the code
7523     // generator that boolean values in the elements of an x86 vector register
7524     // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
7525     // mapping a select to operand #1, and 'false' mapping to operand #2. The
7526     // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
7527     // of the element (the remaining are ignored) and 0 in that high bit would
7528     // mean operand #1 while 1 in the high bit would mean operand #2. So while
7529     // the LLVM model for boolean values in vector elements gets the relevant
7530     // bit set, it is set backwards and over constrained relative to x86's
7531     // actual model.
7532     SDValue VSELECTMask[32];
7533     for (int i = 0, Size = Mask.size(); i < Size; ++i)
7534       for (int j = 0; j < Scale; ++j)
7535         VSELECTMask[Scale * i + j] =
7536             Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
7537                         : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8);
7538
7539     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1);
7540     V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V2);
7541     return DAG.getNode(
7542         ISD::BITCAST, DL, VT,
7543         DAG.getNode(ISD::VSELECT, DL, MVT::v32i8,
7544                     DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, VSELECTMask),
7545                     V1, V2));
7546   }
7547
7548   default:
7549     llvm_unreachable("Not a supported integer vector type!");
7550   }
7551 }
7552
7553 /// \brief Generic routine to lower a shuffle and blend as a decomposed set of
7554 /// unblended shuffles followed by an unshuffled blend.
7555 ///
7556 /// This matches the extremely common pattern for handling combined
7557 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
7558 /// operations.
7559 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
7560                                                           SDValue V1,
7561                                                           SDValue V2,
7562                                                           ArrayRef<int> Mask,
7563                                                           SelectionDAG &DAG) {
7564   // Shuffle the input elements into the desired positions in V1 and V2 and
7565   // blend them together.
7566   SmallVector<int, 32> V1Mask(Mask.size(), -1);
7567   SmallVector<int, 32> V2Mask(Mask.size(), -1);
7568   SmallVector<int, 32> BlendMask(Mask.size(), -1);
7569   for (int i = 0, Size = Mask.size(); i < Size; ++i)
7570     if (Mask[i] >= 0 && Mask[i] < Size) {
7571       V1Mask[i] = Mask[i];
7572       BlendMask[i] = i;
7573     } else if (Mask[i] >= Size) {
7574       V2Mask[i] = Mask[i] - Size;
7575       BlendMask[i] = i + Size;
7576     }
7577
7578   V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
7579   V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
7580   return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
7581 }
7582
7583 /// \brief Try to lower a vector shuffle as a byte rotation.
7584 ///
7585 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
7586 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
7587 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
7588 /// try to generically lower a vector shuffle through such an pattern. It
7589 /// does not check for the profitability of lowering either as PALIGNR or
7590 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
7591 /// This matches shuffle vectors that look like:
7592 ///
7593 ///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
7594 ///
7595 /// Essentially it concatenates V1 and V2, shifts right by some number of
7596 /// elements, and takes the low elements as the result. Note that while this is
7597 /// specified as a *right shift* because x86 is little-endian, it is a *left
7598 /// rotate* of the vector lanes.
7599 ///
7600 /// Note that this only handles 128-bit vector widths currently.
7601 static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
7602                                               SDValue V2,
7603                                               ArrayRef<int> Mask,
7604                                               const X86Subtarget *Subtarget,
7605                                               SelectionDAG &DAG) {
7606   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
7607
7608   // We need to detect various ways of spelling a rotation:
7609   //   [11, 12, 13, 14, 15,  0,  1,  2]
7610   //   [-1, 12, 13, 14, -1, -1,  1, -1]
7611   //   [-1, -1, -1, -1, -1, -1,  1,  2]
7612   //   [ 3,  4,  5,  6,  7,  8,  9, 10]
7613   //   [-1,  4,  5,  6, -1, -1,  9, -1]
7614   //   [-1,  4,  5,  6, -1, -1, -1, -1]
7615   int Rotation = 0;
7616   SDValue Lo, Hi;
7617   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7618     if (Mask[i] == -1)
7619       continue;
7620     assert(Mask[i] >= 0 && "Only -1 is a valid negative mask element!");
7621
7622     // Based on the mod-Size value of this mask element determine where
7623     // a rotated vector would have started.
7624     int StartIdx = i - (Mask[i] % Size);
7625     if (StartIdx == 0)
7626       // The identity rotation isn't interesting, stop.
7627       return SDValue();
7628
7629     // If we found the tail of a vector the rotation must be the missing
7630     // front. If we found the head of a vector, it must be how much of the head.
7631     int CandidateRotation = StartIdx < 0 ? -StartIdx : Size - StartIdx;
7632
7633     if (Rotation == 0)
7634       Rotation = CandidateRotation;
7635     else if (Rotation != CandidateRotation)
7636       // The rotations don't match, so we can't match this mask.
7637       return SDValue();
7638
7639     // Compute which value this mask is pointing at.
7640     SDValue MaskV = Mask[i] < Size ? V1 : V2;
7641
7642     // Compute which of the two target values this index should be assigned to.
7643     // This reflects whether the high elements are remaining or the low elements
7644     // are remaining.
7645     SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
7646
7647     // Either set up this value if we've not encountered it before, or check
7648     // that it remains consistent.
7649     if (!TargetV)
7650       TargetV = MaskV;
7651     else if (TargetV != MaskV)
7652       // This may be a rotation, but it pulls from the inputs in some
7653       // unsupported interleaving.
7654       return SDValue();
7655   }
7656
7657   // Check that we successfully analyzed the mask, and normalize the results.
7658   assert(Rotation != 0 && "Failed to locate a viable rotation!");
7659   assert((Lo || Hi) && "Failed to find a rotated input vector!");
7660   if (!Lo)
7661     Lo = Hi;
7662   else if (!Hi)
7663     Hi = Lo;
7664
7665   assert(VT.getSizeInBits() == 128 &&
7666          "Rotate-based lowering only supports 128-bit lowering!");
7667   assert(Mask.size() <= 16 &&
7668          "Can shuffle at most 16 bytes in a 128-bit vector!");
7669
7670   // The actual rotate instruction rotates bytes, so we need to scale the
7671   // rotation based on how many bytes are in the vector.
7672   int Scale = 16 / Mask.size();
7673
7674   // SSSE3 targets can use the palignr instruction
7675   if (Subtarget->hasSSSE3()) {
7676     // Cast the inputs to v16i8 to match PALIGNR.
7677     Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Lo);
7678     Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Hi);
7679
7680     return DAG.getNode(ISD::BITCAST, DL, VT,
7681                        DAG.getNode(X86ISD::PALIGNR, DL, MVT::v16i8, Hi, Lo,
7682                                    DAG.getConstant(Rotation * Scale, MVT::i8)));
7683   }
7684
7685   // Default SSE2 implementation
7686   int LoByteShift = 16 - Rotation * Scale;
7687   int HiByteShift = Rotation * Scale;
7688
7689   // Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ.
7690   Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Lo);
7691   Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Hi);
7692
7693   SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo,
7694                                 DAG.getConstant(8 * LoByteShift, MVT::i8));
7695   SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi,
7696                                 DAG.getConstant(8 * HiByteShift, MVT::i8));
7697   return DAG.getNode(ISD::BITCAST, DL, VT,
7698                      DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift));
7699 }
7700
7701 /// \brief Compute whether each element of a shuffle is zeroable.
7702 ///
7703 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
7704 /// Either it is an undef element in the shuffle mask, the element of the input
7705 /// referenced is undef, or the element of the input referenced is known to be
7706 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
7707 /// as many lanes with this technique as possible to simplify the remaining
7708 /// shuffle.
7709 static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
7710                                                      SDValue V1, SDValue V2) {
7711   SmallBitVector Zeroable(Mask.size(), false);
7712
7713   bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
7714   bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
7715
7716   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7717     int M = Mask[i];
7718     // Handle the easy cases.
7719     if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
7720       Zeroable[i] = true;
7721       continue;
7722     }
7723
7724     // If this is an index into a build_vector node, dig out the input value and
7725     // use it.
7726     SDValue V = M < Size ? V1 : V2;
7727     if (V.getOpcode() != ISD::BUILD_VECTOR)
7728       continue;
7729
7730     SDValue Input = V.getOperand(M % Size);
7731     // The UNDEF opcode check really should be dead code here, but not quite
7732     // worth asserting on (it isn't invalid, just unexpected).
7733     if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input))
7734       Zeroable[i] = true;
7735   }
7736
7737   return Zeroable;
7738 }
7739
7740 /// \brief Try to emit a bitmask instruction for a shuffle.
7741 ///
7742 /// This handles cases where we can model a blend exactly as a bitmask due to
7743 /// one of the inputs being zeroable.
7744 static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1,
7745                                            SDValue V2, ArrayRef<int> Mask,
7746                                            SelectionDAG &DAG) {
7747   MVT EltVT = VT.getScalarType();
7748   int NumEltBits = EltVT.getSizeInBits();
7749   MVT IntEltVT = MVT::getIntegerVT(NumEltBits);
7750   SDValue Zero = DAG.getConstant(0, IntEltVT);
7751   SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), IntEltVT);
7752   if (EltVT.isFloatingPoint()) {
7753     Zero = DAG.getNode(ISD::BITCAST, DL, EltVT, Zero);
7754     AllOnes = DAG.getNode(ISD::BITCAST, DL, EltVT, AllOnes);
7755   }
7756   SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
7757   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7758   SDValue V;
7759   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7760     if (Zeroable[i])
7761       continue;
7762     if (Mask[i] % Size != i)
7763       return SDValue(); // Not a blend.
7764     if (!V)
7765       V = Mask[i] < Size ? V1 : V2;
7766     else if (V != (Mask[i] < Size ? V1 : V2))
7767       return SDValue(); // Can only let one input through the mask.
7768
7769     VMaskOps[i] = AllOnes;
7770   }
7771   if (!V)
7772     return SDValue(); // No non-zeroable elements!
7773
7774   SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps);
7775   V = DAG.getNode(VT.isFloatingPoint()
7776                   ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND,
7777                   DL, VT, V, VMask);
7778   return V;
7779 }
7780
7781 /// \brief Try to lower a vector shuffle as a byte shift (shifts in zeros).
7782 ///
7783 /// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ SSE2
7784 /// byte-shift instructions. The mask must consist of a shifted sequential
7785 /// shuffle from one of the input vectors and zeroable elements for the
7786 /// remaining 'shifted in' elements.
7787 ///
7788 /// Note that this only handles 128-bit vector widths currently.
7789 static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1,
7790                                              SDValue V2, ArrayRef<int> Mask,
7791                                              SelectionDAG &DAG) {
7792   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
7793
7794   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7795
7796   int Size = Mask.size();
7797   int Scale = 16 / Size;
7798
7799   for (int Shift = 1; Shift < Size; Shift++) {
7800     int ByteShift = Shift * Scale;
7801
7802     // PSRLDQ : (little-endian) right byte shift
7803     // [ 5,  6,  7, zz, zz, zz, zz, zz]
7804     // [ -1, 5,  6,  7, zz, zz, zz, zz]
7805     // [  1, 2, -1, -1, -1, -1, zz, zz]
7806     bool ZeroableRight = true;
7807     for (int i = Size - Shift; i < Size; i++) {
7808       ZeroableRight &= Zeroable[i];
7809     }
7810
7811     if (ZeroableRight) {
7812       bool ValidShiftRight1 =
7813           isSequentialOrUndefInRange(Mask, 0, Size - Shift, Shift);
7814       bool ValidShiftRight2 =
7815           isSequentialOrUndefInRange(Mask, 0, Size - Shift, Size + Shift);
7816
7817       if (ValidShiftRight1 || ValidShiftRight2) {
7818         // Cast the inputs to v2i64 to match PSRLDQ.
7819         SDValue &TargetV = ValidShiftRight1 ? V1 : V2;
7820         SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
7821         SDValue Shifted = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, V,
7822                                       DAG.getConstant(ByteShift * 8, MVT::i8));
7823         return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
7824       }
7825     }
7826
7827     // PSLLDQ : (little-endian) left byte shift
7828     // [ zz,  0,  1,  2,  3,  4,  5,  6]
7829     // [ zz, zz, -1, -1,  2,  3,  4, -1]
7830     // [ zz, zz, zz, zz, zz, zz, -1,  1]
7831     bool ZeroableLeft = true;
7832     for (int i = 0; i < Shift; i++) {
7833       ZeroableLeft &= Zeroable[i];
7834     }
7835
7836     if (ZeroableLeft) {
7837       bool ValidShiftLeft1 =
7838           isSequentialOrUndefInRange(Mask, Shift, Size - Shift, 0);
7839       bool ValidShiftLeft2 =
7840           isSequentialOrUndefInRange(Mask, Shift, Size - Shift, Size);
7841
7842       if (ValidShiftLeft1 || ValidShiftLeft2) {
7843         // Cast the inputs to v2i64 to match PSLLDQ.
7844         SDValue &TargetV = ValidShiftLeft1 ? V1 : V2;
7845         SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
7846         SDValue Shifted = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, V,
7847                                       DAG.getConstant(ByteShift * 8, MVT::i8));
7848         return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
7849       }
7850     }
7851   }
7852
7853   return SDValue();
7854 }
7855
7856 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
7857 ///
7858 /// Attempts to match a shuffle mask against the PSRL(W/D/Q) and PSLL(W/D/Q)
7859 /// SSE2 and AVX2 logical bit-shift instructions. The function matches
7860 /// elements from one of the input vectors shuffled to the left or right
7861 /// with zeroable elements 'shifted in'.
7862 static SDValue lowerVectorShuffleAsBitShift(SDLoc DL, MVT VT, SDValue V1,
7863                                             SDValue V2, ArrayRef<int> Mask,
7864                                             SelectionDAG &DAG) {
7865   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7866
7867   int Size = Mask.size();
7868   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
7869
7870   // PSRL : (little-endian) right bit shift.
7871   // [  1, zz,  3, zz]
7872   // [ -1, -1,  7, zz]
7873   // PSHL : (little-endian) left bit shift.
7874   // [ zz, 0, zz,  2 ]
7875   // [ -1, 4, zz, -1 ]
7876   auto MatchBitShift = [&](int Shift, int Scale) -> SDValue {
7877     MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale);
7878     MVT ShiftVT = MVT::getVectorVT(ShiftSVT, Size / Scale);
7879     assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
7880            "Illegal integer vector type");
7881
7882     bool MatchLeft = true, MatchRight = true;
7883     for (int i = 0; i != Size; i += Scale) {
7884       for (int j = 0; j != Shift; j++) {
7885         MatchLeft &= Zeroable[i + j];
7886       }
7887       for (int j = Scale - Shift; j != Scale; j++) {
7888         MatchRight &= Zeroable[i + j];
7889       }
7890     }
7891     if (!(MatchLeft || MatchRight))
7892       return SDValue();
7893
7894     bool MatchV1 = true, MatchV2 = true;
7895     for (int i = 0; i != Size; i += Scale) {
7896       unsigned Pos = MatchLeft ? i + Shift : i;
7897       unsigned Low = MatchLeft ? i : i + Shift;
7898       unsigned Len = Scale - Shift;
7899       MatchV1 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low);
7900       MatchV2 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low + Size);
7901     }
7902     if (!(MatchV1 || MatchV2))
7903       return SDValue();
7904
7905     // Cast the inputs to ShiftVT to match VSRLI/VSHLI and back again.
7906     unsigned OpCode = MatchLeft ? X86ISD::VSHLI : X86ISD::VSRLI;
7907     int ShiftAmt = Shift * VT.getScalarSizeInBits();
7908     SDValue V = MatchV1 ? V1 : V2;
7909     V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, V);
7910     V = DAG.getNode(OpCode, DL, ShiftVT, V, DAG.getConstant(ShiftAmt, MVT::i8));
7911     return DAG.getNode(ISD::BITCAST, DL, VT, V);
7912   };
7913
7914   // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
7915   // keep doubling the size of the integer elements up to that. We can
7916   // then shift the elements of the integer vector by whole multiples of
7917   // their width within the elements of the larger integer vector. Test each
7918   // multiple to see if we can find a match with the moved element indices
7919   // and that the shifted in elements are all zeroable.
7920   for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= 64; Scale *= 2)
7921     for (int Shift = 1; Shift != Scale; Shift++)
7922       if (SDValue BitShift = MatchBitShift(Shift, Scale))
7923         return BitShift;
7924
7925   // no match
7926   return SDValue();
7927 }
7928
7929 /// \brief Lower a vector shuffle as a zero or any extension.
7930 ///
7931 /// Given a specific number of elements, element bit width, and extension
7932 /// stride, produce either a zero or any extension based on the available
7933 /// features of the subtarget.
7934 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
7935     SDLoc DL, MVT VT, int Scale, bool AnyExt, SDValue InputV,
7936     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
7937   assert(Scale > 1 && "Need a scale to extend.");
7938   int NumElements = VT.getVectorNumElements();
7939   int EltBits = VT.getScalarSizeInBits();
7940   assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
7941          "Only 8, 16, and 32 bit elements can be extended.");
7942   assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
7943
7944   // Found a valid zext mask! Try various lowering strategies based on the
7945   // input type and available ISA extensions.
7946   if (Subtarget->hasSSE41()) {
7947     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
7948                                  NumElements / Scale);
7949     return DAG.getNode(ISD::BITCAST, DL, VT,
7950                        DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV));
7951   }
7952
7953   // For any extends we can cheat for larger element sizes and use shuffle
7954   // instructions that can fold with a load and/or copy.
7955   if (AnyExt && EltBits == 32) {
7956     int PSHUFDMask[4] = {0, -1, 1, -1};
7957     return DAG.getNode(
7958         ISD::BITCAST, DL, VT,
7959         DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
7960                     DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
7961                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
7962   }
7963   if (AnyExt && EltBits == 16 && Scale > 2) {
7964     int PSHUFDMask[4] = {0, -1, 0, -1};
7965     InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
7966                          DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
7967                          getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG));
7968     int PSHUFHWMask[4] = {1, -1, -1, -1};
7969     return DAG.getNode(
7970         ISD::BITCAST, DL, VT,
7971         DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16,
7972                     DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, InputV),
7973                     getV4X86ShuffleImm8ForMask(PSHUFHWMask, DAG)));
7974   }
7975
7976   // If this would require more than 2 unpack instructions to expand, use
7977   // pshufb when available. We can only use more than 2 unpack instructions
7978   // when zero extending i8 elements which also makes it easier to use pshufb.
7979   if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) {
7980     assert(NumElements == 16 && "Unexpected byte vector width!");
7981     SDValue PSHUFBMask[16];
7982     for (int i = 0; i < 16; ++i)
7983       PSHUFBMask[i] =
7984           DAG.getConstant((i % Scale == 0) ? i / Scale : 0x80, MVT::i8);
7985     InputV = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, InputV);
7986     return DAG.getNode(ISD::BITCAST, DL, VT,
7987                        DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
7988                                    DAG.getNode(ISD::BUILD_VECTOR, DL,
7989                                                MVT::v16i8, PSHUFBMask)));
7990   }
7991
7992   // Otherwise emit a sequence of unpacks.
7993   do {
7994     MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
7995     SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
7996                          : getZeroVector(InputVT, Subtarget, DAG, DL);
7997     InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
7998     InputV = DAG.getNode(X86ISD::UNPCKL, DL, InputVT, InputV, Ext);
7999     Scale /= 2;
8000     EltBits *= 2;
8001     NumElements /= 2;
8002   } while (Scale > 1);
8003   return DAG.getNode(ISD::BITCAST, DL, VT, InputV);
8004 }
8005
8006 /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
8007 ///
8008 /// This routine will try to do everything in its power to cleverly lower
8009 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
8010 /// check for the profitability of this lowering,  it tries to aggressively
8011 /// match this pattern. It will use all of the micro-architectural details it
8012 /// can to emit an efficient lowering. It handles both blends with all-zero
8013 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
8014 /// masking out later).
8015 ///
8016 /// The reason we have dedicated lowering for zext-style shuffles is that they
8017 /// are both incredibly common and often quite performance sensitive.
8018 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
8019     SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
8020     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
8021   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8022
8023   int Bits = VT.getSizeInBits();
8024   int NumElements = VT.getVectorNumElements();
8025   assert(VT.getScalarSizeInBits() <= 32 &&
8026          "Exceeds 32-bit integer zero extension limit");
8027   assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
8028
8029   // Define a helper function to check a particular ext-scale and lower to it if
8030   // valid.
8031   auto Lower = [&](int Scale) -> SDValue {
8032     SDValue InputV;
8033     bool AnyExt = true;
8034     for (int i = 0; i < NumElements; ++i) {
8035       if (Mask[i] == -1)
8036         continue; // Valid anywhere but doesn't tell us anything.
8037       if (i % Scale != 0) {
8038         // Each of the extended elements need to be zeroable.
8039         if (!Zeroable[i])
8040           return SDValue();
8041
8042         // We no longer are in the anyext case.
8043         AnyExt = false;
8044         continue;
8045       }
8046
8047       // Each of the base elements needs to be consecutive indices into the
8048       // same input vector.
8049       SDValue V = Mask[i] < NumElements ? V1 : V2;
8050       if (!InputV)
8051         InputV = V;
8052       else if (InputV != V)
8053         return SDValue(); // Flip-flopping inputs.
8054
8055       if (Mask[i] % NumElements != i / Scale)
8056         return SDValue(); // Non-consecutive strided elements.
8057     }
8058
8059     // If we fail to find an input, we have a zero-shuffle which should always
8060     // have already been handled.
8061     // FIXME: Maybe handle this here in case during blending we end up with one?
8062     if (!InputV)
8063       return SDValue();
8064
8065     return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
8066         DL, VT, Scale, AnyExt, InputV, Subtarget, DAG);
8067   };
8068
8069   // The widest scale possible for extending is to a 64-bit integer.
8070   assert(Bits % 64 == 0 &&
8071          "The number of bits in a vector must be divisible by 64 on x86!");
8072   int NumExtElements = Bits / 64;
8073
8074   // Each iteration, try extending the elements half as much, but into twice as
8075   // many elements.
8076   for (; NumExtElements < NumElements; NumExtElements *= 2) {
8077     assert(NumElements % NumExtElements == 0 &&
8078            "The input vector size must be divisible by the extended size.");
8079     if (SDValue V = Lower(NumElements / NumExtElements))
8080       return V;
8081   }
8082
8083   // General extends failed, but 128-bit vectors may be able to use MOVQ.
8084   if (Bits != 128)
8085     return SDValue();
8086
8087   // Returns one of the source operands if the shuffle can be reduced to a
8088   // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
8089   auto CanZExtLowHalf = [&]() {
8090     for (int i = NumElements / 2; i != NumElements; i++)
8091       if (!Zeroable[i])
8092         return SDValue();
8093     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
8094       return V1;
8095     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
8096       return V2;
8097     return SDValue();
8098   };
8099
8100   if (SDValue V = CanZExtLowHalf()) {
8101     V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V);
8102     V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
8103     return DAG.getNode(ISD::BITCAST, DL, VT, V);
8104   }
8105
8106   // No viable ext lowering found.
8107   return SDValue();
8108 }
8109
8110 /// \brief Try to get a scalar value for a specific element of a vector.
8111 ///
8112 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
8113 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
8114                                               SelectionDAG &DAG) {
8115   MVT VT = V.getSimpleValueType();
8116   MVT EltVT = VT.getVectorElementType();
8117   while (V.getOpcode() == ISD::BITCAST)
8118     V = V.getOperand(0);
8119   // If the bitcasts shift the element size, we can't extract an equivalent
8120   // element from it.
8121   MVT NewVT = V.getSimpleValueType();
8122   if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
8123     return SDValue();
8124
8125   if (V.getOpcode() == ISD::BUILD_VECTOR ||
8126       (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR))
8127     return DAG.getNode(ISD::BITCAST, SDLoc(V), EltVT, V.getOperand(Idx));
8128
8129   return SDValue();
8130 }
8131
8132 /// \brief Helper to test for a load that can be folded with x86 shuffles.
8133 ///
8134 /// This is particularly important because the set of instructions varies
8135 /// significantly based on whether the operand is a load or not.
8136 static bool isShuffleFoldableLoad(SDValue V) {
8137   while (V.getOpcode() == ISD::BITCAST)
8138     V = V.getOperand(0);
8139
8140   return ISD::isNON_EXTLoad(V.getNode());
8141 }
8142
8143 /// \brief Try to lower insertion of a single element into a zero vector.
8144 ///
8145 /// This is a common pattern that we have especially efficient patterns to lower
8146 /// across all subtarget feature sets.
8147 static SDValue lowerVectorShuffleAsElementInsertion(
8148     MVT VT, SDLoc DL, SDValue V1, SDValue V2, ArrayRef<int> Mask,
8149     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
8150   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8151   MVT ExtVT = VT;
8152   MVT EltVT = VT.getVectorElementType();
8153
8154   int V2Index = std::find_if(Mask.begin(), Mask.end(),
8155                              [&Mask](int M) { return M >= (int)Mask.size(); }) -
8156                 Mask.begin();
8157   bool IsV1Zeroable = true;
8158   for (int i = 0, Size = Mask.size(); i < Size; ++i)
8159     if (i != V2Index && !Zeroable[i]) {
8160       IsV1Zeroable = false;
8161       break;
8162     }
8163
8164   // Check for a single input from a SCALAR_TO_VECTOR node.
8165   // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
8166   // all the smarts here sunk into that routine. However, the current
8167   // lowering of BUILD_VECTOR makes that nearly impossible until the old
8168   // vector shuffle lowering is dead.
8169   if (SDValue V2S = getScalarValueForVectorElement(
8170           V2, Mask[V2Index] - Mask.size(), DAG)) {
8171     // We need to zext the scalar if it is smaller than an i32.
8172     V2S = DAG.getNode(ISD::BITCAST, DL, EltVT, V2S);
8173     if (EltVT == MVT::i8 || EltVT == MVT::i16) {
8174       // Using zext to expand a narrow element won't work for non-zero
8175       // insertions.
8176       if (!IsV1Zeroable)
8177         return SDValue();
8178
8179       // Zero-extend directly to i32.
8180       ExtVT = MVT::v4i32;
8181       V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
8182     }
8183     V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
8184   } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
8185              EltVT == MVT::i16) {
8186     // Either not inserting from the low element of the input or the input
8187     // element size is too small to use VZEXT_MOVL to clear the high bits.
8188     return SDValue();
8189   }
8190
8191   if (!IsV1Zeroable) {
8192     // If V1 can't be treated as a zero vector we have fewer options to lower
8193     // this. We can't support integer vectors or non-zero targets cheaply, and
8194     // the V1 elements can't be permuted in any way.
8195     assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
8196     if (!VT.isFloatingPoint() || V2Index != 0)
8197       return SDValue();
8198     SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
8199     V1Mask[V2Index] = -1;
8200     if (!isNoopShuffleMask(V1Mask))
8201       return SDValue();
8202     // This is essentially a special case blend operation, but if we have
8203     // general purpose blend operations, they are always faster. Bail and let
8204     // the rest of the lowering handle these as blends.
8205     if (Subtarget->hasSSE41())
8206       return SDValue();
8207
8208     // Otherwise, use MOVSD or MOVSS.
8209     assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
8210            "Only two types of floating point element types to handle!");
8211     return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
8212                        ExtVT, V1, V2);
8213   }
8214
8215   V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
8216   if (ExtVT != VT)
8217     V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
8218
8219   if (V2Index != 0) {
8220     // If we have 4 or fewer lanes we can cheaply shuffle the element into
8221     // the desired position. Otherwise it is more efficient to do a vector
8222     // shift left. We know that we can do a vector shift left because all
8223     // the inputs are zero.
8224     if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
8225       SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
8226       V2Shuffle[V2Index] = 0;
8227       V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
8228     } else {
8229       V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V2);
8230       V2 = DAG.getNode(
8231           X86ISD::VSHLDQ, DL, MVT::v2i64, V2,
8232           DAG.getConstant(
8233               V2Index * EltVT.getSizeInBits(),
8234               DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64)));
8235       V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
8236     }
8237   }
8238   return V2;
8239 }
8240
8241 /// \brief Try to lower broadcast of a single element.
8242 ///
8243 /// For convenience, this code also bundles all of the subtarget feature set
8244 /// filtering. While a little annoying to re-dispatch on type here, there isn't
8245 /// a convenient way to factor it out.
8246 static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V,
8247                                              ArrayRef<int> Mask,
8248                                              const X86Subtarget *Subtarget,
8249                                              SelectionDAG &DAG) {
8250   if (!Subtarget->hasAVX())
8251     return SDValue();
8252   if (VT.isInteger() && !Subtarget->hasAVX2())
8253     return SDValue();
8254
8255   // Check that the mask is a broadcast.
8256   int BroadcastIdx = -1;
8257   for (int M : Mask)
8258     if (M >= 0 && BroadcastIdx == -1)
8259       BroadcastIdx = M;
8260     else if (M >= 0 && M != BroadcastIdx)
8261       return SDValue();
8262
8263   assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
8264                                             "a sorted mask where the broadcast "
8265                                             "comes from V1.");
8266
8267   // Go up the chain of (vector) values to try and find a scalar load that
8268   // we can combine with the broadcast.
8269   for (;;) {
8270     switch (V.getOpcode()) {
8271     case ISD::CONCAT_VECTORS: {
8272       int OperandSize = Mask.size() / V.getNumOperands();
8273       V = V.getOperand(BroadcastIdx / OperandSize);
8274       BroadcastIdx %= OperandSize;
8275       continue;
8276     }
8277
8278     case ISD::INSERT_SUBVECTOR: {
8279       SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
8280       auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
8281       if (!ConstantIdx)
8282         break;
8283
8284       int BeginIdx = (int)ConstantIdx->getZExtValue();
8285       int EndIdx =
8286           BeginIdx + (int)VInner.getValueType().getVectorNumElements();
8287       if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
8288         BroadcastIdx -= BeginIdx;
8289         V = VInner;
8290       } else {
8291         V = VOuter;
8292       }
8293       continue;
8294     }
8295     }
8296     break;
8297   }
8298
8299   // Check if this is a broadcast of a scalar. We special case lowering
8300   // for scalars so that we can more effectively fold with loads.
8301   if (V.getOpcode() == ISD::BUILD_VECTOR ||
8302       (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
8303     V = V.getOperand(BroadcastIdx);
8304
8305     // If the scalar isn't a load we can't broadcast from it in AVX1, only with
8306     // AVX2.
8307     if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V))
8308       return SDValue();
8309   } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) {
8310     // We can't broadcast from a vector register w/o AVX2, and we can only
8311     // broadcast from the zero-element of a vector register.
8312     return SDValue();
8313   }
8314
8315   return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V);
8316 }
8317
8318 // Check for whether we can use INSERTPS to perform the shuffle. We only use
8319 // INSERTPS when the V1 elements are already in the correct locations
8320 // because otherwise we can just always use two SHUFPS instructions which
8321 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
8322 // perform INSERTPS if a single V1 element is out of place and all V2
8323 // elements are zeroable.
8324 static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
8325                                             ArrayRef<int> Mask,
8326                                             SelectionDAG &DAG) {
8327   assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
8328   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8329   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8330   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8331
8332   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8333
8334   unsigned ZMask = 0;
8335   int V1DstIndex = -1;
8336   int V2DstIndex = -1;
8337   bool V1UsedInPlace = false;
8338
8339   for (int i = 0; i < 4; i++) {
8340     // Synthesize a zero mask from the zeroable elements (includes undefs).
8341     if (Zeroable[i]) {
8342       ZMask |= 1 << i;
8343       continue;
8344     }
8345
8346     // Flag if we use any V1 inputs in place.
8347     if (i == Mask[i]) {
8348       V1UsedInPlace = true;
8349       continue;
8350     }
8351
8352     // We can only insert a single non-zeroable element.
8353     if (V1DstIndex != -1 || V2DstIndex != -1)
8354       return SDValue();
8355
8356     if (Mask[i] < 4) {
8357       // V1 input out of place for insertion.
8358       V1DstIndex = i;
8359     } else {
8360       // V2 input for insertion.
8361       V2DstIndex = i;
8362     }
8363   }
8364
8365   // Don't bother if we have no (non-zeroable) element for insertion.
8366   if (V1DstIndex == -1 && V2DstIndex == -1)
8367     return SDValue();
8368
8369   // Determine element insertion src/dst indices. The src index is from the
8370   // start of the inserted vector, not the start of the concatenated vector.
8371   unsigned V2SrcIndex = 0;
8372   if (V1DstIndex != -1) {
8373     // If we have a V1 input out of place, we use V1 as the V2 element insertion
8374     // and don't use the original V2 at all.
8375     V2SrcIndex = Mask[V1DstIndex];
8376     V2DstIndex = V1DstIndex;
8377     V2 = V1;
8378   } else {
8379     V2SrcIndex = Mask[V2DstIndex] - 4;
8380   }
8381
8382   // If no V1 inputs are used in place, then the result is created only from
8383   // the zero mask and the V2 insertion - so remove V1 dependency.
8384   if (!V1UsedInPlace)
8385     V1 = DAG.getUNDEF(MVT::v4f32);
8386
8387   unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
8388   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
8389
8390   // Insert the V2 element into the desired position.
8391   SDLoc DL(Op);
8392   return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
8393                      DAG.getConstant(InsertPSMask, MVT::i8));
8394 }
8395
8396 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
8397 ///
8398 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
8399 /// support for floating point shuffles but not integer shuffles. These
8400 /// instructions will incur a domain crossing penalty on some chips though so
8401 /// it is better to avoid lowering through this for integer vectors where
8402 /// possible.
8403 static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8404                                        const X86Subtarget *Subtarget,
8405                                        SelectionDAG &DAG) {
8406   SDLoc DL(Op);
8407   assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!");
8408   assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
8409   assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
8410   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8411   ArrayRef<int> Mask = SVOp->getMask();
8412   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
8413
8414   if (isSingleInputShuffleMask(Mask)) {
8415     // Use low duplicate instructions for masks that match their pattern.
8416     if (Subtarget->hasSSE3())
8417       if (isShuffleEquivalent(Mask, 0, 0))
8418         return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, V1);
8419
8420     // Straight shuffle of a single input vector. Simulate this by using the
8421     // single input as both of the "inputs" to this instruction..
8422     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
8423
8424     if (Subtarget->hasAVX()) {
8425       // If we have AVX, we can use VPERMILPS which will allow folding a load
8426       // into the shuffle.
8427       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
8428                          DAG.getConstant(SHUFPDMask, MVT::i8));
8429     }
8430
8431     return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V1,
8432                        DAG.getConstant(SHUFPDMask, MVT::i8));
8433   }
8434   assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
8435   assert(Mask[1] >= 2 && "Non-canonicalized blend!");
8436
8437   // Use dedicated unpack instructions for masks that match their pattern.
8438   if (isShuffleEquivalent(Mask, 0, 2))
8439     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2);
8440   if (isShuffleEquivalent(Mask, 1, 3))
8441     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2);
8442
8443   // If we have a single input, insert that into V1 if we can do so cheaply.
8444   if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
8445     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8446             MVT::v2f64, DL, V1, V2, Mask, Subtarget, DAG))
8447       return Insertion;
8448     // Try inverting the insertion since for v2 masks it is easy to do and we
8449     // can't reliably sort the mask one way or the other.
8450     int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
8451                           Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
8452     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8453             MVT::v2f64, DL, V2, V1, InverseMask, Subtarget, DAG))
8454       return Insertion;
8455   }
8456
8457   // Try to use one of the special instruction patterns to handle two common
8458   // blend patterns if a zero-blend above didn't work.
8459   if (isShuffleEquivalent(Mask, 0, 3) || isShuffleEquivalent(Mask, 1, 3))
8460     if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
8461       // We can either use a special instruction to load over the low double or
8462       // to move just the low double.
8463       return DAG.getNode(
8464           isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
8465           DL, MVT::v2f64, V2,
8466           DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
8467
8468   if (Subtarget->hasSSE41())
8469     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
8470                                                   Subtarget, DAG))
8471       return Blend;
8472
8473   unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
8474   return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2,
8475                      DAG.getConstant(SHUFPDMask, MVT::i8));
8476 }
8477
8478 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
8479 ///
8480 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
8481 /// the integer unit to minimize domain crossing penalties. However, for blends
8482 /// it falls back to the floating point shuffle operation with appropriate bit
8483 /// casting.
8484 static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8485                                        const X86Subtarget *Subtarget,
8486                                        SelectionDAG &DAG) {
8487   SDLoc DL(Op);
8488   assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!");
8489   assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
8490   assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
8491   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8492   ArrayRef<int> Mask = SVOp->getMask();
8493   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
8494
8495   if (isSingleInputShuffleMask(Mask)) {
8496     // Check for being able to broadcast a single element.
8497     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v2i64, DL, V1,
8498                                                           Mask, Subtarget, DAG))
8499       return Broadcast;
8500
8501     // Straight shuffle of a single input vector. For everything from SSE2
8502     // onward this has a single fast instruction with no scary immediates.
8503     // We have to map the mask as it is actually a v4i32 shuffle instruction.
8504     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V1);
8505     int WidenedMask[4] = {
8506         std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
8507         std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
8508     return DAG.getNode(
8509         ISD::BITCAST, DL, MVT::v2i64,
8510         DAG.getNode(X86ISD::PSHUFD, SDLoc(Op), MVT::v4i32, V1,
8511                     getV4X86ShuffleImm8ForMask(WidenedMask, DAG)));
8512   }
8513
8514   // Try to use byte shift instructions.
8515   if (SDValue Shift = lowerVectorShuffleAsByteShift(
8516           DL, MVT::v2i64, V1, V2, Mask, DAG))
8517     return Shift;
8518
8519   // If we have a single input from V2 insert that into V1 if we can do so
8520   // cheaply.
8521   if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
8522     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8523             MVT::v2i64, DL, V1, V2, Mask, Subtarget, DAG))
8524       return Insertion;
8525     // Try inverting the insertion since for v2 masks it is easy to do and we
8526     // can't reliably sort the mask one way or the other.
8527     int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
8528                           Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
8529     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8530             MVT::v2i64, DL, V2, V1, InverseMask, Subtarget, DAG))
8531       return Insertion;
8532   }
8533
8534   // Use dedicated unpack instructions for masks that match their pattern.
8535   if (isShuffleEquivalent(Mask, 0, 2))
8536     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2);
8537   if (isShuffleEquivalent(Mask, 1, 3))
8538     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2);
8539
8540   if (Subtarget->hasSSE41())
8541     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
8542                                                   Subtarget, DAG))
8543       return Blend;
8544
8545   // Try to use byte rotation instructions.
8546   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
8547   if (Subtarget->hasSSSE3())
8548     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
8549             DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
8550       return Rotate;
8551
8552   // We implement this with SHUFPD which is pretty lame because it will likely
8553   // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
8554   // However, all the alternatives are still more cycles and newer chips don't
8555   // have this problem. It would be really nice if x86 had better shuffles here.
8556   V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V1);
8557   V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V2);
8558   return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
8559                      DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
8560 }
8561
8562 /// \brief Lower a vector shuffle using the SHUFPS instruction.
8563 ///
8564 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
8565 /// It makes no assumptions about whether this is the *best* lowering, it simply
8566 /// uses it.
8567 static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT,
8568                                             ArrayRef<int> Mask, SDValue V1,
8569                                             SDValue V2, SelectionDAG &DAG) {
8570   SDValue LowV = V1, HighV = V2;
8571   int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
8572
8573   int NumV2Elements =
8574       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
8575
8576   if (NumV2Elements == 1) {
8577     int V2Index =
8578         std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
8579         Mask.begin();
8580
8581     // Compute the index adjacent to V2Index and in the same half by toggling
8582     // the low bit.
8583     int V2AdjIndex = V2Index ^ 1;
8584
8585     if (Mask[V2AdjIndex] == -1) {
8586       // Handles all the cases where we have a single V2 element and an undef.
8587       // This will only ever happen in the high lanes because we commute the
8588       // vector otherwise.
8589       if (V2Index < 2)
8590         std::swap(LowV, HighV);
8591       NewMask[V2Index] -= 4;
8592     } else {
8593       // Handle the case where the V2 element ends up adjacent to a V1 element.
8594       // To make this work, blend them together as the first step.
8595       int V1Index = V2AdjIndex;
8596       int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
8597       V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
8598                        getV4X86ShuffleImm8ForMask(BlendMask, DAG));
8599
8600       // Now proceed to reconstruct the final blend as we have the necessary
8601       // high or low half formed.
8602       if (V2Index < 2) {
8603         LowV = V2;
8604         HighV = V1;
8605       } else {
8606         HighV = V2;
8607       }
8608       NewMask[V1Index] = 2; // We put the V1 element in V2[2].
8609       NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
8610     }
8611   } else if (NumV2Elements == 2) {
8612     if (Mask[0] < 4 && Mask[1] < 4) {
8613       // Handle the easy case where we have V1 in the low lanes and V2 in the
8614       // high lanes.
8615       NewMask[2] -= 4;
8616       NewMask[3] -= 4;
8617     } else if (Mask[2] < 4 && Mask[3] < 4) {
8618       // We also handle the reversed case because this utility may get called
8619       // when we detect a SHUFPS pattern but can't easily commute the shuffle to
8620       // arrange things in the right direction.
8621       NewMask[0] -= 4;
8622       NewMask[1] -= 4;
8623       HighV = V1;
8624       LowV = V2;
8625     } else {
8626       // We have a mixture of V1 and V2 in both low and high lanes. Rather than
8627       // trying to place elements directly, just blend them and set up the final
8628       // shuffle to place them.
8629
8630       // The first two blend mask elements are for V1, the second two are for
8631       // V2.
8632       int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
8633                           Mask[2] < 4 ? Mask[2] : Mask[3],
8634                           (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
8635                           (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
8636       V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
8637                        getV4X86ShuffleImm8ForMask(BlendMask, DAG));
8638
8639       // Now we do a normal shuffle of V1 by giving V1 as both operands to
8640       // a blend.
8641       LowV = HighV = V1;
8642       NewMask[0] = Mask[0] < 4 ? 0 : 2;
8643       NewMask[1] = Mask[0] < 4 ? 2 : 0;
8644       NewMask[2] = Mask[2] < 4 ? 1 : 3;
8645       NewMask[3] = Mask[2] < 4 ? 3 : 1;
8646     }
8647   }
8648   return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
8649                      getV4X86ShuffleImm8ForMask(NewMask, DAG));
8650 }
8651
8652 /// \brief Lower 4-lane 32-bit floating point shuffles.
8653 ///
8654 /// Uses instructions exclusively from the floating point unit to minimize
8655 /// domain crossing penalties, as these are sufficient to implement all v4f32
8656 /// shuffles.
8657 static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8658                                        const X86Subtarget *Subtarget,
8659                                        SelectionDAG &DAG) {
8660   SDLoc DL(Op);
8661   assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
8662   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8663   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8664   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8665   ArrayRef<int> Mask = SVOp->getMask();
8666   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8667
8668   int NumV2Elements =
8669       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
8670
8671   if (NumV2Elements == 0) {
8672     // Check for being able to broadcast a single element.
8673     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f32, DL, V1,
8674                                                           Mask, Subtarget, DAG))
8675       return Broadcast;
8676
8677     // Use even/odd duplicate instructions for masks that match their pattern.
8678     if (Subtarget->hasSSE3()) {
8679       if (isShuffleEquivalent(Mask, 0, 0, 2, 2))
8680         return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
8681       if (isShuffleEquivalent(Mask, 1, 1, 3, 3))
8682         return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
8683     }
8684
8685     if (Subtarget->hasAVX()) {
8686       // If we have AVX, we can use VPERMILPS which will allow folding a load
8687       // into the shuffle.
8688       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
8689                          getV4X86ShuffleImm8ForMask(Mask, DAG));
8690     }
8691
8692     // Otherwise, use a straight shuffle of a single input vector. We pass the
8693     // input vector to both operands to simulate this with a SHUFPS.
8694     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
8695                        getV4X86ShuffleImm8ForMask(Mask, DAG));
8696   }
8697
8698   // Use dedicated unpack instructions for masks that match their pattern.
8699   if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
8700     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2);
8701   if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
8702     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
8703
8704   // There are special ways we can lower some single-element blends. However, we
8705   // have custom ways we can lower more complex single-element blends below that
8706   // we defer to if both this and BLENDPS fail to match, so restrict this to
8707   // when the V2 input is targeting element 0 of the mask -- that is the fast
8708   // case here.
8709   if (NumV2Elements == 1 && Mask[0] >= 4)
8710     if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4f32, DL, V1, V2,
8711                                                          Mask, Subtarget, DAG))
8712       return V;
8713
8714   if (Subtarget->hasSSE41()) {
8715     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
8716                                                   Subtarget, DAG))
8717       return Blend;
8718
8719     // Use INSERTPS if we can complete the shuffle efficiently.
8720     if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG))
8721       return V;
8722   }
8723
8724   // Otherwise fall back to a SHUFPS lowering strategy.
8725   return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
8726 }
8727
8728 /// \brief Lower 4-lane i32 vector shuffles.
8729 ///
8730 /// We try to handle these with integer-domain shuffles where we can, but for
8731 /// blends we use the floating point domain blend instructions.
8732 static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8733                                        const X86Subtarget *Subtarget,
8734                                        SelectionDAG &DAG) {
8735   SDLoc DL(Op);
8736   assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!");
8737   assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
8738   assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
8739   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8740   ArrayRef<int> Mask = SVOp->getMask();
8741   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8742
8743   // Whenever we can lower this as a zext, that instruction is strictly faster
8744   // than any alternative. It also allows us to fold memory operands into the
8745   // shuffle in many cases.
8746   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2,
8747                                                          Mask, Subtarget, DAG))
8748     return ZExt;
8749
8750   int NumV2Elements =
8751       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
8752
8753   if (NumV2Elements == 0) {
8754     // Check for being able to broadcast a single element.
8755     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i32, DL, V1,
8756                                                           Mask, Subtarget, DAG))
8757       return Broadcast;
8758
8759     // Straight shuffle of a single input vector. For everything from SSE2
8760     // onward this has a single fast instruction with no scary immediates.
8761     // We coerce the shuffle pattern to be compatible with UNPCK instructions
8762     // but we aren't actually going to use the UNPCK instruction because doing
8763     // so prevents folding a load into this instruction or making a copy.
8764     const int UnpackLoMask[] = {0, 0, 1, 1};
8765     const int UnpackHiMask[] = {2, 2, 3, 3};
8766     if (isShuffleEquivalent(Mask, 0, 0, 1, 1))
8767       Mask = UnpackLoMask;
8768     else if (isShuffleEquivalent(Mask, 2, 2, 3, 3))
8769       Mask = UnpackHiMask;
8770
8771     return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
8772                        getV4X86ShuffleImm8ForMask(Mask, DAG));
8773   }
8774
8775   // Try to use bit shift instructions.
8776   if (SDValue Shift = lowerVectorShuffleAsBitShift(
8777           DL, MVT::v4i32, V1, V2, Mask, DAG))
8778     return Shift;
8779
8780   // Try to use byte shift instructions.
8781   if (SDValue Shift = lowerVectorShuffleAsByteShift(
8782           DL, MVT::v4i32, V1, V2, Mask, DAG))
8783     return Shift;
8784
8785   // There are special ways we can lower some single-element blends.
8786   if (NumV2Elements == 1)
8787     if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2,
8788                                                          Mask, Subtarget, DAG))
8789       return V;
8790
8791   if (Subtarget->hasSSE41())
8792     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
8793                                                   Subtarget, DAG))
8794       return Blend;
8795
8796   if (SDValue Masked =
8797           lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, DAG))
8798     return Masked;
8799
8800   // Use dedicated unpack instructions for masks that match their pattern.
8801   if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
8802     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2);
8803   if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
8804     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
8805
8806   // Try to use byte rotation instructions.
8807   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
8808   if (Subtarget->hasSSSE3())
8809     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
8810             DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
8811       return Rotate;
8812
8813   // We implement this with SHUFPS because it can blend from two vectors.
8814   // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
8815   // up the inputs, bypassing domain shift penalties that we would encur if we
8816   // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
8817   // relevant.
8818   return DAG.getNode(ISD::BITCAST, DL, MVT::v4i32,
8819                      DAG.getVectorShuffle(
8820                          MVT::v4f32, DL,
8821                          DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V1),
8822                          DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V2), Mask));
8823 }
8824
8825 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
8826 /// shuffle lowering, and the most complex part.
8827 ///
8828 /// The lowering strategy is to try to form pairs of input lanes which are
8829 /// targeted at the same half of the final vector, and then use a dword shuffle
8830 /// to place them onto the right half, and finally unpack the paired lanes into
8831 /// their final position.
8832 ///
8833 /// The exact breakdown of how to form these dword pairs and align them on the
8834 /// correct sides is really tricky. See the comments within the function for
8835 /// more of the details.
8836 static SDValue lowerV8I16SingleInputVectorShuffle(
8837     SDLoc DL, SDValue V, MutableArrayRef<int> Mask,
8838     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
8839   assert(V.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
8840   MutableArrayRef<int> LoMask = Mask.slice(0, 4);
8841   MutableArrayRef<int> HiMask = Mask.slice(4, 4);
8842
8843   SmallVector<int, 4> LoInputs;
8844   std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
8845                [](int M) { return M >= 0; });
8846   std::sort(LoInputs.begin(), LoInputs.end());
8847   LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
8848   SmallVector<int, 4> HiInputs;
8849   std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
8850                [](int M) { return M >= 0; });
8851   std::sort(HiInputs.begin(), HiInputs.end());
8852   HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
8853   int NumLToL =
8854       std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
8855   int NumHToL = LoInputs.size() - NumLToL;
8856   int NumLToH =
8857       std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
8858   int NumHToH = HiInputs.size() - NumLToH;
8859   MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
8860   MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
8861   MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
8862   MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
8863
8864   // Check for being able to broadcast a single element.
8865   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i16, DL, V,
8866                                                         Mask, Subtarget, DAG))
8867     return Broadcast;
8868
8869   // Try to use bit shift instructions.
8870   if (SDValue Shift = lowerVectorShuffleAsBitShift(
8871           DL, MVT::v8i16, V, V, Mask, DAG))
8872     return Shift;
8873
8874   // Try to use byte shift instructions.
8875   if (SDValue Shift = lowerVectorShuffleAsByteShift(
8876           DL, MVT::v8i16, V, V, Mask, DAG))
8877     return Shift;
8878
8879   // Use dedicated unpack instructions for masks that match their pattern.
8880   if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3))
8881     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V);
8882   if (isShuffleEquivalent(Mask, 4, 4, 5, 5, 6, 6, 7, 7))
8883     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V);
8884
8885   // Try to use byte rotation instructions.
8886   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
8887           DL, MVT::v8i16, V, V, Mask, Subtarget, DAG))
8888     return Rotate;
8889
8890   // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
8891   // such inputs we can swap two of the dwords across the half mark and end up
8892   // with <=2 inputs to each half in each half. Once there, we can fall through
8893   // to the generic code below. For example:
8894   //
8895   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
8896   // Mask:  [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
8897   //
8898   // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
8899   // and an existing 2-into-2 on the other half. In this case we may have to
8900   // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
8901   // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
8902   // Fortunately, we don't have to handle anything but a 2-into-2 pattern
8903   // because any other situation (including a 3-into-1 or 1-into-3 in the other
8904   // half than the one we target for fixing) will be fixed when we re-enter this
8905   // path. We will also combine away any sequence of PSHUFD instructions that
8906   // result into a single instruction. Here is an example of the tricky case:
8907   //
8908   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
8909   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
8910   //
8911   // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
8912   //
8913   // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
8914   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
8915   //
8916   // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
8917   // Mask:  [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
8918   //
8919   // The result is fine to be handled by the generic logic.
8920   auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
8921                           ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
8922                           int AOffset, int BOffset) {
8923     assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
8924            "Must call this with A having 3 or 1 inputs from the A half.");
8925     assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
8926            "Must call this with B having 1 or 3 inputs from the B half.");
8927     assert(AToAInputs.size() + BToAInputs.size() == 4 &&
8928            "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
8929
8930     // Compute the index of dword with only one word among the three inputs in
8931     // a half by taking the sum of the half with three inputs and subtracting
8932     // the sum of the actual three inputs. The difference is the remaining
8933     // slot.
8934     int ADWord, BDWord;
8935     int &TripleDWord = AToAInputs.size() == 3 ? ADWord : BDWord;
8936     int &OneInputDWord = AToAInputs.size() == 3 ? BDWord : ADWord;
8937     int TripleInputOffset = AToAInputs.size() == 3 ? AOffset : BOffset;
8938     ArrayRef<int> TripleInputs = AToAInputs.size() == 3 ? AToAInputs : BToAInputs;
8939     int OneInput = AToAInputs.size() == 3 ? BToAInputs[0] : AToAInputs[0];
8940     int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
8941     int TripleNonInputIdx =
8942         TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
8943     TripleDWord = TripleNonInputIdx / 2;
8944
8945     // We use xor with one to compute the adjacent DWord to whichever one the
8946     // OneInput is in.
8947     OneInputDWord = (OneInput / 2) ^ 1;
8948
8949     // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
8950     // and BToA inputs. If there is also such a problem with the BToB and AToB
8951     // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
8952     // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
8953     // is essential that we don't *create* a 3<-1 as then we might oscillate.
8954     if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
8955       // Compute how many inputs will be flipped by swapping these DWords. We
8956       // need
8957       // to balance this to ensure we don't form a 3-1 shuffle in the other
8958       // half.
8959       int NumFlippedAToBInputs =
8960           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
8961           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
8962       int NumFlippedBToBInputs =
8963           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
8964           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
8965       if ((NumFlippedAToBInputs == 1 &&
8966            (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
8967           (NumFlippedBToBInputs == 1 &&
8968            (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
8969         // We choose whether to fix the A half or B half based on whether that
8970         // half has zero flipped inputs. At zero, we may not be able to fix it
8971         // with that half. We also bias towards fixing the B half because that
8972         // will more commonly be the high half, and we have to bias one way.
8973         auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
8974                                                        ArrayRef<int> Inputs) {
8975           int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
8976           bool IsFixIdxInput = std::find(Inputs.begin(), Inputs.end(),
8977                                          PinnedIdx ^ 1) != Inputs.end();
8978           // Determine whether the free index is in the flipped dword or the
8979           // unflipped dword based on where the pinned index is. We use this bit
8980           // in an xor to conditionally select the adjacent dword.
8981           int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
8982           bool IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
8983                                              FixFreeIdx) != Inputs.end();
8984           if (IsFixIdxInput == IsFixFreeIdxInput)
8985             FixFreeIdx += 1;
8986           IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
8987                                         FixFreeIdx) != Inputs.end();
8988           assert(IsFixIdxInput != IsFixFreeIdxInput &&
8989                  "We need to be changing the number of flipped inputs!");
8990           int PSHUFHalfMask[] = {0, 1, 2, 3};
8991           std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
8992           V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
8993                           MVT::v8i16, V,
8994                           getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DAG));
8995
8996           for (int &M : Mask)
8997             if (M != -1 && M == FixIdx)
8998               M = FixFreeIdx;
8999             else if (M != -1 && M == FixFreeIdx)
9000               M = FixIdx;
9001         };
9002         if (NumFlippedBToBInputs != 0) {
9003           int BPinnedIdx =
9004               BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
9005           FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
9006         } else {
9007           assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
9008           int APinnedIdx =
9009               AToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
9010           FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
9011         }
9012       }
9013     }
9014
9015     int PSHUFDMask[] = {0, 1, 2, 3};
9016     PSHUFDMask[ADWord] = BDWord;
9017     PSHUFDMask[BDWord] = ADWord;
9018     V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9019                     DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9020                                 DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
9021                                 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
9022
9023     // Adjust the mask to match the new locations of A and B.
9024     for (int &M : Mask)
9025       if (M != -1 && M/2 == ADWord)
9026         M = 2 * BDWord + M % 2;
9027       else if (M != -1 && M/2 == BDWord)
9028         M = 2 * ADWord + M % 2;
9029
9030     // Recurse back into this routine to re-compute state now that this isn't
9031     // a 3 and 1 problem.
9032     return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
9033                                 Mask);
9034   };
9035   if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
9036     return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
9037   else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
9038     return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
9039
9040   // At this point there are at most two inputs to the low and high halves from
9041   // each half. That means the inputs can always be grouped into dwords and
9042   // those dwords can then be moved to the correct half with a dword shuffle.
9043   // We use at most one low and one high word shuffle to collect these paired
9044   // inputs into dwords, and finally a dword shuffle to place them.
9045   int PSHUFLMask[4] = {-1, -1, -1, -1};
9046   int PSHUFHMask[4] = {-1, -1, -1, -1};
9047   int PSHUFDMask[4] = {-1, -1, -1, -1};
9048
9049   // First fix the masks for all the inputs that are staying in their
9050   // original halves. This will then dictate the targets of the cross-half
9051   // shuffles.
9052   auto fixInPlaceInputs =
9053       [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
9054                     MutableArrayRef<int> SourceHalfMask,
9055                     MutableArrayRef<int> HalfMask, int HalfOffset) {
9056     if (InPlaceInputs.empty())
9057       return;
9058     if (InPlaceInputs.size() == 1) {
9059       SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
9060           InPlaceInputs[0] - HalfOffset;
9061       PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
9062       return;
9063     }
9064     if (IncomingInputs.empty()) {
9065       // Just fix all of the in place inputs.
9066       for (int Input : InPlaceInputs) {
9067         SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
9068         PSHUFDMask[Input / 2] = Input / 2;
9069       }
9070       return;
9071     }
9072
9073     assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
9074     SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
9075         InPlaceInputs[0] - HalfOffset;
9076     // Put the second input next to the first so that they are packed into
9077     // a dword. We find the adjacent index by toggling the low bit.
9078     int AdjIndex = InPlaceInputs[0] ^ 1;
9079     SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
9080     std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
9081     PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
9082   };
9083   fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
9084   fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
9085
9086   // Now gather the cross-half inputs and place them into a free dword of
9087   // their target half.
9088   // FIXME: This operation could almost certainly be simplified dramatically to
9089   // look more like the 3-1 fixing operation.
9090   auto moveInputsToRightHalf = [&PSHUFDMask](
9091       MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
9092       MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
9093       MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
9094       int DestOffset) {
9095     auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
9096       return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word;
9097     };
9098     auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
9099                                                int Word) {
9100       int LowWord = Word & ~1;
9101       int HighWord = Word | 1;
9102       return isWordClobbered(SourceHalfMask, LowWord) ||
9103              isWordClobbered(SourceHalfMask, HighWord);
9104     };
9105
9106     if (IncomingInputs.empty())
9107       return;
9108
9109     if (ExistingInputs.empty()) {
9110       // Map any dwords with inputs from them into the right half.
9111       for (int Input : IncomingInputs) {
9112         // If the source half mask maps over the inputs, turn those into
9113         // swaps and use the swapped lane.
9114         if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
9115           if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == -1) {
9116             SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
9117                 Input - SourceOffset;
9118             // We have to swap the uses in our half mask in one sweep.
9119             for (int &M : HalfMask)
9120               if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
9121                 M = Input;
9122               else if (M == Input)
9123                 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
9124           } else {
9125             assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
9126                        Input - SourceOffset &&
9127                    "Previous placement doesn't match!");
9128           }
9129           // Note that this correctly re-maps both when we do a swap and when
9130           // we observe the other side of the swap above. We rely on that to
9131           // avoid swapping the members of the input list directly.
9132           Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
9133         }
9134
9135         // Map the input's dword into the correct half.
9136         if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == -1)
9137           PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
9138         else
9139           assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
9140                      Input / 2 &&
9141                  "Previous placement doesn't match!");
9142       }
9143
9144       // And just directly shift any other-half mask elements to be same-half
9145       // as we will have mirrored the dword containing the element into the
9146       // same position within that half.
9147       for (int &M : HalfMask)
9148         if (M >= SourceOffset && M < SourceOffset + 4) {
9149           M = M - SourceOffset + DestOffset;
9150           assert(M >= 0 && "This should never wrap below zero!");
9151         }
9152       return;
9153     }
9154
9155     // Ensure we have the input in a viable dword of its current half. This
9156     // is particularly tricky because the original position may be clobbered
9157     // by inputs being moved and *staying* in that half.
9158     if (IncomingInputs.size() == 1) {
9159       if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
9160         int InputFixed = std::find(std::begin(SourceHalfMask),
9161                                    std::end(SourceHalfMask), -1) -
9162                          std::begin(SourceHalfMask) + SourceOffset;
9163         SourceHalfMask[InputFixed - SourceOffset] =
9164             IncomingInputs[0] - SourceOffset;
9165         std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
9166                      InputFixed);
9167         IncomingInputs[0] = InputFixed;
9168       }
9169     } else if (IncomingInputs.size() == 2) {
9170       if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
9171           isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
9172         // We have two non-adjacent or clobbered inputs we need to extract from
9173         // the source half. To do this, we need to map them into some adjacent
9174         // dword slot in the source mask.
9175         int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
9176                               IncomingInputs[1] - SourceOffset};
9177
9178         // If there is a free slot in the source half mask adjacent to one of
9179         // the inputs, place the other input in it. We use (Index XOR 1) to
9180         // compute an adjacent index.
9181         if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
9182             SourceHalfMask[InputsFixed[0] ^ 1] == -1) {
9183           SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
9184           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
9185           InputsFixed[1] = InputsFixed[0] ^ 1;
9186         } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
9187                    SourceHalfMask[InputsFixed[1] ^ 1] == -1) {
9188           SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
9189           SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
9190           InputsFixed[0] = InputsFixed[1] ^ 1;
9191         } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] == -1 &&
9192                    SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] == -1) {
9193           // The two inputs are in the same DWord but it is clobbered and the
9194           // adjacent DWord isn't used at all. Move both inputs to the free
9195           // slot.
9196           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
9197           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
9198           InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
9199           InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
9200         } else {
9201           // The only way we hit this point is if there is no clobbering
9202           // (because there are no off-half inputs to this half) and there is no
9203           // free slot adjacent to one of the inputs. In this case, we have to
9204           // swap an input with a non-input.
9205           for (int i = 0; i < 4; ++i)
9206             assert((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) &&
9207                    "We can't handle any clobbers here!");
9208           assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
9209                  "Cannot have adjacent inputs here!");
9210
9211           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
9212           SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
9213
9214           // We also have to update the final source mask in this case because
9215           // it may need to undo the above swap.
9216           for (int &M : FinalSourceHalfMask)
9217             if (M == (InputsFixed[0] ^ 1) + SourceOffset)
9218               M = InputsFixed[1] + SourceOffset;
9219             else if (M == InputsFixed[1] + SourceOffset)
9220               M = (InputsFixed[0] ^ 1) + SourceOffset;
9221
9222           InputsFixed[1] = InputsFixed[0] ^ 1;
9223         }
9224
9225         // Point everything at the fixed inputs.
9226         for (int &M : HalfMask)
9227           if (M == IncomingInputs[0])
9228             M = InputsFixed[0] + SourceOffset;
9229           else if (M == IncomingInputs[1])
9230             M = InputsFixed[1] + SourceOffset;
9231
9232         IncomingInputs[0] = InputsFixed[0] + SourceOffset;
9233         IncomingInputs[1] = InputsFixed[1] + SourceOffset;
9234       }
9235     } else {
9236       llvm_unreachable("Unhandled input size!");
9237     }
9238
9239     // Now hoist the DWord down to the right half.
9240     int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2;
9241     assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free");
9242     PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
9243     for (int &M : HalfMask)
9244       for (int Input : IncomingInputs)
9245         if (M == Input)
9246           M = FreeDWord * 2 + Input % 2;
9247   };
9248   moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
9249                         /*SourceOffset*/ 4, /*DestOffset*/ 0);
9250   moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
9251                         /*SourceOffset*/ 0, /*DestOffset*/ 4);
9252
9253   // Now enact all the shuffles we've computed to move the inputs into their
9254   // target half.
9255   if (!isNoopShuffleMask(PSHUFLMask))
9256     V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
9257                     getV4X86ShuffleImm8ForMask(PSHUFLMask, DAG));
9258   if (!isNoopShuffleMask(PSHUFHMask))
9259     V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
9260                     getV4X86ShuffleImm8ForMask(PSHUFHMask, DAG));
9261   if (!isNoopShuffleMask(PSHUFDMask))
9262     V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9263                     DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9264                                 DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
9265                                 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
9266
9267   // At this point, each half should contain all its inputs, and we can then
9268   // just shuffle them into their final position.
9269   assert(std::count_if(LoMask.begin(), LoMask.end(),
9270                        [](int M) { return M >= 4; }) == 0 &&
9271          "Failed to lift all the high half inputs to the low mask!");
9272   assert(std::count_if(HiMask.begin(), HiMask.end(),
9273                        [](int M) { return M >= 0 && M < 4; }) == 0 &&
9274          "Failed to lift all the low half inputs to the high mask!");
9275
9276   // Do a half shuffle for the low mask.
9277   if (!isNoopShuffleMask(LoMask))
9278     V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
9279                     getV4X86ShuffleImm8ForMask(LoMask, DAG));
9280
9281   // Do a half shuffle with the high mask after shifting its values down.
9282   for (int &M : HiMask)
9283     if (M >= 0)
9284       M -= 4;
9285   if (!isNoopShuffleMask(HiMask))
9286     V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
9287                     getV4X86ShuffleImm8ForMask(HiMask, DAG));
9288
9289   return V;
9290 }
9291
9292 /// \brief Detect whether the mask pattern should be lowered through
9293 /// interleaving.
9294 ///
9295 /// This essentially tests whether viewing the mask as an interleaving of two
9296 /// sub-sequences reduces the cross-input traffic of a blend operation. If so,
9297 /// lowering it through interleaving is a significantly better strategy.
9298 static bool shouldLowerAsInterleaving(ArrayRef<int> Mask) {
9299   int NumEvenInputs[2] = {0, 0};
9300   int NumOddInputs[2] = {0, 0};
9301   int NumLoInputs[2] = {0, 0};
9302   int NumHiInputs[2] = {0, 0};
9303   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9304     if (Mask[i] < 0)
9305       continue;
9306
9307     int InputIdx = Mask[i] >= Size;
9308
9309     if (i < Size / 2)
9310       ++NumLoInputs[InputIdx];
9311     else
9312       ++NumHiInputs[InputIdx];
9313
9314     if ((i % 2) == 0)
9315       ++NumEvenInputs[InputIdx];
9316     else
9317       ++NumOddInputs[InputIdx];
9318   }
9319
9320   // The minimum number of cross-input results for both the interleaved and
9321   // split cases. If interleaving results in fewer cross-input results, return
9322   // true.
9323   int InterleavedCrosses = std::min(NumEvenInputs[1] + NumOddInputs[0],
9324                                     NumEvenInputs[0] + NumOddInputs[1]);
9325   int SplitCrosses = std::min(NumLoInputs[1] + NumHiInputs[0],
9326                               NumLoInputs[0] + NumHiInputs[1]);
9327   return InterleavedCrosses < SplitCrosses;
9328 }
9329
9330 /// \brief Blend two v8i16 vectors using a naive unpack strategy.
9331 ///
9332 /// This strategy only works when the inputs from each vector fit into a single
9333 /// half of that vector, and generally there are not so many inputs as to leave
9334 /// the in-place shuffles required highly constrained (and thus expensive). It
9335 /// shifts all the inputs into a single side of both input vectors and then
9336 /// uses an unpack to interleave these inputs in a single vector. At that
9337 /// point, we will fall back on the generic single input shuffle lowering.
9338 static SDValue lowerV8I16BasicBlendVectorShuffle(SDLoc DL, SDValue V1,
9339                                                  SDValue V2,
9340                                                  MutableArrayRef<int> Mask,
9341                                                  const X86Subtarget *Subtarget,
9342                                                  SelectionDAG &DAG) {
9343   assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
9344   assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
9345   SmallVector<int, 3> LoV1Inputs, HiV1Inputs, LoV2Inputs, HiV2Inputs;
9346   for (int i = 0; i < 8; ++i)
9347     if (Mask[i] >= 0 && Mask[i] < 4)
9348       LoV1Inputs.push_back(i);
9349     else if (Mask[i] >= 4 && Mask[i] < 8)
9350       HiV1Inputs.push_back(i);
9351     else if (Mask[i] >= 8 && Mask[i] < 12)
9352       LoV2Inputs.push_back(i);
9353     else if (Mask[i] >= 12)
9354       HiV2Inputs.push_back(i);
9355
9356   int NumV1Inputs = LoV1Inputs.size() + HiV1Inputs.size();
9357   int NumV2Inputs = LoV2Inputs.size() + HiV2Inputs.size();
9358   (void)NumV1Inputs;
9359   (void)NumV2Inputs;
9360   assert(NumV1Inputs > 0 && NumV1Inputs <= 3 && "At most 3 inputs supported");
9361   assert(NumV2Inputs > 0 && NumV2Inputs <= 3 && "At most 3 inputs supported");
9362   assert(NumV1Inputs + NumV2Inputs <= 4 && "At most 4 combined inputs");
9363
9364   bool MergeFromLo = LoV1Inputs.size() + LoV2Inputs.size() >=
9365                      HiV1Inputs.size() + HiV2Inputs.size();
9366
9367   auto moveInputsToHalf = [&](SDValue V, ArrayRef<int> LoInputs,
9368                               ArrayRef<int> HiInputs, bool MoveToLo,
9369                               int MaskOffset) {
9370     ArrayRef<int> GoodInputs = MoveToLo ? LoInputs : HiInputs;
9371     ArrayRef<int> BadInputs = MoveToLo ? HiInputs : LoInputs;
9372     if (BadInputs.empty())
9373       return V;
9374
9375     int MoveMask[] = {-1, -1, -1, -1, -1, -1, -1, -1};
9376     int MoveOffset = MoveToLo ? 0 : 4;
9377
9378     if (GoodInputs.empty()) {
9379       for (int BadInput : BadInputs) {
9380         MoveMask[Mask[BadInput] % 4 + MoveOffset] = Mask[BadInput] - MaskOffset;
9381         Mask[BadInput] = Mask[BadInput] % 4 + MoveOffset + MaskOffset;
9382       }
9383     } else {
9384       if (GoodInputs.size() == 2) {
9385         // If the low inputs are spread across two dwords, pack them into
9386         // a single dword.
9387         MoveMask[MoveOffset] = Mask[GoodInputs[0]] - MaskOffset;
9388         MoveMask[MoveOffset + 1] = Mask[GoodInputs[1]] - MaskOffset;
9389         Mask[GoodInputs[0]] = MoveOffset + MaskOffset;
9390         Mask[GoodInputs[1]] = MoveOffset + 1 + MaskOffset;
9391       } else {
9392         // Otherwise pin the good inputs.
9393         for (int GoodInput : GoodInputs)
9394           MoveMask[Mask[GoodInput] - MaskOffset] = Mask[GoodInput] - MaskOffset;
9395       }
9396
9397       if (BadInputs.size() == 2) {
9398         // If we have two bad inputs then there may be either one or two good
9399         // inputs fixed in place. Find a fixed input, and then find the *other*
9400         // two adjacent indices by using modular arithmetic.
9401         int GoodMaskIdx =
9402             std::find_if(std::begin(MoveMask) + MoveOffset, std::end(MoveMask),
9403                          [](int M) { return M >= 0; }) -
9404             std::begin(MoveMask);
9405         int MoveMaskIdx =
9406             ((((GoodMaskIdx - MoveOffset) & ~1) + 2) % 4) + MoveOffset;
9407         assert(MoveMask[MoveMaskIdx] == -1 && "Expected empty slot");
9408         assert(MoveMask[MoveMaskIdx + 1] == -1 && "Expected empty slot");
9409         MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
9410         MoveMask[MoveMaskIdx + 1] = Mask[BadInputs[1]] - MaskOffset;
9411         Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
9412         Mask[BadInputs[1]] = MoveMaskIdx + 1 + MaskOffset;
9413       } else {
9414         assert(BadInputs.size() == 1 && "All sizes handled");
9415         int MoveMaskIdx = std::find(std::begin(MoveMask) + MoveOffset,
9416                                     std::end(MoveMask), -1) -
9417                           std::begin(MoveMask);
9418         MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
9419         Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
9420       }
9421     }
9422
9423     return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
9424                                 MoveMask);
9425   };
9426   V1 = moveInputsToHalf(V1, LoV1Inputs, HiV1Inputs, MergeFromLo,
9427                         /*MaskOffset*/ 0);
9428   V2 = moveInputsToHalf(V2, LoV2Inputs, HiV2Inputs, MergeFromLo,
9429                         /*MaskOffset*/ 8);
9430
9431   // FIXME: Select an interleaving of the merge of V1 and V2 that minimizes
9432   // cross-half traffic in the final shuffle.
9433
9434   // Munge the mask to be a single-input mask after the unpack merges the
9435   // results.
9436   for (int &M : Mask)
9437     if (M != -1)
9438       M = 2 * (M % 4) + (M / 8);
9439
9440   return DAG.getVectorShuffle(
9441       MVT::v8i16, DL, DAG.getNode(MergeFromLo ? X86ISD::UNPCKL : X86ISD::UNPCKH,
9442                                   DL, MVT::v8i16, V1, V2),
9443       DAG.getUNDEF(MVT::v8i16), Mask);
9444 }
9445
9446 /// \brief Generic lowering of 8-lane i16 shuffles.
9447 ///
9448 /// This handles both single-input shuffles and combined shuffle/blends with
9449 /// two inputs. The single input shuffles are immediately delegated to
9450 /// a dedicated lowering routine.
9451 ///
9452 /// The blends are lowered in one of three fundamental ways. If there are few
9453 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
9454 /// of the input is significantly cheaper when lowered as an interleaving of
9455 /// the two inputs, try to interleave them. Otherwise, blend the low and high
9456 /// halves of the inputs separately (making them have relatively few inputs)
9457 /// and then concatenate them.
9458 static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9459                                        const X86Subtarget *Subtarget,
9460                                        SelectionDAG &DAG) {
9461   SDLoc DL(Op);
9462   assert(Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!");
9463   assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
9464   assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
9465   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
9466   ArrayRef<int> OrigMask = SVOp->getMask();
9467   int MaskStorage[8] = {OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3],
9468                         OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7]};
9469   MutableArrayRef<int> Mask(MaskStorage);
9470
9471   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
9472
9473   // Whenever we can lower this as a zext, that instruction is strictly faster
9474   // than any alternative.
9475   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
9476           DL, MVT::v8i16, V1, V2, OrigMask, Subtarget, DAG))
9477     return ZExt;
9478
9479   auto isV1 = [](int M) { return M >= 0 && M < 8; };
9480   auto isV2 = [](int M) { return M >= 8; };
9481
9482   int NumV1Inputs = std::count_if(Mask.begin(), Mask.end(), isV1);
9483   int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2);
9484
9485   if (NumV2Inputs == 0)
9486     return lowerV8I16SingleInputVectorShuffle(DL, V1, Mask, Subtarget, DAG);
9487
9488   assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized "
9489                             "to be V1-input shuffles.");
9490
9491   // Try to use bit shift instructions.
9492   if (SDValue Shift = lowerVectorShuffleAsBitShift(
9493           DL, MVT::v8i16, V1, V2, Mask, DAG))
9494     return Shift;
9495
9496   // Try to use byte shift instructions.
9497   if (SDValue Shift = lowerVectorShuffleAsByteShift(
9498           DL, MVT::v8i16, V1, V2, Mask, DAG))
9499     return Shift;
9500
9501   // There are special ways we can lower some single-element blends.
9502   if (NumV2Inputs == 1)
9503     if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v8i16, DL, V1, V2,
9504                                                          Mask, Subtarget, DAG))
9505       return V;
9506
9507   if (Subtarget->hasSSE41())
9508     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
9509                                                   Subtarget, DAG))
9510       return Blend;
9511
9512   if (SDValue Masked =
9513           lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, DAG))
9514     return Masked;
9515
9516   // Use dedicated unpack instructions for masks that match their pattern.
9517   if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 2, 10, 3, 11))
9518     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2);
9519   if (isShuffleEquivalent(Mask, 4, 12, 5, 13, 6, 14, 7, 15))
9520     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2);
9521
9522   // Try to use byte rotation instructions.
9523   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9524           DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
9525     return Rotate;
9526
9527   if (NumV1Inputs + NumV2Inputs <= 4)
9528     return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG);
9529
9530   // Check whether an interleaving lowering is likely to be more efficient.
9531   // This isn't perfect but it is a strong heuristic that tends to work well on
9532   // the kinds of shuffles that show up in practice.
9533   //
9534   // FIXME: Handle 1x, 2x, and 4x interleaving.
9535   if (shouldLowerAsInterleaving(Mask)) {
9536     // FIXME: Figure out whether we should pack these into the low or high
9537     // halves.
9538
9539     int EMask[8], OMask[8];
9540     for (int i = 0; i < 4; ++i) {
9541       EMask[i] = Mask[2*i];
9542       OMask[i] = Mask[2*i + 1];
9543       EMask[i + 4] = -1;
9544       OMask[i + 4] = -1;
9545     }
9546
9547     SDValue Evens = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, EMask);
9548     SDValue Odds = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, OMask);
9549
9550     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, Evens, Odds);
9551   }
9552
9553   int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9554   int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9555
9556   for (int i = 0; i < 4; ++i) {
9557     LoBlendMask[i] = Mask[i];
9558     HiBlendMask[i] = Mask[i + 4];
9559   }
9560
9561   SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
9562   SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
9563   LoV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, LoV);
9564   HiV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, HiV);
9565
9566   return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9567                      DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, LoV, HiV));
9568 }
9569
9570 /// \brief Check whether a compaction lowering can be done by dropping even
9571 /// elements and compute how many times even elements must be dropped.
9572 ///
9573 /// This handles shuffles which take every Nth element where N is a power of
9574 /// two. Example shuffle masks:
9575 ///
9576 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14,  0,  2,  4,  6,  8, 10, 12, 14
9577 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
9578 ///  N = 2:  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12
9579 ///  N = 2:  0,  4,  8, 12, 16, 20, 24, 28,  0,  4,  8, 12, 16, 20, 24, 28
9580 ///  N = 3:  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8
9581 ///  N = 3:  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24
9582 ///
9583 /// Any of these lanes can of course be undef.
9584 ///
9585 /// This routine only supports N <= 3.
9586 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
9587 /// for larger N.
9588 ///
9589 /// \returns N above, or the number of times even elements must be dropped if
9590 /// there is such a number. Otherwise returns zero.
9591 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask) {
9592   // Figure out whether we're looping over two inputs or just one.
9593   bool IsSingleInput = isSingleInputShuffleMask(Mask);
9594
9595   // The modulus for the shuffle vector entries is based on whether this is
9596   // a single input or not.
9597   int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
9598   assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
9599          "We should only be called with masks with a power-of-2 size!");
9600
9601   uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
9602
9603   // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
9604   // and 2^3 simultaneously. This is because we may have ambiguity with
9605   // partially undef inputs.
9606   bool ViableForN[3] = {true, true, true};
9607
9608   for (int i = 0, e = Mask.size(); i < e; ++i) {
9609     // Ignore undef lanes, we'll optimistically collapse them to the pattern we
9610     // want.
9611     if (Mask[i] == -1)
9612       continue;
9613
9614     bool IsAnyViable = false;
9615     for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
9616       if (ViableForN[j]) {
9617         uint64_t N = j + 1;
9618
9619         // The shuffle mask must be equal to (i * 2^N) % M.
9620         if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
9621           IsAnyViable = true;
9622         else
9623           ViableForN[j] = false;
9624       }
9625     // Early exit if we exhaust the possible powers of two.
9626     if (!IsAnyViable)
9627       break;
9628   }
9629
9630   for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
9631     if (ViableForN[j])
9632       return j + 1;
9633
9634   // Return 0 as there is no viable power of two.
9635   return 0;
9636 }
9637
9638 /// \brief Generic lowering of v16i8 shuffles.
9639 ///
9640 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
9641 /// detect any complexity reducing interleaving. If that doesn't help, it uses
9642 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
9643 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
9644 /// back together.
9645 static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9646                                        const X86Subtarget *Subtarget,
9647                                        SelectionDAG &DAG) {
9648   SDLoc DL(Op);
9649   assert(Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!");
9650   assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
9651   assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
9652   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
9653   ArrayRef<int> OrigMask = SVOp->getMask();
9654   assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!");
9655
9656   // Try to use bit shift instructions.
9657   if (SDValue Shift = lowerVectorShuffleAsBitShift(
9658           DL, MVT::v16i8, V1, V2, OrigMask, DAG))
9659     return Shift;
9660
9661   // Try to use byte shift instructions.
9662   if (SDValue Shift = lowerVectorShuffleAsByteShift(
9663           DL, MVT::v16i8, V1, V2, OrigMask, DAG))
9664     return Shift;
9665
9666   // Try to use byte rotation instructions.
9667   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9668           DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
9669     return Rotate;
9670
9671   // Try to use a zext lowering.
9672   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
9673           DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
9674     return ZExt;
9675
9676   int MaskStorage[16] = {
9677       OrigMask[0],  OrigMask[1],  OrigMask[2],  OrigMask[3],
9678       OrigMask[4],  OrigMask[5],  OrigMask[6],  OrigMask[7],
9679       OrigMask[8],  OrigMask[9],  OrigMask[10], OrigMask[11],
9680       OrigMask[12], OrigMask[13], OrigMask[14], OrigMask[15]};
9681   MutableArrayRef<int> Mask(MaskStorage);
9682   MutableArrayRef<int> LoMask = Mask.slice(0, 8);
9683   MutableArrayRef<int> HiMask = Mask.slice(8, 8);
9684
9685   int NumV2Elements =
9686       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; });
9687
9688   // For single-input shuffles, there are some nicer lowering tricks we can use.
9689   if (NumV2Elements == 0) {
9690     // Check for being able to broadcast a single element.
9691     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i8, DL, V1,
9692                                                           Mask, Subtarget, DAG))
9693       return Broadcast;
9694
9695     // Check whether we can widen this to an i16 shuffle by duplicating bytes.
9696     // Notably, this handles splat and partial-splat shuffles more efficiently.
9697     // However, it only makes sense if the pre-duplication shuffle simplifies
9698     // things significantly. Currently, this means we need to be able to
9699     // express the pre-duplication shuffle as an i16 shuffle.
9700     //
9701     // FIXME: We should check for other patterns which can be widened into an
9702     // i16 shuffle as well.
9703     auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
9704       for (int i = 0; i < 16; i += 2)
9705         if (Mask[i] != -1 && Mask[i + 1] != -1 && Mask[i] != Mask[i + 1])
9706           return false;
9707
9708       return true;
9709     };
9710     auto tryToWidenViaDuplication = [&]() -> SDValue {
9711       if (!canWidenViaDuplication(Mask))
9712         return SDValue();
9713       SmallVector<int, 4> LoInputs;
9714       std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
9715                    [](int M) { return M >= 0 && M < 8; });
9716       std::sort(LoInputs.begin(), LoInputs.end());
9717       LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
9718                      LoInputs.end());
9719       SmallVector<int, 4> HiInputs;
9720       std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
9721                    [](int M) { return M >= 8; });
9722       std::sort(HiInputs.begin(), HiInputs.end());
9723       HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
9724                      HiInputs.end());
9725
9726       bool TargetLo = LoInputs.size() >= HiInputs.size();
9727       ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
9728       ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
9729
9730       int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
9731       SmallDenseMap<int, int, 8> LaneMap;
9732       for (int I : InPlaceInputs) {
9733         PreDupI16Shuffle[I/2] = I/2;
9734         LaneMap[I] = I;
9735       }
9736       int j = TargetLo ? 0 : 4, je = j + 4;
9737       for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
9738         // Check if j is already a shuffle of this input. This happens when
9739         // there are two adjacent bytes after we move the low one.
9740         if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
9741           // If we haven't yet mapped the input, search for a slot into which
9742           // we can map it.
9743           while (j < je && PreDupI16Shuffle[j] != -1)
9744             ++j;
9745
9746           if (j == je)
9747             // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
9748             return SDValue();
9749
9750           // Map this input with the i16 shuffle.
9751           PreDupI16Shuffle[j] = MovingInputs[i] / 2;
9752         }
9753
9754         // Update the lane map based on the mapping we ended up with.
9755         LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
9756       }
9757       V1 = DAG.getNode(
9758           ISD::BITCAST, DL, MVT::v16i8,
9759           DAG.getVectorShuffle(MVT::v8i16, DL,
9760                                DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
9761                                DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
9762
9763       // Unpack the bytes to form the i16s that will be shuffled into place.
9764       V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
9765                        MVT::v16i8, V1, V1);
9766
9767       int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9768       for (int i = 0; i < 16; ++i)
9769         if (Mask[i] != -1) {
9770           int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
9771           assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
9772           if (PostDupI16Shuffle[i / 2] == -1)
9773             PostDupI16Shuffle[i / 2] = MappedMask;
9774           else
9775             assert(PostDupI16Shuffle[i / 2] == MappedMask &&
9776                    "Conflicting entrties in the original shuffle!");
9777         }
9778       return DAG.getNode(
9779           ISD::BITCAST, DL, MVT::v16i8,
9780           DAG.getVectorShuffle(MVT::v8i16, DL,
9781                                DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
9782                                DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
9783     };
9784     if (SDValue V = tryToWidenViaDuplication())
9785       return V;
9786   }
9787
9788   // Check whether an interleaving lowering is likely to be more efficient.
9789   // This isn't perfect but it is a strong heuristic that tends to work well on
9790   // the kinds of shuffles that show up in practice.
9791   //
9792   // FIXME: We need to handle other interleaving widths (i16, i32, ...).
9793   if (shouldLowerAsInterleaving(Mask)) {
9794     int NumLoHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {
9795       return (M >= 0 && M < 8) || (M >= 16 && M < 24);
9796     });
9797     int NumHiHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {
9798       return (M >= 8 && M < 16) || M >= 24;
9799     });
9800     int EMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,
9801                      -1, -1, -1, -1, -1, -1, -1, -1};
9802     int OMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,
9803                      -1, -1, -1, -1, -1, -1, -1, -1};
9804     bool UnpackLo = NumLoHalf >= NumHiHalf;
9805     MutableArrayRef<int> TargetEMask(UnpackLo ? EMask : EMask + 8, 8);
9806     MutableArrayRef<int> TargetOMask(UnpackLo ? OMask : OMask + 8, 8);
9807     for (int i = 0; i < 8; ++i) {
9808       TargetEMask[i] = Mask[2 * i];
9809       TargetOMask[i] = Mask[2 * i + 1];
9810     }
9811
9812     SDValue Evens = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, EMask);
9813     SDValue Odds = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, OMask);
9814
9815     return DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
9816                        MVT::v16i8, Evens, Odds);
9817   }
9818
9819   // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
9820   // with PSHUFB. It is important to do this before we attempt to generate any
9821   // blends but after all of the single-input lowerings. If the single input
9822   // lowerings can find an instruction sequence that is faster than a PSHUFB, we
9823   // want to preserve that and we can DAG combine any longer sequences into
9824   // a PSHUFB in the end. But once we start blending from multiple inputs,
9825   // the complexity of DAG combining bad patterns back into PSHUFB is too high,
9826   // and there are *very* few patterns that would actually be faster than the
9827   // PSHUFB approach because of its ability to zero lanes.
9828   //
9829   // FIXME: The only exceptions to the above are blends which are exact
9830   // interleavings with direct instructions supporting them. We currently don't
9831   // handle those well here.
9832   if (Subtarget->hasSSSE3()) {
9833     SDValue V1Mask[16];
9834     SDValue V2Mask[16];
9835     bool V1InUse = false;
9836     bool V2InUse = false;
9837     SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
9838
9839     for (int i = 0; i < 16; ++i) {
9840       if (Mask[i] == -1) {
9841         V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
9842       } else {
9843         const int ZeroMask = 0x80;
9844         int V1Idx = (Mask[i] < 16 ? Mask[i] : ZeroMask);
9845         int V2Idx = (Mask[i] < 16 ? ZeroMask : Mask[i] - 16);
9846         if (Zeroable[i])
9847           V1Idx = V2Idx = ZeroMask;
9848         V1Mask[i] = DAG.getConstant(V1Idx, MVT::i8);
9849         V2Mask[i] = DAG.getConstant(V2Idx, MVT::i8);
9850         V1InUse |= (ZeroMask != V1Idx);
9851         V2InUse |= (ZeroMask != V2Idx);
9852       }
9853     }
9854
9855     if (V1InUse)
9856       V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V1,
9857                        DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
9858     if (V2InUse)
9859       V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V2,
9860                        DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));
9861
9862     // If we need shuffled inputs from both, blend the two.
9863     if (V1InUse && V2InUse)
9864       return DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
9865     if (V1InUse)
9866       return V1; // Single inputs are easy.
9867     if (V2InUse)
9868       return V2; // Single inputs are easy.
9869     // Shuffling to a zeroable vector.
9870     return getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
9871   }
9872
9873   // There are special ways we can lower some single-element blends.
9874   if (NumV2Elements == 1)
9875     if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v16i8, DL, V1, V2,
9876                                                          Mask, Subtarget, DAG))
9877       return V;
9878
9879   // Check whether a compaction lowering can be done. This handles shuffles
9880   // which take every Nth element for some even N. See the helper function for
9881   // details.
9882   //
9883   // We special case these as they can be particularly efficiently handled with
9884   // the PACKUSB instruction on x86 and they show up in common patterns of
9885   // rearranging bytes to truncate wide elements.
9886   if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask)) {
9887     // NumEvenDrops is the power of two stride of the elements. Another way of
9888     // thinking about it is that we need to drop the even elements this many
9889     // times to get the original input.
9890     bool IsSingleInput = isSingleInputShuffleMask(Mask);
9891
9892     // First we need to zero all the dropped bytes.
9893     assert(NumEvenDrops <= 3 &&
9894            "No support for dropping even elements more than 3 times.");
9895     // We use the mask type to pick which bytes are preserved based on how many
9896     // elements are dropped.
9897     MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
9898     SDValue ByteClearMask =
9899         DAG.getNode(ISD::BITCAST, DL, MVT::v16i8,
9900                     DAG.getConstant(0xFF, MaskVTs[NumEvenDrops - 1]));
9901     V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
9902     if (!IsSingleInput)
9903       V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
9904
9905     // Now pack things back together.
9906     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
9907     V2 = IsSingleInput ? V1 : DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
9908     SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
9909     for (int i = 1; i < NumEvenDrops; ++i) {
9910       Result = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Result);
9911       Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
9912     }
9913
9914     return Result;
9915   }
9916
9917   int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9918   int V1HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9919   int V2LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9920   int V2HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9921
9922   auto buildBlendMasks = [](MutableArrayRef<int> HalfMask,
9923                             MutableArrayRef<int> V1HalfBlendMask,
9924                             MutableArrayRef<int> V2HalfBlendMask) {
9925     for (int i = 0; i < 8; ++i)
9926       if (HalfMask[i] >= 0 && HalfMask[i] < 16) {
9927         V1HalfBlendMask[i] = HalfMask[i];
9928         HalfMask[i] = i;
9929       } else if (HalfMask[i] >= 16) {
9930         V2HalfBlendMask[i] = HalfMask[i] - 16;
9931         HalfMask[i] = i + 8;
9932       }
9933   };
9934   buildBlendMasks(LoMask, V1LoBlendMask, V2LoBlendMask);
9935   buildBlendMasks(HiMask, V1HiBlendMask, V2HiBlendMask);
9936
9937   SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
9938
9939   auto buildLoAndHiV8s = [&](SDValue V, MutableArrayRef<int> LoBlendMask,
9940                              MutableArrayRef<int> HiBlendMask) {
9941     SDValue V1, V2;
9942     // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
9943     // them out and avoid using UNPCK{L,H} to extract the elements of V as
9944     // i16s.
9945     if (std::none_of(LoBlendMask.begin(), LoBlendMask.end(),
9946                      [](int M) { return M >= 0 && M % 2 == 1; }) &&
9947         std::none_of(HiBlendMask.begin(), HiBlendMask.end(),
9948                      [](int M) { return M >= 0 && M % 2 == 1; })) {
9949       // Use a mask to drop the high bytes.
9950       V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
9951       V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, V1,
9952                        DAG.getConstant(0x00FF, MVT::v8i16));
9953
9954       // This will be a single vector shuffle instead of a blend so nuke V2.
9955       V2 = DAG.getUNDEF(MVT::v8i16);
9956
9957       // Squash the masks to point directly into V1.
9958       for (int &M : LoBlendMask)
9959         if (M >= 0)
9960           M /= 2;
9961       for (int &M : HiBlendMask)
9962         if (M >= 0)
9963           M /= 2;
9964     } else {
9965       // Otherwise just unpack the low half of V into V1 and the high half into
9966       // V2 so that we can blend them as i16s.
9967       V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9968                        DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
9969       V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9970                        DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
9971     }
9972
9973     SDValue BlendedLo = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
9974     SDValue BlendedHi = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
9975     return std::make_pair(BlendedLo, BlendedHi);
9976   };
9977   SDValue V1Lo, V1Hi, V2Lo, V2Hi;
9978   std::tie(V1Lo, V1Hi) = buildLoAndHiV8s(V1, V1LoBlendMask, V1HiBlendMask);
9979   std::tie(V2Lo, V2Hi) = buildLoAndHiV8s(V2, V2LoBlendMask, V2HiBlendMask);
9980
9981   SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Lo, V2Lo, LoMask);
9982   SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Hi, V2Hi, HiMask);
9983
9984   return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
9985 }
9986
9987 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
9988 ///
9989 /// This routine breaks down the specific type of 128-bit shuffle and
9990 /// dispatches to the lowering routines accordingly.
9991 static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9992                                         MVT VT, const X86Subtarget *Subtarget,
9993                                         SelectionDAG &DAG) {
9994   switch (VT.SimpleTy) {
9995   case MVT::v2i64:
9996     return lowerV2I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
9997   case MVT::v2f64:
9998     return lowerV2F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
9999   case MVT::v4i32:
10000     return lowerV4I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10001   case MVT::v4f32:
10002     return lowerV4F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10003   case MVT::v8i16:
10004     return lowerV8I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
10005   case MVT::v16i8:
10006     return lowerV16I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
10007
10008   default:
10009     llvm_unreachable("Unimplemented!");
10010   }
10011 }
10012
10013 /// \brief Helper function to test whether a shuffle mask could be
10014 /// simplified by widening the elements being shuffled.
10015 ///
10016 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
10017 /// leaves it in an unspecified state.
10018 ///
10019 /// NOTE: This must handle normal vector shuffle masks and *target* vector
10020 /// shuffle masks. The latter have the special property of a '-2' representing
10021 /// a zero-ed lane of a vector.
10022 static bool canWidenShuffleElements(ArrayRef<int> Mask,
10023                                     SmallVectorImpl<int> &WidenedMask) {
10024   for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
10025     // If both elements are undef, its trivial.
10026     if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
10027       WidenedMask.push_back(SM_SentinelUndef);
10028       continue;
10029     }
10030
10031     // Check for an undef mask and a mask value properly aligned to fit with
10032     // a pair of values. If we find such a case, use the non-undef mask's value.
10033     if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) {
10034       WidenedMask.push_back(Mask[i + 1] / 2);
10035       continue;
10036     }
10037     if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
10038       WidenedMask.push_back(Mask[i] / 2);
10039       continue;
10040     }
10041
10042     // When zeroing, we need to spread the zeroing across both lanes to widen.
10043     if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
10044       if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
10045           (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
10046         WidenedMask.push_back(SM_SentinelZero);
10047         continue;
10048       }
10049       return false;
10050     }
10051
10052     // Finally check if the two mask values are adjacent and aligned with
10053     // a pair.
10054     if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) {
10055       WidenedMask.push_back(Mask[i] / 2);
10056       continue;
10057     }
10058
10059     // Otherwise we can't safely widen the elements used in this shuffle.
10060     return false;
10061   }
10062   assert(WidenedMask.size() == Mask.size() / 2 &&
10063          "Incorrect size of mask after widening the elements!");
10064
10065   return true;
10066 }
10067
10068 /// \brief Generic routine to split vector shuffle into half-sized shuffles.
10069 ///
10070 /// This routine just extracts two subvectors, shuffles them independently, and
10071 /// then concatenates them back together. This should work effectively with all
10072 /// AVX vector shuffle types.
10073 static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
10074                                           SDValue V2, ArrayRef<int> Mask,
10075                                           SelectionDAG &DAG) {
10076   assert(VT.getSizeInBits() >= 256 &&
10077          "Only for 256-bit or wider vector shuffles!");
10078   assert(V1.getSimpleValueType() == VT && "Bad operand type!");
10079   assert(V2.getSimpleValueType() == VT && "Bad operand type!");
10080
10081   ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
10082   ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
10083
10084   int NumElements = VT.getVectorNumElements();
10085   int SplitNumElements = NumElements / 2;
10086   MVT ScalarVT = VT.getScalarType();
10087   MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
10088
10089   SDValue LoV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1,
10090                              DAG.getIntPtrConstant(0));
10091   SDValue HiV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1,
10092                              DAG.getIntPtrConstant(SplitNumElements));
10093   SDValue LoV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2,
10094                              DAG.getIntPtrConstant(0));
10095   SDValue HiV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2,
10096                              DAG.getIntPtrConstant(SplitNumElements));
10097
10098   // Now create two 4-way blends of these half-width vectors.
10099   auto HalfBlend = [&](ArrayRef<int> HalfMask) {
10100     bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
10101     SmallVector<int, 32> V1BlendMask, V2BlendMask, BlendMask;
10102     for (int i = 0; i < SplitNumElements; ++i) {
10103       int M = HalfMask[i];
10104       if (M >= NumElements) {
10105         if (M >= NumElements + SplitNumElements)
10106           UseHiV2 = true;
10107         else
10108           UseLoV2 = true;
10109         V2BlendMask.push_back(M - NumElements);
10110         V1BlendMask.push_back(-1);
10111         BlendMask.push_back(SplitNumElements + i);
10112       } else if (M >= 0) {
10113         if (M >= SplitNumElements)
10114           UseHiV1 = true;
10115         else
10116           UseLoV1 = true;
10117         V2BlendMask.push_back(-1);
10118         V1BlendMask.push_back(M);
10119         BlendMask.push_back(i);
10120       } else {
10121         V2BlendMask.push_back(-1);
10122         V1BlendMask.push_back(-1);
10123         BlendMask.push_back(-1);
10124       }
10125     }
10126
10127     // Because the lowering happens after all combining takes place, we need to
10128     // manually combine these blend masks as much as possible so that we create
10129     // a minimal number of high-level vector shuffle nodes.
10130
10131     // First try just blending the halves of V1 or V2.
10132     if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
10133       return DAG.getUNDEF(SplitVT);
10134     if (!UseLoV2 && !UseHiV2)
10135       return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
10136     if (!UseLoV1 && !UseHiV1)
10137       return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
10138
10139     SDValue V1Blend, V2Blend;
10140     if (UseLoV1 && UseHiV1) {
10141       V1Blend =
10142         DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
10143     } else {
10144       // We only use half of V1 so map the usage down into the final blend mask.
10145       V1Blend = UseLoV1 ? LoV1 : HiV1;
10146       for (int i = 0; i < SplitNumElements; ++i)
10147         if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
10148           BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
10149     }
10150     if (UseLoV2 && UseHiV2) {
10151       V2Blend =
10152         DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
10153     } else {
10154       // We only use half of V2 so map the usage down into the final blend mask.
10155       V2Blend = UseLoV2 ? LoV2 : HiV2;
10156       for (int i = 0; i < SplitNumElements; ++i)
10157         if (BlendMask[i] >= SplitNumElements)
10158           BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
10159     }
10160     return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
10161   };
10162   SDValue Lo = HalfBlend(LoMask);
10163   SDValue Hi = HalfBlend(HiMask);
10164   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
10165 }
10166
10167 /// \brief Either split a vector in halves or decompose the shuffles and the
10168 /// blend.
10169 ///
10170 /// This is provided as a good fallback for many lowerings of non-single-input
10171 /// shuffles with more than one 128-bit lane. In those cases, we want to select
10172 /// between splitting the shuffle into 128-bit components and stitching those
10173 /// back together vs. extracting the single-input shuffles and blending those
10174 /// results.
10175 static SDValue lowerVectorShuffleAsSplitOrBlend(SDLoc DL, MVT VT, SDValue V1,
10176                                                 SDValue V2, ArrayRef<int> Mask,
10177                                                 SelectionDAG &DAG) {
10178   assert(!isSingleInputShuffleMask(Mask) && "This routine must not be used to "
10179                                             "lower single-input shuffles as it "
10180                                             "could then recurse on itself.");
10181   int Size = Mask.size();
10182
10183   // If this can be modeled as a broadcast of two elements followed by a blend,
10184   // prefer that lowering. This is especially important because broadcasts can
10185   // often fold with memory operands.
10186   auto DoBothBroadcast = [&] {
10187     int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
10188     for (int M : Mask)
10189       if (M >= Size) {
10190         if (V2BroadcastIdx == -1)
10191           V2BroadcastIdx = M - Size;
10192         else if (M - Size != V2BroadcastIdx)
10193           return false;
10194       } else if (M >= 0) {
10195         if (V1BroadcastIdx == -1)
10196           V1BroadcastIdx = M;
10197         else if (M != V1BroadcastIdx)
10198           return false;
10199       }
10200     return true;
10201   };
10202   if (DoBothBroadcast())
10203     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
10204                                                       DAG);
10205
10206   // If the inputs all stem from a single 128-bit lane of each input, then we
10207   // split them rather than blending because the split will decompose to
10208   // unusually few instructions.
10209   int LaneCount = VT.getSizeInBits() / 128;
10210   int LaneSize = Size / LaneCount;
10211   SmallBitVector LaneInputs[2];
10212   LaneInputs[0].resize(LaneCount, false);
10213   LaneInputs[1].resize(LaneCount, false);
10214   for (int i = 0; i < Size; ++i)
10215     if (Mask[i] >= 0)
10216       LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
10217   if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
10218     return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10219
10220   // Otherwise, just fall back to decomposed shuffles and a blend. This requires
10221   // that the decomposed single-input shuffles don't end up here.
10222   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
10223 }
10224
10225 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
10226 /// a permutation and blend of those lanes.
10227 ///
10228 /// This essentially blends the out-of-lane inputs to each lane into the lane
10229 /// from a permuted copy of the vector. This lowering strategy results in four
10230 /// instructions in the worst case for a single-input cross lane shuffle which
10231 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
10232 /// of. Special cases for each particular shuffle pattern should be handled
10233 /// prior to trying this lowering.
10234 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT,
10235                                                        SDValue V1, SDValue V2,
10236                                                        ArrayRef<int> Mask,
10237                                                        SelectionDAG &DAG) {
10238   // FIXME: This should probably be generalized for 512-bit vectors as well.
10239   assert(VT.getSizeInBits() == 256 && "Only for 256-bit vector shuffles!");
10240   int LaneSize = Mask.size() / 2;
10241
10242   // If there are only inputs from one 128-bit lane, splitting will in fact be
10243   // less expensive. The flags track wether the given lane contains an element
10244   // that crosses to another lane.
10245   bool LaneCrossing[2] = {false, false};
10246   for (int i = 0, Size = Mask.size(); i < Size; ++i)
10247     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
10248       LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
10249   if (!LaneCrossing[0] || !LaneCrossing[1])
10250     return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10251
10252   if (isSingleInputShuffleMask(Mask)) {
10253     SmallVector<int, 32> FlippedBlendMask;
10254     for (int i = 0, Size = Mask.size(); i < Size; ++i)
10255       FlippedBlendMask.push_back(
10256           Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
10257                                   ? Mask[i]
10258                                   : Mask[i] % LaneSize +
10259                                         (i / LaneSize) * LaneSize + Size));
10260
10261     // Flip the vector, and blend the results which should now be in-lane. The
10262     // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
10263     // 5 for the high source. The value 3 selects the high half of source 2 and
10264     // the value 2 selects the low half of source 2. We only use source 2 to
10265     // allow folding it into a memory operand.
10266     unsigned PERMMask = 3 | 2 << 4;
10267     SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
10268                                   V1, DAG.getConstant(PERMMask, MVT::i8));
10269     return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
10270   }
10271
10272   // This now reduces to two single-input shuffles of V1 and V2 which at worst
10273   // will be handled by the above logic and a blend of the results, much like
10274   // other patterns in AVX.
10275   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
10276 }
10277
10278 /// \brief Handle lowering 2-lane 128-bit shuffles.
10279 static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
10280                                         SDValue V2, ArrayRef<int> Mask,
10281                                         const X86Subtarget *Subtarget,
10282                                         SelectionDAG &DAG) {
10283   // Blends are faster and handle all the non-lane-crossing cases.
10284   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
10285                                                 Subtarget, DAG))
10286     return Blend;
10287
10288   MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
10289                                VT.getVectorNumElements() / 2);
10290   // Check for patterns which can be matched with a single insert of a 128-bit
10291   // subvector.
10292   if (isShuffleEquivalent(Mask, 0, 1, 0, 1) ||
10293       isShuffleEquivalent(Mask, 0, 1, 4, 5)) {
10294     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
10295                               DAG.getIntPtrConstant(0));
10296     SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
10297                               Mask[2] < 4 ? V1 : V2, DAG.getIntPtrConstant(0));
10298     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
10299   }
10300   if (isShuffleEquivalent(Mask, 0, 1, 6, 7)) {
10301     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
10302                               DAG.getIntPtrConstant(0));
10303     SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
10304                               DAG.getIntPtrConstant(2));
10305     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
10306   }
10307
10308   // Otherwise form a 128-bit permutation.
10309   // FIXME: Detect zero-vector inputs and use the VPERM2X128 to zero that half.
10310   unsigned PermMask = Mask[0] / 2 | (Mask[2] / 2) << 4;
10311   return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
10312                      DAG.getConstant(PermMask, MVT::i8));
10313 }
10314
10315 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
10316 /// shuffling each lane.
10317 ///
10318 /// This will only succeed when the result of fixing the 128-bit lanes results
10319 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
10320 /// each 128-bit lanes. This handles many cases where we can quickly blend away
10321 /// the lane crosses early and then use simpler shuffles within each lane.
10322 ///
10323 /// FIXME: It might be worthwhile at some point to support this without
10324 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
10325 /// in x86 only floating point has interesting non-repeating shuffles, and even
10326 /// those are still *marginally* more expensive.
10327 static SDValue lowerVectorShuffleByMerging128BitLanes(
10328     SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10329     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
10330   assert(!isSingleInputShuffleMask(Mask) &&
10331          "This is only useful with multiple inputs.");
10332
10333   int Size = Mask.size();
10334   int LaneSize = 128 / VT.getScalarSizeInBits();
10335   int NumLanes = Size / LaneSize;
10336   assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
10337
10338   // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
10339   // check whether the in-128-bit lane shuffles share a repeating pattern.
10340   SmallVector<int, 4> Lanes;
10341   Lanes.resize(NumLanes, -1);
10342   SmallVector<int, 4> InLaneMask;
10343   InLaneMask.resize(LaneSize, -1);
10344   for (int i = 0; i < Size; ++i) {
10345     if (Mask[i] < 0)
10346       continue;
10347
10348     int j = i / LaneSize;
10349
10350     if (Lanes[j] < 0) {
10351       // First entry we've seen for this lane.
10352       Lanes[j] = Mask[i] / LaneSize;
10353     } else if (Lanes[j] != Mask[i] / LaneSize) {
10354       // This doesn't match the lane selected previously!
10355       return SDValue();
10356     }
10357
10358     // Check that within each lane we have a consistent shuffle mask.
10359     int k = i % LaneSize;
10360     if (InLaneMask[k] < 0) {
10361       InLaneMask[k] = Mask[i] % LaneSize;
10362     } else if (InLaneMask[k] != Mask[i] % LaneSize) {
10363       // This doesn't fit a repeating in-lane mask.
10364       return SDValue();
10365     }
10366   }
10367
10368   // First shuffle the lanes into place.
10369   MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
10370                                 VT.getSizeInBits() / 64);
10371   SmallVector<int, 8> LaneMask;
10372   LaneMask.resize(NumLanes * 2, -1);
10373   for (int i = 0; i < NumLanes; ++i)
10374     if (Lanes[i] >= 0) {
10375       LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
10376       LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
10377     }
10378
10379   V1 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V1);
10380   V2 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V2);
10381   SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
10382
10383   // Cast it back to the type we actually want.
10384   LaneShuffle = DAG.getNode(ISD::BITCAST, DL, VT, LaneShuffle);
10385
10386   // Now do a simple shuffle that isn't lane crossing.
10387   SmallVector<int, 8> NewMask;
10388   NewMask.resize(Size, -1);
10389   for (int i = 0; i < Size; ++i)
10390     if (Mask[i] >= 0)
10391       NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
10392   assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
10393          "Must not introduce lane crosses at this point!");
10394
10395   return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
10396 }
10397
10398 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
10399 /// given mask.
10400 ///
10401 /// This returns true if the elements from a particular input are already in the
10402 /// slot required by the given mask and require no permutation.
10403 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
10404   assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
10405   int Size = Mask.size();
10406   for (int i = 0; i < Size; ++i)
10407     if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
10408       return false;
10409
10410   return true;
10411 }
10412
10413 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
10414 ///
10415 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
10416 /// isn't available.
10417 static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10418                                        const X86Subtarget *Subtarget,
10419                                        SelectionDAG &DAG) {
10420   SDLoc DL(Op);
10421   assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
10422   assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
10423   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10424   ArrayRef<int> Mask = SVOp->getMask();
10425   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10426
10427   SmallVector<int, 4> WidenedMask;
10428   if (canWidenShuffleElements(Mask, WidenedMask))
10429     return lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, Subtarget,
10430                                     DAG);
10431
10432   if (isSingleInputShuffleMask(Mask)) {
10433     // Check for being able to broadcast a single element.
10434     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f64, DL, V1,
10435                                                           Mask, Subtarget, DAG))
10436       return Broadcast;
10437
10438     // Use low duplicate instructions for masks that match their pattern.
10439     if (isShuffleEquivalent(Mask, 0, 0, 2, 2))
10440       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
10441
10442     if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
10443       // Non-half-crossing single input shuffles can be lowerid with an
10444       // interleaved permutation.
10445       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
10446                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
10447       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
10448                          DAG.getConstant(VPERMILPMask, MVT::i8));
10449     }
10450
10451     // With AVX2 we have direct support for this permutation.
10452     if (Subtarget->hasAVX2())
10453       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
10454                          getV4X86ShuffleImm8ForMask(Mask, DAG));
10455
10456     // Otherwise, fall back.
10457     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
10458                                                    DAG);
10459   }
10460
10461   // X86 has dedicated unpack instructions that can handle specific blend
10462   // operations: UNPCKH and UNPCKL.
10463   if (isShuffleEquivalent(Mask, 0, 4, 2, 6))
10464     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2);
10465   if (isShuffleEquivalent(Mask, 1, 5, 3, 7))
10466     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2);
10467
10468   // If we have a single input to the zero element, insert that into V1 if we
10469   // can do so cheaply.
10470   int NumV2Elements =
10471       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
10472   if (NumV2Elements == 1 && Mask[0] >= 4)
10473     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10474             MVT::v4f64, DL, V1, V2, Mask, Subtarget, DAG))
10475       return Insertion;
10476
10477   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
10478                                                 Subtarget, DAG))
10479     return Blend;
10480
10481   // Check if the blend happens to exactly fit that of SHUFPD.
10482   if ((Mask[0] == -1 || Mask[0] < 2) &&
10483       (Mask[1] == -1 || (Mask[1] >= 4 && Mask[1] < 6)) &&
10484       (Mask[2] == -1 || (Mask[2] >= 2 && Mask[2] < 4)) &&
10485       (Mask[3] == -1 || Mask[3] >= 6)) {
10486     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 5) << 1) |
10487                           ((Mask[2] == 3) << 2) | ((Mask[3] == 7) << 3);
10488     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V1, V2,
10489                        DAG.getConstant(SHUFPDMask, MVT::i8));
10490   }
10491   if ((Mask[0] == -1 || (Mask[0] >= 4 && Mask[0] < 6)) &&
10492       (Mask[1] == -1 || Mask[1] < 2) &&
10493       (Mask[2] == -1 || Mask[2] >= 6) &&
10494       (Mask[3] == -1 || (Mask[3] >= 2 && Mask[3] < 4))) {
10495     unsigned SHUFPDMask = (Mask[0] == 5) | ((Mask[1] == 1) << 1) |
10496                           ((Mask[2] == 7) << 2) | ((Mask[3] == 3) << 3);
10497     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V2, V1,
10498                        DAG.getConstant(SHUFPDMask, MVT::i8));
10499   }
10500
10501   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10502   // shuffle. However, if we have AVX2 and either inputs are already in place,
10503   // we will be able to shuffle even across lanes the other input in a single
10504   // instruction so skip this pattern.
10505   if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
10506                                  isShuffleMaskInputInPlace(1, Mask))))
10507     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10508             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
10509       return Result;
10510
10511   // If we have AVX2 then we always want to lower with a blend because an v4 we
10512   // can fully permute the elements.
10513   if (Subtarget->hasAVX2())
10514     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
10515                                                       Mask, DAG);
10516
10517   // Otherwise fall back on generic lowering.
10518   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
10519 }
10520
10521 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
10522 ///
10523 /// This routine is only called when we have AVX2 and thus a reasonable
10524 /// instruction set for v4i64 shuffling..
10525 static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10526                                        const X86Subtarget *Subtarget,
10527                                        SelectionDAG &DAG) {
10528   SDLoc DL(Op);
10529   assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
10530   assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
10531   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10532   ArrayRef<int> Mask = SVOp->getMask();
10533   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10534   assert(Subtarget->hasAVX2() && "We can only lower v4i64 with AVX2!");
10535
10536   SmallVector<int, 4> WidenedMask;
10537   if (canWidenShuffleElements(Mask, WidenedMask))
10538     return lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, Subtarget,
10539                                     DAG);
10540
10541   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
10542                                                 Subtarget, DAG))
10543     return Blend;
10544
10545   // Check for being able to broadcast a single element.
10546   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i64, DL, V1,
10547                                                         Mask, Subtarget, DAG))
10548     return Broadcast;
10549
10550   // When the shuffle is mirrored between the 128-bit lanes of the unit, we can
10551   // use lower latency instructions that will operate on both 128-bit lanes.
10552   SmallVector<int, 2> RepeatedMask;
10553   if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
10554     if (isSingleInputShuffleMask(Mask)) {
10555       int PSHUFDMask[] = {-1, -1, -1, -1};
10556       for (int i = 0; i < 2; ++i)
10557         if (RepeatedMask[i] >= 0) {
10558           PSHUFDMask[2 * i] = 2 * RepeatedMask[i];
10559           PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1;
10560         }
10561       return DAG.getNode(
10562           ISD::BITCAST, DL, MVT::v4i64,
10563           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
10564                       DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, V1),
10565                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
10566     }
10567
10568     // Use dedicated unpack instructions for masks that match their pattern.
10569     if (isShuffleEquivalent(Mask, 0, 4, 2, 6))
10570       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
10571     if (isShuffleEquivalent(Mask, 1, 5, 3, 7))
10572       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);
10573   }
10574
10575   // AVX2 provides a direct instruction for permuting a single input across
10576   // lanes.
10577   if (isSingleInputShuffleMask(Mask))
10578     return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
10579                        getV4X86ShuffleImm8ForMask(Mask, DAG));
10580
10581   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10582   // shuffle. However, if we have AVX2 and either inputs are already in place,
10583   // we will be able to shuffle even across lanes the other input in a single
10584   // instruction so skip this pattern.
10585   if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
10586                                  isShuffleMaskInputInPlace(1, Mask))))
10587     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10588             DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
10589       return Result;
10590
10591   // Otherwise fall back on generic blend lowering.
10592   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
10593                                                     Mask, DAG);
10594 }
10595
10596 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
10597 ///
10598 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
10599 /// isn't available.
10600 static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10601                                        const X86Subtarget *Subtarget,
10602                                        SelectionDAG &DAG) {
10603   SDLoc DL(Op);
10604   assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
10605   assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
10606   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10607   ArrayRef<int> Mask = SVOp->getMask();
10608   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10609
10610   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
10611                                                 Subtarget, DAG))
10612     return Blend;
10613
10614   // Check for being able to broadcast a single element.
10615   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8f32, DL, V1,
10616                                                         Mask, Subtarget, DAG))
10617     return Broadcast;
10618
10619   // If the shuffle mask is repeated in each 128-bit lane, we have many more
10620   // options to efficiently lower the shuffle.
10621   SmallVector<int, 4> RepeatedMask;
10622   if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
10623     assert(RepeatedMask.size() == 4 &&
10624            "Repeated masks must be half the mask width!");
10625
10626     // Use even/odd duplicate instructions for masks that match their pattern.
10627     if (isShuffleEquivalent(Mask, 0, 0, 2, 2, 4, 4, 6, 6))
10628       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
10629     if (isShuffleEquivalent(Mask, 1, 1, 3, 3, 5, 5, 7, 7))
10630       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
10631
10632     if (isSingleInputShuffleMask(Mask))
10633       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
10634                          getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
10635
10636     // Use dedicated unpack instructions for masks that match their pattern.
10637     if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13))
10638       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2);
10639     if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15))
10640       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2);
10641
10642     // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
10643     // have already handled any direct blends. We also need to squash the
10644     // repeated mask into a simulated v4f32 mask.
10645     for (int i = 0; i < 4; ++i)
10646       if (RepeatedMask[i] >= 8)
10647         RepeatedMask[i] -= 4;
10648     return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
10649   }
10650
10651   // If we have a single input shuffle with different shuffle patterns in the
10652   // two 128-bit lanes use the variable mask to VPERMILPS.
10653   if (isSingleInputShuffleMask(Mask)) {
10654     SDValue VPermMask[8];
10655     for (int i = 0; i < 8; ++i)
10656       VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
10657                                  : DAG.getConstant(Mask[i], MVT::i32);
10658     if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
10659       return DAG.getNode(
10660           X86ISD::VPERMILPV, DL, MVT::v8f32, V1,
10661           DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask));
10662
10663     if (Subtarget->hasAVX2())
10664       return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32,
10665                          DAG.getNode(ISD::BITCAST, DL, MVT::v8f32,
10666                                      DAG.getNode(ISD::BUILD_VECTOR, DL,
10667                                                  MVT::v8i32, VPermMask)),
10668                          V1);
10669
10670     // Otherwise, fall back.
10671     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
10672                                                    DAG);
10673   }
10674
10675   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10676   // shuffle.
10677   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10678           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
10679     return Result;
10680
10681   // If we have AVX2 then we always want to lower with a blend because at v8 we
10682   // can fully permute the elements.
10683   if (Subtarget->hasAVX2())
10684     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
10685                                                       Mask, DAG);
10686
10687   // Otherwise fall back on generic lowering.
10688   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
10689 }
10690
10691 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
10692 ///
10693 /// This routine is only called when we have AVX2 and thus a reasonable
10694 /// instruction set for v8i32 shuffling..
10695 static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10696                                        const X86Subtarget *Subtarget,
10697                                        SelectionDAG &DAG) {
10698   SDLoc DL(Op);
10699   assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
10700   assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
10701   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10702   ArrayRef<int> Mask = SVOp->getMask();
10703   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10704   assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!");
10705
10706   // Whenever we can lower this as a zext, that instruction is strictly faster
10707   // than any alternative. It also allows us to fold memory operands into the
10708   // shuffle in many cases.
10709   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2,
10710                                                          Mask, Subtarget, DAG))
10711     return ZExt;
10712
10713   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
10714                                                 Subtarget, DAG))
10715     return Blend;
10716
10717   // Check for being able to broadcast a single element.
10718   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i32, DL, V1,
10719                                                         Mask, Subtarget, DAG))
10720     return Broadcast;
10721
10722   // If the shuffle mask is repeated in each 128-bit lane we can use more
10723   // efficient instructions that mirror the shuffles across the two 128-bit
10724   // lanes.
10725   SmallVector<int, 4> RepeatedMask;
10726   if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) {
10727     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
10728     if (isSingleInputShuffleMask(Mask))
10729       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
10730                          getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
10731
10732     // Use dedicated unpack instructions for masks that match their pattern.
10733     if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13))
10734       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2);
10735     if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15))
10736       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2);
10737   }
10738
10739   // If the shuffle patterns aren't repeated but it is a single input, directly
10740   // generate a cross-lane VPERMD instruction.
10741   if (isSingleInputShuffleMask(Mask)) {
10742     SDValue VPermMask[8];
10743     for (int i = 0; i < 8; ++i)
10744       VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
10745                                  : DAG.getConstant(Mask[i], MVT::i32);
10746     return DAG.getNode(
10747         X86ISD::VPERMV, DL, MVT::v8i32,
10748         DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
10749   }
10750
10751   // Try to use bit shift instructions.
10752   if (SDValue Shift = lowerVectorShuffleAsBitShift(
10753           DL, MVT::v8i32, V1, V2, Mask, DAG))
10754     return Shift;
10755
10756   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10757   // shuffle.
10758   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10759           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
10760     return Result;
10761
10762   // Otherwise fall back on generic blend lowering.
10763   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
10764                                                     Mask, DAG);
10765 }
10766
10767 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
10768 ///
10769 /// This routine is only called when we have AVX2 and thus a reasonable
10770 /// instruction set for v16i16 shuffling..
10771 static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10772                                         const X86Subtarget *Subtarget,
10773                                         SelectionDAG &DAG) {
10774   SDLoc DL(Op);
10775   assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
10776   assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
10777   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10778   ArrayRef<int> Mask = SVOp->getMask();
10779   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
10780   assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!");
10781
10782   // Whenever we can lower this as a zext, that instruction is strictly faster
10783   // than any alternative. It also allows us to fold memory operands into the
10784   // shuffle in many cases.
10785   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v16i16, V1, V2,
10786                                                          Mask, Subtarget, DAG))
10787     return ZExt;
10788
10789   // Check for being able to broadcast a single element.
10790   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i16, DL, V1,
10791                                                         Mask, Subtarget, DAG))
10792     return Broadcast;
10793
10794   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
10795                                                 Subtarget, DAG))
10796     return Blend;
10797
10798   // Use dedicated unpack instructions for masks that match their pattern.
10799   if (isShuffleEquivalent(Mask,
10800                           // First 128-bit lane:
10801                           0, 16, 1, 17, 2, 18, 3, 19,
10802                           // Second 128-bit lane:
10803                           8, 24, 9, 25, 10, 26, 11, 27))
10804     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2);
10805   if (isShuffleEquivalent(Mask,
10806                           // First 128-bit lane:
10807                           4, 20, 5, 21, 6, 22, 7, 23,
10808                           // Second 128-bit lane:
10809                           12, 28, 13, 29, 14, 30, 15, 31))
10810     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2);
10811
10812   if (isSingleInputShuffleMask(Mask)) {
10813     // There are no generalized cross-lane shuffle operations available on i16
10814     // element types.
10815     if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
10816       return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
10817                                                      Mask, DAG);
10818
10819     SDValue PSHUFBMask[32];
10820     for (int i = 0; i < 16; ++i) {
10821       if (Mask[i] == -1) {
10822         PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8);
10823         continue;
10824       }
10825
10826       int M = i < 8 ? Mask[i] : Mask[i] - 8;
10827       assert(M >= 0 && M < 8 && "Invalid single-input mask!");
10828       PSHUFBMask[2 * i] = DAG.getConstant(2 * M, MVT::i8);
10829       PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, MVT::i8);
10830     }
10831     return DAG.getNode(
10832         ISD::BITCAST, DL, MVT::v16i16,
10833         DAG.getNode(
10834             X86ISD::PSHUFB, DL, MVT::v32i8,
10835             DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1),
10836             DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)));
10837   }
10838
10839   // Try to use bit shift instructions.
10840   if (SDValue Shift = lowerVectorShuffleAsBitShift(
10841           DL, MVT::v16i16, V1, V2, Mask, DAG))
10842     return Shift;
10843
10844   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10845   // shuffle.
10846   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10847           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
10848     return Result;
10849
10850   // Otherwise fall back on generic lowering.
10851   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
10852 }
10853
10854 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
10855 ///
10856 /// This routine is only called when we have AVX2 and thus a reasonable
10857 /// instruction set for v32i8 shuffling..
10858 static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10859                                        const X86Subtarget *Subtarget,
10860                                        SelectionDAG &DAG) {
10861   SDLoc DL(Op);
10862   assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
10863   assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
10864   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10865   ArrayRef<int> Mask = SVOp->getMask();
10866   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
10867   assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!");
10868
10869   // Whenever we can lower this as a zext, that instruction is strictly faster
10870   // than any alternative. It also allows us to fold memory operands into the
10871   // shuffle in many cases.
10872   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2,
10873                                                          Mask, Subtarget, DAG))
10874     return ZExt;
10875
10876   // Check for being able to broadcast a single element.
10877   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v32i8, DL, V1,
10878                                                         Mask, Subtarget, DAG))
10879     return Broadcast;
10880
10881   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
10882                                                 Subtarget, DAG))
10883     return Blend;
10884
10885   // Use dedicated unpack instructions for masks that match their pattern.
10886   // Note that these are repeated 128-bit lane unpacks, not unpacks across all
10887   // 256-bit lanes.
10888   if (isShuffleEquivalent(
10889           Mask,
10890           // First 128-bit lane:
10891           0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
10892           // Second 128-bit lane:
10893           16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55))
10894     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2);
10895   if (isShuffleEquivalent(
10896           Mask,
10897           // First 128-bit lane:
10898           8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
10899           // Second 128-bit lane:
10900           24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63))
10901     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2);
10902
10903   if (isSingleInputShuffleMask(Mask)) {
10904     // There are no generalized cross-lane shuffle operations available on i8
10905     // element types.
10906     if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
10907       return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2,
10908                                                      Mask, DAG);
10909
10910     SDValue PSHUFBMask[32];
10911     for (int i = 0; i < 32; ++i)
10912       PSHUFBMask[i] =
10913           Mask[i] < 0
10914               ? DAG.getUNDEF(MVT::i8)
10915               : DAG.getConstant(Mask[i] < 16 ? Mask[i] : Mask[i] - 16, MVT::i8);
10916
10917     return DAG.getNode(
10918         X86ISD::PSHUFB, DL, MVT::v32i8, V1,
10919         DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask));
10920   }
10921
10922   // Try to use bit shift instructions.
10923   if (SDValue Shift = lowerVectorShuffleAsBitShift(
10924           DL, MVT::v32i8, V1, V2, Mask, DAG))
10925     return Shift;
10926
10927   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10928   // shuffle.
10929   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10930           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
10931     return Result;
10932
10933   // Otherwise fall back on generic lowering.
10934   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
10935 }
10936
10937 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
10938 ///
10939 /// This routine either breaks down the specific type of a 256-bit x86 vector
10940 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
10941 /// together based on the available instructions.
10942 static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10943                                         MVT VT, const X86Subtarget *Subtarget,
10944                                         SelectionDAG &DAG) {
10945   SDLoc DL(Op);
10946   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10947   ArrayRef<int> Mask = SVOp->getMask();
10948
10949   // There is a really nice hard cut-over between AVX1 and AVX2 that means we can
10950   // check for those subtargets here and avoid much of the subtarget querying in
10951   // the per-vector-type lowering routines. With AVX1 we have essentially *zero*
10952   // ability to manipulate a 256-bit vector with integer types. Since we'll use
10953   // floating point types there eventually, just immediately cast everything to
10954   // a float and operate entirely in that domain.
10955   if (VT.isInteger() && !Subtarget->hasAVX2()) {
10956     int ElementBits = VT.getScalarSizeInBits();
10957     if (ElementBits < 32)
10958       // No floating point type available, decompose into 128-bit vectors.
10959       return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10960
10961     MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
10962                                 VT.getVectorNumElements());
10963     V1 = DAG.getNode(ISD::BITCAST, DL, FpVT, V1);
10964     V2 = DAG.getNode(ISD::BITCAST, DL, FpVT, V2);
10965     return DAG.getNode(ISD::BITCAST, DL, VT,
10966                        DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
10967   }
10968
10969   switch (VT.SimpleTy) {
10970   case MVT::v4f64:
10971     return lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
10972   case MVT::v4i64:
10973     return lowerV4I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
10974   case MVT::v8f32:
10975     return lowerV8F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10976   case MVT::v8i32:
10977     return lowerV8I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10978   case MVT::v16i16:
10979     return lowerV16I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
10980   case MVT::v32i8:
10981     return lowerV32I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
10982
10983   default:
10984     llvm_unreachable("Not a valid 256-bit x86 vector type!");
10985   }
10986 }
10987
10988 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
10989 static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10990                                        const X86Subtarget *Subtarget,
10991                                        SelectionDAG &DAG) {
10992   SDLoc DL(Op);
10993   assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
10994   assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
10995   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10996   ArrayRef<int> Mask = SVOp->getMask();
10997   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10998
10999   // X86 has dedicated unpack instructions that can handle specific blend
11000   // operations: UNPCKH and UNPCKL.
11001   if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14))
11002     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2);
11003   if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15))
11004     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2);
11005
11006   // FIXME: Implement direct support for this type!
11007   return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG);
11008 }
11009
11010 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
11011 static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11012                                        const X86Subtarget *Subtarget,
11013                                        SelectionDAG &DAG) {
11014   SDLoc DL(Op);
11015   assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
11016   assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
11017   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11018   ArrayRef<int> Mask = SVOp->getMask();
11019   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11020
11021   // Use dedicated unpack instructions for masks that match their pattern.
11022   if (isShuffleEquivalent(Mask,
11023                           0, 16, 1, 17, 4, 20, 5, 21,
11024                           8, 24, 9, 25, 12, 28, 13, 29))
11025     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2);
11026   if (isShuffleEquivalent(Mask,
11027                           2, 18, 3, 19, 6, 22, 7, 23,
11028                           10, 26, 11, 27, 14, 30, 15, 31))
11029     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2);
11030
11031   // FIXME: Implement direct support for this type!
11032   return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG);
11033 }
11034
11035 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
11036 static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11037                                        const X86Subtarget *Subtarget,
11038                                        SelectionDAG &DAG) {
11039   SDLoc DL(Op);
11040   assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
11041   assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
11042   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11043   ArrayRef<int> Mask = SVOp->getMask();
11044   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11045
11046   // X86 has dedicated unpack instructions that can handle specific blend
11047   // operations: UNPCKH and UNPCKL.
11048   if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14))
11049     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2);
11050   if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15))
11051     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2);
11052
11053   // FIXME: Implement direct support for this type!
11054   return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG);
11055 }
11056
11057 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
11058 static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11059                                        const X86Subtarget *Subtarget,
11060                                        SelectionDAG &DAG) {
11061   SDLoc DL(Op);
11062   assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
11063   assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
11064   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11065   ArrayRef<int> Mask = SVOp->getMask();
11066   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11067
11068   // Use dedicated unpack instructions for masks that match their pattern.
11069   if (isShuffleEquivalent(Mask,
11070                           0, 16, 1, 17, 4, 20, 5, 21,
11071                           8, 24, 9, 25, 12, 28, 13, 29))
11072     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2);
11073   if (isShuffleEquivalent(Mask,
11074                           2, 18, 3, 19, 6, 22, 7, 23,
11075                           10, 26, 11, 27, 14, 30, 15, 31))
11076     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2);
11077
11078   // FIXME: Implement direct support for this type!
11079   return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG);
11080 }
11081
11082 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
11083 static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11084                                         const X86Subtarget *Subtarget,
11085                                         SelectionDAG &DAG) {
11086   SDLoc DL(Op);
11087   assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
11088   assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
11089   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11090   ArrayRef<int> Mask = SVOp->getMask();
11091   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
11092   assert(Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
11093
11094   // FIXME: Implement direct support for this type!
11095   return splitAndLowerVectorShuffle(DL, MVT::v32i16, V1, V2, Mask, DAG);
11096 }
11097
11098 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
11099 static SDValue lowerV64I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11100                                        const X86Subtarget *Subtarget,
11101                                        SelectionDAG &DAG) {
11102   SDLoc DL(Op);
11103   assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
11104   assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
11105   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11106   ArrayRef<int> Mask = SVOp->getMask();
11107   assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
11108   assert(Subtarget->hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
11109
11110   // FIXME: Implement direct support for this type!
11111   return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
11112 }
11113
11114 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
11115 ///
11116 /// This routine either breaks down the specific type of a 512-bit x86 vector
11117 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
11118 /// together based on the available instructions.
11119 static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11120                                         MVT VT, const X86Subtarget *Subtarget,
11121                                         SelectionDAG &DAG) {
11122   SDLoc DL(Op);
11123   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11124   ArrayRef<int> Mask = SVOp->getMask();
11125   assert(Subtarget->hasAVX512() &&
11126          "Cannot lower 512-bit vectors w/ basic ISA!");
11127
11128   // Check for being able to broadcast a single element.
11129   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(VT.SimpleTy, DL, V1,
11130                                                         Mask, Subtarget, DAG))
11131     return Broadcast;
11132
11133   // Dispatch to each element type for lowering. If we don't have supprot for
11134   // specific element type shuffles at 512 bits, immediately split them and
11135   // lower them. Each lowering routine of a given type is allowed to assume that
11136   // the requisite ISA extensions for that element type are available.
11137   switch (VT.SimpleTy) {
11138   case MVT::v8f64:
11139     return lowerV8F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
11140   case MVT::v16f32:
11141     return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
11142   case MVT::v8i64:
11143     return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
11144   case MVT::v16i32:
11145     return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
11146   case MVT::v32i16:
11147     if (Subtarget->hasBWI())
11148       return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
11149     break;
11150   case MVT::v64i8:
11151     if (Subtarget->hasBWI())
11152       return lowerV64I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
11153     break;
11154
11155   default:
11156     llvm_unreachable("Not a valid 512-bit x86 vector type!");
11157   }
11158
11159   // Otherwise fall back on splitting.
11160   return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
11161 }
11162
11163 /// \brief Top-level lowering for x86 vector shuffles.
11164 ///
11165 /// This handles decomposition, canonicalization, and lowering of all x86
11166 /// vector shuffles. Most of the specific lowering strategies are encapsulated
11167 /// above in helper routines. The canonicalization attempts to widen shuffles
11168 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
11169 /// s.t. only one of the two inputs needs to be tested, etc.
11170 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
11171                                   SelectionDAG &DAG) {
11172   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11173   ArrayRef<int> Mask = SVOp->getMask();
11174   SDValue V1 = Op.getOperand(0);
11175   SDValue V2 = Op.getOperand(1);
11176   MVT VT = Op.getSimpleValueType();
11177   int NumElements = VT.getVectorNumElements();
11178   SDLoc dl(Op);
11179
11180   assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
11181
11182   bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
11183   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
11184   if (V1IsUndef && V2IsUndef)
11185     return DAG.getUNDEF(VT);
11186
11187   // When we create a shuffle node we put the UNDEF node to second operand,
11188   // but in some cases the first operand may be transformed to UNDEF.
11189   // In this case we should just commute the node.
11190   if (V1IsUndef)
11191     return DAG.getCommutedVectorShuffle(*SVOp);
11192
11193   // Check for non-undef masks pointing at an undef vector and make the masks
11194   // undef as well. This makes it easier to match the shuffle based solely on
11195   // the mask.
11196   if (V2IsUndef)
11197     for (int M : Mask)
11198       if (M >= NumElements) {
11199         SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
11200         for (int &M : NewMask)
11201           if (M >= NumElements)
11202             M = -1;
11203         return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask);
11204       }
11205
11206   // Try to collapse shuffles into using a vector type with fewer elements but
11207   // wider element types. We cap this to not form integers or floating point
11208   // elements wider than 64 bits, but it might be interesting to form i128
11209   // integers to handle flipping the low and high halves of AVX 256-bit vectors.
11210   SmallVector<int, 16> WidenedMask;
11211   if (VT.getScalarSizeInBits() < 64 &&
11212       canWidenShuffleElements(Mask, WidenedMask)) {
11213     MVT NewEltVT = VT.isFloatingPoint()
11214                        ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
11215                        : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
11216     MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
11217     // Make sure that the new vector type is legal. For example, v2f64 isn't
11218     // legal on SSE1.
11219     if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
11220       V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
11221       V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
11222       return DAG.getNode(ISD::BITCAST, dl, VT,
11223                          DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask));
11224     }
11225   }
11226
11227   int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0;
11228   for (int M : SVOp->getMask())
11229     if (M < 0)
11230       ++NumUndefElements;
11231     else if (M < NumElements)
11232       ++NumV1Elements;
11233     else
11234       ++NumV2Elements;
11235
11236   // Commute the shuffle as needed such that more elements come from V1 than
11237   // V2. This allows us to match the shuffle pattern strictly on how many
11238   // elements come from V1 without handling the symmetric cases.
11239   if (NumV2Elements > NumV1Elements)
11240     return DAG.getCommutedVectorShuffle(*SVOp);
11241
11242   // When the number of V1 and V2 elements are the same, try to minimize the
11243   // number of uses of V2 in the low half of the vector. When that is tied,
11244   // ensure that the sum of indices for V1 is equal to or lower than the sum
11245   // indices for V2. When those are equal, try to ensure that the number of odd
11246   // indices for V1 is lower than the number of odd indices for V2.
11247   if (NumV1Elements == NumV2Elements) {
11248     int LowV1Elements = 0, LowV2Elements = 0;
11249     for (int M : SVOp->getMask().slice(0, NumElements / 2))
11250       if (M >= NumElements)
11251         ++LowV2Elements;
11252       else if (M >= 0)
11253         ++LowV1Elements;
11254     if (LowV2Elements > LowV1Elements) {
11255       return DAG.getCommutedVectorShuffle(*SVOp);
11256     } else if (LowV2Elements == LowV1Elements) {
11257       int SumV1Indices = 0, SumV2Indices = 0;
11258       for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
11259         if (SVOp->getMask()[i] >= NumElements)
11260           SumV2Indices += i;
11261         else if (SVOp->getMask()[i] >= 0)
11262           SumV1Indices += i;
11263       if (SumV2Indices < SumV1Indices) {
11264         return DAG.getCommutedVectorShuffle(*SVOp);
11265       } else if (SumV2Indices == SumV1Indices) {
11266         int NumV1OddIndices = 0, NumV2OddIndices = 0;
11267         for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
11268           if (SVOp->getMask()[i] >= NumElements)
11269             NumV2OddIndices += i % 2;
11270           else if (SVOp->getMask()[i] >= 0)
11271             NumV1OddIndices += i % 2;
11272         if (NumV2OddIndices < NumV1OddIndices)
11273           return DAG.getCommutedVectorShuffle(*SVOp);
11274       }
11275     }
11276   }
11277
11278   // For each vector width, delegate to a specialized lowering routine.
11279   if (VT.getSizeInBits() == 128)
11280     return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
11281
11282   if (VT.getSizeInBits() == 256)
11283     return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
11284
11285   // Force AVX-512 vectors to be scalarized for now.
11286   // FIXME: Implement AVX-512 support!
11287   if (VT.getSizeInBits() == 512)
11288     return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
11289
11290   llvm_unreachable("Unimplemented!");
11291 }
11292
11293
11294 //===----------------------------------------------------------------------===//
11295 // Legacy vector shuffle lowering
11296 //
11297 // This code is the legacy code handling vector shuffles until the above
11298 // replaces its functionality and performance.
11299 //===----------------------------------------------------------------------===//
11300
11301 static bool isBlendMask(ArrayRef<int> MaskVals, MVT VT, bool hasSSE41,
11302                         bool hasInt256, unsigned *MaskOut = nullptr) {
11303   MVT EltVT = VT.getVectorElementType();
11304
11305   // There is no blend with immediate in AVX-512.
11306   if (VT.is512BitVector())
11307     return false;
11308
11309   if (!hasSSE41 || EltVT == MVT::i8)
11310     return false;
11311   if (!hasInt256 && VT == MVT::v16i16)
11312     return false;
11313
11314   unsigned MaskValue = 0;
11315   unsigned NumElems = VT.getVectorNumElements();
11316   // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
11317   unsigned NumLanes = (NumElems - 1) / 8 + 1;
11318   unsigned NumElemsInLane = NumElems / NumLanes;
11319
11320   // Blend for v16i16 should be symmetric for both lanes.
11321   for (unsigned i = 0; i < NumElemsInLane; ++i) {
11322
11323     int SndLaneEltIdx = (NumLanes == 2) ? MaskVals[i + NumElemsInLane] : -1;
11324     int EltIdx = MaskVals[i];
11325
11326     if ((EltIdx < 0 || EltIdx == (int)i) &&
11327         (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane)))
11328       continue;
11329
11330     if (((unsigned)EltIdx == (i + NumElems)) &&
11331         (SndLaneEltIdx < 0 ||
11332          (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane))
11333       MaskValue |= (1 << i);
11334     else
11335       return false;
11336   }
11337
11338   if (MaskOut)
11339     *MaskOut = MaskValue;
11340   return true;
11341 }
11342
11343 // Try to lower a shuffle node into a simple blend instruction.
11344 // This function assumes isBlendMask returns true for this
11345 // SuffleVectorSDNode
11346 static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
11347                                           unsigned MaskValue,
11348                                           const X86Subtarget *Subtarget,
11349                                           SelectionDAG &DAG) {
11350   MVT VT = SVOp->getSimpleValueType(0);
11351   MVT EltVT = VT.getVectorElementType();
11352   assert(isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(),
11353                      Subtarget->hasInt256() && "Trying to lower a "
11354                                                "VECTOR_SHUFFLE to a Blend but "
11355                                                "with the wrong mask"));
11356   SDValue V1 = SVOp->getOperand(0);
11357   SDValue V2 = SVOp->getOperand(1);
11358   SDLoc dl(SVOp);
11359   unsigned NumElems = VT.getVectorNumElements();
11360
11361   // Convert i32 vectors to floating point if it is not AVX2.
11362   // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
11363   MVT BlendVT = VT;
11364   if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
11365     BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
11366                                NumElems);
11367     V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1);
11368     V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2);
11369   }
11370
11371   SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2,
11372                             DAG.getConstant(MaskValue, MVT::i32));
11373   return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
11374 }
11375
11376 /// In vector type \p VT, return true if the element at index \p InputIdx
11377 /// falls on a different 128-bit lane than \p OutputIdx.
11378 static bool ShuffleCrosses128bitLane(MVT VT, unsigned InputIdx,
11379                                      unsigned OutputIdx) {
11380   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
11381   return InputIdx * EltSize / 128 != OutputIdx * EltSize / 128;
11382 }
11383
11384 /// Generate a PSHUFB if possible.  Selects elements from \p V1 according to
11385 /// \p MaskVals.  MaskVals[OutputIdx] = InputIdx specifies that we want to
11386 /// shuffle the element at InputIdx in V1 to OutputIdx in the result.  If \p
11387 /// MaskVals refers to elements outside of \p V1 or is undef (-1), insert a
11388 /// zero.
11389 static SDValue getPSHUFB(ArrayRef<int> MaskVals, SDValue V1, SDLoc &dl,
11390                          SelectionDAG &DAG) {
11391   MVT VT = V1.getSimpleValueType();
11392   assert(VT.is128BitVector() || VT.is256BitVector());
11393
11394   MVT EltVT = VT.getVectorElementType();
11395   unsigned EltSizeInBytes = EltVT.getSizeInBits() / 8;
11396   unsigned NumElts = VT.getVectorNumElements();
11397
11398   SmallVector<SDValue, 32> PshufbMask;
11399   for (unsigned OutputIdx = 0; OutputIdx < NumElts; ++OutputIdx) {
11400     int InputIdx = MaskVals[OutputIdx];
11401     unsigned InputByteIdx;
11402
11403     if (InputIdx < 0 || NumElts <= (unsigned)InputIdx)
11404       InputByteIdx = 0x80;
11405     else {
11406       // Cross lane is not allowed.
11407       if (ShuffleCrosses128bitLane(VT, InputIdx, OutputIdx))
11408         return SDValue();
11409       InputByteIdx = InputIdx * EltSizeInBytes;
11410       // Index is an byte offset within the 128-bit lane.
11411       InputByteIdx &= 0xf;
11412     }
11413
11414     for (unsigned j = 0; j < EltSizeInBytes; ++j) {
11415       PshufbMask.push_back(DAG.getConstant(InputByteIdx, MVT::i8));
11416       if (InputByteIdx != 0x80)
11417         ++InputByteIdx;
11418     }
11419   }
11420
11421   MVT ShufVT = MVT::getVectorVT(MVT::i8, PshufbMask.size());
11422   if (ShufVT != VT)
11423     V1 = DAG.getNode(ISD::BITCAST, dl, ShufVT, V1);
11424   return DAG.getNode(X86ISD::PSHUFB, dl, ShufVT, V1,
11425                      DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT, PshufbMask));
11426 }
11427
11428 // v8i16 shuffles - Prefer shuffles in the following order:
11429 // 1. [all]   pshuflw, pshufhw, optional move
11430 // 2. [ssse3] 1 x pshufb
11431 // 3. [ssse3] 2 x pshufb + 1 x por
11432 // 4. [all]   mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
11433 static SDValue
11434 LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
11435                          SelectionDAG &DAG) {
11436   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11437   SDValue V1 = SVOp->getOperand(0);
11438   SDValue V2 = SVOp->getOperand(1);
11439   SDLoc dl(SVOp);
11440   SmallVector<int, 8> MaskVals;
11441
11442   // Determine if more than 1 of the words in each of the low and high quadwords
11443   // of the result come from the same quadword of one of the two inputs.  Undef
11444   // mask values count as coming from any quadword, for better codegen.
11445   //
11446   // Lo/HiQuad[i] = j indicates how many words from the ith quad of the input
11447   // feeds this quad.  For i, 0 and 1 refer to V1, 2 and 3 refer to V2.
11448   unsigned LoQuad[] = { 0, 0, 0, 0 };
11449   unsigned HiQuad[] = { 0, 0, 0, 0 };
11450   // Indices of quads used.
11451   std::bitset<4> InputQuads;
11452   for (unsigned i = 0; i < 8; ++i) {
11453     unsigned *Quad = i < 4 ? LoQuad : HiQuad;
11454     int EltIdx = SVOp->getMaskElt(i);
11455     MaskVals.push_back(EltIdx);
11456     if (EltIdx < 0) {
11457       ++Quad[0];
11458       ++Quad[1];
11459       ++Quad[2];
11460       ++Quad[3];
11461       continue;
11462     }
11463     ++Quad[EltIdx / 4];
11464     InputQuads.set(EltIdx / 4);
11465   }
11466
11467   int BestLoQuad = -1;
11468   unsigned MaxQuad = 1;
11469   for (unsigned i = 0; i < 4; ++i) {
11470     if (LoQuad[i] > MaxQuad) {
11471       BestLoQuad = i;
11472       MaxQuad = LoQuad[i];
11473     }
11474   }
11475
11476   int BestHiQuad = -1;
11477   MaxQuad = 1;
11478   for (unsigned i = 0; i < 4; ++i) {
11479     if (HiQuad[i] > MaxQuad) {
11480       BestHiQuad = i;
11481       MaxQuad = HiQuad[i];
11482     }
11483   }
11484
11485   // For SSSE3, If all 8 words of the result come from only 1 quadword of each
11486   // of the two input vectors, shuffle them into one input vector so only a
11487   // single pshufb instruction is necessary. If there are more than 2 input
11488   // quads, disable the next transformation since it does not help SSSE3.
11489   bool V1Used = InputQuads[0] || InputQuads[1];
11490   bool V2Used = InputQuads[2] || InputQuads[3];
11491   if (Subtarget->hasSSSE3()) {
11492     if (InputQuads.count() == 2 && V1Used && V2Used) {
11493       BestLoQuad = InputQuads[0] ? 0 : 1;
11494       BestHiQuad = InputQuads[2] ? 2 : 3;
11495     }
11496     if (InputQuads.count() > 2) {
11497       BestLoQuad = -1;
11498       BestHiQuad = -1;
11499     }
11500   }
11501
11502   // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
11503   // the shuffle mask.  If a quad is scored as -1, that means that it contains
11504   // words from all 4 input quadwords.
11505   SDValue NewV;
11506   if (BestLoQuad >= 0 || BestHiQuad >= 0) {
11507     int MaskV[] = {
11508       BestLoQuad < 0 ? 0 : BestLoQuad,
11509       BestHiQuad < 0 ? 1 : BestHiQuad
11510     };
11511     NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
11512                   DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
11513                   DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
11514     NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV);
11515
11516     // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
11517     // source words for the shuffle, to aid later transformations.
11518     bool AllWordsInNewV = true;
11519     bool InOrder[2] = { true, true };
11520     for (unsigned i = 0; i != 8; ++i) {
11521       int idx = MaskVals[i];
11522       if (idx != (int)i)
11523         InOrder[i/4] = false;
11524       if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
11525         continue;
11526       AllWordsInNewV = false;
11527       break;
11528     }
11529
11530     bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
11531     if (AllWordsInNewV) {
11532       for (int i = 0; i != 8; ++i) {
11533         int idx = MaskVals[i];
11534         if (idx < 0)
11535           continue;
11536         idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
11537         if ((idx != i) && idx < 4)
11538           pshufhw = false;
11539         if ((idx != i) && idx > 3)
11540           pshuflw = false;
11541       }
11542       V1 = NewV;
11543       V2Used = false;
11544       BestLoQuad = 0;
11545       BestHiQuad = 1;
11546     }
11547
11548     // If we've eliminated the use of V2, and the new mask is a pshuflw or
11549     // pshufhw, that's as cheap as it gets.  Return the new shuffle.
11550     if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
11551       unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;
11552       unsigned TargetMask = 0;
11553       NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
11554                                   DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
11555       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
11556       TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp):
11557                              getShufflePSHUFLWImmediate(SVOp);
11558       V1 = NewV.getOperand(0);
11559       return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
11560     }
11561   }
11562
11563   // Promote splats to a larger type which usually leads to more efficient code.
11564   // FIXME: Is this true if pshufb is available?
11565   if (SVOp->isSplat())
11566     return PromoteSplat(SVOp, DAG);
11567
11568   // If we have SSSE3, and all words of the result are from 1 input vector,
11569   // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
11570   // is present, fall back to case 4.
11571   if (Subtarget->hasSSSE3()) {
11572     SmallVector<SDValue,16> pshufbMask;
11573
11574     // If we have elements from both input vectors, set the high bit of the
11575     // shuffle mask element to zero out elements that come from V2 in the V1
11576     // mask, and elements that come from V1 in the V2 mask, so that the two
11577     // results can be OR'd together.
11578     bool TwoInputs = V1Used && V2Used;
11579     V1 = getPSHUFB(MaskVals, V1, dl, DAG);
11580     if (!TwoInputs)
11581       return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
11582
11583     // Calculate the shuffle mask for the second input, shuffle it, and
11584     // OR it with the first shuffled input.
11585     CommuteVectorShuffleMask(MaskVals, 8);
11586     V2 = getPSHUFB(MaskVals, V2, dl, DAG);
11587     V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
11588     return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
11589   }
11590
11591   // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
11592   // and update MaskVals with new element order.
11593   std::bitset<8> InOrder;
11594   if (BestLoQuad >= 0) {
11595     int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 };
11596     for (int i = 0; i != 4; ++i) {
11597       int idx = MaskVals[i];
11598       if (idx < 0) {
11599         InOrder.set(i);
11600       } else if ((idx / 4) == BestLoQuad) {
11601         MaskV[i] = idx & 3;
11602         InOrder.set(i);
11603       }
11604     }
11605     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
11606                                 &MaskV[0]);
11607
11608     if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
11609       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
11610       NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
11611                                   NewV.getOperand(0),
11612                                   getShufflePSHUFLWImmediate(SVOp), DAG);
11613     }
11614   }
11615
11616   // If BestHi >= 0, generate a pshufhw to put the high elements in order,
11617   // and update MaskVals with the new element order.
11618   if (BestHiQuad >= 0) {
11619     int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 };
11620     for (unsigned i = 4; i != 8; ++i) {
11621       int idx = MaskVals[i];
11622       if (idx < 0) {
11623         InOrder.set(i);
11624       } else if ((idx / 4) == BestHiQuad) {
11625         MaskV[i] = (idx & 3) + 4;
11626         InOrder.set(i);
11627       }
11628     }
11629     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
11630                                 &MaskV[0]);
11631
11632     if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
11633       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
11634       NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
11635                                   NewV.getOperand(0),
11636                                   getShufflePSHUFHWImmediate(SVOp), DAG);
11637     }
11638   }
11639
11640   // In case BestHi & BestLo were both -1, which means each quadword has a word
11641   // from each of the four input quadwords, calculate the InOrder bitvector now
11642   // before falling through to the insert/extract cleanup.
11643   if (BestLoQuad == -1 && BestHiQuad == -1) {
11644     NewV = V1;
11645     for (int i = 0; i != 8; ++i)
11646       if (MaskVals[i] < 0 || MaskVals[i] == i)
11647         InOrder.set(i);
11648   }
11649
11650   // The other elements are put in the right place using pextrw and pinsrw.
11651   for (unsigned i = 0; i != 8; ++i) {
11652     if (InOrder[i])
11653       continue;
11654     int EltIdx = MaskVals[i];
11655     if (EltIdx < 0)
11656       continue;
11657     SDValue ExtOp = (EltIdx < 8) ?
11658       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
11659                   DAG.getIntPtrConstant(EltIdx)) :
11660       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
11661                   DAG.getIntPtrConstant(EltIdx - 8));
11662     NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
11663                        DAG.getIntPtrConstant(i));
11664   }
11665   return NewV;
11666 }
11667
11668 /// \brief v16i16 shuffles
11669 ///
11670 /// FIXME: We only support generation of a single pshufb currently.  We can
11671 /// generalize the other applicable cases from LowerVECTOR_SHUFFLEv8i16 as
11672 /// well (e.g 2 x pshufb + 1 x por).
11673 static SDValue
11674 LowerVECTOR_SHUFFLEv16i16(SDValue Op, SelectionDAG &DAG) {
11675   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11676   SDValue V1 = SVOp->getOperand(0);
11677   SDValue V2 = SVOp->getOperand(1);
11678   SDLoc dl(SVOp);
11679
11680   if (V2.getOpcode() != ISD::UNDEF)
11681     return SDValue();
11682
11683   SmallVector<int, 16> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
11684   return getPSHUFB(MaskVals, V1, dl, DAG);
11685 }
11686
11687 // v16i8 shuffles - Prefer shuffles in the following order:
11688 // 1. [ssse3] 1 x pshufb
11689 // 2. [ssse3] 2 x pshufb + 1 x por
11690 // 3. [all]   v8i16 shuffle + N x pextrw + rotate + pinsrw
11691 static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
11692                                         const X86Subtarget* Subtarget,
11693                                         SelectionDAG &DAG) {
11694   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
11695   SDValue V1 = SVOp->getOperand(0);
11696   SDValue V2 = SVOp->getOperand(1);
11697   SDLoc dl(SVOp);
11698   ArrayRef<int> MaskVals = SVOp->getMask();
11699
11700   // Promote splats to a larger type which usually leads to more efficient code.
11701   // FIXME: Is this true if pshufb is available?
11702   if (SVOp->isSplat())
11703     return PromoteSplat(SVOp, DAG);
11704
11705   // If we have SSSE3, case 1 is generated when all result bytes come from
11706   // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
11707   // present, fall back to case 3.
11708
11709   // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
11710   if (Subtarget->hasSSSE3()) {
11711     SmallVector<SDValue,16> pshufbMask;
11712
11713     // If all result elements are from one input vector, then only translate
11714     // undef mask values to 0x80 (zero out result) in the pshufb mask.
11715     //
11716     // Otherwise, we have elements from both input vectors, and must zero out
11717     // elements that come from V2 in the first mask, and V1 in the second mask
11718     // so that we can OR them together.
11719     for (unsigned i = 0; i != 16; ++i) {
11720       int EltIdx = MaskVals[i];
11721       if (EltIdx < 0 || EltIdx >= 16)
11722         EltIdx = 0x80;
11723       pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
11724     }
11725     V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
11726                      DAG.getNode(ISD::BUILD_VECTOR, dl,
11727                                  MVT::v16i8, pshufbMask));
11728
11729     // As PSHUFB will zero elements with negative indices, it's safe to ignore
11730     // the 2nd operand if it's undefined or zero.
11731     if (V2.getOpcode() == ISD::UNDEF ||
11732         ISD::isBuildVectorAllZeros(V2.getNode()))
11733       return V1;
11734
11735     // Calculate the shuffle mask for the second input, shuffle it, and
11736     // OR it with the first shuffled input.
11737     pshufbMask.clear();
11738     for (unsigned i = 0; i != 16; ++i) {
11739       int EltIdx = MaskVals[i];
11740       EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16;
11741       pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
11742     }
11743     V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
11744                      DAG.getNode(ISD::BUILD_VECTOR, dl,
11745                                  MVT::v16i8, pshufbMask));
11746     return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
11747   }
11748
11749   // No SSSE3 - Calculate in place words and then fix all out of place words
11750   // With 0-16 extracts & inserts.  Worst case is 16 bytes out of order from
11751   // the 16 different words that comprise the two doublequadword input vectors.
11752   V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
11753   V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
11754   SDValue NewV = V1;
11755   for (int i = 0; i != 8; ++i) {
11756     int Elt0 = MaskVals[i*2];
11757     int Elt1 = MaskVals[i*2+1];
11758
11759     // This word of the result is all undef, skip it.
11760     if (Elt0 < 0 && Elt1 < 0)
11761       continue;
11762
11763     // This word of the result is already in the correct place, skip it.
11764     if ((Elt0 == i*2) && (Elt1 == i*2+1))
11765       continue;
11766
11767     SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
11768     SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
11769     SDValue InsElt;
11770
11771     // If Elt0 and Elt1 are defined, are consecutive, and can be load
11772     // using a single extract together, load it and store it.
11773     if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
11774       InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
11775                            DAG.getIntPtrConstant(Elt1 / 2));
11776       NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
11777                         DAG.getIntPtrConstant(i));
11778       continue;
11779     }
11780
11781     // If Elt1 is defined, extract it from the appropriate source.  If the
11782     // source byte is not also odd, shift the extracted word left 8 bits
11783     // otherwise clear the bottom 8 bits if we need to do an or.
11784     if (Elt1 >= 0) {
11785       InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
11786                            DAG.getIntPtrConstant(Elt1 / 2));
11787       if ((Elt1 & 1) == 0)
11788         InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
11789                              DAG.getConstant(8,
11790                                   TLI.getShiftAmountTy(InsElt.getValueType())));
11791       else if (Elt0 >= 0)
11792         InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
11793                              DAG.getConstant(0xFF00, MVT::i16));
11794     }
11795     // If Elt0 is defined, extract it from the appropriate source.  If the
11796     // source byte is not also even, shift the extracted word right 8 bits. If
11797     // Elt1 was also defined, OR the extracted values together before
11798     // inserting them in the result.
11799     if (Elt0 >= 0) {
11800       SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
11801                                     Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
11802       if ((Elt0 & 1) != 0)
11803         InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
11804                               DAG.getConstant(8,
11805                                  TLI.getShiftAmountTy(InsElt0.getValueType())));
11806       else if (Elt1 >= 0)
11807         InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
11808                              DAG.getConstant(0x00FF, MVT::i16));
11809       InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
11810                          : InsElt0;
11811     }
11812     NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
11813                        DAG.getIntPtrConstant(i));
11814   }
11815   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
11816 }
11817
11818 // v32i8 shuffles - Translate to VPSHUFB if possible.
11819 static
11820 SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
11821                                  const X86Subtarget *Subtarget,
11822                                  SelectionDAG &DAG) {
11823   MVT VT = SVOp->getSimpleValueType(0);
11824   SDValue V1 = SVOp->getOperand(0);
11825   SDValue V2 = SVOp->getOperand(1);
11826   SDLoc dl(SVOp);
11827   SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
11828
11829   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
11830   bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode());
11831   bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode());
11832
11833   // VPSHUFB may be generated if
11834   // (1) one of input vector is undefined or zeroinitializer.
11835   // The mask value 0x80 puts 0 in the corresponding slot of the vector.
11836   // And (2) the mask indexes don't cross the 128-bit lane.
11837   if (VT != MVT::v32i8 || !Subtarget->hasInt256() ||
11838       (!V2IsUndef && !V2IsAllZero && !V1IsAllZero))
11839     return SDValue();
11840
11841   if (V1IsAllZero && !V2IsAllZero) {
11842     CommuteVectorShuffleMask(MaskVals, 32);
11843     V1 = V2;
11844   }
11845   return getPSHUFB(MaskVals, V1, dl, DAG);
11846 }
11847
11848 /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
11849 /// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
11850 /// done when every pair / quad of shuffle mask elements point to elements in
11851 /// the right sequence. e.g.
11852 /// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15>
11853 static
11854 SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
11855                                  SelectionDAG &DAG) {
11856   MVT VT = SVOp->getSimpleValueType(0);
11857   SDLoc dl(SVOp);
11858   unsigned NumElems = VT.getVectorNumElements();
11859   MVT NewVT;
11860   unsigned Scale;
11861   switch (VT.SimpleTy) {
11862   default: llvm_unreachable("Unexpected!");
11863   case MVT::v2i64:
11864   case MVT::v2f64:
11865            return SDValue(SVOp, 0);
11866   case MVT::v4f32:  NewVT = MVT::v2f64; Scale = 2; break;
11867   case MVT::v4i32:  NewVT = MVT::v2i64; Scale = 2; break;
11868   case MVT::v8i16:  NewVT = MVT::v4i32; Scale = 2; break;
11869   case MVT::v16i8:  NewVT = MVT::v4i32; Scale = 4; break;
11870   case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break;
11871   case MVT::v32i8:  NewVT = MVT::v8i32; Scale = 4; break;
11872   }
11873
11874   SmallVector<int, 8> MaskVec;
11875   for (unsigned i = 0; i != NumElems; i += Scale) {
11876     int StartIdx = -1;
11877     for (unsigned j = 0; j != Scale; ++j) {
11878       int EltIdx = SVOp->getMaskElt(i+j);
11879       if (EltIdx < 0)
11880         continue;
11881       if (StartIdx < 0)
11882         StartIdx = (EltIdx / Scale);
11883       if (EltIdx != (int)(StartIdx*Scale + j))
11884         return SDValue();
11885     }
11886     MaskVec.push_back(StartIdx);
11887   }
11888
11889   SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0));
11890   SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1));
11891   return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
11892 }
11893
11894 /// getVZextMovL - Return a zero-extending vector move low node.
11895 ///
11896 static SDValue getVZextMovL(MVT VT, MVT OpVT,
11897                             SDValue SrcOp, SelectionDAG &DAG,
11898                             const X86Subtarget *Subtarget, SDLoc dl) {
11899   if (VT == MVT::v2f64 || VT == MVT::v4f32) {
11900     LoadSDNode *LD = nullptr;
11901     if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
11902       LD = dyn_cast<LoadSDNode>(SrcOp);
11903     if (!LD) {
11904       // movssrr and movsdrr do not clear top bits. Try to use movd, movq
11905       // instead.
11906       MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
11907       if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) &&
11908           SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
11909           SrcOp.getOperand(0).getOpcode() == ISD::BITCAST &&
11910           SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
11911         // PR2108
11912         OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
11913         return DAG.getNode(ISD::BITCAST, dl, VT,
11914                            DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
11915                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
11916                                                    OpVT,
11917                                                    SrcOp.getOperand(0)
11918                                                           .getOperand(0))));
11919       }
11920     }
11921   }
11922
11923   return DAG.getNode(ISD::BITCAST, dl, VT,
11924                      DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
11925                                  DAG.getNode(ISD::BITCAST, dl,
11926                                              OpVT, SrcOp)));
11927 }
11928
11929 /// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
11930 /// which could not be matched by any known target speficic shuffle
11931 static SDValue
11932 LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
11933
11934   SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG);
11935   if (NewOp.getNode())
11936     return NewOp;
11937
11938   MVT VT = SVOp->getSimpleValueType(0);
11939
11940   unsigned NumElems = VT.getVectorNumElements();
11941   unsigned NumLaneElems = NumElems / 2;
11942
11943   SDLoc dl(SVOp);
11944   MVT EltVT = VT.getVectorElementType();
11945   MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
11946   SDValue Output[2];
11947
11948   SmallVector<int, 16> Mask;
11949   for (unsigned l = 0; l < 2; ++l) {
11950     // Build a shuffle mask for the output, discovering on the fly which
11951     // input vectors to use as shuffle operands (recorded in InputUsed).
11952     // If building a suitable shuffle vector proves too hard, then bail
11953     // out with UseBuildVector set.
11954     bool UseBuildVector = false;
11955     int InputUsed[2] = { -1, -1 }; // Not yet discovered.
11956     unsigned LaneStart = l * NumLaneElems;
11957     for (unsigned i = 0; i != NumLaneElems; ++i) {
11958       // The mask element.  This indexes into the input.
11959       int Idx = SVOp->getMaskElt(i+LaneStart);
11960       if (Idx < 0) {
11961         // the mask element does not index into any input vector.
11962         Mask.push_back(-1);
11963         continue;
11964       }
11965
11966       // The input vector this mask element indexes into.
11967       int Input = Idx / NumLaneElems;
11968
11969       // Turn the index into an offset from the start of the input vector.
11970       Idx -= Input * NumLaneElems;
11971
11972       // Find or create a shuffle vector operand to hold this input.
11973       unsigned OpNo;
11974       for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
11975         if (InputUsed[OpNo] == Input)
11976           // This input vector is already an operand.
11977           break;
11978         if (InputUsed[OpNo] < 0) {
11979           // Create a new operand for this input vector.
11980           InputUsed[OpNo] = Input;
11981           break;
11982         }
11983       }
11984
11985       if (OpNo >= array_lengthof(InputUsed)) {
11986         // More than two input vectors used!  Give up on trying to create a
11987         // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
11988         UseBuildVector = true;
11989         break;
11990       }
11991
11992       // Add the mask index for the new shuffle vector.
11993       Mask.push_back(Idx + OpNo * NumLaneElems);
11994     }
11995
11996     if (UseBuildVector) {
11997       SmallVector<SDValue, 16> SVOps;
11998       for (unsigned i = 0; i != NumLaneElems; ++i) {
11999         // The mask element.  This indexes into the input.
12000         int Idx = SVOp->getMaskElt(i+LaneStart);
12001         if (Idx < 0) {
12002           SVOps.push_back(DAG.getUNDEF(EltVT));
12003           continue;
12004         }
12005
12006         // The input vector this mask element indexes into.
12007         int Input = Idx / NumElems;
12008
12009         // Turn the index into an offset from the start of the input vector.
12010         Idx -= Input * NumElems;
12011
12012         // Extract the vector element by hand.
12013         SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
12014                                     SVOp->getOperand(Input),
12015                                     DAG.getIntPtrConstant(Idx)));
12016       }
12017
12018       // Construct the output using a BUILD_VECTOR.
12019       Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, SVOps);
12020     } else if (InputUsed[0] < 0) {
12021       // No input vectors were used! The result is undefined.
12022       Output[l] = DAG.getUNDEF(NVT);
12023     } else {
12024       SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),
12025                                         (InputUsed[0] % 2) * NumLaneElems,
12026                                         DAG, dl);
12027       // If only one input was used, use an undefined vector for the other.
12028       SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :
12029         Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),
12030                             (InputUsed[1] % 2) * NumLaneElems, DAG, dl);
12031       // At least one input vector was used. Create a new shuffle vector.
12032       Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
12033     }
12034
12035     Mask.clear();
12036   }
12037
12038   // Concatenate the result back
12039   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]);
12040 }
12041
12042 /// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
12043 /// 4 elements, and match them with several different shuffle types.
12044 static SDValue
12045 LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
12046   SDValue V1 = SVOp->getOperand(0);
12047   SDValue V2 = SVOp->getOperand(1);
12048   SDLoc dl(SVOp);
12049   MVT VT = SVOp->getSimpleValueType(0);
12050
12051   assert(VT.is128BitVector() && "Unsupported vector size");
12052
12053   std::pair<int, int> Locs[4];
12054   int Mask1[] = { -1, -1, -1, -1 };
12055   SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end());
12056
12057   unsigned NumHi = 0;
12058   unsigned NumLo = 0;
12059   for (unsigned i = 0; i != 4; ++i) {
12060     int Idx = PermMask[i];
12061     if (Idx < 0) {
12062       Locs[i] = std::make_pair(-1, -1);
12063     } else {
12064       assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
12065       if (Idx < 4) {
12066         Locs[i] = std::make_pair(0, NumLo);
12067         Mask1[NumLo] = Idx;
12068         NumLo++;
12069       } else {
12070         Locs[i] = std::make_pair(1, NumHi);
12071         if (2+NumHi < 4)
12072           Mask1[2+NumHi] = Idx;
12073         NumHi++;
12074       }
12075     }
12076   }
12077
12078   if (NumLo <= 2 && NumHi <= 2) {
12079     // If no more than two elements come from either vector. This can be
12080     // implemented with two shuffles. First shuffle gather the elements.
12081     // The second shuffle, which takes the first shuffle as both of its
12082     // vector operands, put the elements into the right order.
12083     V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
12084
12085     int Mask2[] = { -1, -1, -1, -1 };
12086
12087     for (unsigned i = 0; i != 4; ++i)
12088       if (Locs[i].first != -1) {
12089         unsigned Idx = (i < 2) ? 0 : 4;
12090         Idx += Locs[i].first * 2 + Locs[i].second;
12091         Mask2[i] = Idx;
12092       }
12093
12094     return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
12095   }
12096
12097   if (NumLo == 3 || NumHi == 3) {
12098     // Otherwise, we must have three elements from one vector, call it X, and
12099     // one element from the other, call it Y.  First, use a shufps to build an
12100     // intermediate vector with the one element from Y and the element from X
12101     // that will be in the same half in the final destination (the indexes don't
12102     // matter). Then, use a shufps to build the final vector, taking the half
12103     // containing the element from Y from the intermediate, and the other half
12104     // from X.
12105     if (NumHi == 3) {
12106       // Normalize it so the 3 elements come from V1.
12107       CommuteVectorShuffleMask(PermMask, 4);
12108       std::swap(V1, V2);
12109     }
12110
12111     // Find the element from V2.
12112     unsigned HiIndex;
12113     for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
12114       int Val = PermMask[HiIndex];
12115       if (Val < 0)
12116         continue;
12117       if (Val >= 4)
12118         break;
12119     }
12120
12121     Mask1[0] = PermMask[HiIndex];
12122     Mask1[1] = -1;
12123     Mask1[2] = PermMask[HiIndex^1];
12124     Mask1[3] = -1;
12125     V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
12126
12127     if (HiIndex >= 2) {
12128       Mask1[0] = PermMask[0];
12129       Mask1[1] = PermMask[1];
12130       Mask1[2] = HiIndex & 1 ? 6 : 4;
12131       Mask1[3] = HiIndex & 1 ? 4 : 6;
12132       return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
12133     }
12134
12135     Mask1[0] = HiIndex & 1 ? 2 : 0;
12136     Mask1[1] = HiIndex & 1 ? 0 : 2;
12137     Mask1[2] = PermMask[2];
12138     Mask1[3] = PermMask[3];
12139     if (Mask1[2] >= 0)
12140       Mask1[2] += 4;
12141     if (Mask1[3] >= 0)
12142       Mask1[3] += 4;
12143     return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
12144   }
12145
12146   // Break it into (shuffle shuffle_hi, shuffle_lo).
12147   int LoMask[] = { -1, -1, -1, -1 };
12148   int HiMask[] = { -1, -1, -1, -1 };
12149
12150   int *MaskPtr = LoMask;
12151   unsigned MaskIdx = 0;
12152   unsigned LoIdx = 0;
12153   unsigned HiIdx = 2;
12154   for (unsigned i = 0; i != 4; ++i) {
12155     if (i == 2) {
12156       MaskPtr = HiMask;
12157       MaskIdx = 1;
12158       LoIdx = 0;
12159       HiIdx = 2;
12160     }
12161     int Idx = PermMask[i];
12162     if (Idx < 0) {
12163       Locs[i] = std::make_pair(-1, -1);
12164     } else if (Idx < 4) {
12165       Locs[i] = std::make_pair(MaskIdx, LoIdx);
12166       MaskPtr[LoIdx] = Idx;
12167       LoIdx++;
12168     } else {
12169       Locs[i] = std::make_pair(MaskIdx, HiIdx);
12170       MaskPtr[HiIdx] = Idx;
12171       HiIdx++;
12172     }
12173   }
12174
12175   SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
12176   SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
12177   int MaskOps[] = { -1, -1, -1, -1 };
12178   for (unsigned i = 0; i != 4; ++i)
12179     if (Locs[i].first != -1)
12180       MaskOps[i] = Locs[i].first * 4 + Locs[i].second;
12181   return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
12182 }
12183
12184 static bool MayFoldVectorLoad(SDValue V) {
12185   while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
12186     V = V.getOperand(0);
12187
12188   if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
12189     V = V.getOperand(0);
12190   if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&
12191       V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)
12192     // BUILD_VECTOR (load), undef
12193     V = V.getOperand(0);
12194
12195   return MayFoldLoad(V);
12196 }
12197
12198 static
12199 SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) {
12200   MVT VT = Op.getSimpleValueType();
12201
12202   // Canonicalize to v2f64.
12203   V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
12204   return DAG.getNode(ISD::BITCAST, dl, VT,
12205                      getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64,
12206                                           V1, DAG));
12207 }
12208
12209 static
12210 SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG,
12211                         bool HasSSE2) {
12212   SDValue V1 = Op.getOperand(0);
12213   SDValue V2 = Op.getOperand(1);
12214   MVT VT = Op.getSimpleValueType();
12215
12216   assert(VT != MVT::v2i64 && "unsupported shuffle type");
12217
12218   if (HasSSE2 && VT == MVT::v2f64)
12219     return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
12220
12221   // v4f32 or v4i32: canonicalize to v4f32 (which is legal for SSE1)
12222   return DAG.getNode(ISD::BITCAST, dl, VT,
12223                      getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32,
12224                            DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1),
12225                            DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG));
12226 }
12227
12228 static
12229 SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG) {
12230   SDValue V1 = Op.getOperand(0);
12231   SDValue V2 = Op.getOperand(1);
12232   MVT VT = Op.getSimpleValueType();
12233
12234   assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
12235          "unsupported shuffle type");
12236
12237   if (V2.getOpcode() == ISD::UNDEF)
12238     V2 = V1;
12239
12240   // v4i32 or v4f32
12241   return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
12242 }
12243
12244 static
12245 SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
12246   SDValue V1 = Op.getOperand(0);
12247   SDValue V2 = Op.getOperand(1);
12248   MVT VT = Op.getSimpleValueType();
12249   unsigned NumElems = VT.getVectorNumElements();
12250
12251   // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
12252   // operand of these instructions is only memory, so check if there's a
12253   // potencial load folding here, otherwise use SHUFPS or MOVSD to match the
12254   // same masks.
12255   bool CanFoldLoad = false;
12256
12257   // Trivial case, when V2 comes from a load.
12258   if (MayFoldVectorLoad(V2))
12259     CanFoldLoad = true;
12260
12261   // When V1 is a load, it can be folded later into a store in isel, example:
12262   //  (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1)
12263   //    turns into:
12264   //  (MOVLPSmr addr:$src1, VR128:$src2)
12265   // So, recognize this potential and also use MOVLPS or MOVLPD
12266   else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
12267     CanFoldLoad = true;
12268
12269   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12270   if (CanFoldLoad) {
12271     if (HasSSE2 && NumElems == 2)
12272       return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
12273
12274     if (NumElems == 4)
12275       // If we don't care about the second element, proceed to use movss.
12276       if (SVOp->getMaskElt(1) != -1)
12277         return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
12278   }
12279
12280   // movl and movlp will both match v2i64, but v2i64 is never matched by
12281   // movl earlier because we make it strict to avoid messing with the movlp load
12282   // folding logic (see the code above getMOVLP call). Match it here then,
12283   // this is horrible, but will stay like this until we move all shuffle
12284   // matching to x86 specific nodes. Note that for the 1st condition all
12285   // types are matched with movsd.
12286   if (HasSSE2) {
12287     // FIXME: isMOVLMask should be checked and matched before getMOVLP,
12288     // as to remove this logic from here, as much as possible
12289     if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT))
12290       return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
12291     return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
12292   }
12293
12294   assert(VT != MVT::v4i32 && "unsupported shuffle type");
12295
12296   // Invert the operand order and use SHUFPS to match it.
12297   return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1,
12298                               getShuffleSHUFImmediate(SVOp), DAG);
12299 }
12300
12301 static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
12302                                          SelectionDAG &DAG) {
12303   SDLoc dl(Load);
12304   MVT VT = Load->getSimpleValueType(0);
12305   MVT EVT = VT.getVectorElementType();
12306   SDValue Addr = Load->getOperand(1);
12307   SDValue NewAddr = DAG.getNode(
12308       ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
12309       DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType()));
12310
12311   SDValue NewLoad =
12312       DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
12313                   DAG.getMachineFunction().getMachineMemOperand(
12314                       Load->getMemOperand(), 0, EVT.getStoreSize()));
12315   return NewLoad;
12316 }
12317
12318 // It is only safe to call this function if isINSERTPSMask is true for
12319 // this shufflevector mask.
12320 static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
12321                            SelectionDAG &DAG) {
12322   // Generate an insertps instruction when inserting an f32 from memory onto a
12323   // v4f32 or when copying a member from one v4f32 to another.
12324   // We also use it for transferring i32 from one register to another,
12325   // since it simply copies the same bits.
12326   // If we're transferring an i32 from memory to a specific element in a
12327   // register, we output a generic DAG that will match the PINSRD
12328   // instruction.
12329   MVT VT = SVOp->getSimpleValueType(0);
12330   MVT EVT = VT.getVectorElementType();
12331   SDValue V1 = SVOp->getOperand(0);
12332   SDValue V2 = SVOp->getOperand(1);
12333   auto Mask = SVOp->getMask();
12334   assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
12335          "unsupported vector type for insertps/pinsrd");
12336
12337   auto FromV1Predicate = [](const int &i) { return i < 4 && i > -1; };
12338   auto FromV2Predicate = [](const int &i) { return i >= 4; };
12339   int FromV1 = std::count_if(Mask.begin(), Mask.end(), FromV1Predicate);
12340
12341   SDValue From;
12342   SDValue To;
12343   unsigned DestIndex;
12344   if (FromV1 == 1) {
12345     From = V1;
12346     To = V2;
12347     DestIndex = std::find_if(Mask.begin(), Mask.end(), FromV1Predicate) -
12348                 Mask.begin();
12349
12350     // If we have 1 element from each vector, we have to check if we're
12351     // changing V1's element's place. If so, we're done. Otherwise, we
12352     // should assume we're changing V2's element's place and behave
12353     // accordingly.
12354     int FromV2 = std::count_if(Mask.begin(), Mask.end(), FromV2Predicate);
12355     assert(DestIndex <= INT32_MAX && "truncated destination index");
12356     if (FromV1 == FromV2 &&
12357         static_cast<int>(DestIndex) == Mask[DestIndex] % 4) {
12358       From = V2;
12359       To = V1;
12360       DestIndex =
12361           std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
12362     }
12363   } else {
12364     assert(std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 &&
12365            "More than one element from V1 and from V2, or no elements from one "
12366            "of the vectors. This case should not have returned true from "
12367            "isINSERTPSMask");
12368     From = V2;
12369     To = V1;
12370     DestIndex =
12371         std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
12372   }
12373
12374   // Get an index into the source vector in the range [0,4) (the mask is
12375   // in the range [0,8) because it can address V1 and V2)
12376   unsigned SrcIndex = Mask[DestIndex] % 4;
12377   if (MayFoldLoad(From)) {
12378     // Trivial case, when From comes from a load and is only used by the
12379     // shuffle. Make it use insertps from the vector that we need from that
12380     // load.
12381     SDValue NewLoad =
12382         NarrowVectorLoadToElement(cast<LoadSDNode>(From), SrcIndex, DAG);
12383     if (!NewLoad.getNode())
12384       return SDValue();
12385
12386     if (EVT == MVT::f32) {
12387       // Create this as a scalar to vector to match the instruction pattern.
12388       SDValue LoadScalarToVector =
12389           DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad);
12390       SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4);
12391       return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector,
12392                          InsertpsMask);
12393     } else { // EVT == MVT::i32
12394       // If we're getting an i32 from memory, use an INSERT_VECTOR_ELT
12395       // instruction, to match the PINSRD instruction, which loads an i32 to a
12396       // certain vector element.
12397       return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, To, NewLoad,
12398                          DAG.getConstant(DestIndex, MVT::i32));
12399     }
12400   }
12401
12402   // Vector-element-to-vector
12403   SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6);
12404   return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask);
12405 }
12406
12407 // Reduce a vector shuffle to zext.
12408 static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
12409                                     SelectionDAG &DAG) {
12410   // PMOVZX is only available from SSE41.
12411   if (!Subtarget->hasSSE41())
12412     return SDValue();
12413
12414   MVT VT = Op.getSimpleValueType();
12415
12416   // Only AVX2 support 256-bit vector integer extending.
12417   if (!Subtarget->hasInt256() && VT.is256BitVector())
12418     return SDValue();
12419
12420   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12421   SDLoc DL(Op);
12422   SDValue V1 = Op.getOperand(0);
12423   SDValue V2 = Op.getOperand(1);
12424   unsigned NumElems = VT.getVectorNumElements();
12425
12426   // Extending is an unary operation and the element type of the source vector
12427   // won't be equal to or larger than i64.
12428   if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() ||
12429       VT.getVectorElementType() == MVT::i64)
12430     return SDValue();
12431
12432   // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4.
12433   unsigned Shift = 1; // Start from 2, i.e. 1 << 1.
12434   while ((1U << Shift) < NumElems) {
12435     if (SVOp->getMaskElt(1U << Shift) == 1)
12436       break;
12437     Shift += 1;
12438     // The maximal ratio is 8, i.e. from i8 to i64.
12439     if (Shift > 3)
12440       return SDValue();
12441   }
12442
12443   // Check the shuffle mask.
12444   unsigned Mask = (1U << Shift) - 1;
12445   for (unsigned i = 0; i != NumElems; ++i) {
12446     int EltIdx = SVOp->getMaskElt(i);
12447     if ((i & Mask) != 0 && EltIdx != -1)
12448       return SDValue();
12449     if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift))
12450       return SDValue();
12451   }
12452
12453   unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift;
12454   MVT NeVT = MVT::getIntegerVT(NBits);
12455   MVT NVT = MVT::getVectorVT(NeVT, NumElems >> Shift);
12456
12457   if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT))
12458     return SDValue();
12459
12460   return DAG.getNode(ISD::BITCAST, DL, VT,
12461                      DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
12462 }
12463
12464 static SDValue NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
12465                                       SelectionDAG &DAG) {
12466   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12467   MVT VT = Op.getSimpleValueType();
12468   SDLoc dl(Op);
12469   SDValue V1 = Op.getOperand(0);
12470   SDValue V2 = Op.getOperand(1);
12471
12472   if (isZeroShuffle(SVOp))
12473     return getZeroVector(VT, Subtarget, DAG, dl);
12474
12475   // Handle splat operations
12476   if (SVOp->isSplat()) {
12477     // Use vbroadcast whenever the splat comes from a foldable load
12478     SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
12479     if (Broadcast.getNode())
12480       return Broadcast;
12481   }
12482
12483   // Check integer expanding shuffles.
12484   SDValue NewOp = LowerVectorIntExtend(Op, Subtarget, DAG);
12485   if (NewOp.getNode())
12486     return NewOp;
12487
12488   // If the shuffle can be profitably rewritten as a narrower shuffle, then
12489   // do it!
12490   if (VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v16i16 ||
12491       VT == MVT::v32i8) {
12492     SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
12493     if (NewOp.getNode())
12494       return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
12495   } else if (VT.is128BitVector() && Subtarget->hasSSE2()) {
12496     // FIXME: Figure out a cleaner way to do this.
12497     if (ISD::isBuildVectorAllZeros(V2.getNode())) {
12498       SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
12499       if (NewOp.getNode()) {
12500         MVT NewVT = NewOp.getSimpleValueType();
12501         if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
12502                                NewVT, true, false))
12503           return getVZextMovL(VT, NewVT, NewOp.getOperand(0), DAG, Subtarget,
12504                               dl);
12505       }
12506     } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
12507       SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
12508       if (NewOp.getNode()) {
12509         MVT NewVT = NewOp.getSimpleValueType();
12510         if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
12511           return getVZextMovL(VT, NewVT, NewOp.getOperand(1), DAG, Subtarget,
12512                               dl);
12513       }
12514     }
12515   }
12516   return SDValue();
12517 }
12518
12519 SDValue
12520 X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
12521   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12522   SDValue V1 = Op.getOperand(0);
12523   SDValue V2 = Op.getOperand(1);
12524   MVT VT = Op.getSimpleValueType();
12525   SDLoc dl(Op);
12526   unsigned NumElems = VT.getVectorNumElements();
12527   bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
12528   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
12529   bool V1IsSplat = false;
12530   bool V2IsSplat = false;
12531   bool HasSSE2 = Subtarget->hasSSE2();
12532   bool HasFp256    = Subtarget->hasFp256();
12533   bool HasInt256   = Subtarget->hasInt256();
12534   MachineFunction &MF = DAG.getMachineFunction();
12535   bool OptForSize = MF.getFunction()->getAttributes().
12536     hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
12537
12538   // Check if we should use the experimental vector shuffle lowering. If so,
12539   // delegate completely to that code path.
12540   if (ExperimentalVectorShuffleLowering)
12541     return lowerVectorShuffle(Op, Subtarget, DAG);
12542
12543   assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
12544
12545   if (V1IsUndef && V2IsUndef)
12546     return DAG.getUNDEF(VT);
12547
12548   // When we create a shuffle node we put the UNDEF node to second operand,
12549   // but in some cases the first operand may be transformed to UNDEF.
12550   // In this case we should just commute the node.
12551   if (V1IsUndef)
12552     return DAG.getCommutedVectorShuffle(*SVOp);
12553
12554   // Vector shuffle lowering takes 3 steps:
12555   //
12556   // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable
12557   //    narrowing and commutation of operands should be handled.
12558   // 2) Matching of shuffles with known shuffle masks to x86 target specific
12559   //    shuffle nodes.
12560   // 3) Rewriting of unmatched masks into new generic shuffle operations,
12561   //    so the shuffle can be broken into other shuffles and the legalizer can
12562   //    try the lowering again.
12563   //
12564   // The general idea is that no vector_shuffle operation should be left to
12565   // be matched during isel, all of them must be converted to a target specific
12566   // node here.
12567
12568   // Normalize the input vectors. Here splats, zeroed vectors, profitable
12569   // narrowing and commutation of operands should be handled. The actual code
12570   // doesn't include all of those, work in progress...
12571   SDValue NewOp = NormalizeVectorShuffle(Op, Subtarget, DAG);
12572   if (NewOp.getNode())
12573     return NewOp;
12574
12575   SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end());
12576
12577   // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
12578   // unpckh_undef). Only use pshufd if speed is more important than size.
12579   if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasInt256))
12580     return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
12581   if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasInt256))
12582     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
12583
12584   if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() &&
12585       V2IsUndef && MayFoldVectorLoad(V1))
12586     return getMOVDDup(Op, dl, V1, DAG);
12587
12588   if (isMOVHLPS_v_undef_Mask(M, VT))
12589     return getMOVHighToLow(Op, dl, DAG);
12590
12591   // Use to match splats
12592   if (HasSSE2 && isUNPCKHMask(M, VT, HasInt256) && V2IsUndef &&
12593       (VT == MVT::v2f64 || VT == MVT::v2i64))
12594     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
12595
12596   if (isPSHUFDMask(M, VT)) {
12597     // The actual implementation will match the mask in the if above and then
12598     // during isel it can match several different instructions, not only pshufd
12599     // as its name says, sad but true, emulate the behavior for now...
12600     if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64)))
12601       return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG);
12602
12603     unsigned TargetMask = getShuffleSHUFImmediate(SVOp);
12604
12605     if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
12606       return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
12607
12608     if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64))
12609       return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1, TargetMask,
12610                                   DAG);
12611
12612     return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1,
12613                                 TargetMask, DAG);
12614   }
12615
12616   if (isPALIGNRMask(M, VT, Subtarget))
12617     return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2,
12618                                 getShufflePALIGNRImmediate(SVOp),
12619                                 DAG);
12620
12621   if (isVALIGNMask(M, VT, Subtarget))
12622     return getTargetShuffleNode(X86ISD::VALIGN, dl, VT, V1, V2,
12623                                 getShuffleVALIGNImmediate(SVOp),
12624                                 DAG);
12625
12626   // Check if this can be converted into a logical shift.
12627   bool isLeft = false;
12628   unsigned ShAmt = 0;
12629   SDValue ShVal;
12630   bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
12631   if (isShift && ShVal.hasOneUse()) {
12632     // If the shifted value has multiple uses, it may be cheaper to use
12633     // v_set0 + movlhps or movhlps, etc.
12634     MVT EltVT = VT.getVectorElementType();
12635     ShAmt *= EltVT.getSizeInBits();
12636     return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
12637   }
12638
12639   if (isMOVLMask(M, VT)) {
12640     if (ISD::isBuildVectorAllZeros(V1.getNode()))
12641       return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
12642     if (!isMOVLPMask(M, VT)) {
12643       if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
12644         return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
12645
12646       if (VT == MVT::v4i32 || VT == MVT::v4f32)
12647         return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
12648     }
12649   }
12650
12651   // FIXME: fold these into legal mask.
12652   if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasInt256))
12653     return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
12654
12655   if (isMOVHLPSMask(M, VT))
12656     return getMOVHighToLow(Op, dl, DAG);
12657
12658   if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget))
12659     return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
12660
12661   if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget))
12662     return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
12663
12664   if (isMOVLPMask(M, VT))
12665     return getMOVLP(Op, dl, DAG, HasSSE2);
12666
12667   if (ShouldXformToMOVHLPS(M, VT) ||
12668       ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT))
12669     return DAG.getCommutedVectorShuffle(*SVOp);
12670
12671   if (isShift) {
12672     // No better options. Use a vshldq / vsrldq.
12673     MVT EltVT = VT.getVectorElementType();
12674     ShAmt *= EltVT.getSizeInBits();
12675     return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
12676   }
12677
12678   bool Commuted = false;
12679   // FIXME: This should also accept a bitcast of a splat?  Be careful, not
12680   // 1,1,1,1 -> v8i16 though.
12681   BitVector UndefElements;
12682   if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V1.getNode()))
12683     if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
12684       V1IsSplat = true;
12685   if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V2.getNode()))
12686     if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
12687       V2IsSplat = true;
12688
12689   // Canonicalize the splat or undef, if present, to be on the RHS.
12690   if (!V2IsUndef && V1IsSplat && !V2IsSplat) {
12691     CommuteVectorShuffleMask(M, NumElems);
12692     std::swap(V1, V2);
12693     std::swap(V1IsSplat, V2IsSplat);
12694     Commuted = true;
12695   }
12696
12697   if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) {
12698     // Shuffling low element of v1 into undef, just return v1.
12699     if (V2IsUndef)
12700       return V1;
12701     // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
12702     // the instruction selector will not match, so get a canonical MOVL with
12703     // swapped operands to undo the commute.
12704     return getMOVL(DAG, dl, VT, V2, V1);
12705   }
12706
12707   if (isUNPCKLMask(M, VT, HasInt256))
12708     return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
12709
12710   if (isUNPCKHMask(M, VT, HasInt256))
12711     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
12712
12713   if (V2IsSplat) {
12714     // Normalize mask so all entries that point to V2 points to its first
12715     // element then try to match unpck{h|l} again. If match, return a
12716     // new vector_shuffle with the corrected mask.p
12717     SmallVector<int, 8> NewMask(M.begin(), M.end());
12718     NormalizeMask(NewMask, NumElems);
12719     if (isUNPCKLMask(NewMask, VT, HasInt256, true))
12720       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
12721     if (isUNPCKHMask(NewMask, VT, HasInt256, true))
12722       return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
12723   }
12724
12725   if (Commuted) {
12726     // Commute is back and try unpck* again.
12727     // FIXME: this seems wrong.
12728     CommuteVectorShuffleMask(M, NumElems);
12729     std::swap(V1, V2);
12730     std::swap(V1IsSplat, V2IsSplat);
12731
12732     if (isUNPCKLMask(M, VT, HasInt256))
12733       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
12734
12735     if (isUNPCKHMask(M, VT, HasInt256))
12736       return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
12737   }
12738
12739   // Normalize the node to match x86 shuffle ops if needed
12740   if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true)))
12741     return DAG.getCommutedVectorShuffle(*SVOp);
12742
12743   // The checks below are all present in isShuffleMaskLegal, but they are
12744   // inlined here right now to enable us to directly emit target specific
12745   // nodes, and remove one by one until they don't return Op anymore.
12746
12747   if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
12748       SVOp->getSplatIndex() == 0 && V2IsUndef) {
12749     if (VT == MVT::v2f64 || VT == MVT::v2i64)
12750       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
12751   }
12752
12753   if (isPSHUFHWMask(M, VT, HasInt256))
12754     return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
12755                                 getShufflePSHUFHWImmediate(SVOp),
12756                                 DAG);
12757
12758   if (isPSHUFLWMask(M, VT, HasInt256))
12759     return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
12760                                 getShufflePSHUFLWImmediate(SVOp),
12761                                 DAG);
12762
12763   unsigned MaskValue;
12764   if (isBlendMask(M, VT, Subtarget->hasSSE41(), HasInt256, &MaskValue))
12765     return LowerVECTOR_SHUFFLEtoBlend(SVOp, MaskValue, Subtarget, DAG);
12766
12767   if (isSHUFPMask(M, VT))
12768     return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
12769                                 getShuffleSHUFImmediate(SVOp), DAG);
12770
12771   if (isUNPCKL_v_undef_Mask(M, VT, HasInt256))
12772     return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
12773   if (isUNPCKH_v_undef_Mask(M, VT, HasInt256))
12774     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
12775
12776   //===--------------------------------------------------------------------===//
12777   // Generate target specific nodes for 128 or 256-bit shuffles only
12778   // supported in the AVX instruction set.
12779   //
12780
12781   // Handle VMOVDDUPY permutations
12782   if (V2IsUndef && isMOVDDUPYMask(M, VT, HasFp256))
12783     return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
12784
12785   // Handle VPERMILPS/D* permutations
12786   if (isVPERMILPMask(M, VT)) {
12787     if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32)
12788       return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
12789                                   getShuffleSHUFImmediate(SVOp), DAG);
12790     return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1,
12791                                 getShuffleSHUFImmediate(SVOp), DAG);
12792   }
12793
12794   unsigned Idx;
12795   if (VT.is512BitVector() && isINSERT64x4Mask(M, VT, &Idx))
12796     return Insert256BitVector(V1, Extract256BitVector(V2, 0, DAG, dl),
12797                               Idx*(NumElems/2), DAG, dl);
12798
12799   // Handle VPERM2F128/VPERM2I128 permutations
12800   if (isVPERM2X128Mask(M, VT, HasFp256))
12801     return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
12802                                 V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
12803
12804   if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT))
12805     return getINSERTPS(SVOp, dl, DAG);
12806
12807   unsigned Imm8;
12808   if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8))
12809     return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG);
12810
12811   if ((V2IsUndef && HasInt256 && VT.is256BitVector() && NumElems == 8) ||
12812       VT.is512BitVector()) {
12813     MVT MaskEltVT = MVT::getIntegerVT(VT.getVectorElementType().getSizeInBits());
12814     MVT MaskVectorVT = MVT::getVectorVT(MaskEltVT, NumElems);
12815     SmallVector<SDValue, 16> permclMask;
12816     for (unsigned i = 0; i != NumElems; ++i) {
12817       permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT));
12818     }
12819
12820     SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, permclMask);
12821     if (V2IsUndef)
12822       // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
12823       return DAG.getNode(X86ISD::VPERMV, dl, VT,
12824                           DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
12825     return DAG.getNode(X86ISD::VPERMV3, dl, VT, V1,
12826                        DAG.getNode(ISD::BITCAST, dl, VT, Mask), V2);
12827   }
12828
12829   //===--------------------------------------------------------------------===//
12830   // Since no target specific shuffle was selected for this generic one,
12831   // lower it into other known shuffles. FIXME: this isn't true yet, but
12832   // this is the plan.
12833   //
12834
12835   // Handle v8i16 specifically since SSE can do byte extraction and insertion.
12836   if (VT == MVT::v8i16) {
12837     SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG);
12838     if (NewOp.getNode())
12839       return NewOp;
12840   }
12841
12842   if (VT == MVT::v16i16 && HasInt256) {
12843     SDValue NewOp = LowerVECTOR_SHUFFLEv16i16(Op, DAG);
12844     if (NewOp.getNode())
12845       return NewOp;
12846   }
12847
12848   if (VT == MVT::v16i8) {
12849     SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, Subtarget, DAG);
12850     if (NewOp.getNode())
12851       return NewOp;
12852   }
12853
12854   if (VT == MVT::v32i8) {
12855     SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG);
12856     if (NewOp.getNode())
12857       return NewOp;
12858   }
12859
12860   // Handle all 128-bit wide vectors with 4 elements, and match them with
12861   // several different shuffle types.
12862   if (NumElems == 4 && VT.is128BitVector())
12863     return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
12864
12865   // Handle general 256-bit shuffles
12866   if (VT.is256BitVector())
12867     return LowerVECTOR_SHUFFLE_256(SVOp, DAG);
12868
12869   return SDValue();
12870 }
12871
12872 // This function assumes its argument is a BUILD_VECTOR of constants or
12873 // undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is
12874 // true.
12875 static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
12876                                     unsigned &MaskValue) {
12877   MaskValue = 0;
12878   unsigned NumElems = BuildVector->getNumOperands();
12879   // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
12880   unsigned NumLanes = (NumElems - 1) / 8 + 1;
12881   unsigned NumElemsInLane = NumElems / NumLanes;
12882
12883   // Blend for v16i16 should be symetric for the both lanes.
12884   for (unsigned i = 0; i < NumElemsInLane; ++i) {
12885     SDValue EltCond = BuildVector->getOperand(i);
12886     SDValue SndLaneEltCond =
12887         (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond;
12888
12889     int Lane1Cond = -1, Lane2Cond = -1;
12890     if (isa<ConstantSDNode>(EltCond))
12891       Lane1Cond = !isZero(EltCond);
12892     if (isa<ConstantSDNode>(SndLaneEltCond))
12893       Lane2Cond = !isZero(SndLaneEltCond);
12894
12895     if (Lane1Cond == Lane2Cond || Lane2Cond < 0)
12896       // Lane1Cond != 0, means we want the first argument.
12897       // Lane1Cond == 0, means we want the second argument.
12898       // The encoding of this argument is 0 for the first argument, 1
12899       // for the second. Therefore, invert the condition.
12900       MaskValue |= !Lane1Cond << i;
12901     else if (Lane1Cond < 0)
12902       MaskValue |= !Lane2Cond << i;
12903     else
12904       return false;
12905   }
12906   return true;
12907 }
12908
12909 /// \brief Try to lower a VSELECT instruction to an immediate-controlled blend
12910 /// instruction.
12911 static SDValue lowerVSELECTtoBLENDI(SDValue Op, const X86Subtarget *Subtarget,
12912                                     SelectionDAG &DAG) {
12913   SDValue Cond = Op.getOperand(0);
12914   SDValue LHS = Op.getOperand(1);
12915   SDValue RHS = Op.getOperand(2);
12916   SDLoc dl(Op);
12917   MVT VT = Op.getSimpleValueType();
12918   MVT EltVT = VT.getVectorElementType();
12919   unsigned NumElems = VT.getVectorNumElements();
12920
12921   // There is no blend with immediate in AVX-512.
12922   if (VT.is512BitVector())
12923     return SDValue();
12924
12925   if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
12926     return SDValue();
12927   if (!Subtarget->hasInt256() && VT == MVT::v16i16)
12928     return SDValue();
12929
12930   if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
12931     return SDValue();
12932
12933   // Check the mask for BLEND and build the value.
12934   unsigned MaskValue = 0;
12935   if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
12936     return SDValue();
12937
12938   // Convert i32 vectors to floating point if it is not AVX2.
12939   // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
12940   MVT BlendVT = VT;
12941   if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
12942     BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
12943                                NumElems);
12944     LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS);
12945     RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS);
12946   }
12947
12948   SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS,
12949                             DAG.getConstant(MaskValue, MVT::i32));
12950   return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
12951 }
12952
12953 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
12954   // A vselect where all conditions and data are constants can be optimized into
12955   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
12956   if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
12957       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
12958       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
12959     return SDValue();
12960
12961   SDValue BlendOp = lowerVSELECTtoBLENDI(Op, Subtarget, DAG);
12962   if (BlendOp.getNode())
12963     return BlendOp;
12964
12965   // Some types for vselect were previously set to Expand, not Legal or
12966   // Custom. Return an empty SDValue so we fall-through to Expand, after
12967   // the Custom lowering phase.
12968   MVT VT = Op.getSimpleValueType();
12969   switch (VT.SimpleTy) {
12970   default:
12971     break;
12972   case MVT::v8i16:
12973   case MVT::v16i16:
12974     if (Subtarget->hasBWI() && Subtarget->hasVLX())
12975       break;
12976     return SDValue();
12977   }
12978
12979   // We couldn't create a "Blend with immediate" node.
12980   // This node should still be legal, but we'll have to emit a blendv*
12981   // instruction.
12982   return Op;
12983 }
12984
12985 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
12986   MVT VT = Op.getSimpleValueType();
12987   SDLoc dl(Op);
12988
12989   if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
12990     return SDValue();
12991
12992   if (VT.getSizeInBits() == 8) {
12993     SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
12994                                   Op.getOperand(0), Op.getOperand(1));
12995     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
12996                                   DAG.getValueType(VT));
12997     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
12998   }
12999
13000   if (VT.getSizeInBits() == 16) {
13001     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
13002     // If Idx is 0, it's cheaper to do a move instead of a pextrw.
13003     if (Idx == 0)
13004       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
13005                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
13006                                      DAG.getNode(ISD::BITCAST, dl,
13007                                                  MVT::v4i32,
13008                                                  Op.getOperand(0)),
13009                                      Op.getOperand(1)));
13010     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
13011                                   Op.getOperand(0), Op.getOperand(1));
13012     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
13013                                   DAG.getValueType(VT));
13014     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
13015   }
13016
13017   if (VT == MVT::f32) {
13018     // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
13019     // the result back to FR32 register. It's only worth matching if the
13020     // result has a single use which is a store or a bitcast to i32.  And in
13021     // the case of a store, it's not worth it if the index is a constant 0,
13022     // because a MOVSSmr can be used instead, which is smaller and faster.
13023     if (!Op.hasOneUse())
13024       return SDValue();
13025     SDNode *User = *Op.getNode()->use_begin();
13026     if ((User->getOpcode() != ISD::STORE ||
13027          (isa<ConstantSDNode>(Op.getOperand(1)) &&
13028           cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
13029         (User->getOpcode() != ISD::BITCAST ||
13030          User->getValueType(0) != MVT::i32))
13031       return SDValue();
13032     SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
13033                                   DAG.getNode(ISD::BITCAST, dl, MVT::v4i32,
13034                                               Op.getOperand(0)),
13035                                               Op.getOperand(1));
13036     return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
13037   }
13038
13039   if (VT == MVT::i32 || VT == MVT::i64) {
13040     // ExtractPS/pextrq works with constant index.
13041     if (isa<ConstantSDNode>(Op.getOperand(1)))
13042       return Op;
13043   }
13044   return SDValue();
13045 }
13046
13047 /// Extract one bit from mask vector, like v16i1 or v8i1.
13048 /// AVX-512 feature.
13049 SDValue
13050 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
13051   SDValue Vec = Op.getOperand(0);
13052   SDLoc dl(Vec);
13053   MVT VecVT = Vec.getSimpleValueType();
13054   SDValue Idx = Op.getOperand(1);
13055   MVT EltVT = Op.getSimpleValueType();
13056
13057   assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
13058   assert((VecVT.getVectorNumElements() <= 16 || Subtarget->hasBWI()) &&
13059          "Unexpected vector type in ExtractBitFromMaskVector");
13060
13061   // variable index can't be handled in mask registers,
13062   // extend vector to VR512
13063   if (!isa<ConstantSDNode>(Idx)) {
13064     MVT ExtVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
13065     SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
13066     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
13067                               ExtVT.getVectorElementType(), Ext, Idx);
13068     return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
13069   }
13070
13071   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13072   const TargetRegisterClass* rc = getRegClassFor(VecVT);
13073   if (!Subtarget->hasDQI() && (VecVT.getVectorNumElements() <= 8))
13074     rc = getRegClassFor(MVT::v16i1);
13075   unsigned MaxSift = rc->getSize()*8 - 1;
13076   Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
13077                     DAG.getConstant(MaxSift - IdxVal, MVT::i8));
13078   Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
13079                     DAG.getConstant(MaxSift, MVT::i8));
13080   return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
13081                        DAG.getIntPtrConstant(0));
13082 }
13083
13084 SDValue
13085 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
13086                                            SelectionDAG &DAG) const {
13087   SDLoc dl(Op);
13088   SDValue Vec = Op.getOperand(0);
13089   MVT VecVT = Vec.getSimpleValueType();
13090   SDValue Idx = Op.getOperand(1);
13091
13092   if (Op.getSimpleValueType() == MVT::i1)
13093     return ExtractBitFromMaskVector(Op, DAG);
13094
13095   if (!isa<ConstantSDNode>(Idx)) {
13096     if (VecVT.is512BitVector() ||
13097         (VecVT.is256BitVector() && Subtarget->hasInt256() &&
13098          VecVT.getVectorElementType().getSizeInBits() == 32)) {
13099
13100       MVT MaskEltVT =
13101         MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits());
13102       MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
13103                                     MaskEltVT.getSizeInBits());
13104
13105       Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
13106       SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
13107                                 getZeroVector(MaskVT, Subtarget, DAG, dl),
13108                                 Idx, DAG.getConstant(0, getPointerTy()));
13109       SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
13110       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(),
13111                         Perm, DAG.getConstant(0, getPointerTy()));
13112     }
13113     return SDValue();
13114   }
13115
13116   // If this is a 256-bit vector result, first extract the 128-bit vector and
13117   // then extract the element from the 128-bit vector.
13118   if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
13119
13120     unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13121     // Get the 128-bit vector.
13122     Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
13123     MVT EltVT = VecVT.getVectorElementType();
13124
13125     unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
13126
13127     //if (IdxVal >= NumElems/2)
13128     //  IdxVal -= NumElems/2;
13129     IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk;
13130     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
13131                        DAG.getConstant(IdxVal, MVT::i32));
13132   }
13133
13134   assert(VecVT.is128BitVector() && "Unexpected vector length");
13135
13136   if (Subtarget->hasSSE41()) {
13137     SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
13138     if (Res.getNode())
13139       return Res;
13140   }
13141
13142   MVT VT = Op.getSimpleValueType();
13143   // TODO: handle v16i8.
13144   if (VT.getSizeInBits() == 16) {
13145     SDValue Vec = Op.getOperand(0);
13146     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
13147     if (Idx == 0)
13148       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
13149                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
13150                                      DAG.getNode(ISD::BITCAST, dl,
13151                                                  MVT::v4i32, Vec),
13152                                      Op.getOperand(1)));
13153     // Transform it so it match pextrw which produces a 32-bit result.
13154     MVT EltVT = MVT::i32;
13155     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
13156                                   Op.getOperand(0), Op.getOperand(1));
13157     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
13158                                   DAG.getValueType(VT));
13159     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
13160   }
13161
13162   if (VT.getSizeInBits() == 32) {
13163     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
13164     if (Idx == 0)
13165       return Op;
13166
13167     // SHUFPS the element to the lowest double word, then movss.
13168     int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
13169     MVT VVT = Op.getOperand(0).getSimpleValueType();
13170     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
13171                                        DAG.getUNDEF(VVT), Mask);
13172     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
13173                        DAG.getIntPtrConstant(0));
13174   }
13175
13176   if (VT.getSizeInBits() == 64) {
13177     // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
13178     // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
13179     //        to match extract_elt for f64.
13180     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
13181     if (Idx == 0)
13182       return Op;
13183
13184     // UNPCKHPD the element to the lowest double word, then movsd.
13185     // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
13186     // to a f64mem, the whole operation is folded into a single MOVHPDmr.
13187     int Mask[2] = { 1, -1 };
13188     MVT VVT = Op.getOperand(0).getSimpleValueType();
13189     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
13190                                        DAG.getUNDEF(VVT), Mask);
13191     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
13192                        DAG.getIntPtrConstant(0));
13193   }
13194
13195   return SDValue();
13196 }
13197
13198 /// Insert one bit to mask vector, like v16i1 or v8i1.
13199 /// AVX-512 feature.
13200 SDValue
13201 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
13202   SDLoc dl(Op);
13203   SDValue Vec = Op.getOperand(0);
13204   SDValue Elt = Op.getOperand(1);
13205   SDValue Idx = Op.getOperand(2);
13206   MVT VecVT = Vec.getSimpleValueType();
13207
13208   if (!isa<ConstantSDNode>(Idx)) {
13209     // Non constant index. Extend source and destination,
13210     // insert element and then truncate the result.
13211     MVT ExtVecVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
13212     MVT ExtEltVT = (VecVT == MVT::v8i1 ?  MVT::i64 : MVT::i32);
13213     SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
13214       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
13215       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
13216     return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
13217   }
13218
13219   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13220   SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
13221   if (Vec.getOpcode() == ISD::UNDEF)
13222     return DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
13223                        DAG.getConstant(IdxVal, MVT::i8));
13224   const TargetRegisterClass* rc = getRegClassFor(VecVT);
13225   unsigned MaxSift = rc->getSize()*8 - 1;
13226   EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
13227                     DAG.getConstant(MaxSift, MVT::i8));
13228   EltInVec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, EltInVec,
13229                     DAG.getConstant(MaxSift - IdxVal, MVT::i8));
13230   return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
13231 }
13232
13233 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
13234                                                   SelectionDAG &DAG) const {
13235   MVT VT = Op.getSimpleValueType();
13236   MVT EltVT = VT.getVectorElementType();
13237
13238   if (EltVT == MVT::i1)
13239     return InsertBitToMaskVector(Op, DAG);
13240
13241   SDLoc dl(Op);
13242   SDValue N0 = Op.getOperand(0);
13243   SDValue N1 = Op.getOperand(1);
13244   SDValue N2 = Op.getOperand(2);
13245   if (!isa<ConstantSDNode>(N2))
13246     return SDValue();
13247   auto *N2C = cast<ConstantSDNode>(N2);
13248   unsigned IdxVal = N2C->getZExtValue();
13249
13250   // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
13251   // into that, and then insert the subvector back into the result.
13252   if (VT.is256BitVector() || VT.is512BitVector()) {
13253     // Get the desired 128-bit vector half.
13254     SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
13255
13256     // Insert the element into the desired half.
13257     unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
13258     unsigned IdxIn128 = IdxVal - (IdxVal / NumEltsIn128) * NumEltsIn128;
13259
13260     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
13261                     DAG.getConstant(IdxIn128, MVT::i32));
13262
13263     // Insert the changed part back to the 256-bit vector
13264     return Insert128BitVector(N0, V, IdxVal, DAG, dl);
13265   }
13266   assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
13267
13268   if (Subtarget->hasSSE41()) {
13269     if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) {
13270       unsigned Opc;
13271       if (VT == MVT::v8i16) {
13272         Opc = X86ISD::PINSRW;
13273       } else {
13274         assert(VT == MVT::v16i8);
13275         Opc = X86ISD::PINSRB;
13276       }
13277
13278       // Transform it so it match pinsr{b,w} which expects a GR32 as its second
13279       // argument.
13280       if (N1.getValueType() != MVT::i32)
13281         N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
13282       if (N2.getValueType() != MVT::i32)
13283         N2 = DAG.getIntPtrConstant(IdxVal);
13284       return DAG.getNode(Opc, dl, VT, N0, N1, N2);
13285     }
13286
13287     if (EltVT == MVT::f32) {
13288       // Bits [7:6] of the constant are the source select.  This will always be
13289       //  zero here.  The DAG Combiner may combine an extract_elt index into
13290       //  these
13291       //  bits.  For example (insert (extract, 3), 2) could be matched by
13292       //  putting
13293       //  the '3' into bits [7:6] of X86ISD::INSERTPS.
13294       // Bits [5:4] of the constant are the destination select.  This is the
13295       //  value of the incoming immediate.
13296       // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may
13297       //   combine either bitwise AND or insert of float 0.0 to set these bits.
13298       N2 = DAG.getIntPtrConstant(IdxVal << 4);
13299       // Create this as a scalar to vector..
13300       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
13301       return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
13302     }
13303
13304     if (EltVT == MVT::i32 || EltVT == MVT::i64) {
13305       // PINSR* works with constant index.
13306       return Op;
13307     }
13308   }
13309
13310   if (EltVT == MVT::i8)
13311     return SDValue();
13312
13313   if (EltVT.getSizeInBits() == 16) {
13314     // Transform it so it match pinsrw which expects a 16-bit value in a GR32
13315     // as its second argument.
13316     if (N1.getValueType() != MVT::i32)
13317       N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
13318     if (N2.getValueType() != MVT::i32)
13319       N2 = DAG.getIntPtrConstant(IdxVal);
13320     return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
13321   }
13322   return SDValue();
13323 }
13324
13325 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
13326   SDLoc dl(Op);
13327   MVT OpVT = Op.getSimpleValueType();
13328
13329   // If this is a 256-bit vector result, first insert into a 128-bit
13330   // vector and then insert into the 256-bit vector.
13331   if (!OpVT.is128BitVector()) {
13332     // Insert into a 128-bit vector.
13333     unsigned SizeFactor = OpVT.getSizeInBits()/128;
13334     MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
13335                                  OpVT.getVectorNumElements() / SizeFactor);
13336
13337     Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
13338
13339     // Insert the 128-bit vector.
13340     return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
13341   }
13342
13343   if (OpVT == MVT::v1i64 &&
13344       Op.getOperand(0).getValueType() == MVT::i64)
13345     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
13346
13347   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
13348   assert(OpVT.is128BitVector() && "Expected an SSE type!");
13349   return DAG.getNode(ISD::BITCAST, dl, OpVT,
13350                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
13351 }
13352
13353 // Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
13354 // a simple subregister reference or explicit instructions to grab
13355 // upper bits of a vector.
13356 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
13357                                       SelectionDAG &DAG) {
13358   SDLoc dl(Op);
13359   SDValue In =  Op.getOperand(0);
13360   SDValue Idx = Op.getOperand(1);
13361   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13362   MVT ResVT   = Op.getSimpleValueType();
13363   MVT InVT    = In.getSimpleValueType();
13364
13365   if (Subtarget->hasFp256()) {
13366     if (ResVT.is128BitVector() &&
13367         (InVT.is256BitVector() || InVT.is512BitVector()) &&
13368         isa<ConstantSDNode>(Idx)) {
13369       return Extract128BitVector(In, IdxVal, DAG, dl);
13370     }
13371     if (ResVT.is256BitVector() && InVT.is512BitVector() &&
13372         isa<ConstantSDNode>(Idx)) {
13373       return Extract256BitVector(In, IdxVal, DAG, dl);
13374     }
13375   }
13376   return SDValue();
13377 }
13378
13379 // Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
13380 // simple superregister reference or explicit instructions to insert
13381 // the upper bits of a vector.
13382 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
13383                                      SelectionDAG &DAG) {
13384   if (!Subtarget->hasAVX())
13385     return SDValue();
13386
13387   SDLoc dl(Op);
13388   SDValue Vec = Op.getOperand(0);
13389   SDValue SubVec = Op.getOperand(1);
13390   SDValue Idx = Op.getOperand(2);
13391
13392   if (!isa<ConstantSDNode>(Idx))
13393     return SDValue();
13394
13395   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13396   MVT OpVT = Op.getSimpleValueType();
13397   MVT SubVecVT = SubVec.getSimpleValueType();
13398
13399   // Fold two 16-byte subvector loads into one 32-byte load:
13400   // (insert_subvector (insert_subvector undef, (load addr), 0),
13401   //                   (load addr + 16), Elts/2)
13402   // --> load32 addr
13403   if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
13404       Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
13405       OpVT.is256BitVector() && SubVecVT.is128BitVector() &&
13406       !Subtarget->isUnalignedMem32Slow()) {
13407     SDValue SubVec2 = Vec.getOperand(1);
13408     if (auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2))) {
13409       if (Idx2->getZExtValue() == 0) {
13410         SDValue Ops[] = { SubVec2, SubVec };
13411         SDValue LD = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false);
13412         if (LD.getNode())
13413           return LD;
13414       }
13415     }
13416   }
13417
13418   if ((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
13419       SubVecVT.is128BitVector())
13420     return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
13421
13422   if (OpVT.is512BitVector() && SubVecVT.is256BitVector())
13423     return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
13424
13425   return SDValue();
13426 }
13427
13428 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
13429 // their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
13430 // one of the above mentioned nodes. It has to be wrapped because otherwise
13431 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
13432 // be used to form addressing mode. These wrapped nodes will be selected
13433 // into MOV32ri.
13434 SDValue
13435 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
13436   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
13437
13438   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13439   // global base reg.
13440   unsigned char OpFlag = 0;
13441   unsigned WrapperKind = X86ISD::Wrapper;
13442   CodeModel::Model M = DAG.getTarget().getCodeModel();
13443
13444   if (Subtarget->isPICStyleRIPRel() &&
13445       (M == CodeModel::Small || M == CodeModel::Kernel))
13446     WrapperKind = X86ISD::WrapperRIP;
13447   else if (Subtarget->isPICStyleGOT())
13448     OpFlag = X86II::MO_GOTOFF;
13449   else if (Subtarget->isPICStyleStubPIC())
13450     OpFlag = X86II::MO_PIC_BASE_OFFSET;
13451
13452   SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
13453                                              CP->getAlignment(),
13454                                              CP->getOffset(), OpFlag);
13455   SDLoc DL(CP);
13456   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13457   // With PIC, the address is actually $g + Offset.
13458   if (OpFlag) {
13459     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13460                          DAG.getNode(X86ISD::GlobalBaseReg,
13461                                      SDLoc(), getPointerTy()),
13462                          Result);
13463   }
13464
13465   return Result;
13466 }
13467
13468 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
13469   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
13470
13471   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13472   // global base reg.
13473   unsigned char OpFlag = 0;
13474   unsigned WrapperKind = X86ISD::Wrapper;
13475   CodeModel::Model M = DAG.getTarget().getCodeModel();
13476
13477   if (Subtarget->isPICStyleRIPRel() &&
13478       (M == CodeModel::Small || M == CodeModel::Kernel))
13479     WrapperKind = X86ISD::WrapperRIP;
13480   else if (Subtarget->isPICStyleGOT())
13481     OpFlag = X86II::MO_GOTOFF;
13482   else if (Subtarget->isPICStyleStubPIC())
13483     OpFlag = X86II::MO_PIC_BASE_OFFSET;
13484
13485   SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
13486                                           OpFlag);
13487   SDLoc DL(JT);
13488   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13489
13490   // With PIC, the address is actually $g + Offset.
13491   if (OpFlag)
13492     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13493                          DAG.getNode(X86ISD::GlobalBaseReg,
13494                                      SDLoc(), getPointerTy()),
13495                          Result);
13496
13497   return Result;
13498 }
13499
13500 SDValue
13501 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
13502   const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
13503
13504   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13505   // global base reg.
13506   unsigned char OpFlag = 0;
13507   unsigned WrapperKind = X86ISD::Wrapper;
13508   CodeModel::Model M = DAG.getTarget().getCodeModel();
13509
13510   if (Subtarget->isPICStyleRIPRel() &&
13511       (M == CodeModel::Small || M == CodeModel::Kernel)) {
13512     if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())
13513       OpFlag = X86II::MO_GOTPCREL;
13514     WrapperKind = X86ISD::WrapperRIP;
13515   } else if (Subtarget->isPICStyleGOT()) {
13516     OpFlag = X86II::MO_GOT;
13517   } else if (Subtarget->isPICStyleStubPIC()) {
13518     OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE;
13519   } else if (Subtarget->isPICStyleStubNoDynamic()) {
13520     OpFlag = X86II::MO_DARWIN_NONLAZY;
13521   }
13522
13523   SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
13524
13525   SDLoc DL(Op);
13526   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13527
13528   // With PIC, the address is actually $g + Offset.
13529   if (DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
13530       !Subtarget->is64Bit()) {
13531     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13532                          DAG.getNode(X86ISD::GlobalBaseReg,
13533                                      SDLoc(), getPointerTy()),
13534                          Result);
13535   }
13536
13537   // For symbols that require a load from a stub to get the address, emit the
13538   // load.
13539   if (isGlobalStubReference(OpFlag))
13540     Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
13541                          MachinePointerInfo::getGOT(), false, false, false, 0);
13542
13543   return Result;
13544 }
13545
13546 SDValue
13547 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
13548   // Create the TargetBlockAddressAddress node.
13549   unsigned char OpFlags =
13550     Subtarget->ClassifyBlockAddressReference();
13551   CodeModel::Model M = DAG.getTarget().getCodeModel();
13552   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
13553   int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
13554   SDLoc dl(Op);
13555   SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset,
13556                                              OpFlags);
13557
13558   if (Subtarget->isPICStyleRIPRel() &&
13559       (M == CodeModel::Small || M == CodeModel::Kernel))
13560     Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
13561   else
13562     Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
13563
13564   // With PIC, the address is actually $g + Offset.
13565   if (isGlobalRelativeToPICBase(OpFlags)) {
13566     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
13567                          DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
13568                          Result);
13569   }
13570
13571   return Result;
13572 }
13573
13574 SDValue
13575 X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
13576                                       int64_t Offset, SelectionDAG &DAG) const {
13577   // Create the TargetGlobalAddress node, folding in the constant
13578   // offset if it is legal.
13579   unsigned char OpFlags =
13580       Subtarget->ClassifyGlobalReference(GV, DAG.getTarget());
13581   CodeModel::Model M = DAG.getTarget().getCodeModel();
13582   SDValue Result;
13583   if (OpFlags == X86II::MO_NO_FLAG &&
13584       X86::isOffsetSuitableForCodeModel(Offset, M)) {
13585     // A direct static reference to a global.
13586     Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
13587     Offset = 0;
13588   } else {
13589     Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
13590   }
13591
13592   if (Subtarget->isPICStyleRIPRel() &&
13593       (M == CodeModel::Small || M == CodeModel::Kernel))
13594     Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
13595   else
13596     Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
13597
13598   // With PIC, the address is actually $g + Offset.
13599   if (isGlobalRelativeToPICBase(OpFlags)) {
13600     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
13601                          DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
13602                          Result);
13603   }
13604
13605   // For globals that require a load from a stub to get the address, emit the
13606   // load.
13607   if (isGlobalStubReference(OpFlags))
13608     Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
13609                          MachinePointerInfo::getGOT(), false, false, false, 0);
13610
13611   // If there was a non-zero offset that we didn't fold, create an explicit
13612   // addition for it.
13613   if (Offset != 0)
13614     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
13615                          DAG.getConstant(Offset, getPointerTy()));
13616
13617   return Result;
13618 }
13619
13620 SDValue
13621 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
13622   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
13623   int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
13624   return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
13625 }
13626
13627 static SDValue
13628 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
13629            SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
13630            unsigned char OperandFlags, bool LocalDynamic = false) {
13631   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
13632   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
13633   SDLoc dl(GA);
13634   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13635                                            GA->getValueType(0),
13636                                            GA->getOffset(),
13637                                            OperandFlags);
13638
13639   X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
13640                                            : X86ISD::TLSADDR;
13641
13642   if (InFlag) {
13643     SDValue Ops[] = { Chain,  TGA, *InFlag };
13644     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
13645   } else {
13646     SDValue Ops[]  = { Chain, TGA };
13647     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
13648   }
13649
13650   // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
13651   MFI->setAdjustsStack(true);
13652   MFI->setHasCalls(true);
13653
13654   SDValue Flag = Chain.getValue(1);
13655   return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
13656 }
13657
13658 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
13659 static SDValue
13660 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13661                                 const EVT PtrVT) {
13662   SDValue InFlag;
13663   SDLoc dl(GA);  // ? function entry point might be better
13664   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
13665                                    DAG.getNode(X86ISD::GlobalBaseReg,
13666                                                SDLoc(), PtrVT), InFlag);
13667   InFlag = Chain.getValue(1);
13668
13669   return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
13670 }
13671
13672 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
13673 static SDValue
13674 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13675                                 const EVT PtrVT) {
13676   return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
13677                     X86::RAX, X86II::MO_TLSGD);
13678 }
13679
13680 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
13681                                            SelectionDAG &DAG,
13682                                            const EVT PtrVT,
13683                                            bool is64Bit) {
13684   SDLoc dl(GA);
13685
13686   // Get the start address of the TLS block for this module.
13687   X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
13688       .getInfo<X86MachineFunctionInfo>();
13689   MFI->incNumLocalDynamicTLSAccesses();
13690
13691   SDValue Base;
13692   if (is64Bit) {
13693     Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
13694                       X86II::MO_TLSLD, /*LocalDynamic=*/true);
13695   } else {
13696     SDValue InFlag;
13697     SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
13698         DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
13699     InFlag = Chain.getValue(1);
13700     Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
13701                       X86II::MO_TLSLDM, /*LocalDynamic=*/true);
13702   }
13703
13704   // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
13705   // of Base.
13706
13707   // Build x@dtpoff.
13708   unsigned char OperandFlags = X86II::MO_DTPOFF;
13709   unsigned WrapperKind = X86ISD::Wrapper;
13710   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13711                                            GA->getValueType(0),
13712                                            GA->getOffset(), OperandFlags);
13713   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
13714
13715   // Add x@dtpoff with the base.
13716   return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
13717 }
13718
13719 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
13720 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13721                                    const EVT PtrVT, TLSModel::Model model,
13722                                    bool is64Bit, bool isPIC) {
13723   SDLoc dl(GA);
13724
13725   // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
13726   Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
13727                                                          is64Bit ? 257 : 256));
13728
13729   SDValue ThreadPointer =
13730       DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0),
13731                   MachinePointerInfo(Ptr), false, false, false, 0);
13732
13733   unsigned char OperandFlags = 0;
13734   // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
13735   // initialexec.
13736   unsigned WrapperKind = X86ISD::Wrapper;
13737   if (model == TLSModel::LocalExec) {
13738     OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
13739   } else if (model == TLSModel::InitialExec) {
13740     if (is64Bit) {
13741       OperandFlags = X86II::MO_GOTTPOFF;
13742       WrapperKind = X86ISD::WrapperRIP;
13743     } else {
13744       OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
13745     }
13746   } else {
13747     llvm_unreachable("Unexpected model");
13748   }
13749
13750   // emit "addl x@ntpoff,%eax" (local exec)
13751   // or "addl x@indntpoff,%eax" (initial exec)
13752   // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
13753   SDValue TGA =
13754       DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
13755                                  GA->getOffset(), OperandFlags);
13756   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
13757
13758   if (model == TLSModel::InitialExec) {
13759     if (isPIC && !is64Bit) {
13760       Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
13761                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
13762                            Offset);
13763     }
13764
13765     Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
13766                          MachinePointerInfo::getGOT(), false, false, false, 0);
13767   }
13768
13769   // The address of the thread local variable is the add of the thread
13770   // pointer with the offset of the variable.
13771   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
13772 }
13773
13774 SDValue
13775 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
13776
13777   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
13778   const GlobalValue *GV = GA->getGlobal();
13779
13780   if (Subtarget->isTargetELF()) {
13781     TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
13782
13783     switch (model) {
13784       case TLSModel::GeneralDynamic:
13785         if (Subtarget->is64Bit())
13786           return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
13787         return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
13788       case TLSModel::LocalDynamic:
13789         return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(),
13790                                            Subtarget->is64Bit());
13791       case TLSModel::InitialExec:
13792       case TLSModel::LocalExec:
13793         return LowerToTLSExecModel(
13794             GA, DAG, getPointerTy(), model, Subtarget->is64Bit(),
13795             DAG.getTarget().getRelocationModel() == Reloc::PIC_);
13796     }
13797     llvm_unreachable("Unknown TLS model.");
13798   }
13799
13800   if (Subtarget->isTargetDarwin()) {
13801     // Darwin only has one model of TLS.  Lower to that.
13802     unsigned char OpFlag = 0;
13803     unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
13804                            X86ISD::WrapperRIP : X86ISD::Wrapper;
13805
13806     // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13807     // global base reg.
13808     bool PIC32 = (DAG.getTarget().getRelocationModel() == Reloc::PIC_) &&
13809                  !Subtarget->is64Bit();
13810     if (PIC32)
13811       OpFlag = X86II::MO_TLVP_PIC_BASE;
13812     else
13813       OpFlag = X86II::MO_TLVP;
13814     SDLoc DL(Op);
13815     SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
13816                                                 GA->getValueType(0),
13817                                                 GA->getOffset(), OpFlag);
13818     SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13819
13820     // With PIC32, the address is actually $g + Offset.
13821     if (PIC32)
13822       Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13823                            DAG.getNode(X86ISD::GlobalBaseReg,
13824                                        SDLoc(), getPointerTy()),
13825                            Offset);
13826
13827     // Lowering the machine isd will make sure everything is in the right
13828     // location.
13829     SDValue Chain = DAG.getEntryNode();
13830     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
13831     SDValue Args[] = { Chain, Offset };
13832     Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
13833
13834     // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
13835     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
13836     MFI->setAdjustsStack(true);
13837
13838     // And our return value (tls address) is in the standard call return value
13839     // location.
13840     unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
13841     return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
13842                               Chain.getValue(1));
13843   }
13844
13845   if (Subtarget->isTargetKnownWindowsMSVC() ||
13846       Subtarget->isTargetWindowsGNU()) {
13847     // Just use the implicit TLS architecture
13848     // Need to generate someting similar to:
13849     //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
13850     //                                  ; from TEB
13851     //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
13852     //   mov     rcx, qword [rdx+rcx*8]
13853     //   mov     eax, .tls$:tlsvar
13854     //   [rax+rcx] contains the address
13855     // Windows 64bit: gs:0x58
13856     // Windows 32bit: fs:__tls_array
13857
13858     SDLoc dl(GA);
13859     SDValue Chain = DAG.getEntryNode();
13860
13861     // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
13862     // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
13863     // use its literal value of 0x2C.
13864     Value *Ptr = Constant::getNullValue(Subtarget->is64Bit()
13865                                         ? Type::getInt8PtrTy(*DAG.getContext(),
13866                                                              256)
13867                                         : Type::getInt32PtrTy(*DAG.getContext(),
13868                                                               257));
13869
13870     SDValue TlsArray =
13871         Subtarget->is64Bit()
13872             ? DAG.getIntPtrConstant(0x58)
13873             : (Subtarget->isTargetWindowsGNU()
13874                    ? DAG.getIntPtrConstant(0x2C)
13875                    : DAG.getExternalSymbol("_tls_array", getPointerTy()));
13876
13877     SDValue ThreadPointer =
13878         DAG.getLoad(getPointerTy(), dl, Chain, TlsArray,
13879                     MachinePointerInfo(Ptr), false, false, false, 0);
13880
13881     // Load the _tls_index variable
13882     SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy());
13883     if (Subtarget->is64Bit())
13884       IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain,
13885                            IDX, MachinePointerInfo(), MVT::i32,
13886                            false, false, false, 0);
13887     else
13888       IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(),
13889                         false, false, false, 0);
13890
13891     SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()),
13892                                     getPointerTy());
13893     IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);
13894
13895     SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);
13896     res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(),
13897                       false, false, false, 0);
13898
13899     // Get the offset of start of .tls section
13900     SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13901                                              GA->getValueType(0),
13902                                              GA->getOffset(), X86II::MO_SECREL);
13903     SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA);
13904
13905     // The address of the thread local variable is the add of the thread
13906     // pointer with the offset of the variable.
13907     return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset);
13908   }
13909
13910   llvm_unreachable("TLS not implemented for this target.");
13911 }
13912
13913 /// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
13914 /// and take a 2 x i32 value to shift plus a shift amount.
13915 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
13916   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
13917   MVT VT = Op.getSimpleValueType();
13918   unsigned VTBits = VT.getSizeInBits();
13919   SDLoc dl(Op);
13920   bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
13921   SDValue ShOpLo = Op.getOperand(0);
13922   SDValue ShOpHi = Op.getOperand(1);
13923   SDValue ShAmt  = Op.getOperand(2);
13924   // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
13925   // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
13926   // during isel.
13927   SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
13928                                   DAG.getConstant(VTBits - 1, MVT::i8));
13929   SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
13930                                      DAG.getConstant(VTBits - 1, MVT::i8))
13931                        : DAG.getConstant(0, VT);
13932
13933   SDValue Tmp2, Tmp3;
13934   if (Op.getOpcode() == ISD::SHL_PARTS) {
13935     Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
13936     Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
13937   } else {
13938     Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
13939     Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
13940   }
13941
13942   // If the shift amount is larger or equal than the width of a part we can't
13943   // rely on the results of shld/shrd. Insert a test and select the appropriate
13944   // values for large shift amounts.
13945   SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
13946                                 DAG.getConstant(VTBits, MVT::i8));
13947   SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
13948                              AndNode, DAG.getConstant(0, MVT::i8));
13949
13950   SDValue Hi, Lo;
13951   SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
13952   SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
13953   SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
13954
13955   if (Op.getOpcode() == ISD::SHL_PARTS) {
13956     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
13957     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
13958   } else {
13959     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
13960     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
13961   }
13962
13963   SDValue Ops[2] = { Lo, Hi };
13964   return DAG.getMergeValues(Ops, dl);
13965 }
13966
13967 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
13968                                            SelectionDAG &DAG) const {
13969   MVT SrcVT = Op.getOperand(0).getSimpleValueType();
13970   SDLoc dl(Op);
13971
13972   if (SrcVT.isVector()) {
13973     if (SrcVT.getVectorElementType() == MVT::i1) {
13974       MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
13975       return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
13976                          DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT,
13977                                      Op.getOperand(0)));
13978     }
13979     return SDValue();
13980   }
13981
13982   assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
13983          "Unknown SINT_TO_FP to lower!");
13984
13985   // These are really Legal; return the operand so the caller accepts it as
13986   // Legal.
13987   if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
13988     return Op;
13989   if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
13990       Subtarget->is64Bit()) {
13991     return Op;
13992   }
13993
13994   unsigned Size = SrcVT.getSizeInBits()/8;
13995   MachineFunction &MF = DAG.getMachineFunction();
13996   int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
13997   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
13998   SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
13999                                StackSlot,
14000                                MachinePointerInfo::getFixedStack(SSFI),
14001                                false, false, 0);
14002   return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
14003 }
14004
14005 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
14006                                      SDValue StackSlot,
14007                                      SelectionDAG &DAG) const {
14008   // Build the FILD
14009   SDLoc DL(Op);
14010   SDVTList Tys;
14011   bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
14012   if (useSSE)
14013     Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
14014   else
14015     Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
14016
14017   unsigned ByteSize = SrcVT.getSizeInBits()/8;
14018
14019   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
14020   MachineMemOperand *MMO;
14021   if (FI) {
14022     int SSFI = FI->getIndex();
14023     MMO =
14024       DAG.getMachineFunction()
14025       .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14026                             MachineMemOperand::MOLoad, ByteSize, ByteSize);
14027   } else {
14028     MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
14029     StackSlot = StackSlot.getOperand(1);
14030   }
14031   SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
14032   SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
14033                                            X86ISD::FILD, DL,
14034                                            Tys, Ops, SrcVT, MMO);
14035
14036   if (useSSE) {
14037     Chain = Result.getValue(1);
14038     SDValue InFlag = Result.getValue(2);
14039
14040     // FIXME: Currently the FST is flagged to the FILD_FLAG. This
14041     // shouldn't be necessary except that RFP cannot be live across
14042     // multiple blocks. When stackifier is fixed, they can be uncoupled.
14043     MachineFunction &MF = DAG.getMachineFunction();
14044     unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
14045     int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
14046     SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
14047     Tys = DAG.getVTList(MVT::Other);
14048     SDValue Ops[] = {
14049       Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
14050     };
14051     MachineMemOperand *MMO =
14052       DAG.getMachineFunction()
14053       .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14054                             MachineMemOperand::MOStore, SSFISize, SSFISize);
14055
14056     Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
14057                                     Ops, Op.getValueType(), MMO);
14058     Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
14059                          MachinePointerInfo::getFixedStack(SSFI),
14060                          false, false, false, 0);
14061   }
14062
14063   return Result;
14064 }
14065
14066 // LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
14067 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
14068                                                SelectionDAG &DAG) const {
14069   // This algorithm is not obvious. Here it is what we're trying to output:
14070   /*
14071      movq       %rax,  %xmm0
14072      punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
14073      subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
14074      #ifdef __SSE3__
14075        haddpd   %xmm0, %xmm0
14076      #else
14077        pshufd   $0x4e, %xmm0, %xmm1
14078        addpd    %xmm1, %xmm0
14079      #endif
14080   */
14081
14082   SDLoc dl(Op);
14083   LLVMContext *Context = DAG.getContext();
14084
14085   // Build some magic constants.
14086   static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
14087   Constant *C0 = ConstantDataVector::get(*Context, CV0);
14088   SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
14089
14090   SmallVector<Constant*,2> CV1;
14091   CV1.push_back(
14092     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
14093                                       APInt(64, 0x4330000000000000ULL))));
14094   CV1.push_back(
14095     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
14096                                       APInt(64, 0x4530000000000000ULL))));
14097   Constant *C1 = ConstantVector::get(CV1);
14098   SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
14099
14100   // Load the 64-bit value into an XMM register.
14101   SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
14102                             Op.getOperand(0));
14103   SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
14104                               MachinePointerInfo::getConstantPool(),
14105                               false, false, false, 16);
14106   SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32,
14107                               DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1),
14108                               CLod0);
14109
14110   SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
14111                               MachinePointerInfo::getConstantPool(),
14112                               false, false, false, 16);
14113   SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1);
14114   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
14115   SDValue Result;
14116
14117   if (Subtarget->hasSSE3()) {
14118     // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
14119     Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
14120   } else {
14121     SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub);
14122     SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
14123                                            S2F, 0x4E, DAG);
14124     Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
14125                          DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle),
14126                          Sub);
14127   }
14128
14129   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
14130                      DAG.getIntPtrConstant(0));
14131 }
14132
14133 // LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
14134 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
14135                                                SelectionDAG &DAG) const {
14136   SDLoc dl(Op);
14137   // FP constant to bias correct the final result.
14138   SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
14139                                    MVT::f64);
14140
14141   // Load the 32-bit value into an XMM register.
14142   SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
14143                              Op.getOperand(0));
14144
14145   // Zero out the upper parts of the register.
14146   Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
14147
14148   Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
14149                      DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
14150                      DAG.getIntPtrConstant(0));
14151
14152   // Or the load with the bias.
14153   SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
14154                            DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
14155                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
14156                                                    MVT::v2f64, Load)),
14157                            DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
14158                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
14159                                                    MVT::v2f64, Bias)));
14160   Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
14161                    DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or),
14162                    DAG.getIntPtrConstant(0));
14163
14164   // Subtract the bias.
14165   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
14166
14167   // Handle final rounding.
14168   EVT DestVT = Op.getValueType();
14169
14170   if (DestVT.bitsLT(MVT::f64))
14171     return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
14172                        DAG.getIntPtrConstant(0));
14173   if (DestVT.bitsGT(MVT::f64))
14174     return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
14175
14176   // Handle final rounding.
14177   return Sub;
14178 }
14179
14180 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
14181                                      const X86Subtarget &Subtarget) {
14182   // The algorithm is the following:
14183   // #ifdef __SSE4_1__
14184   //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
14185   //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
14186   //                                 (uint4) 0x53000000, 0xaa);
14187   // #else
14188   //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
14189   //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
14190   // #endif
14191   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
14192   //     return (float4) lo + fhi;
14193
14194   SDLoc DL(Op);
14195   SDValue V = Op->getOperand(0);
14196   EVT VecIntVT = V.getValueType();
14197   bool Is128 = VecIntVT == MVT::v4i32;
14198   EVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
14199   // If we convert to something else than the supported type, e.g., to v4f64,
14200   // abort early.
14201   if (VecFloatVT != Op->getValueType(0))
14202     return SDValue();
14203
14204   unsigned NumElts = VecIntVT.getVectorNumElements();
14205   assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
14206          "Unsupported custom type");
14207   assert(NumElts <= 8 && "The size of the constant array must be fixed");
14208
14209   // In the #idef/#else code, we have in common:
14210   // - The vector of constants:
14211   // -- 0x4b000000
14212   // -- 0x53000000
14213   // - A shift:
14214   // -- v >> 16
14215
14216   // Create the splat vector for 0x4b000000.
14217   SDValue CstLow = DAG.getConstant(0x4b000000, MVT::i32);
14218   SDValue CstLowArray[] = {CstLow, CstLow, CstLow, CstLow,
14219                            CstLow, CstLow, CstLow, CstLow};
14220   SDValue VecCstLow = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
14221                                   makeArrayRef(&CstLowArray[0], NumElts));
14222   // Create the splat vector for 0x53000000.
14223   SDValue CstHigh = DAG.getConstant(0x53000000, MVT::i32);
14224   SDValue CstHighArray[] = {CstHigh, CstHigh, CstHigh, CstHigh,
14225                             CstHigh, CstHigh, CstHigh, CstHigh};
14226   SDValue VecCstHigh = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
14227                                    makeArrayRef(&CstHighArray[0], NumElts));
14228
14229   // Create the right shift.
14230   SDValue CstShift = DAG.getConstant(16, MVT::i32);
14231   SDValue CstShiftArray[] = {CstShift, CstShift, CstShift, CstShift,
14232                              CstShift, CstShift, CstShift, CstShift};
14233   SDValue VecCstShift = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
14234                                     makeArrayRef(&CstShiftArray[0], NumElts));
14235   SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
14236
14237   SDValue Low, High;
14238   if (Subtarget.hasSSE41()) {
14239     EVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
14240     //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
14241     SDValue VecCstLowBitcast =
14242         DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstLow);
14243     SDValue VecBitcast = DAG.getNode(ISD::BITCAST, DL, VecI16VT, V);
14244     // Low will be bitcasted right away, so do not bother bitcasting back to its
14245     // original type.
14246     Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
14247                       VecCstLowBitcast, DAG.getConstant(0xaa, MVT::i32));
14248     //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
14249     //                                 (uint4) 0x53000000, 0xaa);
14250     SDValue VecCstHighBitcast =
14251         DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstHigh);
14252     SDValue VecShiftBitcast =
14253         DAG.getNode(ISD::BITCAST, DL, VecI16VT, HighShift);
14254     // High will be bitcasted right away, so do not bother bitcasting back to
14255     // its original type.
14256     High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
14257                        VecCstHighBitcast, DAG.getConstant(0xaa, MVT::i32));
14258   } else {
14259     SDValue CstMask = DAG.getConstant(0xffff, MVT::i32);
14260     SDValue VecCstMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, CstMask,
14261                                      CstMask, CstMask, CstMask);
14262     //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
14263     SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
14264     Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
14265
14266     //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
14267     High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
14268   }
14269
14270   // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
14271   SDValue CstFAdd = DAG.getConstantFP(
14272       APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), MVT::f32);
14273   SDValue CstFAddArray[] = {CstFAdd, CstFAdd, CstFAdd, CstFAdd,
14274                             CstFAdd, CstFAdd, CstFAdd, CstFAdd};
14275   SDValue VecCstFAdd = DAG.getNode(ISD::BUILD_VECTOR, DL, VecFloatVT,
14276                                    makeArrayRef(&CstFAddArray[0], NumElts));
14277
14278   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
14279   SDValue HighBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, High);
14280   SDValue FHigh =
14281       DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
14282   //     return (float4) lo + fhi;
14283   SDValue LowBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, Low);
14284   return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
14285 }
14286
14287 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
14288                                                SelectionDAG &DAG) const {
14289   SDValue N0 = Op.getOperand(0);
14290   MVT SVT = N0.getSimpleValueType();
14291   SDLoc dl(Op);
14292
14293   switch (SVT.SimpleTy) {
14294   default:
14295     llvm_unreachable("Custom UINT_TO_FP is not supported!");
14296   case MVT::v4i8:
14297   case MVT::v4i16:
14298   case MVT::v8i8:
14299   case MVT::v8i16: {
14300     MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements());
14301     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14302                        DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
14303   }
14304   case MVT::v4i32:
14305   case MVT::v8i32:
14306     return lowerUINT_TO_FP_vXi32(Op, DAG, *Subtarget);
14307   }
14308   llvm_unreachable(nullptr);
14309 }
14310
14311 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
14312                                            SelectionDAG &DAG) const {
14313   SDValue N0 = Op.getOperand(0);
14314   SDLoc dl(Op);
14315
14316   if (Op.getValueType().isVector())
14317     return lowerUINT_TO_FP_vec(Op, DAG);
14318
14319   // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
14320   // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
14321   // the optimization here.
14322   if (DAG.SignBitIsZero(N0))
14323     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
14324
14325   MVT SrcVT = N0.getSimpleValueType();
14326   MVT DstVT = Op.getSimpleValueType();
14327   if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
14328     return LowerUINT_TO_FP_i64(Op, DAG);
14329   if (SrcVT == MVT::i32 && X86ScalarSSEf64)
14330     return LowerUINT_TO_FP_i32(Op, DAG);
14331   if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
14332     return SDValue();
14333
14334   // Make a 64-bit buffer, and use it to build an FILD.
14335   SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
14336   if (SrcVT == MVT::i32) {
14337     SDValue WordOff = DAG.getConstant(4, getPointerTy());
14338     SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
14339                                      getPointerTy(), StackSlot, WordOff);
14340     SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
14341                                   StackSlot, MachinePointerInfo(),
14342                                   false, false, 0);
14343     SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
14344                                   OffsetSlot, MachinePointerInfo(),
14345                                   false, false, 0);
14346     SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
14347     return Fild;
14348   }
14349
14350   assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
14351   SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
14352                                StackSlot, MachinePointerInfo(),
14353                                false, false, 0);
14354   // For i64 source, we need to add the appropriate power of 2 if the input
14355   // was negative.  This is the same as the optimization in
14356   // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
14357   // we must be careful to do the computation in x87 extended precision, not
14358   // in SSE. (The generic code can't know it's OK to do this, or how to.)
14359   int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
14360   MachineMemOperand *MMO =
14361     DAG.getMachineFunction()
14362     .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14363                           MachineMemOperand::MOLoad, 8, 8);
14364
14365   SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
14366   SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
14367   SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
14368                                          MVT::i64, MMO);
14369
14370   APInt FF(32, 0x5F800000ULL);
14371
14372   // Check whether the sign bit is set.
14373   SDValue SignSet = DAG.getSetCC(dl,
14374                                  getSetCCResultType(*DAG.getContext(), MVT::i64),
14375                                  Op.getOperand(0), DAG.getConstant(0, MVT::i64),
14376                                  ISD::SETLT);
14377
14378   // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
14379   SDValue FudgePtr = DAG.getConstantPool(
14380                              ConstantInt::get(*DAG.getContext(), FF.zext(64)),
14381                                          getPointerTy());
14382
14383   // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
14384   SDValue Zero = DAG.getIntPtrConstant(0);
14385   SDValue Four = DAG.getIntPtrConstant(4);
14386   SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
14387                                Zero, Four);
14388   FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset);
14389
14390   // Load the value out, extending it from f32 to f80.
14391   // FIXME: Avoid the extend by constructing the right constant pool?
14392   SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(),
14393                                  FudgePtr, MachinePointerInfo::getConstantPool(),
14394                                  MVT::f32, false, false, false, 4);
14395   // Extend everything to 80 bits to force it to be done on x87.
14396   SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
14397   return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0));
14398 }
14399
14400 std::pair<SDValue,SDValue>
14401 X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
14402                                     bool IsSigned, bool IsReplace) const {
14403   SDLoc DL(Op);
14404
14405   EVT DstTy = Op.getValueType();
14406
14407   if (!IsSigned && !isIntegerTypeFTOL(DstTy)) {
14408     assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
14409     DstTy = MVT::i64;
14410   }
14411
14412   assert(DstTy.getSimpleVT() <= MVT::i64 &&
14413          DstTy.getSimpleVT() >= MVT::i16 &&
14414          "Unknown FP_TO_INT to lower!");
14415
14416   // These are really Legal.
14417   if (DstTy == MVT::i32 &&
14418       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
14419     return std::make_pair(SDValue(), SDValue());
14420   if (Subtarget->is64Bit() &&
14421       DstTy == MVT::i64 &&
14422       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
14423     return std::make_pair(SDValue(), SDValue());
14424
14425   // We lower FP->int64 either into FISTP64 followed by a load from a temporary
14426   // stack slot, or into the FTOL runtime function.
14427   MachineFunction &MF = DAG.getMachineFunction();
14428   unsigned MemSize = DstTy.getSizeInBits()/8;
14429   int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
14430   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
14431
14432   unsigned Opc;
14433   if (!IsSigned && isIntegerTypeFTOL(DstTy))
14434     Opc = X86ISD::WIN_FTOL;
14435   else
14436     switch (DstTy.getSimpleVT().SimpleTy) {
14437     default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
14438     case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
14439     case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
14440     case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
14441     }
14442
14443   SDValue Chain = DAG.getEntryNode();
14444   SDValue Value = Op.getOperand(0);
14445   EVT TheVT = Op.getOperand(0).getValueType();
14446   // FIXME This causes a redundant load/store if the SSE-class value is already
14447   // in memory, such as if it is on the callstack.
14448   if (isScalarFPTypeInSSEReg(TheVT)) {
14449     assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
14450     Chain = DAG.getStore(Chain, DL, Value, StackSlot,
14451                          MachinePointerInfo::getFixedStack(SSFI),
14452                          false, false, 0);
14453     SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
14454     SDValue Ops[] = {
14455       Chain, StackSlot, DAG.getValueType(TheVT)
14456     };
14457
14458     MachineMemOperand *MMO =
14459       MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14460                               MachineMemOperand::MOLoad, MemSize, MemSize);
14461     Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
14462     Chain = Value.getValue(1);
14463     SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
14464     StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
14465   }
14466
14467   MachineMemOperand *MMO =
14468     MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14469                             MachineMemOperand::MOStore, MemSize, MemSize);
14470
14471   if (Opc != X86ISD::WIN_FTOL) {
14472     // Build the FP_TO_INT*_IN_MEM
14473     SDValue Ops[] = { Chain, Value, StackSlot };
14474     SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
14475                                            Ops, DstTy, MMO);
14476     return std::make_pair(FIST, StackSlot);
14477   } else {
14478     SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
14479       DAG.getVTList(MVT::Other, MVT::Glue),
14480       Chain, Value);
14481     SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX,
14482       MVT::i32, ftol.getValue(1));
14483     SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX,
14484       MVT::i32, eax.getValue(2));
14485     SDValue Ops[] = { eax, edx };
14486     SDValue pair = IsReplace
14487       ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops)
14488       : DAG.getMergeValues(Ops, DL);
14489     return std::make_pair(pair, SDValue());
14490   }
14491 }
14492
14493 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
14494                               const X86Subtarget *Subtarget) {
14495   MVT VT = Op->getSimpleValueType(0);
14496   SDValue In = Op->getOperand(0);
14497   MVT InVT = In.getSimpleValueType();
14498   SDLoc dl(Op);
14499
14500   // Optimize vectors in AVX mode:
14501   //
14502   //   v8i16 -> v8i32
14503   //   Use vpunpcklwd for 4 lower elements  v8i16 -> v4i32.
14504   //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
14505   //   Concat upper and lower parts.
14506   //
14507   //   v4i32 -> v4i64
14508   //   Use vpunpckldq for 4 lower elements  v4i32 -> v2i64.
14509   //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
14510   //   Concat upper and lower parts.
14511   //
14512
14513   if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
14514       ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
14515       ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
14516     return SDValue();
14517
14518   if (Subtarget->hasInt256())
14519     return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
14520
14521   SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
14522   SDValue Undef = DAG.getUNDEF(InVT);
14523   bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
14524   SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
14525   SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
14526
14527   MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
14528                              VT.getVectorNumElements()/2);
14529
14530   OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
14531   OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
14532
14533   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
14534 }
14535
14536 static  SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
14537                                         SelectionDAG &DAG) {
14538   MVT VT = Op->getSimpleValueType(0);
14539   SDValue In = Op->getOperand(0);
14540   MVT InVT = In.getSimpleValueType();
14541   SDLoc DL(Op);
14542   unsigned int NumElts = VT.getVectorNumElements();
14543   if (NumElts != 8 && NumElts != 16)
14544     return SDValue();
14545
14546   if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
14547     return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
14548
14549   EVT ExtVT = (NumElts == 8)? MVT::v8i64 : MVT::v16i32;
14550   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14551   // Now we have only mask extension
14552   assert(InVT.getVectorElementType() == MVT::i1);
14553   SDValue Cst = DAG.getTargetConstant(1, ExtVT.getScalarType());
14554   const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
14555   SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
14556   unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
14557   SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
14558                            MachinePointerInfo::getConstantPool(),
14559                            false, false, false, Alignment);
14560
14561   SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, DL, ExtVT, In, Ld);
14562   if (VT.is512BitVector())
14563     return Brcst;
14564   return DAG.getNode(X86ISD::VTRUNC, DL, VT, Brcst);
14565 }
14566
14567 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
14568                                SelectionDAG &DAG) {
14569   if (Subtarget->hasFp256()) {
14570     SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
14571     if (Res.getNode())
14572       return Res;
14573   }
14574
14575   return SDValue();
14576 }
14577
14578 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
14579                                 SelectionDAG &DAG) {
14580   SDLoc DL(Op);
14581   MVT VT = Op.getSimpleValueType();
14582   SDValue In = Op.getOperand(0);
14583   MVT SVT = In.getSimpleValueType();
14584
14585   if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
14586     return LowerZERO_EXTEND_AVX512(Op, DAG);
14587
14588   if (Subtarget->hasFp256()) {
14589     SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
14590     if (Res.getNode())
14591       return Res;
14592   }
14593
14594   assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
14595          VT.getVectorNumElements() != SVT.getVectorNumElements());
14596   return SDValue();
14597 }
14598
14599 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
14600   SDLoc DL(Op);
14601   MVT VT = Op.getSimpleValueType();
14602   SDValue In = Op.getOperand(0);
14603   MVT InVT = In.getSimpleValueType();
14604
14605   if (VT == MVT::i1) {
14606     assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
14607            "Invalid scalar TRUNCATE operation");
14608     if (InVT.getSizeInBits() >= 32)
14609       return SDValue();
14610     In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
14611     return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
14612   }
14613   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
14614          "Invalid TRUNCATE operation");
14615
14616   if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) {
14617     if (VT.getVectorElementType().getSizeInBits() >=8)
14618       return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
14619
14620     assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
14621     unsigned NumElts = InVT.getVectorNumElements();
14622     assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type");
14623     if (InVT.getSizeInBits() < 512) {
14624       MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64;
14625       In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
14626       InVT = ExtVT;
14627     }
14628
14629     SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType());
14630     const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
14631     SDValue CP = DAG.getConstantPool(C, getPointerTy());
14632     unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
14633     SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
14634                            MachinePointerInfo::getConstantPool(),
14635                            false, false, false, Alignment);
14636     SDValue OneV = DAG.getNode(X86ISD::VBROADCAST, DL, InVT, Ld);
14637     SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In);
14638     return DAG.getNode(X86ISD::TESTM, DL, VT, And, And);
14639   }
14640
14641   if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
14642     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
14643     if (Subtarget->hasInt256()) {
14644       static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
14645       In = DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, In);
14646       In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
14647                                 ShufMask);
14648       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
14649                          DAG.getIntPtrConstant(0));
14650     }
14651
14652     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14653                                DAG.getIntPtrConstant(0));
14654     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14655                                DAG.getIntPtrConstant(2));
14656     OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
14657     OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
14658     static const int ShufMask[] = {0, 2, 4, 6};
14659     return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
14660   }
14661
14662   if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
14663     // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
14664     if (Subtarget->hasInt256()) {
14665       In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In);
14666
14667       SmallVector<SDValue,32> pshufbMask;
14668       for (unsigned i = 0; i < 2; ++i) {
14669         pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8));
14670         pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8));
14671         pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8));
14672         pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8));
14673         pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8));
14674         pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8));
14675         pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8));
14676         pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8));
14677         for (unsigned j = 0; j < 8; ++j)
14678           pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
14679       }
14680       SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask);
14681       In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
14682       In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In);
14683
14684       static const int ShufMask[] = {0,  2,  -1,  -1};
14685       In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, DAG.getUNDEF(MVT::v4i64),
14686                                 &ShufMask[0]);
14687       In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14688                        DAG.getIntPtrConstant(0));
14689       return DAG.getNode(ISD::BITCAST, DL, VT, In);
14690     }
14691
14692     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
14693                                DAG.getIntPtrConstant(0));
14694
14695     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
14696                                DAG.getIntPtrConstant(4));
14697
14698     OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpLo);
14699     OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpHi);
14700
14701     // The PSHUFB mask:
14702     static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
14703                                    -1, -1, -1, -1, -1, -1, -1, -1};
14704
14705     SDValue Undef = DAG.getUNDEF(MVT::v16i8);
14706     OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
14707     OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
14708
14709     OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
14710     OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
14711
14712     // The MOVLHPS Mask:
14713     static const int ShufMask2[] = {0, 1, 4, 5};
14714     SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
14715     return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, res);
14716   }
14717
14718   // Handle truncation of V256 to V128 using shuffles.
14719   if (!VT.is128BitVector() || !InVT.is256BitVector())
14720     return SDValue();
14721
14722   assert(Subtarget->hasFp256() && "256-bit vector without AVX!");
14723
14724   unsigned NumElems = VT.getVectorNumElements();
14725   MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
14726
14727   SmallVector<int, 16> MaskVec(NumElems * 2, -1);
14728   // Prepare truncation shuffle mask
14729   for (unsigned i = 0; i != NumElems; ++i)
14730     MaskVec[i] = i * 2;
14731   SDValue V = DAG.getVectorShuffle(NVT, DL,
14732                                    DAG.getNode(ISD::BITCAST, DL, NVT, In),
14733                                    DAG.getUNDEF(NVT), &MaskVec[0]);
14734   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
14735                      DAG.getIntPtrConstant(0));
14736 }
14737
14738 SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
14739                                            SelectionDAG &DAG) const {
14740   assert(!Op.getSimpleValueType().isVector());
14741
14742   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
14743     /*IsSigned=*/ true, /*IsReplace=*/ false);
14744   SDValue FIST = Vals.first, StackSlot = Vals.second;
14745   // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
14746   if (!FIST.getNode()) return Op;
14747
14748   if (StackSlot.getNode())
14749     // Load the result.
14750     return DAG.getLoad(Op.getValueType(), SDLoc(Op),
14751                        FIST, StackSlot, MachinePointerInfo(),
14752                        false, false, false, 0);
14753
14754   // The node is the result.
14755   return FIST;
14756 }
14757
14758 SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
14759                                            SelectionDAG &DAG) const {
14760   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
14761     /*IsSigned=*/ false, /*IsReplace=*/ false);
14762   SDValue FIST = Vals.first, StackSlot = Vals.second;
14763   assert(FIST.getNode() && "Unexpected failure");
14764
14765   if (StackSlot.getNode())
14766     // Load the result.
14767     return DAG.getLoad(Op.getValueType(), SDLoc(Op),
14768                        FIST, StackSlot, MachinePointerInfo(),
14769                        false, false, false, 0);
14770
14771   // The node is the result.
14772   return FIST;
14773 }
14774
14775 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
14776   SDLoc DL(Op);
14777   MVT VT = Op.getSimpleValueType();
14778   SDValue In = Op.getOperand(0);
14779   MVT SVT = In.getSimpleValueType();
14780
14781   assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
14782
14783   return DAG.getNode(X86ISD::VFPEXT, DL, VT,
14784                      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
14785                                  In, DAG.getUNDEF(SVT)));
14786 }
14787
14788 /// The only differences between FABS and FNEG are the mask and the logic op.
14789 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
14790 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
14791   assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
14792          "Wrong opcode for lowering FABS or FNEG.");
14793
14794   bool IsFABS = (Op.getOpcode() == ISD::FABS);
14795
14796   // If this is a FABS and it has an FNEG user, bail out to fold the combination
14797   // into an FNABS. We'll lower the FABS after that if it is still in use.
14798   if (IsFABS)
14799     for (SDNode *User : Op->uses())
14800       if (User->getOpcode() == ISD::FNEG)
14801         return Op;
14802
14803   SDValue Op0 = Op.getOperand(0);
14804   bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
14805
14806   SDLoc dl(Op);
14807   MVT VT = Op.getSimpleValueType();
14808   // Assume scalar op for initialization; update for vector if needed.
14809   // Note that there are no scalar bitwise logical SSE/AVX instructions, so we
14810   // generate a 16-byte vector constant and logic op even for the scalar case.
14811   // Using a 16-byte mask allows folding the load of the mask with
14812   // the logic op, so it can save (~4 bytes) on code size.
14813   MVT EltVT = VT;
14814   unsigned NumElts = VT == MVT::f64 ? 2 : 4;
14815   // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
14816   // decide if we should generate a 16-byte constant mask when we only need 4 or
14817   // 8 bytes for the scalar case.
14818   if (VT.isVector()) {
14819     EltVT = VT.getVectorElementType();
14820     NumElts = VT.getVectorNumElements();
14821   }
14822
14823   unsigned EltBits = EltVT.getSizeInBits();
14824   LLVMContext *Context = DAG.getContext();
14825   // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
14826   APInt MaskElt =
14827     IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits);
14828   Constant *C = ConstantInt::get(*Context, MaskElt);
14829   C = ConstantVector::getSplat(NumElts, C);
14830   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14831   SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy());
14832   unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
14833   SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
14834                              MachinePointerInfo::getConstantPool(),
14835                              false, false, false, Alignment);
14836
14837   if (VT.isVector()) {
14838     // For a vector, cast operands to a vector type, perform the logic op,
14839     // and cast the result back to the original value type.
14840     MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
14841     SDValue MaskCasted = DAG.getNode(ISD::BITCAST, dl, VecVT, Mask);
14842     SDValue Operand = IsFNABS ?
14843       DAG.getNode(ISD::BITCAST, dl, VecVT, Op0.getOperand(0)) :
14844       DAG.getNode(ISD::BITCAST, dl, VecVT, Op0);
14845     unsigned BitOp = IsFABS ? ISD::AND : IsFNABS ? ISD::OR : ISD::XOR;
14846     return DAG.getNode(ISD::BITCAST, dl, VT,
14847                        DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted));
14848   }
14849
14850   // If not vector, then scalar.
14851   unsigned BitOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
14852   SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
14853   return DAG.getNode(BitOp, dl, VT, Operand, Mask);
14854 }
14855
14856 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
14857   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14858   LLVMContext *Context = DAG.getContext();
14859   SDValue Op0 = Op.getOperand(0);
14860   SDValue Op1 = Op.getOperand(1);
14861   SDLoc dl(Op);
14862   MVT VT = Op.getSimpleValueType();
14863   MVT SrcVT = Op1.getSimpleValueType();
14864
14865   // If second operand is smaller, extend it first.
14866   if (SrcVT.bitsLT(VT)) {
14867     Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
14868     SrcVT = VT;
14869   }
14870   // And if it is bigger, shrink it first.
14871   if (SrcVT.bitsGT(VT)) {
14872     Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
14873     SrcVT = VT;
14874   }
14875
14876   // At this point the operands and the result should have the same
14877   // type, and that won't be f80 since that is not custom lowered.
14878
14879   const fltSemantics &Sem =
14880       VT == MVT::f64 ? APFloat::IEEEdouble : APFloat::IEEEsingle;
14881   const unsigned SizeInBits = VT.getSizeInBits();
14882
14883   SmallVector<Constant *, 4> CV(
14884       VT == MVT::f64 ? 2 : 4,
14885       ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0))));
14886
14887   // First, clear all bits but the sign bit from the second operand (sign).
14888   CV[0] = ConstantFP::get(*Context,
14889                           APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1)));
14890   Constant *C = ConstantVector::get(CV);
14891   SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
14892   SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
14893                               MachinePointerInfo::getConstantPool(),
14894                               false, false, false, 16);
14895   SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
14896
14897   // Next, clear the sign bit from the first operand (magnitude).
14898   // If it's a constant, we can clear it here.
14899   if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Op0)) {
14900     APFloat APF = Op0CN->getValueAPF();
14901     // If the magnitude is a positive zero, the sign bit alone is enough.
14902     if (APF.isPosZero())
14903       return SignBit;
14904     APF.clearSign();
14905     CV[0] = ConstantFP::get(*Context, APF);
14906   } else {
14907     CV[0] = ConstantFP::get(
14908         *Context,
14909         APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1)));
14910   }
14911   C = ConstantVector::get(CV);
14912   CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
14913   SDValue Val = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
14914                             MachinePointerInfo::getConstantPool(),
14915                             false, false, false, 16);
14916   // If the magnitude operand wasn't a constant, we need to AND out the sign.
14917   if (!isa<ConstantFPSDNode>(Op0))
14918     Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Val);
14919
14920   // OR the magnitude value with the sign bit.
14921   return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
14922 }
14923
14924 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
14925   SDValue N0 = Op.getOperand(0);
14926   SDLoc dl(Op);
14927   MVT VT = Op.getSimpleValueType();
14928
14929   // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
14930   SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
14931                                   DAG.getConstant(1, VT));
14932   return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
14933 }
14934
14935 // Check whether an OR'd tree is PTEST-able.
14936 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
14937                                       SelectionDAG &DAG) {
14938   assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
14939
14940   if (!Subtarget->hasSSE41())
14941     return SDValue();
14942
14943   if (!Op->hasOneUse())
14944     return SDValue();
14945
14946   SDNode *N = Op.getNode();
14947   SDLoc DL(N);
14948
14949   SmallVector<SDValue, 8> Opnds;
14950   DenseMap<SDValue, unsigned> VecInMap;
14951   SmallVector<SDValue, 8> VecIns;
14952   EVT VT = MVT::Other;
14953
14954   // Recognize a special case where a vector is casted into wide integer to
14955   // test all 0s.
14956   Opnds.push_back(N->getOperand(0));
14957   Opnds.push_back(N->getOperand(1));
14958
14959   for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
14960     SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
14961     // BFS traverse all OR'd operands.
14962     if (I->getOpcode() == ISD::OR) {
14963       Opnds.push_back(I->getOperand(0));
14964       Opnds.push_back(I->getOperand(1));
14965       // Re-evaluate the number of nodes to be traversed.
14966       e += 2; // 2 more nodes (LHS and RHS) are pushed.
14967       continue;
14968     }
14969
14970     // Quit if a non-EXTRACT_VECTOR_ELT
14971     if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
14972       return SDValue();
14973
14974     // Quit if without a constant index.
14975     SDValue Idx = I->getOperand(1);
14976     if (!isa<ConstantSDNode>(Idx))
14977       return SDValue();
14978
14979     SDValue ExtractedFromVec = I->getOperand(0);
14980     DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
14981     if (M == VecInMap.end()) {
14982       VT = ExtractedFromVec.getValueType();
14983       // Quit if not 128/256-bit vector.
14984       if (!VT.is128BitVector() && !VT.is256BitVector())
14985         return SDValue();
14986       // Quit if not the same type.
14987       if (VecInMap.begin() != VecInMap.end() &&
14988           VT != VecInMap.begin()->first.getValueType())
14989         return SDValue();
14990       M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
14991       VecIns.push_back(ExtractedFromVec);
14992     }
14993     M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
14994   }
14995
14996   assert((VT.is128BitVector() || VT.is256BitVector()) &&
14997          "Not extracted from 128-/256-bit vector.");
14998
14999   unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
15000
15001   for (DenseMap<SDValue, unsigned>::const_iterator
15002         I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
15003     // Quit if not all elements are used.
15004     if (I->second != FullMask)
15005       return SDValue();
15006   }
15007
15008   EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
15009
15010   // Cast all vectors into TestVT for PTEST.
15011   for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
15012     VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]);
15013
15014   // If more than one full vectors are evaluated, OR them first before PTEST.
15015   for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
15016     // Each iteration will OR 2 nodes and append the result until there is only
15017     // 1 node left, i.e. the final OR'd value of all vectors.
15018     SDValue LHS = VecIns[Slot];
15019     SDValue RHS = VecIns[Slot + 1];
15020     VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
15021   }
15022
15023   return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
15024                      VecIns.back(), VecIns.back());
15025 }
15026
15027 /// \brief return true if \c Op has a use that doesn't just read flags.
15028 static bool hasNonFlagsUse(SDValue Op) {
15029   for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
15030        ++UI) {
15031     SDNode *User = *UI;
15032     unsigned UOpNo = UI.getOperandNo();
15033     if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
15034       // Look pass truncate.
15035       UOpNo = User->use_begin().getOperandNo();
15036       User = *User->use_begin();
15037     }
15038
15039     if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
15040         !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
15041       return true;
15042   }
15043   return false;
15044 }
15045
15046 /// Emit nodes that will be selected as "test Op0,Op0", or something
15047 /// equivalent.
15048 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
15049                                     SelectionDAG &DAG) const {
15050   if (Op.getValueType() == MVT::i1) {
15051     SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
15052     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
15053                        DAG.getConstant(0, MVT::i8));
15054   }
15055   // CF and OF aren't always set the way we want. Determine which
15056   // of these we need.
15057   bool NeedCF = false;
15058   bool NeedOF = false;
15059   switch (X86CC) {
15060   default: break;
15061   case X86::COND_A: case X86::COND_AE:
15062   case X86::COND_B: case X86::COND_BE:
15063     NeedCF = true;
15064     break;
15065   case X86::COND_G: case X86::COND_GE:
15066   case X86::COND_L: case X86::COND_LE:
15067   case X86::COND_O: case X86::COND_NO: {
15068     // Check if we really need to set the
15069     // Overflow flag. If NoSignedWrap is present
15070     // that is not actually needed.
15071     switch (Op->getOpcode()) {
15072     case ISD::ADD:
15073     case ISD::SUB:
15074     case ISD::MUL:
15075     case ISD::SHL: {
15076       const BinaryWithFlagsSDNode *BinNode =
15077           cast<BinaryWithFlagsSDNode>(Op.getNode());
15078       if (BinNode->hasNoSignedWrap())
15079         break;
15080     }
15081     default:
15082       NeedOF = true;
15083       break;
15084     }
15085     break;
15086   }
15087   }
15088   // See if we can use the EFLAGS value from the operand instead of
15089   // doing a separate TEST. TEST always sets OF and CF to 0, so unless
15090   // we prove that the arithmetic won't overflow, we can't use OF or CF.
15091   if (Op.getResNo() != 0 || NeedOF || NeedCF) {
15092     // Emit a CMP with 0, which is the TEST pattern.
15093     //if (Op.getValueType() == MVT::i1)
15094     //  return DAG.getNode(X86ISD::CMP, dl, MVT::i1, Op,
15095     //                     DAG.getConstant(0, MVT::i1));
15096     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
15097                        DAG.getConstant(0, Op.getValueType()));
15098   }
15099   unsigned Opcode = 0;
15100   unsigned NumOperands = 0;
15101
15102   // Truncate operations may prevent the merge of the SETCC instruction
15103   // and the arithmetic instruction before it. Attempt to truncate the operands
15104   // of the arithmetic instruction and use a reduced bit-width instruction.
15105   bool NeedTruncation = false;
15106   SDValue ArithOp = Op;
15107   if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
15108     SDValue Arith = Op->getOperand(0);
15109     // Both the trunc and the arithmetic op need to have one user each.
15110     if (Arith->hasOneUse())
15111       switch (Arith.getOpcode()) {
15112         default: break;
15113         case ISD::ADD:
15114         case ISD::SUB:
15115         case ISD::AND:
15116         case ISD::OR:
15117         case ISD::XOR: {
15118           NeedTruncation = true;
15119           ArithOp = Arith;
15120         }
15121       }
15122   }
15123
15124   // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
15125   // which may be the result of a CAST.  We use the variable 'Op', which is the
15126   // non-casted variable when we check for possible users.
15127   switch (ArithOp.getOpcode()) {
15128   case ISD::ADD:
15129     // Due to an isel shortcoming, be conservative if this add is likely to be
15130     // selected as part of a load-modify-store instruction. When the root node
15131     // in a match is a store, isel doesn't know how to remap non-chain non-flag
15132     // uses of other nodes in the match, such as the ADD in this case. This
15133     // leads to the ADD being left around and reselected, with the result being
15134     // two adds in the output.  Alas, even if none our users are stores, that
15135     // doesn't prove we're O.K.  Ergo, if we have any parents that aren't
15136     // CopyToReg or SETCC, eschew INC/DEC.  A better fix seems to require
15137     // climbing the DAG back to the root, and it doesn't seem to be worth the
15138     // effort.
15139     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
15140          UE = Op.getNode()->use_end(); UI != UE; ++UI)
15141       if (UI->getOpcode() != ISD::CopyToReg &&
15142           UI->getOpcode() != ISD::SETCC &&
15143           UI->getOpcode() != ISD::STORE)
15144         goto default_case;
15145
15146     if (ConstantSDNode *C =
15147         dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
15148       // An add of one will be selected as an INC.
15149       if (C->getAPIntValue() == 1 && !Subtarget->slowIncDec()) {
15150         Opcode = X86ISD::INC;
15151         NumOperands = 1;
15152         break;
15153       }
15154
15155       // An add of negative one (subtract of one) will be selected as a DEC.
15156       if (C->getAPIntValue().isAllOnesValue() && !Subtarget->slowIncDec()) {
15157         Opcode = X86ISD::DEC;
15158         NumOperands = 1;
15159         break;
15160       }
15161     }
15162
15163     // Otherwise use a regular EFLAGS-setting add.
15164     Opcode = X86ISD::ADD;
15165     NumOperands = 2;
15166     break;
15167   case ISD::SHL:
15168   case ISD::SRL:
15169     // If we have a constant logical shift that's only used in a comparison
15170     // against zero turn it into an equivalent AND. This allows turning it into
15171     // a TEST instruction later.
15172     if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
15173         isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
15174       EVT VT = Op.getValueType();
15175       unsigned BitWidth = VT.getSizeInBits();
15176       unsigned ShAmt = Op->getConstantOperandVal(1);
15177       if (ShAmt >= BitWidth) // Avoid undefined shifts.
15178         break;
15179       APInt Mask = ArithOp.getOpcode() == ISD::SRL
15180                        ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
15181                        : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
15182       if (!Mask.isSignedIntN(32)) // Avoid large immediates.
15183         break;
15184       SDValue New = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
15185                                 DAG.getConstant(Mask, VT));
15186       DAG.ReplaceAllUsesWith(Op, New);
15187       Op = New;
15188     }
15189     break;
15190
15191   case ISD::AND:
15192     // If the primary and result isn't used, don't bother using X86ISD::AND,
15193     // because a TEST instruction will be better.
15194     if (!hasNonFlagsUse(Op))
15195       break;
15196     // FALL THROUGH
15197   case ISD::SUB:
15198   case ISD::OR:
15199   case ISD::XOR:
15200     // Due to the ISEL shortcoming noted above, be conservative if this op is
15201     // likely to be selected as part of a load-modify-store instruction.
15202     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
15203            UE = Op.getNode()->use_end(); UI != UE; ++UI)
15204       if (UI->getOpcode() == ISD::STORE)
15205         goto default_case;
15206
15207     // Otherwise use a regular EFLAGS-setting instruction.
15208     switch (ArithOp.getOpcode()) {
15209     default: llvm_unreachable("unexpected operator!");
15210     case ISD::SUB: Opcode = X86ISD::SUB; break;
15211     case ISD::XOR: Opcode = X86ISD::XOR; break;
15212     case ISD::AND: Opcode = X86ISD::AND; break;
15213     case ISD::OR: {
15214       if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
15215         SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG);
15216         if (EFLAGS.getNode())
15217           return EFLAGS;
15218       }
15219       Opcode = X86ISD::OR;
15220       break;
15221     }
15222     }
15223
15224     NumOperands = 2;
15225     break;
15226   case X86ISD::ADD:
15227   case X86ISD::SUB:
15228   case X86ISD::INC:
15229   case X86ISD::DEC:
15230   case X86ISD::OR:
15231   case X86ISD::XOR:
15232   case X86ISD::AND:
15233     return SDValue(Op.getNode(), 1);
15234   default:
15235   default_case:
15236     break;
15237   }
15238
15239   // If we found that truncation is beneficial, perform the truncation and
15240   // update 'Op'.
15241   if (NeedTruncation) {
15242     EVT VT = Op.getValueType();
15243     SDValue WideVal = Op->getOperand(0);
15244     EVT WideVT = WideVal.getValueType();
15245     unsigned ConvertedOp = 0;
15246     // Use a target machine opcode to prevent further DAGCombine
15247     // optimizations that may separate the arithmetic operations
15248     // from the setcc node.
15249     switch (WideVal.getOpcode()) {
15250       default: break;
15251       case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
15252       case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
15253       case ISD::AND: ConvertedOp = X86ISD::AND; break;
15254       case ISD::OR:  ConvertedOp = X86ISD::OR;  break;
15255       case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
15256     }
15257
15258     if (ConvertedOp) {
15259       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15260       if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
15261         SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
15262         SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
15263         Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
15264       }
15265     }
15266   }
15267
15268   if (Opcode == 0)
15269     // Emit a CMP with 0, which is the TEST pattern.
15270     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
15271                        DAG.getConstant(0, Op.getValueType()));
15272
15273   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
15274   SmallVector<SDValue, 4> Ops;
15275   for (unsigned i = 0; i != NumOperands; ++i)
15276     Ops.push_back(Op.getOperand(i));
15277
15278   SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
15279   DAG.ReplaceAllUsesWith(Op, New);
15280   return SDValue(New.getNode(), 1);
15281 }
15282
15283 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
15284 /// equivalent.
15285 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
15286                                    SDLoc dl, SelectionDAG &DAG) const {
15287   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) {
15288     if (C->getAPIntValue() == 0)
15289       return EmitTest(Op0, X86CC, dl, DAG);
15290
15291      if (Op0.getValueType() == MVT::i1)
15292        llvm_unreachable("Unexpected comparison operation for MVT::i1 operands");
15293   }
15294
15295   if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
15296        Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
15297     // Do the comparison at i32 if it's smaller, besides the Atom case.
15298     // This avoids subregister aliasing issues. Keep the smaller reference
15299     // if we're optimizing for size, however, as that'll allow better folding
15300     // of memory operations.
15301     if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 &&
15302         !DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
15303              AttributeSet::FunctionIndex, Attribute::MinSize) &&
15304         !Subtarget->isAtom()) {
15305       unsigned ExtendOp =
15306           isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
15307       Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
15308       Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
15309     }
15310     // Use SUB instead of CMP to enable CSE between SUB and CMP.
15311     SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
15312     SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
15313                               Op0, Op1);
15314     return SDValue(Sub.getNode(), 1);
15315   }
15316   return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
15317 }
15318
15319 /// Convert a comparison if required by the subtarget.
15320 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
15321                                                  SelectionDAG &DAG) const {
15322   // If the subtarget does not support the FUCOMI instruction, floating-point
15323   // comparisons have to be converted.
15324   if (Subtarget->hasCMov() ||
15325       Cmp.getOpcode() != X86ISD::CMP ||
15326       !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
15327       !Cmp.getOperand(1).getValueType().isFloatingPoint())
15328     return Cmp;
15329
15330   // The instruction selector will select an FUCOM instruction instead of
15331   // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
15332   // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
15333   // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
15334   SDLoc dl(Cmp);
15335   SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
15336   SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
15337   SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
15338                             DAG.getConstant(8, MVT::i8));
15339   SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
15340   return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
15341 }
15342
15343 /// The minimum architected relative accuracy is 2^-12. We need one
15344 /// Newton-Raphson step to have a good float result (24 bits of precision).
15345 SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
15346                                             DAGCombinerInfo &DCI,
15347                                             unsigned &RefinementSteps,
15348                                             bool &UseOneConstNR) const {
15349   // FIXME: We should use instruction latency models to calculate the cost of
15350   // each potential sequence, but this is very hard to do reliably because
15351   // at least Intel's Core* chips have variable timing based on the number of
15352   // significant digits in the divisor and/or sqrt operand.
15353   if (!Subtarget->useSqrtEst())
15354     return SDValue();
15355
15356   EVT VT = Op.getValueType();
15357
15358   // SSE1 has rsqrtss and rsqrtps.
15359   // TODO: Add support for AVX512 (v16f32).
15360   // It is likely not profitable to do this for f64 because a double-precision
15361   // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
15362   // instructions: convert to single, rsqrtss, convert back to double, refine
15363   // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
15364   // along with FMA, this could be a throughput win.
15365   if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
15366       (Subtarget->hasAVX() && VT == MVT::v8f32)) {
15367     RefinementSteps = 1;
15368     UseOneConstNR = false;
15369     return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
15370   }
15371   return SDValue();
15372 }
15373
15374 /// The minimum architected relative accuracy is 2^-12. We need one
15375 /// Newton-Raphson step to have a good float result (24 bits of precision).
15376 SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
15377                                             DAGCombinerInfo &DCI,
15378                                             unsigned &RefinementSteps) const {
15379   // FIXME: We should use instruction latency models to calculate the cost of
15380   // each potential sequence, but this is very hard to do reliably because
15381   // at least Intel's Core* chips have variable timing based on the number of
15382   // significant digits in the divisor.
15383   if (!Subtarget->useReciprocalEst())
15384     return SDValue();
15385
15386   EVT VT = Op.getValueType();
15387
15388   // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
15389   // TODO: Add support for AVX512 (v16f32).
15390   // It is likely not profitable to do this for f64 because a double-precision
15391   // reciprocal estimate with refinement on x86 prior to FMA requires
15392   // 15 instructions: convert to single, rcpss, convert back to double, refine
15393   // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
15394   // along with FMA, this could be a throughput win.
15395   if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
15396       (Subtarget->hasAVX() && VT == MVT::v8f32)) {
15397     RefinementSteps = ReciprocalEstimateRefinementSteps;
15398     return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
15399   }
15400   return SDValue();
15401 }
15402
15403 static bool isAllOnes(SDValue V) {
15404   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
15405   return C && C->isAllOnesValue();
15406 }
15407
15408 /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
15409 /// if it's possible.
15410 SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
15411                                      SDLoc dl, SelectionDAG &DAG) const {
15412   SDValue Op0 = And.getOperand(0);
15413   SDValue Op1 = And.getOperand(1);
15414   if (Op0.getOpcode() == ISD::TRUNCATE)
15415     Op0 = Op0.getOperand(0);
15416   if (Op1.getOpcode() == ISD::TRUNCATE)
15417     Op1 = Op1.getOperand(0);
15418
15419   SDValue LHS, RHS;
15420   if (Op1.getOpcode() == ISD::SHL)
15421     std::swap(Op0, Op1);
15422   if (Op0.getOpcode() == ISD::SHL) {
15423     if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0)))
15424       if (And00C->getZExtValue() == 1) {
15425         // If we looked past a truncate, check that it's only truncating away
15426         // known zeros.
15427         unsigned BitWidth = Op0.getValueSizeInBits();
15428         unsigned AndBitWidth = And.getValueSizeInBits();
15429         if (BitWidth > AndBitWidth) {
15430           APInt Zeros, Ones;
15431           DAG.computeKnownBits(Op0, Zeros, Ones);
15432           if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
15433             return SDValue();
15434         }
15435         LHS = Op1;
15436         RHS = Op0.getOperand(1);
15437       }
15438   } else if (Op1.getOpcode() == ISD::Constant) {
15439     ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
15440     uint64_t AndRHSVal = AndRHS->getZExtValue();
15441     SDValue AndLHS = Op0;
15442
15443     if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
15444       LHS = AndLHS.getOperand(0);
15445       RHS = AndLHS.getOperand(1);
15446     }
15447
15448     // Use BT if the immediate can't be encoded in a TEST instruction.
15449     if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
15450       LHS = AndLHS;
15451       RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType());
15452     }
15453   }
15454
15455   if (LHS.getNode()) {
15456     // If LHS is i8, promote it to i32 with any_extend.  There is no i8 BT
15457     // instruction.  Since the shift amount is in-range-or-undefined, we know
15458     // that doing a bittest on the i32 value is ok.  We extend to i32 because
15459     // the encoding for the i16 version is larger than the i32 version.
15460     // Also promote i16 to i32 for performance / code size reason.
15461     if (LHS.getValueType() == MVT::i8 ||
15462         LHS.getValueType() == MVT::i16)
15463       LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
15464
15465     // If the operand types disagree, extend the shift amount to match.  Since
15466     // BT ignores high bits (like shifts) we can use anyextend.
15467     if (LHS.getValueType() != RHS.getValueType())
15468       RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
15469
15470     SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
15471     X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
15472     return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15473                        DAG.getConstant(Cond, MVT::i8), BT);
15474   }
15475
15476   return SDValue();
15477 }
15478
15479 /// \brief - Turns an ISD::CondCode into a value suitable for SSE floating point
15480 /// mask CMPs.
15481 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
15482                               SDValue &Op1) {
15483   unsigned SSECC;
15484   bool Swap = false;
15485
15486   // SSE Condition code mapping:
15487   //  0 - EQ
15488   //  1 - LT
15489   //  2 - LE
15490   //  3 - UNORD
15491   //  4 - NEQ
15492   //  5 - NLT
15493   //  6 - NLE
15494   //  7 - ORD
15495   switch (SetCCOpcode) {
15496   default: llvm_unreachable("Unexpected SETCC condition");
15497   case ISD::SETOEQ:
15498   case ISD::SETEQ:  SSECC = 0; break;
15499   case ISD::SETOGT:
15500   case ISD::SETGT:  Swap = true; // Fallthrough
15501   case ISD::SETLT:
15502   case ISD::SETOLT: SSECC = 1; break;
15503   case ISD::SETOGE:
15504   case ISD::SETGE:  Swap = true; // Fallthrough
15505   case ISD::SETLE:
15506   case ISD::SETOLE: SSECC = 2; break;
15507   case ISD::SETUO:  SSECC = 3; break;
15508   case ISD::SETUNE:
15509   case ISD::SETNE:  SSECC = 4; break;
15510   case ISD::SETULE: Swap = true; // Fallthrough
15511   case ISD::SETUGE: SSECC = 5; break;
15512   case ISD::SETULT: Swap = true; // Fallthrough
15513   case ISD::SETUGT: SSECC = 6; break;
15514   case ISD::SETO:   SSECC = 7; break;
15515   case ISD::SETUEQ:
15516   case ISD::SETONE: SSECC = 8; break;
15517   }
15518   if (Swap)
15519     std::swap(Op0, Op1);
15520
15521   return SSECC;
15522 }
15523
15524 // Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
15525 // ones, and then concatenate the result back.
15526 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
15527   MVT VT = Op.getSimpleValueType();
15528
15529   assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
15530          "Unsupported value type for operation");
15531
15532   unsigned NumElems = VT.getVectorNumElements();
15533   SDLoc dl(Op);
15534   SDValue CC = Op.getOperand(2);
15535
15536   // Extract the LHS vectors
15537   SDValue LHS = Op.getOperand(0);
15538   SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
15539   SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
15540
15541   // Extract the RHS vectors
15542   SDValue RHS = Op.getOperand(1);
15543   SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
15544   SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
15545
15546   // Issue the operation on the smaller types and concatenate the result back
15547   MVT EltVT = VT.getVectorElementType();
15548   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
15549   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
15550                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
15551                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
15552 }
15553
15554 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG,
15555                                      const X86Subtarget *Subtarget) {
15556   SDValue Op0 = Op.getOperand(0);
15557   SDValue Op1 = Op.getOperand(1);
15558   SDValue CC = Op.getOperand(2);
15559   MVT VT = Op.getSimpleValueType();
15560   SDLoc dl(Op);
15561
15562   assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 8 &&
15563          Op.getValueType().getScalarType() == MVT::i1 &&
15564          "Cannot set masked compare for this operation");
15565
15566   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
15567   unsigned  Opc = 0;
15568   bool Unsigned = false;
15569   bool Swap = false;
15570   unsigned SSECC;
15571   switch (SetCCOpcode) {
15572   default: llvm_unreachable("Unexpected SETCC condition");
15573   case ISD::SETNE:  SSECC = 4; break;
15574   case ISD::SETEQ:  Opc = X86ISD::PCMPEQM; break;
15575   case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
15576   case ISD::SETLT:  Swap = true; //fall-through
15577   case ISD::SETGT:  Opc = X86ISD::PCMPGTM; break;
15578   case ISD::SETULT: SSECC = 1; Unsigned = true; break;
15579   case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
15580   case ISD::SETGE:  Swap = true; SSECC = 2; break; // LE + swap
15581   case ISD::SETULE: Unsigned = true; //fall-through
15582   case ISD::SETLE:  SSECC = 2; break;
15583   }
15584
15585   if (Swap)
15586     std::swap(Op0, Op1);
15587   if (Opc)
15588     return DAG.getNode(Opc, dl, VT, Op0, Op1);
15589   Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
15590   return DAG.getNode(Opc, dl, VT, Op0, Op1,
15591                      DAG.getConstant(SSECC, MVT::i8));
15592 }
15593
15594 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
15595 /// operand \p Op1.  If non-trivial (for example because it's not constant)
15596 /// return an empty value.
15597 static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG)
15598 {
15599   BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
15600   if (!BV)
15601     return SDValue();
15602
15603   MVT VT = Op1.getSimpleValueType();
15604   MVT EVT = VT.getVectorElementType();
15605   unsigned n = VT.getVectorNumElements();
15606   SmallVector<SDValue, 8> ULTOp1;
15607
15608   for (unsigned i = 0; i < n; ++i) {
15609     ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
15610     if (!Elt || Elt->isOpaque() || Elt->getValueType(0) != EVT)
15611       return SDValue();
15612
15613     // Avoid underflow.
15614     APInt Val = Elt->getAPIntValue();
15615     if (Val == 0)
15616       return SDValue();
15617
15618     ULTOp1.push_back(DAG.getConstant(Val - 1, EVT));
15619   }
15620
15621   return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ULTOp1);
15622 }
15623
15624 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
15625                            SelectionDAG &DAG) {
15626   SDValue Op0 = Op.getOperand(0);
15627   SDValue Op1 = Op.getOperand(1);
15628   SDValue CC = Op.getOperand(2);
15629   MVT VT = Op.getSimpleValueType();
15630   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
15631   bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
15632   SDLoc dl(Op);
15633
15634   if (isFP) {
15635 #ifndef NDEBUG
15636     MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
15637     assert(EltVT == MVT::f32 || EltVT == MVT::f64);
15638 #endif
15639
15640     unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
15641     unsigned Opc = X86ISD::CMPP;
15642     if (Subtarget->hasAVX512() && VT.getVectorElementType() == MVT::i1) {
15643       assert(VT.getVectorNumElements() <= 16);
15644       Opc = X86ISD::CMPM;
15645     }
15646     // In the two special cases we can't handle, emit two comparisons.
15647     if (SSECC == 8) {
15648       unsigned CC0, CC1;
15649       unsigned CombineOpc;
15650       if (SetCCOpcode == ISD::SETUEQ) {
15651         CC0 = 3; CC1 = 0; CombineOpc = ISD::OR;
15652       } else {
15653         assert(SetCCOpcode == ISD::SETONE);
15654         CC0 = 7; CC1 = 4; CombineOpc = ISD::AND;
15655       }
15656
15657       SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
15658                                  DAG.getConstant(CC0, MVT::i8));
15659       SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
15660                                  DAG.getConstant(CC1, MVT::i8));
15661       return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
15662     }
15663     // Handle all other FP comparisons here.
15664     return DAG.getNode(Opc, dl, VT, Op0, Op1,
15665                        DAG.getConstant(SSECC, MVT::i8));
15666   }
15667
15668   // Break 256-bit integer vector compare into smaller ones.
15669   if (VT.is256BitVector() && !Subtarget->hasInt256())
15670     return Lower256IntVSETCC(Op, DAG);
15671
15672   bool MaskResult = (VT.getVectorElementType() == MVT::i1);
15673   EVT OpVT = Op1.getValueType();
15674   if (Subtarget->hasAVX512()) {
15675     if (Op1.getValueType().is512BitVector() ||
15676         (Subtarget->hasBWI() && Subtarget->hasVLX()) ||
15677         (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32))
15678       return LowerIntVSETCC_AVX512(Op, DAG, Subtarget);
15679
15680     // In AVX-512 architecture setcc returns mask with i1 elements,
15681     // But there is no compare instruction for i8 and i16 elements in KNL.
15682     // We are not talking about 512-bit operands in this case, these
15683     // types are illegal.
15684     if (MaskResult &&
15685         (OpVT.getVectorElementType().getSizeInBits() < 32 &&
15686          OpVT.getVectorElementType().getSizeInBits() >= 8))
15687       return DAG.getNode(ISD::TRUNCATE, dl, VT,
15688                          DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
15689   }
15690
15691   // We are handling one of the integer comparisons here.  Since SSE only has
15692   // GT and EQ comparisons for integer, swapping operands and multiple
15693   // operations may be required for some comparisons.
15694   unsigned Opc;
15695   bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
15696   bool Subus = false;
15697
15698   switch (SetCCOpcode) {
15699   default: llvm_unreachable("Unexpected SETCC condition");
15700   case ISD::SETNE:  Invert = true;
15701   case ISD::SETEQ:  Opc = X86ISD::PCMPEQ; break;
15702   case ISD::SETLT:  Swap = true;
15703   case ISD::SETGT:  Opc = X86ISD::PCMPGT; break;
15704   case ISD::SETGE:  Swap = true;
15705   case ISD::SETLE:  Opc = X86ISD::PCMPGT;
15706                     Invert = true; break;
15707   case ISD::SETULT: Swap = true;
15708   case ISD::SETUGT: Opc = X86ISD::PCMPGT;
15709                     FlipSigns = true; break;
15710   case ISD::SETUGE: Swap = true;
15711   case ISD::SETULE: Opc = X86ISD::PCMPGT;
15712                     FlipSigns = true; Invert = true; break;
15713   }
15714
15715   // Special case: Use min/max operations for SETULE/SETUGE
15716   MVT VET = VT.getVectorElementType();
15717   bool hasMinMax =
15718        (Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
15719     || (Subtarget->hasSSE2()  && (VET == MVT::i8));
15720
15721   if (hasMinMax) {
15722     switch (SetCCOpcode) {
15723     default: break;
15724     case ISD::SETULE: Opc = X86ISD::UMIN; MinMax = true; break;
15725     case ISD::SETUGE: Opc = X86ISD::UMAX; MinMax = true; break;
15726     }
15727
15728     if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
15729   }
15730
15731   bool hasSubus = Subtarget->hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
15732   if (!MinMax && hasSubus) {
15733     // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
15734     // Op0 u<= Op1:
15735     //   t = psubus Op0, Op1
15736     //   pcmpeq t, <0..0>
15737     switch (SetCCOpcode) {
15738     default: break;
15739     case ISD::SETULT: {
15740       // If the comparison is against a constant we can turn this into a
15741       // setule.  With psubus, setule does not require a swap.  This is
15742       // beneficial because the constant in the register is no longer
15743       // destructed as the destination so it can be hoisted out of a loop.
15744       // Only do this pre-AVX since vpcmp* is no longer destructive.
15745       if (Subtarget->hasAVX())
15746         break;
15747       SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG);
15748       if (ULEOp1.getNode()) {
15749         Op1 = ULEOp1;
15750         Subus = true; Invert = false; Swap = false;
15751       }
15752       break;
15753     }
15754     // Psubus is better than flip-sign because it requires no inversion.
15755     case ISD::SETUGE: Subus = true; Invert = false; Swap = true;  break;
15756     case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
15757     }
15758
15759     if (Subus) {
15760       Opc = X86ISD::SUBUS;
15761       FlipSigns = false;
15762     }
15763   }
15764
15765   if (Swap)
15766     std::swap(Op0, Op1);
15767
15768   // Check that the operation in question is available (most are plain SSE2,
15769   // but PCMPGTQ and PCMPEQQ have different requirements).
15770   if (VT == MVT::v2i64) {
15771     if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) {
15772       assert(Subtarget->hasSSE2() && "Don't know how to lower!");
15773
15774       // First cast everything to the right type.
15775       Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
15776       Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
15777
15778       // Since SSE has no unsigned integer comparisons, we need to flip the sign
15779       // bits of the inputs before performing those operations. The lower
15780       // compare is always unsigned.
15781       SDValue SB;
15782       if (FlipSigns) {
15783         SB = DAG.getConstant(0x80000000U, MVT::v4i32);
15784       } else {
15785         SDValue Sign = DAG.getConstant(0x80000000U, MVT::i32);
15786         SDValue Zero = DAG.getConstant(0x00000000U, MVT::i32);
15787         SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
15788                          Sign, Zero, Sign, Zero);
15789       }
15790       Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
15791       Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
15792
15793       // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
15794       SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
15795       SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
15796
15797       // Create masks for only the low parts/high parts of the 64 bit integers.
15798       static const int MaskHi[] = { 1, 1, 3, 3 };
15799       static const int MaskLo[] = { 0, 0, 2, 2 };
15800       SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
15801       SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
15802       SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
15803
15804       SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
15805       Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
15806
15807       if (Invert)
15808         Result = DAG.getNOT(dl, Result, MVT::v4i32);
15809
15810       return DAG.getNode(ISD::BITCAST, dl, VT, Result);
15811     }
15812
15813     if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) {
15814       // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
15815       // pcmpeqd + pshufd + pand.
15816       assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!");
15817
15818       // First cast everything to the right type.
15819       Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
15820       Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
15821
15822       // Do the compare.
15823       SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
15824
15825       // Make sure the lower and upper halves are both all-ones.
15826       static const int Mask[] = { 1, 0, 3, 2 };
15827       SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
15828       Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
15829
15830       if (Invert)
15831         Result = DAG.getNOT(dl, Result, MVT::v4i32);
15832
15833       return DAG.getNode(ISD::BITCAST, dl, VT, Result);
15834     }
15835   }
15836
15837   // Since SSE has no unsigned integer comparisons, we need to flip the sign
15838   // bits of the inputs before performing those operations.
15839   if (FlipSigns) {
15840     EVT EltVT = VT.getVectorElementType();
15841     SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), VT);
15842     Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
15843     Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
15844   }
15845
15846   SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
15847
15848   // If the logical-not of the result is required, perform that now.
15849   if (Invert)
15850     Result = DAG.getNOT(dl, Result, VT);
15851
15852   if (MinMax)
15853     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
15854
15855   if (Subus)
15856     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
15857                          getZeroVector(VT, Subtarget, DAG, dl));
15858
15859   return Result;
15860 }
15861
15862 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
15863
15864   MVT VT = Op.getSimpleValueType();
15865
15866   if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
15867
15868   assert(((!Subtarget->hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
15869          && "SetCC type must be 8-bit or 1-bit integer");
15870   SDValue Op0 = Op.getOperand(0);
15871   SDValue Op1 = Op.getOperand(1);
15872   SDLoc dl(Op);
15873   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
15874
15875   // Optimize to BT if possible.
15876   // Lower (X & (1 << N)) == 0 to BT(X, N).
15877   // Lower ((X >>u N) & 1) != 0 to BT(X, N).
15878   // Lower ((X >>s N) & 1) != 0 to BT(X, N).
15879   if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
15880       Op1.getOpcode() == ISD::Constant &&
15881       cast<ConstantSDNode>(Op1)->isNullValue() &&
15882       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15883     SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
15884     if (NewSetCC.getNode()) {
15885       if (VT == MVT::i1)
15886         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
15887       return NewSetCC;
15888     }
15889   }
15890
15891   // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
15892   // these.
15893   if (Op1.getOpcode() == ISD::Constant &&
15894       (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
15895        cast<ConstantSDNode>(Op1)->isNullValue()) &&
15896       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15897
15898     // If the input is a setcc, then reuse the input setcc or use a new one with
15899     // the inverted condition.
15900     if (Op0.getOpcode() == X86ISD::SETCC) {
15901       X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
15902       bool Invert = (CC == ISD::SETNE) ^
15903         cast<ConstantSDNode>(Op1)->isNullValue();
15904       if (!Invert)
15905         return Op0;
15906
15907       CCode = X86::GetOppositeBranchCondition(CCode);
15908       SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15909                                   DAG.getConstant(CCode, MVT::i8),
15910                                   Op0.getOperand(1));
15911       if (VT == MVT::i1)
15912         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
15913       return SetCC;
15914     }
15915   }
15916   if ((Op0.getValueType() == MVT::i1) && (Op1.getOpcode() == ISD::Constant) &&
15917       (cast<ConstantSDNode>(Op1)->getZExtValue() == 1) &&
15918       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15919
15920     ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
15921     return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, MVT::i1), NewCC);
15922   }
15923
15924   bool isFP = Op1.getSimpleValueType().isFloatingPoint();
15925   unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
15926   if (X86CC == X86::COND_INVALID)
15927     return SDValue();
15928
15929   SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
15930   EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
15931   SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15932                               DAG.getConstant(X86CC, MVT::i8), EFLAGS);
15933   if (VT == MVT::i1)
15934     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
15935   return SetCC;
15936 }
15937
15938 // isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
15939 static bool isX86LogicalCmp(SDValue Op) {
15940   unsigned Opc = Op.getNode()->getOpcode();
15941   if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
15942       Opc == X86ISD::SAHF)
15943     return true;
15944   if (Op.getResNo() == 1 &&
15945       (Opc == X86ISD::ADD ||
15946        Opc == X86ISD::SUB ||
15947        Opc == X86ISD::ADC ||
15948        Opc == X86ISD::SBB ||
15949        Opc == X86ISD::SMUL ||
15950        Opc == X86ISD::UMUL ||
15951        Opc == X86ISD::INC ||
15952        Opc == X86ISD::DEC ||
15953        Opc == X86ISD::OR ||
15954        Opc == X86ISD::XOR ||
15955        Opc == X86ISD::AND))
15956     return true;
15957
15958   if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
15959     return true;
15960
15961   return false;
15962 }
15963
15964 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
15965   if (V.getOpcode() != ISD::TRUNCATE)
15966     return false;
15967
15968   SDValue VOp0 = V.getOperand(0);
15969   unsigned InBits = VOp0.getValueSizeInBits();
15970   unsigned Bits = V.getValueSizeInBits();
15971   return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
15972 }
15973
15974 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
15975   bool addTest = true;
15976   SDValue Cond  = Op.getOperand(0);
15977   SDValue Op1 = Op.getOperand(1);
15978   SDValue Op2 = Op.getOperand(2);
15979   SDLoc DL(Op);
15980   EVT VT = Op1.getValueType();
15981   SDValue CC;
15982
15983   // Lower fp selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
15984   // are available. Otherwise fp cmovs get lowered into a less efficient branch
15985   // sequence later on.
15986   if (Cond.getOpcode() == ISD::SETCC &&
15987       ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
15988        (Subtarget->hasSSE1() && VT == MVT::f32)) &&
15989       VT == Cond.getOperand(0).getValueType() && Cond->hasOneUse()) {
15990     SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
15991     int SSECC = translateX86FSETCC(
15992         cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
15993
15994     if (SSECC != 8) {
15995       if (Subtarget->hasAVX512()) {
15996         SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1,
15997                                   DAG.getConstant(SSECC, MVT::i8));
15998         return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2);
15999       }
16000       SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
16001                                 DAG.getConstant(SSECC, MVT::i8));
16002       SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
16003       SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
16004       return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
16005     }
16006   }
16007
16008   if (Cond.getOpcode() == ISD::SETCC) {
16009     SDValue NewCond = LowerSETCC(Cond, DAG);
16010     if (NewCond.getNode())
16011       Cond = NewCond;
16012   }
16013
16014   // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
16015   // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
16016   // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
16017   // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
16018   if (Cond.getOpcode() == X86ISD::SETCC &&
16019       Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
16020       isZero(Cond.getOperand(1).getOperand(1))) {
16021     SDValue Cmp = Cond.getOperand(1);
16022
16023     unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
16024
16025     if ((isAllOnes(Op1) || isAllOnes(Op2)) &&
16026         (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
16027       SDValue Y = isAllOnes(Op2) ? Op1 : Op2;
16028
16029       SDValue CmpOp0 = Cmp.getOperand(0);
16030       // Apply further optimizations for special cases
16031       // (select (x != 0), -1, 0) -> neg & sbb
16032       // (select (x == 0), 0, -1) -> neg & sbb
16033       if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y))
16034         if (YC->isNullValue() &&
16035             (isAllOnes(Op1) == (CondCode == X86::COND_NE))) {
16036           SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
16037           SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
16038                                     DAG.getConstant(0, CmpOp0.getValueType()),
16039                                     CmpOp0);
16040           SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
16041                                     DAG.getConstant(X86::COND_B, MVT::i8),
16042                                     SDValue(Neg.getNode(), 1));
16043           return Res;
16044         }
16045
16046       Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
16047                         CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));
16048       Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16049
16050       SDValue Res =   // Res = 0 or -1.
16051         DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
16052                     DAG.getConstant(X86::COND_B, MVT::i8), Cmp);
16053
16054       if (isAllOnes(Op1) != (CondCode == X86::COND_E))
16055         Res = DAG.getNOT(DL, Res, Res.getValueType());
16056
16057       ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
16058       if (!N2C || !N2C->isNullValue())
16059         Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
16060       return Res;
16061     }
16062   }
16063
16064   // Look past (and (setcc_carry (cmp ...)), 1).
16065   if (Cond.getOpcode() == ISD::AND &&
16066       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
16067     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
16068     if (C && C->getAPIntValue() == 1)
16069       Cond = Cond.getOperand(0);
16070   }
16071
16072   // If condition flag is set by a X86ISD::CMP, then use it as the condition
16073   // setting operand in place of the X86ISD::SETCC.
16074   unsigned CondOpcode = Cond.getOpcode();
16075   if (CondOpcode == X86ISD::SETCC ||
16076       CondOpcode == X86ISD::SETCC_CARRY) {
16077     CC = Cond.getOperand(0);
16078
16079     SDValue Cmp = Cond.getOperand(1);
16080     unsigned Opc = Cmp.getOpcode();
16081     MVT VT = Op.getSimpleValueType();
16082
16083     bool IllegalFPCMov = false;
16084     if (VT.isFloatingPoint() && !VT.isVector() &&
16085         !isScalarFPTypeInSSEReg(VT))  // FPStack?
16086       IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
16087
16088     if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
16089         Opc == X86ISD::BT) { // FIXME
16090       Cond = Cmp;
16091       addTest = false;
16092     }
16093   } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
16094              CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
16095              ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
16096               Cond.getOperand(0).getValueType() != MVT::i8)) {
16097     SDValue LHS = Cond.getOperand(0);
16098     SDValue RHS = Cond.getOperand(1);
16099     unsigned X86Opcode;
16100     unsigned X86Cond;
16101     SDVTList VTs;
16102     switch (CondOpcode) {
16103     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
16104     case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
16105     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
16106     case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
16107     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
16108     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
16109     default: llvm_unreachable("unexpected overflowing operator");
16110     }
16111     if (CondOpcode == ISD::UMULO)
16112       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
16113                           MVT::i32);
16114     else
16115       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
16116
16117     SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
16118
16119     if (CondOpcode == ISD::UMULO)
16120       Cond = X86Op.getValue(2);
16121     else
16122       Cond = X86Op.getValue(1);
16123
16124     CC = DAG.getConstant(X86Cond, MVT::i8);
16125     addTest = false;
16126   }
16127
16128   if (addTest) {
16129     // Look pass the truncate if the high bits are known zero.
16130     if (isTruncWithZeroHighBitsInput(Cond, DAG))
16131         Cond = Cond.getOperand(0);
16132
16133     // We know the result of AND is compared against zero. Try to match
16134     // it to BT.
16135     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
16136       SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG);
16137       if (NewSetCC.getNode()) {
16138         CC = NewSetCC.getOperand(0);
16139         Cond = NewSetCC.getOperand(1);
16140         addTest = false;
16141       }
16142     }
16143   }
16144
16145   if (addTest) {
16146     CC = DAG.getConstant(X86::COND_NE, MVT::i8);
16147     Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
16148   }
16149
16150   // a <  b ? -1 :  0 -> RES = ~setcc_carry
16151   // a <  b ?  0 : -1 -> RES = setcc_carry
16152   // a >= b ? -1 :  0 -> RES = setcc_carry
16153   // a >= b ?  0 : -1 -> RES = ~setcc_carry
16154   if (Cond.getOpcode() == X86ISD::SUB) {
16155     Cond = ConvertCmpIfNecessary(Cond, DAG);
16156     unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
16157
16158     if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
16159         (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) {
16160       SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
16161                                 DAG.getConstant(X86::COND_B, MVT::i8), Cond);
16162       if (isAllOnes(Op1) != (CondCode == X86::COND_B))
16163         return DAG.getNOT(DL, Res, Res.getValueType());
16164       return Res;
16165     }
16166   }
16167
16168   // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
16169   // widen the cmov and push the truncate through. This avoids introducing a new
16170   // branch during isel and doesn't add any extensions.
16171   if (Op.getValueType() == MVT::i8 &&
16172       Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
16173     SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
16174     if (T1.getValueType() == T2.getValueType() &&
16175         // Blacklist CopyFromReg to avoid partial register stalls.
16176         T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
16177       SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
16178       SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
16179       return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
16180     }
16181   }
16182
16183   // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
16184   // condition is true.
16185   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
16186   SDValue Ops[] = { Op2, Op1, CC, Cond };
16187   return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
16188 }
16189
16190 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget,
16191                                        SelectionDAG &DAG) {
16192   MVT VT = Op->getSimpleValueType(0);
16193   SDValue In = Op->getOperand(0);
16194   MVT InVT = In.getSimpleValueType();
16195   MVT VTElt = VT.getVectorElementType();
16196   MVT InVTElt = InVT.getVectorElementType();
16197   SDLoc dl(Op);
16198
16199   // SKX processor
16200   if ((InVTElt == MVT::i1) &&
16201       (((Subtarget->hasBWI() && Subtarget->hasVLX() &&
16202         VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) ||
16203
16204        ((Subtarget->hasBWI() && VT.is512BitVector() &&
16205         VTElt.getSizeInBits() <= 16)) ||
16206
16207        ((Subtarget->hasDQI() && Subtarget->hasVLX() &&
16208         VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
16209
16210        ((Subtarget->hasDQI() && VT.is512BitVector() &&
16211         VTElt.getSizeInBits() >= 32))))
16212     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16213
16214   unsigned int NumElts = VT.getVectorNumElements();
16215
16216   if (NumElts != 8 && NumElts != 16)
16217     return SDValue();
16218
16219   if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) {
16220     if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
16221       return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
16222     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16223   }
16224
16225   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16226   assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
16227
16228   MVT ExtVT = (NumElts == 8) ? MVT::v8i64 : MVT::v16i32;
16229   Constant *C = ConstantInt::get(*DAG.getContext(),
16230     APInt::getAllOnesValue(ExtVT.getScalarType().getSizeInBits()));
16231
16232   SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
16233   unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
16234   SDValue Ld = DAG.getLoad(ExtVT.getScalarType(), dl, DAG.getEntryNode(), CP,
16235                           MachinePointerInfo::getConstantPool(),
16236                           false, false, false, Alignment);
16237   SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, dl, ExtVT, In, Ld);
16238   if (VT.is512BitVector())
16239     return Brcst;
16240   return DAG.getNode(X86ISD::VTRUNC, dl, VT, Brcst);
16241 }
16242
16243 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
16244                                 SelectionDAG &DAG) {
16245   MVT VT = Op->getSimpleValueType(0);
16246   SDValue In = Op->getOperand(0);
16247   MVT InVT = In.getSimpleValueType();
16248   SDLoc dl(Op);
16249
16250   if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
16251     return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
16252
16253   if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
16254       (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
16255       (VT != MVT::v16i16 || InVT != MVT::v16i8))
16256     return SDValue();
16257
16258   if (Subtarget->hasInt256())
16259     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16260
16261   // Optimize vectors in AVX mode
16262   // Sign extend  v8i16 to v8i32 and
16263   //              v4i32 to v4i64
16264   //
16265   // Divide input vector into two parts
16266   // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
16267   // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
16268   // concat the vectors to original VT
16269
16270   unsigned NumElems = InVT.getVectorNumElements();
16271   SDValue Undef = DAG.getUNDEF(InVT);
16272
16273   SmallVector<int,8> ShufMask1(NumElems, -1);
16274   for (unsigned i = 0; i != NumElems/2; ++i)
16275     ShufMask1[i] = i;
16276
16277   SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask1[0]);
16278
16279   SmallVector<int,8> ShufMask2(NumElems, -1);
16280   for (unsigned i = 0; i != NumElems/2; ++i)
16281     ShufMask2[i] = i + NumElems/2;
16282
16283   SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]);
16284
16285   MVT HalfVT = MVT::getVectorVT(VT.getScalarType(),
16286                                 VT.getVectorNumElements()/2);
16287
16288   OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
16289   OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi);
16290
16291   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
16292 }
16293
16294 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
16295 // may emit an illegal shuffle but the expansion is still better than scalar
16296 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
16297 // we'll emit a shuffle and a arithmetic shift.
16298 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
16299 // TODO: It is possible to support ZExt by zeroing the undef values during
16300 // the shuffle phase or after the shuffle.
16301 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
16302                                  SelectionDAG &DAG) {
16303   MVT RegVT = Op.getSimpleValueType();
16304   assert(RegVT.isVector() && "We only custom lower vector sext loads.");
16305   assert(RegVT.isInteger() &&
16306          "We only custom lower integer vector sext loads.");
16307
16308   // Nothing useful we can do without SSE2 shuffles.
16309   assert(Subtarget->hasSSE2() && "We only custom lower sext loads with SSE2.");
16310
16311   LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
16312   SDLoc dl(Ld);
16313   EVT MemVT = Ld->getMemoryVT();
16314   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16315   unsigned RegSz = RegVT.getSizeInBits();
16316
16317   ISD::LoadExtType Ext = Ld->getExtensionType();
16318
16319   assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
16320          && "Only anyext and sext are currently implemented.");
16321   assert(MemVT != RegVT && "Cannot extend to the same type");
16322   assert(MemVT.isVector() && "Must load a vector from memory");
16323
16324   unsigned NumElems = RegVT.getVectorNumElements();
16325   unsigned MemSz = MemVT.getSizeInBits();
16326   assert(RegSz > MemSz && "Register size must be greater than the mem size");
16327
16328   if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) {
16329     // The only way in which we have a legal 256-bit vector result but not the
16330     // integer 256-bit operations needed to directly lower a sextload is if we
16331     // have AVX1 but not AVX2. In that case, we can always emit a sextload to
16332     // a 128-bit vector and a normal sign_extend to 256-bits that should get
16333     // correctly legalized. We do this late to allow the canonical form of
16334     // sextload to persist throughout the rest of the DAG combiner -- it wants
16335     // to fold together any extensions it can, and so will fuse a sign_extend
16336     // of an sextload into a sextload targeting a wider value.
16337     SDValue Load;
16338     if (MemSz == 128) {
16339       // Just switch this to a normal load.
16340       assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
16341                                        "it must be a legal 128-bit vector "
16342                                        "type!");
16343       Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
16344                   Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(),
16345                   Ld->isInvariant(), Ld->getAlignment());
16346     } else {
16347       assert(MemSz < 128 &&
16348              "Can't extend a type wider than 128 bits to a 256 bit vector!");
16349       // Do an sext load to a 128-bit vector type. We want to use the same
16350       // number of elements, but elements half as wide. This will end up being
16351       // recursively lowered by this routine, but will succeed as we definitely
16352       // have all the necessary features if we're using AVX1.
16353       EVT HalfEltVT =
16354           EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
16355       EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
16356       Load =
16357           DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
16358                          Ld->getPointerInfo(), MemVT, Ld->isVolatile(),
16359                          Ld->isNonTemporal(), Ld->isInvariant(),
16360                          Ld->getAlignment());
16361     }
16362
16363     // Replace chain users with the new chain.
16364     assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
16365     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
16366
16367     // Finally, do a normal sign-extend to the desired register.
16368     return DAG.getSExtOrTrunc(Load, dl, RegVT);
16369   }
16370
16371   // All sizes must be a power of two.
16372   assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
16373          "Non-power-of-two elements are not custom lowered!");
16374
16375   // Attempt to load the original value using scalar loads.
16376   // Find the largest scalar type that divides the total loaded size.
16377   MVT SclrLoadTy = MVT::i8;
16378   for (MVT Tp : MVT::integer_valuetypes()) {
16379     if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
16380       SclrLoadTy = Tp;
16381     }
16382   }
16383
16384   // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
16385   if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
16386       (64 <= MemSz))
16387     SclrLoadTy = MVT::f64;
16388
16389   // Calculate the number of scalar loads that we need to perform
16390   // in order to load our vector from memory.
16391   unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
16392
16393   assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
16394          "Can only lower sext loads with a single scalar load!");
16395
16396   unsigned loadRegZize = RegSz;
16397   if (Ext == ISD::SEXTLOAD && RegSz == 256)
16398     loadRegZize /= 2;
16399
16400   // Represent our vector as a sequence of elements which are the
16401   // largest scalar that we can load.
16402   EVT LoadUnitVecVT = EVT::getVectorVT(
16403       *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
16404
16405   // Represent the data using the same element type that is stored in
16406   // memory. In practice, we ''widen'' MemVT.
16407   EVT WideVecVT =
16408       EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
16409                        loadRegZize / MemVT.getScalarType().getSizeInBits());
16410
16411   assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
16412          "Invalid vector type");
16413
16414   // We can't shuffle using an illegal type.
16415   assert(TLI.isTypeLegal(WideVecVT) &&
16416          "We only lower types that form legal widened vector types");
16417
16418   SmallVector<SDValue, 8> Chains;
16419   SDValue Ptr = Ld->getBasePtr();
16420   SDValue Increment =
16421       DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, TLI.getPointerTy());
16422   SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
16423
16424   for (unsigned i = 0; i < NumLoads; ++i) {
16425     // Perform a single load.
16426     SDValue ScalarLoad =
16427         DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
16428                     Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(),
16429                     Ld->getAlignment());
16430     Chains.push_back(ScalarLoad.getValue(1));
16431     // Create the first element type using SCALAR_TO_VECTOR in order to avoid
16432     // another round of DAGCombining.
16433     if (i == 0)
16434       Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
16435     else
16436       Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
16437                         ScalarLoad, DAG.getIntPtrConstant(i));
16438
16439     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
16440   }
16441
16442   SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
16443
16444   // Bitcast the loaded value to a vector of the original element type, in
16445   // the size of the target vector type.
16446   SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
16447   unsigned SizeRatio = RegSz / MemSz;
16448
16449   if (Ext == ISD::SEXTLOAD) {
16450     // If we have SSE4.1, we can directly emit a VSEXT node.
16451     if (Subtarget->hasSSE41()) {
16452       SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
16453       DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16454       return Sext;
16455     }
16456
16457     // Otherwise we'll shuffle the small elements in the high bits of the
16458     // larger type and perform an arithmetic shift. If the shift is not legal
16459     // it's better to scalarize.
16460     assert(TLI.isOperationLegalOrCustom(ISD::SRA, RegVT) &&
16461            "We can't implement a sext load without an arithmetic right shift!");
16462
16463     // Redistribute the loaded elements into the different locations.
16464     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
16465     for (unsigned i = 0; i != NumElems; ++i)
16466       ShuffleVec[i * SizeRatio + SizeRatio - 1] = i;
16467
16468     SDValue Shuff = DAG.getVectorShuffle(
16469         WideVecVT, dl, SlicedVec, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
16470
16471     Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
16472
16473     // Build the arithmetic shift.
16474     unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
16475                    MemVT.getVectorElementType().getSizeInBits();
16476     Shuff =
16477         DAG.getNode(ISD::SRA, dl, RegVT, Shuff, DAG.getConstant(Amt, RegVT));
16478
16479     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16480     return Shuff;
16481   }
16482
16483   // Redistribute the loaded elements into the different locations.
16484   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
16485   for (unsigned i = 0; i != NumElems; ++i)
16486     ShuffleVec[i * SizeRatio] = i;
16487
16488   SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
16489                                        DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
16490
16491   // Bitcast to the requested type.
16492   Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
16493   DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16494   return Shuff;
16495 }
16496
16497 // isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
16498 // ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
16499 // from the AND / OR.
16500 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
16501   Opc = Op.getOpcode();
16502   if (Opc != ISD::OR && Opc != ISD::AND)
16503     return false;
16504   return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
16505           Op.getOperand(0).hasOneUse() &&
16506           Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
16507           Op.getOperand(1).hasOneUse());
16508 }
16509
16510 // isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
16511 // 1 and that the SETCC node has a single use.
16512 static bool isXor1OfSetCC(SDValue Op) {
16513   if (Op.getOpcode() != ISD::XOR)
16514     return false;
16515   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
16516   if (N1C && N1C->getAPIntValue() == 1) {
16517     return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
16518       Op.getOperand(0).hasOneUse();
16519   }
16520   return false;
16521 }
16522
16523 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
16524   bool addTest = true;
16525   SDValue Chain = Op.getOperand(0);
16526   SDValue Cond  = Op.getOperand(1);
16527   SDValue Dest  = Op.getOperand(2);
16528   SDLoc dl(Op);
16529   SDValue CC;
16530   bool Inverted = false;
16531
16532   if (Cond.getOpcode() == ISD::SETCC) {
16533     // Check for setcc([su]{add,sub,mul}o == 0).
16534     if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
16535         isa<ConstantSDNode>(Cond.getOperand(1)) &&
16536         cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() &&
16537         Cond.getOperand(0).getResNo() == 1 &&
16538         (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
16539          Cond.getOperand(0).getOpcode() == ISD::UADDO ||
16540          Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
16541          Cond.getOperand(0).getOpcode() == ISD::USUBO ||
16542          Cond.getOperand(0).getOpcode() == ISD::SMULO ||
16543          Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
16544       Inverted = true;
16545       Cond = Cond.getOperand(0);
16546     } else {
16547       SDValue NewCond = LowerSETCC(Cond, DAG);
16548       if (NewCond.getNode())
16549         Cond = NewCond;
16550     }
16551   }
16552 #if 0
16553   // FIXME: LowerXALUO doesn't handle these!!
16554   else if (Cond.getOpcode() == X86ISD::ADD  ||
16555            Cond.getOpcode() == X86ISD::SUB  ||
16556            Cond.getOpcode() == X86ISD::SMUL ||
16557            Cond.getOpcode() == X86ISD::UMUL)
16558     Cond = LowerXALUO(Cond, DAG);
16559 #endif
16560
16561   // Look pass (and (setcc_carry (cmp ...)), 1).
16562   if (Cond.getOpcode() == ISD::AND &&
16563       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
16564     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
16565     if (C && C->getAPIntValue() == 1)
16566       Cond = Cond.getOperand(0);
16567   }
16568
16569   // If condition flag is set by a X86ISD::CMP, then use it as the condition
16570   // setting operand in place of the X86ISD::SETCC.
16571   unsigned CondOpcode = Cond.getOpcode();
16572   if (CondOpcode == X86ISD::SETCC ||
16573       CondOpcode == X86ISD::SETCC_CARRY) {
16574     CC = Cond.getOperand(0);
16575
16576     SDValue Cmp = Cond.getOperand(1);
16577     unsigned Opc = Cmp.getOpcode();
16578     // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
16579     if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
16580       Cond = Cmp;
16581       addTest = false;
16582     } else {
16583       switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
16584       default: break;
16585       case X86::COND_O:
16586       case X86::COND_B:
16587         // These can only come from an arithmetic instruction with overflow,
16588         // e.g. SADDO, UADDO.
16589         Cond = Cond.getNode()->getOperand(1);
16590         addTest = false;
16591         break;
16592       }
16593     }
16594   }
16595   CondOpcode = Cond.getOpcode();
16596   if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
16597       CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
16598       ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
16599        Cond.getOperand(0).getValueType() != MVT::i8)) {
16600     SDValue LHS = Cond.getOperand(0);
16601     SDValue RHS = Cond.getOperand(1);
16602     unsigned X86Opcode;
16603     unsigned X86Cond;
16604     SDVTList VTs;
16605     // Keep this in sync with LowerXALUO, otherwise we might create redundant
16606     // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
16607     // X86ISD::INC).
16608     switch (CondOpcode) {
16609     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
16610     case ISD::SADDO:
16611       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
16612         if (C->isOne()) {
16613           X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
16614           break;
16615         }
16616       X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
16617     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
16618     case ISD::SSUBO:
16619       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
16620         if (C->isOne()) {
16621           X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
16622           break;
16623         }
16624       X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
16625     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
16626     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
16627     default: llvm_unreachable("unexpected overflowing operator");
16628     }
16629     if (Inverted)
16630       X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
16631     if (CondOpcode == ISD::UMULO)
16632       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
16633                           MVT::i32);
16634     else
16635       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
16636
16637     SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
16638
16639     if (CondOpcode == ISD::UMULO)
16640       Cond = X86Op.getValue(2);
16641     else
16642       Cond = X86Op.getValue(1);
16643
16644     CC = DAG.getConstant(X86Cond, MVT::i8);
16645     addTest = false;
16646   } else {
16647     unsigned CondOpc;
16648     if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
16649       SDValue Cmp = Cond.getOperand(0).getOperand(1);
16650       if (CondOpc == ISD::OR) {
16651         // Also, recognize the pattern generated by an FCMP_UNE. We can emit
16652         // two branches instead of an explicit OR instruction with a
16653         // separate test.
16654         if (Cmp == Cond.getOperand(1).getOperand(1) &&
16655             isX86LogicalCmp(Cmp)) {
16656           CC = Cond.getOperand(0).getOperand(0);
16657           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16658                               Chain, Dest, CC, Cmp);
16659           CC = Cond.getOperand(1).getOperand(0);
16660           Cond = Cmp;
16661           addTest = false;
16662         }
16663       } else { // ISD::AND
16664         // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
16665         // two branches instead of an explicit AND instruction with a
16666         // separate test. However, we only do this if this block doesn't
16667         // have a fall-through edge, because this requires an explicit
16668         // jmp when the condition is false.
16669         if (Cmp == Cond.getOperand(1).getOperand(1) &&
16670             isX86LogicalCmp(Cmp) &&
16671             Op.getNode()->hasOneUse()) {
16672           X86::CondCode CCode =
16673             (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
16674           CCode = X86::GetOppositeBranchCondition(CCode);
16675           CC = DAG.getConstant(CCode, MVT::i8);
16676           SDNode *User = *Op.getNode()->use_begin();
16677           // Look for an unconditional branch following this conditional branch.
16678           // We need this because we need to reverse the successors in order
16679           // to implement FCMP_OEQ.
16680           if (User->getOpcode() == ISD::BR) {
16681             SDValue FalseBB = User->getOperand(1);
16682             SDNode *NewBR =
16683               DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16684             assert(NewBR == User);
16685             (void)NewBR;
16686             Dest = FalseBB;
16687
16688             Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16689                                 Chain, Dest, CC, Cmp);
16690             X86::CondCode CCode =
16691               (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
16692             CCode = X86::GetOppositeBranchCondition(CCode);
16693             CC = DAG.getConstant(CCode, MVT::i8);
16694             Cond = Cmp;
16695             addTest = false;
16696           }
16697         }
16698       }
16699     } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
16700       // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
16701       // It should be transformed during dag combiner except when the condition
16702       // is set by a arithmetics with overflow node.
16703       X86::CondCode CCode =
16704         (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
16705       CCode = X86::GetOppositeBranchCondition(CCode);
16706       CC = DAG.getConstant(CCode, MVT::i8);
16707       Cond = Cond.getOperand(0).getOperand(1);
16708       addTest = false;
16709     } else if (Cond.getOpcode() == ISD::SETCC &&
16710                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
16711       // For FCMP_OEQ, we can emit
16712       // two branches instead of an explicit AND instruction with a
16713       // separate test. However, we only do this if this block doesn't
16714       // have a fall-through edge, because this requires an explicit
16715       // jmp when the condition is false.
16716       if (Op.getNode()->hasOneUse()) {
16717         SDNode *User = *Op.getNode()->use_begin();
16718         // Look for an unconditional branch following this conditional branch.
16719         // We need this because we need to reverse the successors in order
16720         // to implement FCMP_OEQ.
16721         if (User->getOpcode() == ISD::BR) {
16722           SDValue FalseBB = User->getOperand(1);
16723           SDNode *NewBR =
16724             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16725           assert(NewBR == User);
16726           (void)NewBR;
16727           Dest = FalseBB;
16728
16729           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
16730                                     Cond.getOperand(0), Cond.getOperand(1));
16731           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16732           CC = DAG.getConstant(X86::COND_NE, MVT::i8);
16733           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16734                               Chain, Dest, CC, Cmp);
16735           CC = DAG.getConstant(X86::COND_P, MVT::i8);
16736           Cond = Cmp;
16737           addTest = false;
16738         }
16739       }
16740     } else if (Cond.getOpcode() == ISD::SETCC &&
16741                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
16742       // For FCMP_UNE, we can emit
16743       // two branches instead of an explicit AND instruction with a
16744       // separate test. However, we only do this if this block doesn't
16745       // have a fall-through edge, because this requires an explicit
16746       // jmp when the condition is false.
16747       if (Op.getNode()->hasOneUse()) {
16748         SDNode *User = *Op.getNode()->use_begin();
16749         // Look for an unconditional branch following this conditional branch.
16750         // We need this because we need to reverse the successors in order
16751         // to implement FCMP_UNE.
16752         if (User->getOpcode() == ISD::BR) {
16753           SDValue FalseBB = User->getOperand(1);
16754           SDNode *NewBR =
16755             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16756           assert(NewBR == User);
16757           (void)NewBR;
16758
16759           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
16760                                     Cond.getOperand(0), Cond.getOperand(1));
16761           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16762           CC = DAG.getConstant(X86::COND_NE, MVT::i8);
16763           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16764                               Chain, Dest, CC, Cmp);
16765           CC = DAG.getConstant(X86::COND_NP, MVT::i8);
16766           Cond = Cmp;
16767           addTest = false;
16768           Dest = FalseBB;
16769         }
16770       }
16771     }
16772   }
16773
16774   if (addTest) {
16775     // Look pass the truncate if the high bits are known zero.
16776     if (isTruncWithZeroHighBitsInput(Cond, DAG))
16777         Cond = Cond.getOperand(0);
16778
16779     // We know the result of AND is compared against zero. Try to match
16780     // it to BT.
16781     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
16782       SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
16783       if (NewSetCC.getNode()) {
16784         CC = NewSetCC.getOperand(0);
16785         Cond = NewSetCC.getOperand(1);
16786         addTest = false;
16787       }
16788     }
16789   }
16790
16791   if (addTest) {
16792     X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
16793     CC = DAG.getConstant(X86Cond, MVT::i8);
16794     Cond = EmitTest(Cond, X86Cond, dl, DAG);
16795   }
16796   Cond = ConvertCmpIfNecessary(Cond, DAG);
16797   return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16798                      Chain, Dest, CC, Cond);
16799 }
16800
16801 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
16802 // Calls to _alloca are needed to probe the stack when allocating more than 4k
16803 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
16804 // that the guard pages used by the OS virtual memory manager are allocated in
16805 // correct sequence.
16806 SDValue
16807 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
16808                                            SelectionDAG &DAG) const {
16809   MachineFunction &MF = DAG.getMachineFunction();
16810   bool SplitStack = MF.shouldSplitStack();
16811   bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMachO()) ||
16812                SplitStack;
16813   SDLoc dl(Op);
16814
16815   if (!Lower) {
16816     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16817     SDNode* Node = Op.getNode();
16818
16819     unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
16820     assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
16821         " not tell us which reg is the stack pointer!");
16822     EVT VT = Node->getValueType(0);
16823     SDValue Tmp1 = SDValue(Node, 0);
16824     SDValue Tmp2 = SDValue(Node, 1);
16825     SDValue Tmp3 = Node->getOperand(2);
16826     SDValue Chain = Tmp1.getOperand(0);
16827
16828     // Chain the dynamic stack allocation so that it doesn't modify the stack
16829     // pointer when other instructions are using the stack.
16830     Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true),
16831         SDLoc(Node));
16832
16833     SDValue Size = Tmp2.getOperand(1);
16834     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
16835     Chain = SP.getValue(1);
16836     unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
16837     const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
16838     unsigned StackAlign = TFI.getStackAlignment();
16839     Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
16840     if (Align > StackAlign)
16841       Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
16842           DAG.getConstant(-(uint64_t)Align, VT));
16843     Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
16844
16845     Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, true),
16846         DAG.getIntPtrConstant(0, true), SDValue(),
16847         SDLoc(Node));
16848
16849     SDValue Ops[2] = { Tmp1, Tmp2 };
16850     return DAG.getMergeValues(Ops, dl);
16851   }
16852
16853   // Get the inputs.
16854   SDValue Chain = Op.getOperand(0);
16855   SDValue Size  = Op.getOperand(1);
16856   unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
16857   EVT VT = Op.getNode()->getValueType(0);
16858
16859   bool Is64Bit = Subtarget->is64Bit();
16860   EVT SPTy = getPointerTy();
16861
16862   if (SplitStack) {
16863     MachineRegisterInfo &MRI = MF.getRegInfo();
16864
16865     if (Is64Bit) {
16866       // The 64 bit implementation of segmented stacks needs to clobber both r10
16867       // r11. This makes it impossible to use it along with nested parameters.
16868       const Function *F = MF.getFunction();
16869
16870       for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
16871            I != E; ++I)
16872         if (I->hasNestAttr())
16873           report_fatal_error("Cannot use segmented stacks with functions that "
16874                              "have nested arguments.");
16875     }
16876
16877     const TargetRegisterClass *AddrRegClass =
16878       getRegClassFor(getPointerTy());
16879     unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
16880     Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
16881     SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
16882                                 DAG.getRegister(Vreg, SPTy));
16883     SDValue Ops1[2] = { Value, Chain };
16884     return DAG.getMergeValues(Ops1, dl);
16885   } else {
16886     SDValue Flag;
16887     const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX);
16888
16889     Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
16890     Flag = Chain.getValue(1);
16891     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
16892
16893     Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
16894
16895     const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
16896     unsigned SPReg = RegInfo->getStackRegister();
16897     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
16898     Chain = SP.getValue(1);
16899
16900     if (Align) {
16901       SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
16902                        DAG.getConstant(-(uint64_t)Align, VT));
16903       Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
16904     }
16905
16906     SDValue Ops1[2] = { SP, Chain };
16907     return DAG.getMergeValues(Ops1, dl);
16908   }
16909 }
16910
16911 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
16912   MachineFunction &MF = DAG.getMachineFunction();
16913   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
16914
16915   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
16916   SDLoc DL(Op);
16917
16918   if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) {
16919     // vastart just stores the address of the VarArgsFrameIndex slot into the
16920     // memory location argument.
16921     SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
16922                                    getPointerTy());
16923     return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
16924                         MachinePointerInfo(SV), false, false, 0);
16925   }
16926
16927   // __va_list_tag:
16928   //   gp_offset         (0 - 6 * 8)
16929   //   fp_offset         (48 - 48 + 8 * 16)
16930   //   overflow_arg_area (point to parameters coming in memory).
16931   //   reg_save_area
16932   SmallVector<SDValue, 8> MemOps;
16933   SDValue FIN = Op.getOperand(1);
16934   // Store gp_offset
16935   SDValue Store = DAG.getStore(Op.getOperand(0), DL,
16936                                DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
16937                                                MVT::i32),
16938                                FIN, MachinePointerInfo(SV), false, false, 0);
16939   MemOps.push_back(Store);
16940
16941   // Store fp_offset
16942   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
16943                     FIN, DAG.getIntPtrConstant(4));
16944   Store = DAG.getStore(Op.getOperand(0), DL,
16945                        DAG.getConstant(FuncInfo->getVarArgsFPOffset(),
16946                                        MVT::i32),
16947                        FIN, MachinePointerInfo(SV, 4), false, false, 0);
16948   MemOps.push_back(Store);
16949
16950   // Store ptr to overflow_arg_area
16951   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
16952                     FIN, DAG.getIntPtrConstant(4));
16953   SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
16954                                     getPointerTy());
16955   Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
16956                        MachinePointerInfo(SV, 8),
16957                        false, false, 0);
16958   MemOps.push_back(Store);
16959
16960   // Store ptr to reg_save_area.
16961   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
16962                     FIN, DAG.getIntPtrConstant(8));
16963   SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
16964                                     getPointerTy());
16965   Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
16966                        MachinePointerInfo(SV, 16), false, false, 0);
16967   MemOps.push_back(Store);
16968   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
16969 }
16970
16971 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
16972   assert(Subtarget->is64Bit() &&
16973          "LowerVAARG only handles 64-bit va_arg!");
16974   assert((Subtarget->isTargetLinux() ||
16975           Subtarget->isTargetDarwin()) &&
16976           "Unhandled target in LowerVAARG");
16977   assert(Op.getNode()->getNumOperands() == 4);
16978   SDValue Chain = Op.getOperand(0);
16979   SDValue SrcPtr = Op.getOperand(1);
16980   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
16981   unsigned Align = Op.getConstantOperandVal(3);
16982   SDLoc dl(Op);
16983
16984   EVT ArgVT = Op.getNode()->getValueType(0);
16985   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
16986   uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
16987   uint8_t ArgMode;
16988
16989   // Decide which area this value should be read from.
16990   // TODO: Implement the AMD64 ABI in its entirety. This simple
16991   // selection mechanism works only for the basic types.
16992   if (ArgVT == MVT::f80) {
16993     llvm_unreachable("va_arg for f80 not yet implemented");
16994   } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
16995     ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
16996   } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
16997     ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
16998   } else {
16999     llvm_unreachable("Unhandled argument type in LowerVAARG");
17000   }
17001
17002   if (ArgMode == 2) {
17003     // Sanity Check: Make sure using fp_offset makes sense.
17004     assert(!DAG.getTarget().Options.UseSoftFloat &&
17005            !(DAG.getMachineFunction()
17006                 .getFunction()->getAttributes()
17007                 .hasAttribute(AttributeSet::FunctionIndex,
17008                               Attribute::NoImplicitFloat)) &&
17009            Subtarget->hasSSE1());
17010   }
17011
17012   // Insert VAARG_64 node into the DAG
17013   // VAARG_64 returns two values: Variable Argument Address, Chain
17014   SmallVector<SDValue, 11> InstOps;
17015   InstOps.push_back(Chain);
17016   InstOps.push_back(SrcPtr);
17017   InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32));
17018   InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8));
17019   InstOps.push_back(DAG.getConstant(Align, MVT::i32));
17020   SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other);
17021   SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
17022                                           VTs, InstOps, MVT::i64,
17023                                           MachinePointerInfo(SV),
17024                                           /*Align=*/0,
17025                                           /*Volatile=*/false,
17026                                           /*ReadMem=*/true,
17027                                           /*WriteMem=*/true);
17028   Chain = VAARG.getValue(1);
17029
17030   // Load the next argument and return it
17031   return DAG.getLoad(ArgVT, dl,
17032                      Chain,
17033                      VAARG,
17034                      MachinePointerInfo(),
17035                      false, false, false, 0);
17036 }
17037
17038 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
17039                            SelectionDAG &DAG) {
17040   // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
17041   assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
17042   SDValue Chain = Op.getOperand(0);
17043   SDValue DstPtr = Op.getOperand(1);
17044   SDValue SrcPtr = Op.getOperand(2);
17045   const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
17046   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
17047   SDLoc DL(Op);
17048
17049   return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
17050                        DAG.getIntPtrConstant(24), 8, /*isVolatile*/false,
17051                        false,
17052                        MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
17053 }
17054
17055 // getTargetVShiftByConstNode - Handle vector element shifts where the shift
17056 // amount is a constant. Takes immediate version of shift as input.
17057 static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
17058                                           SDValue SrcOp, uint64_t ShiftAmt,
17059                                           SelectionDAG &DAG) {
17060   MVT ElementType = VT.getVectorElementType();
17061
17062   // Fold this packed shift into its first operand if ShiftAmt is 0.
17063   if (ShiftAmt == 0)
17064     return SrcOp;
17065
17066   // Check for ShiftAmt >= element width
17067   if (ShiftAmt >= ElementType.getSizeInBits()) {
17068     if (Opc == X86ISD::VSRAI)
17069       ShiftAmt = ElementType.getSizeInBits() - 1;
17070     else
17071       return DAG.getConstant(0, VT);
17072   }
17073
17074   assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
17075          && "Unknown target vector shift-by-constant node");
17076
17077   // Fold this packed vector shift into a build vector if SrcOp is a
17078   // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT.
17079   if (VT == SrcOp.getSimpleValueType() &&
17080       ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
17081     SmallVector<SDValue, 8> Elts;
17082     unsigned NumElts = SrcOp->getNumOperands();
17083     ConstantSDNode *ND;
17084
17085     switch(Opc) {
17086     default: llvm_unreachable(nullptr);
17087     case X86ISD::VSHLI:
17088       for (unsigned i=0; i!=NumElts; ++i) {
17089         SDValue CurrentOp = SrcOp->getOperand(i);
17090         if (CurrentOp->getOpcode() == ISD::UNDEF) {
17091           Elts.push_back(CurrentOp);
17092           continue;
17093         }
17094         ND = cast<ConstantSDNode>(CurrentOp);
17095         const APInt &C = ND->getAPIntValue();
17096         Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), ElementType));
17097       }
17098       break;
17099     case X86ISD::VSRLI:
17100       for (unsigned i=0; i!=NumElts; ++i) {
17101         SDValue CurrentOp = SrcOp->getOperand(i);
17102         if (CurrentOp->getOpcode() == ISD::UNDEF) {
17103           Elts.push_back(CurrentOp);
17104           continue;
17105         }
17106         ND = cast<ConstantSDNode>(CurrentOp);
17107         const APInt &C = ND->getAPIntValue();
17108         Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), ElementType));
17109       }
17110       break;
17111     case X86ISD::VSRAI:
17112       for (unsigned i=0; i!=NumElts; ++i) {
17113         SDValue CurrentOp = SrcOp->getOperand(i);
17114         if (CurrentOp->getOpcode() == ISD::UNDEF) {
17115           Elts.push_back(CurrentOp);
17116           continue;
17117         }
17118         ND = cast<ConstantSDNode>(CurrentOp);
17119         const APInt &C = ND->getAPIntValue();
17120         Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), ElementType));
17121       }
17122       break;
17123     }
17124
17125     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
17126   }
17127
17128   return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8));
17129 }
17130
17131 // getTargetVShiftNode - Handle vector element shifts where the shift amount
17132 // may or may not be a constant. Takes immediate version of shift as input.
17133 static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
17134                                    SDValue SrcOp, SDValue ShAmt,
17135                                    SelectionDAG &DAG) {
17136   MVT SVT = ShAmt.getSimpleValueType();
17137   assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
17138
17139   // Catch shift-by-constant.
17140   if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
17141     return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
17142                                       CShAmt->getZExtValue(), DAG);
17143
17144   // Change opcode to non-immediate version
17145   switch (Opc) {
17146     default: llvm_unreachable("Unknown target vector shift node");
17147     case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
17148     case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
17149     case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
17150   }
17151
17152   const X86Subtarget &Subtarget =
17153       static_cast<const X86Subtarget &>(DAG.getSubtarget());
17154   if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
17155       ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
17156     // Let the shuffle legalizer expand this shift amount node.
17157     SDValue Op0 = ShAmt.getOperand(0);
17158     Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
17159     ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, &Subtarget, DAG);
17160   } else {
17161     // Need to build a vector containing shift amount.
17162     // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
17163     SmallVector<SDValue, 4> ShOps;
17164     ShOps.push_back(ShAmt);
17165     if (SVT == MVT::i32) {
17166       ShOps.push_back(DAG.getConstant(0, SVT));
17167       ShOps.push_back(DAG.getUNDEF(SVT));
17168     }
17169     ShOps.push_back(DAG.getUNDEF(SVT));
17170
17171     MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
17172     ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, BVT, ShOps);
17173   }
17174
17175   // The return type has to be a 128-bit type with the same element
17176   // type as the input type.
17177   MVT EltVT = VT.getVectorElementType();
17178   EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
17179
17180   ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt);
17181   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
17182 }
17183
17184 /// \brief Return (and \p Op, \p Mask) for compare instructions or
17185 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
17186 /// necessary casting for \p Mask when lowering masking intrinsics.
17187 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
17188                                     SDValue PreservedSrc,
17189                                     const X86Subtarget *Subtarget,
17190                                     SelectionDAG &DAG) {
17191     EVT VT = Op.getValueType();
17192     EVT MaskVT = EVT::getVectorVT(*DAG.getContext(),
17193                                   MVT::i1, VT.getVectorNumElements());
17194     EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17195                                      Mask.getValueType().getSizeInBits());
17196     SDLoc dl(Op);
17197
17198     assert(MaskVT.isSimple() && "invalid mask type");
17199
17200     if (isAllOnes(Mask))
17201       return Op;
17202
17203     // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
17204     // are extracted by EXTRACT_SUBVECTOR.
17205     SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17206                               DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17207                               DAG.getIntPtrConstant(0));
17208
17209     switch (Op.getOpcode()) {
17210       default: break;
17211       case X86ISD::PCMPEQM:
17212       case X86ISD::PCMPGTM:
17213       case X86ISD::CMPM:
17214       case X86ISD::CMPMU:
17215         return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
17216     }
17217     if (PreservedSrc.getOpcode() == ISD::UNDEF)
17218       PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
17219     return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc);
17220 }
17221
17222 /// \brief Creates an SDNode for a predicated scalar operation.
17223 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
17224 /// The mask is comming as MVT::i8 and it should be truncated
17225 /// to MVT::i1 while lowering masking intrinsics.
17226 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
17227 /// "X86select" instead of "vselect". We just can't create the "vselect" node for
17228 /// a scalar instruction.
17229 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
17230                                     SDValue PreservedSrc,
17231                                     const X86Subtarget *Subtarget,
17232                                     SelectionDAG &DAG) {
17233     if (isAllOnes(Mask))
17234       return Op;
17235
17236     EVT VT = Op.getValueType();
17237     SDLoc dl(Op);
17238     // The mask should be of type MVT::i1
17239     SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
17240
17241     if (PreservedSrc.getOpcode() == ISD::UNDEF)
17242       PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
17243     return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
17244 }
17245
17246 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
17247                                        SelectionDAG &DAG) {
17248   SDLoc dl(Op);
17249   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
17250   EVT VT = Op.getValueType();
17251   const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
17252   if (IntrData) {
17253     switch(IntrData->Type) {
17254     case INTR_TYPE_1OP:
17255       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
17256     case INTR_TYPE_2OP:
17257       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17258         Op.getOperand(2));
17259     case INTR_TYPE_3OP:
17260       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17261         Op.getOperand(2), Op.getOperand(3));
17262     case INTR_TYPE_1OP_MASK_RM: {
17263       SDValue Src = Op.getOperand(1);
17264       SDValue Src0 = Op.getOperand(2);
17265       SDValue Mask = Op.getOperand(3);
17266       SDValue RoundingMode = Op.getOperand(4);
17267       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
17268                                               RoundingMode),
17269                                   Mask, Src0, Subtarget, DAG);
17270     }
17271     case INTR_TYPE_SCALAR_MASK_RM: {
17272       SDValue Src1 = Op.getOperand(1);
17273       SDValue Src2 = Op.getOperand(2);
17274       SDValue Src0 = Op.getOperand(3);
17275       SDValue Mask = Op.getOperand(4);
17276       SDValue RoundingMode = Op.getOperand(5);
17277       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
17278                                               RoundingMode),
17279                                   Mask, Src0, Subtarget, DAG);
17280     }
17281     case INTR_TYPE_2OP_MASK: {
17282       SDValue Mask = Op.getOperand(4);
17283       SDValue PassThru = Op.getOperand(3);
17284       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17285       if (IntrWithRoundingModeOpcode != 0) {
17286         unsigned Round = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
17287         if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
17288           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17289                                       dl, Op.getValueType(),
17290                                       Op.getOperand(1), Op.getOperand(2),
17291                                       Op.getOperand(3), Op.getOperand(5)),
17292                                       Mask, PassThru, Subtarget, DAG);
17293         }
17294       }
17295       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17296                                               Op.getOperand(1),
17297                                               Op.getOperand(2)),
17298                                   Mask, PassThru, Subtarget, DAG);
17299     }
17300     case FMA_OP_MASK: {
17301       SDValue Src1 = Op.getOperand(1);
17302       SDValue Src2 = Op.getOperand(2);
17303       SDValue Src3 = Op.getOperand(3);
17304       SDValue Mask = Op.getOperand(4);
17305       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17306       if (IntrWithRoundingModeOpcode != 0) {
17307         SDValue Rnd = Op.getOperand(5);
17308         if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
17309             X86::STATIC_ROUNDING::CUR_DIRECTION)
17310           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17311                                                   dl, Op.getValueType(),
17312                                                   Src1, Src2, Src3, Rnd),
17313                                       Mask, Src1, Subtarget, DAG);
17314       }
17315       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
17316                                               dl, Op.getValueType(),
17317                                               Src1, Src2, Src3),
17318                                   Mask, Src1, Subtarget, DAG);
17319     }
17320     case CMP_MASK:
17321     case CMP_MASK_CC: {
17322       // Comparison intrinsics with masks.
17323       // Example of transformation:
17324       // (i8 (int_x86_avx512_mask_pcmpeq_q_128
17325       //             (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
17326       // (i8 (bitcast
17327       //   (v8i1 (insert_subvector undef,
17328       //           (v2i1 (and (PCMPEQM %a, %b),
17329       //                      (extract_subvector
17330       //                         (v8i1 (bitcast %mask)), 0))), 0))))
17331       EVT VT = Op.getOperand(1).getValueType();
17332       EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17333                                     VT.getVectorNumElements());
17334       SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
17335       EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17336                                        Mask.getValueType().getSizeInBits());
17337       SDValue Cmp;
17338       if (IntrData->Type == CMP_MASK_CC) {
17339         Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
17340                     Op.getOperand(2), Op.getOperand(3));
17341       } else {
17342         assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
17343         Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
17344                     Op.getOperand(2));
17345       }
17346       SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
17347                                              DAG.getTargetConstant(0, MaskVT),
17348                                              Subtarget, DAG);
17349       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
17350                                 DAG.getUNDEF(BitcastVT), CmpMask,
17351                                 DAG.getIntPtrConstant(0));
17352       return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
17353     }
17354     case COMI: { // Comparison intrinsics
17355       ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
17356       SDValue LHS = Op.getOperand(1);
17357       SDValue RHS = Op.getOperand(2);
17358       unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
17359       assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
17360       SDValue Cond = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
17361       SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17362                                   DAG.getConstant(X86CC, MVT::i8), Cond);
17363       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17364     }
17365     case VSHIFT:
17366       return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
17367                                  Op.getOperand(1), Op.getOperand(2), DAG);
17368     case VSHIFT_MASK:
17369       return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl,
17370                                                       Op.getSimpleValueType(),
17371                                                       Op.getOperand(1),
17372                                                       Op.getOperand(2), DAG),
17373                                   Op.getOperand(4), Op.getOperand(3), Subtarget,
17374                                   DAG);
17375     case COMPRESS_EXPAND_IN_REG: {
17376       SDValue Mask = Op.getOperand(3);
17377       SDValue DataToCompress = Op.getOperand(1);
17378       SDValue PassThru = Op.getOperand(2);
17379       if (isAllOnes(Mask)) // return data as is
17380         return Op.getOperand(1);
17381       EVT VT = Op.getValueType();
17382       EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17383                                     VT.getVectorNumElements());
17384       EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17385                                        Mask.getValueType().getSizeInBits());
17386       SDLoc dl(Op);
17387       SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17388                                   DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17389                                   DAG.getIntPtrConstant(0));
17390
17391       return DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToCompress,
17392                          PassThru);
17393     }
17394     case BLEND: {
17395       SDValue Mask = Op.getOperand(3);
17396       EVT VT = Op.getValueType();
17397       EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17398                                     VT.getVectorNumElements());
17399       EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17400                                        Mask.getValueType().getSizeInBits());
17401       SDLoc dl(Op);
17402       SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17403                                   DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17404                                   DAG.getIntPtrConstant(0));
17405       return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1),
17406                          Op.getOperand(2));
17407     }
17408     default:
17409       break;
17410     }
17411   }
17412
17413   switch (IntNo) {
17414   default: return SDValue();    // Don't custom lower most intrinsics.
17415
17416   case Intrinsic::x86_avx512_mask_valign_q_512:
17417   case Intrinsic::x86_avx512_mask_valign_d_512:
17418     // Vector source operands are swapped.
17419     return getVectorMaskingNode(DAG.getNode(X86ISD::VALIGN, dl,
17420                                             Op.getValueType(), Op.getOperand(2),
17421                                             Op.getOperand(1),
17422                                             Op.getOperand(3)),
17423                                 Op.getOperand(5), Op.getOperand(4),
17424                                 Subtarget, DAG);
17425
17426   // ptest and testp intrinsics. The intrinsic these come from are designed to
17427   // return an integer value, not just an instruction so lower it to the ptest
17428   // or testp pattern and a setcc for the result.
17429   case Intrinsic::x86_sse41_ptestz:
17430   case Intrinsic::x86_sse41_ptestc:
17431   case Intrinsic::x86_sse41_ptestnzc:
17432   case Intrinsic::x86_avx_ptestz_256:
17433   case Intrinsic::x86_avx_ptestc_256:
17434   case Intrinsic::x86_avx_ptestnzc_256:
17435   case Intrinsic::x86_avx_vtestz_ps:
17436   case Intrinsic::x86_avx_vtestc_ps:
17437   case Intrinsic::x86_avx_vtestnzc_ps:
17438   case Intrinsic::x86_avx_vtestz_pd:
17439   case Intrinsic::x86_avx_vtestc_pd:
17440   case Intrinsic::x86_avx_vtestnzc_pd:
17441   case Intrinsic::x86_avx_vtestz_ps_256:
17442   case Intrinsic::x86_avx_vtestc_ps_256:
17443   case Intrinsic::x86_avx_vtestnzc_ps_256:
17444   case Intrinsic::x86_avx_vtestz_pd_256:
17445   case Intrinsic::x86_avx_vtestc_pd_256:
17446   case Intrinsic::x86_avx_vtestnzc_pd_256: {
17447     bool IsTestPacked = false;
17448     unsigned X86CC;
17449     switch (IntNo) {
17450     default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
17451     case Intrinsic::x86_avx_vtestz_ps:
17452     case Intrinsic::x86_avx_vtestz_pd:
17453     case Intrinsic::x86_avx_vtestz_ps_256:
17454     case Intrinsic::x86_avx_vtestz_pd_256:
17455       IsTestPacked = true; // Fallthrough
17456     case Intrinsic::x86_sse41_ptestz:
17457     case Intrinsic::x86_avx_ptestz_256:
17458       // ZF = 1
17459       X86CC = X86::COND_E;
17460       break;
17461     case Intrinsic::x86_avx_vtestc_ps:
17462     case Intrinsic::x86_avx_vtestc_pd:
17463     case Intrinsic::x86_avx_vtestc_ps_256:
17464     case Intrinsic::x86_avx_vtestc_pd_256:
17465       IsTestPacked = true; // Fallthrough
17466     case Intrinsic::x86_sse41_ptestc:
17467     case Intrinsic::x86_avx_ptestc_256:
17468       // CF = 1
17469       X86CC = X86::COND_B;
17470       break;
17471     case Intrinsic::x86_avx_vtestnzc_ps:
17472     case Intrinsic::x86_avx_vtestnzc_pd:
17473     case Intrinsic::x86_avx_vtestnzc_ps_256:
17474     case Intrinsic::x86_avx_vtestnzc_pd_256:
17475       IsTestPacked = true; // Fallthrough
17476     case Intrinsic::x86_sse41_ptestnzc:
17477     case Intrinsic::x86_avx_ptestnzc_256:
17478       // ZF and CF = 0
17479       X86CC = X86::COND_A;
17480       break;
17481     }
17482
17483     SDValue LHS = Op.getOperand(1);
17484     SDValue RHS = Op.getOperand(2);
17485     unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
17486     SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
17487     SDValue CC = DAG.getConstant(X86CC, MVT::i8);
17488     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
17489     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17490   }
17491   case Intrinsic::x86_avx512_kortestz_w:
17492   case Intrinsic::x86_avx512_kortestc_w: {
17493     unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B;
17494     SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1));
17495     SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2));
17496     SDValue CC = DAG.getConstant(X86CC, MVT::i8);
17497     SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
17498     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i1, CC, Test);
17499     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17500   }
17501
17502   case Intrinsic::x86_sse42_pcmpistria128:
17503   case Intrinsic::x86_sse42_pcmpestria128:
17504   case Intrinsic::x86_sse42_pcmpistric128:
17505   case Intrinsic::x86_sse42_pcmpestric128:
17506   case Intrinsic::x86_sse42_pcmpistrio128:
17507   case Intrinsic::x86_sse42_pcmpestrio128:
17508   case Intrinsic::x86_sse42_pcmpistris128:
17509   case Intrinsic::x86_sse42_pcmpestris128:
17510   case Intrinsic::x86_sse42_pcmpistriz128:
17511   case Intrinsic::x86_sse42_pcmpestriz128: {
17512     unsigned Opcode;
17513     unsigned X86CC;
17514     switch (IntNo) {
17515     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
17516     case Intrinsic::x86_sse42_pcmpistria128:
17517       Opcode = X86ISD::PCMPISTRI;
17518       X86CC = X86::COND_A;
17519       break;
17520     case Intrinsic::x86_sse42_pcmpestria128:
17521       Opcode = X86ISD::PCMPESTRI;
17522       X86CC = X86::COND_A;
17523       break;
17524     case Intrinsic::x86_sse42_pcmpistric128:
17525       Opcode = X86ISD::PCMPISTRI;
17526       X86CC = X86::COND_B;
17527       break;
17528     case Intrinsic::x86_sse42_pcmpestric128:
17529       Opcode = X86ISD::PCMPESTRI;
17530       X86CC = X86::COND_B;
17531       break;
17532     case Intrinsic::x86_sse42_pcmpistrio128:
17533       Opcode = X86ISD::PCMPISTRI;
17534       X86CC = X86::COND_O;
17535       break;
17536     case Intrinsic::x86_sse42_pcmpestrio128:
17537       Opcode = X86ISD::PCMPESTRI;
17538       X86CC = X86::COND_O;
17539       break;
17540     case Intrinsic::x86_sse42_pcmpistris128:
17541       Opcode = X86ISD::PCMPISTRI;
17542       X86CC = X86::COND_S;
17543       break;
17544     case Intrinsic::x86_sse42_pcmpestris128:
17545       Opcode = X86ISD::PCMPESTRI;
17546       X86CC = X86::COND_S;
17547       break;
17548     case Intrinsic::x86_sse42_pcmpistriz128:
17549       Opcode = X86ISD::PCMPISTRI;
17550       X86CC = X86::COND_E;
17551       break;
17552     case Intrinsic::x86_sse42_pcmpestriz128:
17553       Opcode = X86ISD::PCMPESTRI;
17554       X86CC = X86::COND_E;
17555       break;
17556     }
17557     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
17558     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17559     SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
17560     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17561                                 DAG.getConstant(X86CC, MVT::i8),
17562                                 SDValue(PCMP.getNode(), 1));
17563     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17564   }
17565
17566   case Intrinsic::x86_sse42_pcmpistri128:
17567   case Intrinsic::x86_sse42_pcmpestri128: {
17568     unsigned Opcode;
17569     if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
17570       Opcode = X86ISD::PCMPISTRI;
17571     else
17572       Opcode = X86ISD::PCMPESTRI;
17573
17574     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
17575     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17576     return DAG.getNode(Opcode, dl, VTs, NewOps);
17577   }
17578   }
17579 }
17580
17581 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
17582                               SDValue Src, SDValue Mask, SDValue Base,
17583                               SDValue Index, SDValue ScaleOp, SDValue Chain,
17584                               const X86Subtarget * Subtarget) {
17585   SDLoc dl(Op);
17586   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
17587   assert(C && "Invalid scale type");
17588   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
17589   EVT MaskVT = MVT::getVectorVT(MVT::i1,
17590                              Index.getSimpleValueType().getVectorNumElements());
17591   SDValue MaskInReg;
17592   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
17593   if (MaskC)
17594     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
17595   else
17596     MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
17597   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
17598   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
17599   SDValue Segment = DAG.getRegister(0, MVT::i32);
17600   if (Src.getOpcode() == ISD::UNDEF)
17601     Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl);
17602   SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
17603   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
17604   SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
17605   return DAG.getMergeValues(RetOps, dl);
17606 }
17607
17608 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
17609                                SDValue Src, SDValue Mask, SDValue Base,
17610                                SDValue Index, SDValue ScaleOp, SDValue Chain) {
17611   SDLoc dl(Op);
17612   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
17613   assert(C && "Invalid scale type");
17614   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
17615   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
17616   SDValue Segment = DAG.getRegister(0, MVT::i32);
17617   EVT MaskVT = MVT::getVectorVT(MVT::i1,
17618                              Index.getSimpleValueType().getVectorNumElements());
17619   SDValue MaskInReg;
17620   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
17621   if (MaskC)
17622     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
17623   else
17624     MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
17625   SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
17626   SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
17627   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
17628   return SDValue(Res, 1);
17629 }
17630
17631 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
17632                                SDValue Mask, SDValue Base, SDValue Index,
17633                                SDValue ScaleOp, SDValue Chain) {
17634   SDLoc dl(Op);
17635   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
17636   assert(C && "Invalid scale type");
17637   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
17638   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
17639   SDValue Segment = DAG.getRegister(0, MVT::i32);
17640   EVT MaskVT =
17641     MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
17642   SDValue MaskInReg;
17643   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
17644   if (MaskC)
17645     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
17646   else
17647     MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
17648   //SDVTList VTs = DAG.getVTList(MVT::Other);
17649   SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
17650   SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
17651   return SDValue(Res, 0);
17652 }
17653
17654 // getReadPerformanceCounter - Handles the lowering of builtin intrinsics that
17655 // read performance monitor counters (x86_rdpmc).
17656 static void getReadPerformanceCounter(SDNode *N, SDLoc DL,
17657                               SelectionDAG &DAG, const X86Subtarget *Subtarget,
17658                               SmallVectorImpl<SDValue> &Results) {
17659   assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
17660   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
17661   SDValue LO, HI;
17662
17663   // The ECX register is used to select the index of the performance counter
17664   // to read.
17665   SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
17666                                    N->getOperand(2));
17667   SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
17668
17669   // Reads the content of a 64-bit performance counter and returns it in the
17670   // registers EDX:EAX.
17671   if (Subtarget->is64Bit()) {
17672     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
17673     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
17674                             LO.getValue(2));
17675   } else {
17676     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
17677     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
17678                             LO.getValue(2));
17679   }
17680   Chain = HI.getValue(1);
17681
17682   if (Subtarget->is64Bit()) {
17683     // The EAX register is loaded with the low-order 32 bits. The EDX register
17684     // is loaded with the supported high-order bits of the counter.
17685     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
17686                               DAG.getConstant(32, MVT::i8));
17687     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
17688     Results.push_back(Chain);
17689     return;
17690   }
17691
17692   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
17693   SDValue Ops[] = { LO, HI };
17694   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
17695   Results.push_back(Pair);
17696   Results.push_back(Chain);
17697 }
17698
17699 // getReadTimeStampCounter - Handles the lowering of builtin intrinsics that
17700 // read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is
17701 // also used to custom lower READCYCLECOUNTER nodes.
17702 static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode,
17703                               SelectionDAG &DAG, const X86Subtarget *Subtarget,
17704                               SmallVectorImpl<SDValue> &Results) {
17705   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
17706   SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
17707   SDValue LO, HI;
17708
17709   // The processor's time-stamp counter (a 64-bit MSR) is stored into the
17710   // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
17711   // and the EAX register is loaded with the low-order 32 bits.
17712   if (Subtarget->is64Bit()) {
17713     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
17714     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
17715                             LO.getValue(2));
17716   } else {
17717     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
17718     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
17719                             LO.getValue(2));
17720   }
17721   SDValue Chain = HI.getValue(1);
17722
17723   if (Opcode == X86ISD::RDTSCP_DAG) {
17724     assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
17725
17726     // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
17727     // the ECX register. Add 'ecx' explicitly to the chain.
17728     SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
17729                                      HI.getValue(2));
17730     // Explicitly store the content of ECX at the location passed in input
17731     // to the 'rdtscp' intrinsic.
17732     Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
17733                          MachinePointerInfo(), false, false, 0);
17734   }
17735
17736   if (Subtarget->is64Bit()) {
17737     // The EDX register is loaded with the high-order 32 bits of the MSR, and
17738     // the EAX register is loaded with the low-order 32 bits.
17739     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
17740                               DAG.getConstant(32, MVT::i8));
17741     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
17742     Results.push_back(Chain);
17743     return;
17744   }
17745
17746   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
17747   SDValue Ops[] = { LO, HI };
17748   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
17749   Results.push_back(Pair);
17750   Results.push_back(Chain);
17751 }
17752
17753 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
17754                                      SelectionDAG &DAG) {
17755   SmallVector<SDValue, 2> Results;
17756   SDLoc DL(Op);
17757   getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
17758                           Results);
17759   return DAG.getMergeValues(Results, DL);
17760 }
17761
17762
17763 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
17764                                       SelectionDAG &DAG) {
17765   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
17766
17767   const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo);
17768   if (!IntrData)
17769     return SDValue();
17770
17771   SDLoc dl(Op);
17772   switch(IntrData->Type) {
17773   default:
17774     llvm_unreachable("Unknown Intrinsic Type");
17775     break;
17776   case RDSEED:
17777   case RDRAND: {
17778     // Emit the node with the right value type.
17779     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
17780     SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
17781
17782     // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
17783     // Otherwise return the value from Rand, which is always 0, casted to i32.
17784     SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
17785                       DAG.getConstant(1, Op->getValueType(1)),
17786                       DAG.getConstant(X86::COND_B, MVT::i32),
17787                       SDValue(Result.getNode(), 1) };
17788     SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
17789                                   DAG.getVTList(Op->getValueType(1), MVT::Glue),
17790                                   Ops);
17791
17792     // Return { result, isValid, chain }.
17793     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
17794                        SDValue(Result.getNode(), 2));
17795   }
17796   case GATHER: {
17797   //gather(v1, mask, index, base, scale);
17798     SDValue Chain = Op.getOperand(0);
17799     SDValue Src   = Op.getOperand(2);
17800     SDValue Base  = Op.getOperand(3);
17801     SDValue Index = Op.getOperand(4);
17802     SDValue Mask  = Op.getOperand(5);
17803     SDValue Scale = Op.getOperand(6);
17804     return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain,
17805                           Subtarget);
17806   }
17807   case SCATTER: {
17808   //scatter(base, mask, index, v1, scale);
17809     SDValue Chain = Op.getOperand(0);
17810     SDValue Base  = Op.getOperand(2);
17811     SDValue Mask  = Op.getOperand(3);
17812     SDValue Index = Op.getOperand(4);
17813     SDValue Src   = Op.getOperand(5);
17814     SDValue Scale = Op.getOperand(6);
17815     return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain);
17816   }
17817   case PREFETCH: {
17818     SDValue Hint = Op.getOperand(6);
17819     unsigned HintVal;
17820     if (dyn_cast<ConstantSDNode> (Hint) == nullptr ||
17821         (HintVal = dyn_cast<ConstantSDNode> (Hint)->getZExtValue()) > 1)
17822       llvm_unreachable("Wrong prefetch hint in intrinsic: should be 0 or 1");
17823     unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0);
17824     SDValue Chain = Op.getOperand(0);
17825     SDValue Mask  = Op.getOperand(2);
17826     SDValue Index = Op.getOperand(3);
17827     SDValue Base  = Op.getOperand(4);
17828     SDValue Scale = Op.getOperand(5);
17829     return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain);
17830   }
17831   // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
17832   case RDTSC: {
17833     SmallVector<SDValue, 2> Results;
17834     getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget, Results);
17835     return DAG.getMergeValues(Results, dl);
17836   }
17837   // Read Performance Monitoring Counters.
17838   case RDPMC: {
17839     SmallVector<SDValue, 2> Results;
17840     getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
17841     return DAG.getMergeValues(Results, dl);
17842   }
17843   // XTEST intrinsics.
17844   case XTEST: {
17845     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
17846     SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
17847     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17848                                 DAG.getConstant(X86::COND_NE, MVT::i8),
17849                                 InTrans);
17850     SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
17851     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
17852                        Ret, SDValue(InTrans.getNode(), 1));
17853   }
17854   // ADC/ADCX/SBB
17855   case ADX: {
17856     SmallVector<SDValue, 2> Results;
17857     SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
17858     SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
17859     SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
17860                                 DAG.getConstant(-1, MVT::i8));
17861     SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
17862                               Op.getOperand(4), GenCF.getValue(1));
17863     SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
17864                                  Op.getOperand(5), MachinePointerInfo(),
17865                                  false, false, 0);
17866     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17867                                 DAG.getConstant(X86::COND_B, MVT::i8),
17868                                 Res.getValue(1));
17869     Results.push_back(SetCC);
17870     Results.push_back(Store);
17871     return DAG.getMergeValues(Results, dl);
17872   }
17873   case COMPRESS_TO_MEM: {
17874     SDLoc dl(Op);
17875     SDValue Mask = Op.getOperand(4);
17876     SDValue DataToCompress = Op.getOperand(3);
17877     SDValue Addr = Op.getOperand(2);
17878     SDValue Chain = Op.getOperand(0);
17879
17880     if (isAllOnes(Mask)) // return just a store
17881       return DAG.getStore(Chain, dl, DataToCompress, Addr,
17882                           MachinePointerInfo(), false, false, 0);
17883
17884     EVT VT = DataToCompress.getValueType();
17885     EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17886                                   VT.getVectorNumElements());
17887     EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17888                                      Mask.getValueType().getSizeInBits());
17889     SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17890                                 DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17891                                 DAG.getIntPtrConstant(0));
17892
17893     SDValue Compressed =  DAG.getNode(IntrData->Opc0, dl, VT, VMask,
17894                                       DataToCompress, DAG.getUNDEF(VT));
17895     return DAG.getStore(Chain, dl, Compressed, Addr,
17896                         MachinePointerInfo(), false, false, 0);
17897   }
17898   case EXPAND_FROM_MEM: {
17899     SDLoc dl(Op);
17900     SDValue Mask = Op.getOperand(4);
17901     SDValue PathThru = Op.getOperand(3);
17902     SDValue Addr = Op.getOperand(2);
17903     SDValue Chain = Op.getOperand(0);
17904     EVT VT = Op.getValueType();
17905
17906     if (isAllOnes(Mask)) // return just a load
17907       return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false,
17908                          false, 0);
17909     EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17910                                   VT.getVectorNumElements());
17911     EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17912                                      Mask.getValueType().getSizeInBits());
17913     SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17914                                 DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17915                                 DAG.getIntPtrConstant(0));
17916
17917     SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(),
17918                                    false, false, false, 0);
17919
17920     SmallVector<SDValue, 2> Results;
17921     Results.push_back(DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToExpand,
17922                                   PathThru));
17923     Results.push_back(Chain);
17924     return DAG.getMergeValues(Results, dl);
17925   }
17926   }
17927 }
17928
17929 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
17930                                            SelectionDAG &DAG) const {
17931   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
17932   MFI->setReturnAddressIsTaken(true);
17933
17934   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
17935     return SDValue();
17936
17937   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
17938   SDLoc dl(Op);
17939   EVT PtrVT = getPointerTy();
17940
17941   if (Depth > 0) {
17942     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
17943     const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
17944     SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
17945     return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
17946                        DAG.getNode(ISD::ADD, dl, PtrVT,
17947                                    FrameAddr, Offset),
17948                        MachinePointerInfo(), false, false, false, 0);
17949   }
17950
17951   // Just load the return address.
17952   SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
17953   return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
17954                      RetAddrFI, MachinePointerInfo(), false, false, false, 0);
17955 }
17956
17957 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
17958   MachineFunction &MF = DAG.getMachineFunction();
17959   MachineFrameInfo *MFI = MF.getFrameInfo();
17960   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
17961   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
17962   EVT VT = Op.getValueType();
17963
17964   MFI->setFrameAddressIsTaken(true);
17965
17966   if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
17967     // Depth > 0 makes no sense on targets which use Windows unwind codes.  It
17968     // is not possible to crawl up the stack without looking at the unwind codes
17969     // simultaneously.
17970     int FrameAddrIndex = FuncInfo->getFAIndex();
17971     if (!FrameAddrIndex) {
17972       // Set up a frame object for the return address.
17973       unsigned SlotSize = RegInfo->getSlotSize();
17974       FrameAddrIndex = MF.getFrameInfo()->CreateFixedObject(
17975           SlotSize, /*Offset=*/INT64_MIN, /*IsImmutable=*/false);
17976       FuncInfo->setFAIndex(FrameAddrIndex);
17977     }
17978     return DAG.getFrameIndex(FrameAddrIndex, VT);
17979   }
17980
17981   unsigned FrameReg =
17982       RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
17983   SDLoc dl(Op);  // FIXME probably not meaningful
17984   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
17985   assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
17986           (FrameReg == X86::EBP && VT == MVT::i32)) &&
17987          "Invalid Frame Register!");
17988   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
17989   while (Depth--)
17990     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
17991                             MachinePointerInfo(),
17992                             false, false, false, 0);
17993   return FrameAddr;
17994 }
17995
17996 // FIXME? Maybe this could be a TableGen attribute on some registers and
17997 // this table could be generated automatically from RegInfo.
17998 unsigned X86TargetLowering::getRegisterByName(const char* RegName,
17999                                               EVT VT) const {
18000   unsigned Reg = StringSwitch<unsigned>(RegName)
18001                        .Case("esp", X86::ESP)
18002                        .Case("rsp", X86::RSP)
18003                        .Default(0);
18004   if (Reg)
18005     return Reg;
18006   report_fatal_error("Invalid register name global variable");
18007 }
18008
18009 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
18010                                                      SelectionDAG &DAG) const {
18011   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
18012   return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());
18013 }
18014
18015 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
18016   SDValue Chain     = Op.getOperand(0);
18017   SDValue Offset    = Op.getOperand(1);
18018   SDValue Handler   = Op.getOperand(2);
18019   SDLoc dl      (Op);
18020
18021   EVT PtrVT = getPointerTy();
18022   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
18023   unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
18024   assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
18025           (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
18026          "Invalid Frame Register!");
18027   SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
18028   unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
18029
18030   SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
18031                                  DAG.getIntPtrConstant(RegInfo->getSlotSize()));
18032   StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
18033   Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
18034                        false, false, 0);
18035   Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
18036
18037   return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
18038                      DAG.getRegister(StoreAddrReg, PtrVT));
18039 }
18040
18041 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
18042                                                SelectionDAG &DAG) const {
18043   SDLoc DL(Op);
18044   return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
18045                      DAG.getVTList(MVT::i32, MVT::Other),
18046                      Op.getOperand(0), Op.getOperand(1));
18047 }
18048
18049 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
18050                                                 SelectionDAG &DAG) const {
18051   SDLoc DL(Op);
18052   return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
18053                      Op.getOperand(0), Op.getOperand(1));
18054 }
18055
18056 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
18057   return Op.getOperand(0);
18058 }
18059
18060 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
18061                                                 SelectionDAG &DAG) const {
18062   SDValue Root = Op.getOperand(0);
18063   SDValue Trmp = Op.getOperand(1); // trampoline
18064   SDValue FPtr = Op.getOperand(2); // nested function
18065   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
18066   SDLoc dl (Op);
18067
18068   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
18069   const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
18070
18071   if (Subtarget->is64Bit()) {
18072     SDValue OutChains[6];
18073
18074     // Large code-model.
18075     const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
18076     const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
18077
18078     const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
18079     const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
18080
18081     const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
18082
18083     // Load the pointer to the nested function into R11.
18084     unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
18085     SDValue Addr = Trmp;
18086     OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
18087                                 Addr, MachinePointerInfo(TrmpAddr),
18088                                 false, false, 0);
18089
18090     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18091                        DAG.getConstant(2, MVT::i64));
18092     OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
18093                                 MachinePointerInfo(TrmpAddr, 2),
18094                                 false, false, 2);
18095
18096     // Load the 'nest' parameter value into R10.
18097     // R10 is specified in X86CallingConv.td
18098     OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
18099     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18100                        DAG.getConstant(10, MVT::i64));
18101     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
18102                                 Addr, MachinePointerInfo(TrmpAddr, 10),
18103                                 false, false, 0);
18104
18105     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18106                        DAG.getConstant(12, MVT::i64));
18107     OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
18108                                 MachinePointerInfo(TrmpAddr, 12),
18109                                 false, false, 2);
18110
18111     // Jump to the nested function.
18112     OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
18113     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18114                        DAG.getConstant(20, MVT::i64));
18115     OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
18116                                 Addr, MachinePointerInfo(TrmpAddr, 20),
18117                                 false, false, 0);
18118
18119     unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
18120     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18121                        DAG.getConstant(22, MVT::i64));
18122     OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,
18123                                 MachinePointerInfo(TrmpAddr, 22),
18124                                 false, false, 0);
18125
18126     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
18127   } else {
18128     const Function *Func =
18129       cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
18130     CallingConv::ID CC = Func->getCallingConv();
18131     unsigned NestReg;
18132
18133     switch (CC) {
18134     default:
18135       llvm_unreachable("Unsupported calling convention");
18136     case CallingConv::C:
18137     case CallingConv::X86_StdCall: {
18138       // Pass 'nest' parameter in ECX.
18139       // Must be kept in sync with X86CallingConv.td
18140       NestReg = X86::ECX;
18141
18142       // Check that ECX wasn't needed by an 'inreg' parameter.
18143       FunctionType *FTy = Func->getFunctionType();
18144       const AttributeSet &Attrs = Func->getAttributes();
18145
18146       if (!Attrs.isEmpty() && !Func->isVarArg()) {
18147         unsigned InRegCount = 0;
18148         unsigned Idx = 1;
18149
18150         for (FunctionType::param_iterator I = FTy->param_begin(),
18151              E = FTy->param_end(); I != E; ++I, ++Idx)
18152           if (Attrs.hasAttribute(Idx, Attribute::InReg))
18153             // FIXME: should only count parameters that are lowered to integers.
18154             InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
18155
18156         if (InRegCount > 2) {
18157           report_fatal_error("Nest register in use - reduce number of inreg"
18158                              " parameters!");
18159         }
18160       }
18161       break;
18162     }
18163     case CallingConv::X86_FastCall:
18164     case CallingConv::X86_ThisCall:
18165     case CallingConv::Fast:
18166       // Pass 'nest' parameter in EAX.
18167       // Must be kept in sync with X86CallingConv.td
18168       NestReg = X86::EAX;
18169       break;
18170     }
18171
18172     SDValue OutChains[4];
18173     SDValue Addr, Disp;
18174
18175     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18176                        DAG.getConstant(10, MVT::i32));
18177     Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
18178
18179     // This is storing the opcode for MOV32ri.
18180     const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
18181     const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
18182     OutChains[0] = DAG.getStore(Root, dl,
18183                                 DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
18184                                 Trmp, MachinePointerInfo(TrmpAddr),
18185                                 false, false, 0);
18186
18187     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18188                        DAG.getConstant(1, MVT::i32));
18189     OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
18190                                 MachinePointerInfo(TrmpAddr, 1),
18191                                 false, false, 1);
18192
18193     const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
18194     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18195                        DAG.getConstant(5, MVT::i32));
18196     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,
18197                                 MachinePointerInfo(TrmpAddr, 5),
18198                                 false, false, 1);
18199
18200     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18201                        DAG.getConstant(6, MVT::i32));
18202     OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
18203                                 MachinePointerInfo(TrmpAddr, 6),
18204                                 false, false, 1);
18205
18206     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
18207   }
18208 }
18209
18210 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
18211                                             SelectionDAG &DAG) const {
18212   /*
18213    The rounding mode is in bits 11:10 of FPSR, and has the following
18214    settings:
18215      00 Round to nearest
18216      01 Round to -inf
18217      10 Round to +inf
18218      11 Round to 0
18219
18220   FLT_ROUNDS, on the other hand, expects the following:
18221     -1 Undefined
18222      0 Round to 0
18223      1 Round to nearest
18224      2 Round to +inf
18225      3 Round to -inf
18226
18227   To perform the conversion, we do:
18228     (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
18229   */
18230
18231   MachineFunction &MF = DAG.getMachineFunction();
18232   const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
18233   unsigned StackAlignment = TFI.getStackAlignment();
18234   MVT VT = Op.getSimpleValueType();
18235   SDLoc DL(Op);
18236
18237   // Save FP Control Word to stack slot
18238   int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
18239   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
18240
18241   MachineMemOperand *MMO =
18242    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
18243                            MachineMemOperand::MOStore, 2, 2);
18244
18245   SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
18246   SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
18247                                           DAG.getVTList(MVT::Other),
18248                                           Ops, MVT::i16, MMO);
18249
18250   // Load FP Control Word from stack slot
18251   SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
18252                             MachinePointerInfo(), false, false, false, 0);
18253
18254   // Transform as necessary
18255   SDValue CWD1 =
18256     DAG.getNode(ISD::SRL, DL, MVT::i16,
18257                 DAG.getNode(ISD::AND, DL, MVT::i16,
18258                             CWD, DAG.getConstant(0x800, MVT::i16)),
18259                 DAG.getConstant(11, MVT::i8));
18260   SDValue CWD2 =
18261     DAG.getNode(ISD::SRL, DL, MVT::i16,
18262                 DAG.getNode(ISD::AND, DL, MVT::i16,
18263                             CWD, DAG.getConstant(0x400, MVT::i16)),
18264                 DAG.getConstant(9, MVT::i8));
18265
18266   SDValue RetVal =
18267     DAG.getNode(ISD::AND, DL, MVT::i16,
18268                 DAG.getNode(ISD::ADD, DL, MVT::i16,
18269                             DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
18270                             DAG.getConstant(1, MVT::i16)),
18271                 DAG.getConstant(3, MVT::i16));
18272
18273   return DAG.getNode((VT.getSizeInBits() < 16 ?
18274                       ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
18275 }
18276
18277 static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
18278   MVT VT = Op.getSimpleValueType();
18279   EVT OpVT = VT;
18280   unsigned NumBits = VT.getSizeInBits();
18281   SDLoc dl(Op);
18282
18283   Op = Op.getOperand(0);
18284   if (VT == MVT::i8) {
18285     // Zero extend to i32 since there is not an i8 bsr.
18286     OpVT = MVT::i32;
18287     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
18288   }
18289
18290   // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
18291   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
18292   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
18293
18294   // If src is zero (i.e. bsr sets ZF), returns NumBits.
18295   SDValue Ops[] = {
18296     Op,
18297     DAG.getConstant(NumBits+NumBits-1, OpVT),
18298     DAG.getConstant(X86::COND_E, MVT::i8),
18299     Op.getValue(1)
18300   };
18301   Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
18302
18303   // Finally xor with NumBits-1.
18304   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
18305
18306   if (VT == MVT::i8)
18307     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
18308   return Op;
18309 }
18310
18311 static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
18312   MVT VT = Op.getSimpleValueType();
18313   EVT OpVT = VT;
18314   unsigned NumBits = VT.getSizeInBits();
18315   SDLoc dl(Op);
18316
18317   Op = Op.getOperand(0);
18318   if (VT == MVT::i8) {
18319     // Zero extend to i32 since there is not an i8 bsr.
18320     OpVT = MVT::i32;
18321     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
18322   }
18323
18324   // Issue a bsr (scan bits in reverse).
18325   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
18326   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
18327
18328   // And xor with NumBits-1.
18329   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
18330
18331   if (VT == MVT::i8)
18332     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
18333   return Op;
18334 }
18335
18336 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
18337   MVT VT = Op.getSimpleValueType();
18338   unsigned NumBits = VT.getSizeInBits();
18339   SDLoc dl(Op);
18340   Op = Op.getOperand(0);
18341
18342   // Issue a bsf (scan bits forward) which also sets EFLAGS.
18343   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18344   Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
18345
18346   // If src is zero (i.e. bsf sets ZF), returns NumBits.
18347   SDValue Ops[] = {
18348     Op,
18349     DAG.getConstant(NumBits, VT),
18350     DAG.getConstant(X86::COND_E, MVT::i8),
18351     Op.getValue(1)
18352   };
18353   return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
18354 }
18355
18356 // Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
18357 // ones, and then concatenate the result back.
18358 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
18359   MVT VT = Op.getSimpleValueType();
18360
18361   assert(VT.is256BitVector() && VT.isInteger() &&
18362          "Unsupported value type for operation");
18363
18364   unsigned NumElems = VT.getVectorNumElements();
18365   SDLoc dl(Op);
18366
18367   // Extract the LHS vectors
18368   SDValue LHS = Op.getOperand(0);
18369   SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
18370   SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
18371
18372   // Extract the RHS vectors
18373   SDValue RHS = Op.getOperand(1);
18374   SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
18375   SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
18376
18377   MVT EltVT = VT.getVectorElementType();
18378   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
18379
18380   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
18381                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
18382                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
18383 }
18384
18385 static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
18386   assert(Op.getSimpleValueType().is256BitVector() &&
18387          Op.getSimpleValueType().isInteger() &&
18388          "Only handle AVX 256-bit vector integer operation");
18389   return Lower256IntArith(Op, DAG);
18390 }
18391
18392 static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
18393   assert(Op.getSimpleValueType().is256BitVector() &&
18394          Op.getSimpleValueType().isInteger() &&
18395          "Only handle AVX 256-bit vector integer operation");
18396   return Lower256IntArith(Op, DAG);
18397 }
18398
18399 static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
18400                         SelectionDAG &DAG) {
18401   SDLoc dl(Op);
18402   MVT VT = Op.getSimpleValueType();
18403
18404   // Decompose 256-bit ops into smaller 128-bit ops.
18405   if (VT.is256BitVector() && !Subtarget->hasInt256())
18406     return Lower256IntArith(Op, DAG);
18407
18408   SDValue A = Op.getOperand(0);
18409   SDValue B = Op.getOperand(1);
18410
18411   // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
18412   if (VT == MVT::v4i32) {
18413     assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() &&
18414            "Should not custom lower when pmuldq is available!");
18415
18416     // Extract the odd parts.
18417     static const int UnpackMask[] = { 1, -1, 3, -1 };
18418     SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
18419     SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
18420
18421     // Multiply the even parts.
18422     SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
18423     // Now multiply odd parts.
18424     SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
18425
18426     Evens = DAG.getNode(ISD::BITCAST, dl, VT, Evens);
18427     Odds = DAG.getNode(ISD::BITCAST, dl, VT, Odds);
18428
18429     // Merge the two vectors back together with a shuffle. This expands into 2
18430     // shuffles.
18431     static const int ShufMask[] = { 0, 4, 2, 6 };
18432     return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
18433   }
18434
18435   assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
18436          "Only know how to lower V2I64/V4I64/V8I64 multiply");
18437
18438   //  Ahi = psrlqi(a, 32);
18439   //  Bhi = psrlqi(b, 32);
18440   //
18441   //  AloBlo = pmuludq(a, b);
18442   //  AloBhi = pmuludq(a, Bhi);
18443   //  AhiBlo = pmuludq(Ahi, b);
18444
18445   //  AloBhi = psllqi(AloBhi, 32);
18446   //  AhiBlo = psllqi(AhiBlo, 32);
18447   //  return AloBlo + AloBhi + AhiBlo;
18448
18449   SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
18450   SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
18451
18452   // Bit cast to 32-bit vectors for MULUDQ
18453   EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
18454                                   (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;
18455   A = DAG.getNode(ISD::BITCAST, dl, MulVT, A);
18456   B = DAG.getNode(ISD::BITCAST, dl, MulVT, B);
18457   Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi);
18458   Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi);
18459
18460   SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
18461   SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
18462   SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
18463
18464   AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG);
18465   AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG);
18466
18467   SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
18468   return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
18469 }
18470
18471 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
18472   assert(Subtarget->isTargetWin64() && "Unexpected target");
18473   EVT VT = Op.getValueType();
18474   assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
18475          "Unexpected return type for lowering");
18476
18477   RTLIB::Libcall LC;
18478   bool isSigned;
18479   switch (Op->getOpcode()) {
18480   default: llvm_unreachable("Unexpected request for libcall!");
18481   case ISD::SDIV:      isSigned = true;  LC = RTLIB::SDIV_I128;    break;
18482   case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;
18483   case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;
18484   case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;
18485   case ISD::SDIVREM:   isSigned = true;  LC = RTLIB::SDIVREM_I128; break;
18486   case ISD::UDIVREM:   isSigned = false; LC = RTLIB::UDIVREM_I128; break;
18487   }
18488
18489   SDLoc dl(Op);
18490   SDValue InChain = DAG.getEntryNode();
18491
18492   TargetLowering::ArgListTy Args;
18493   TargetLowering::ArgListEntry Entry;
18494   for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
18495     EVT ArgVT = Op->getOperand(i).getValueType();
18496     assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
18497            "Unexpected argument type for lowering");
18498     SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
18499     Entry.Node = StackPtr;
18500     InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(),
18501                            false, false, 16);
18502     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
18503     Entry.Ty = PointerType::get(ArgTy,0);
18504     Entry.isSExt = false;
18505     Entry.isZExt = false;
18506     Args.push_back(Entry);
18507   }
18508
18509   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
18510                                          getPointerTy());
18511
18512   TargetLowering::CallLoweringInfo CLI(DAG);
18513   CLI.setDebugLoc(dl).setChain(InChain)
18514     .setCallee(getLibcallCallingConv(LC),
18515                static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
18516                Callee, std::move(Args), 0)
18517     .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
18518
18519   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
18520   return DAG.getNode(ISD::BITCAST, dl, VT, CallInfo.first);
18521 }
18522
18523 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
18524                              SelectionDAG &DAG) {
18525   SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
18526   EVT VT = Op0.getValueType();
18527   SDLoc dl(Op);
18528
18529   assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) ||
18530          (VT == MVT::v8i32 && Subtarget->hasInt256()));
18531
18532   // PMULxD operations multiply each even value (starting at 0) of LHS with
18533   // the related value of RHS and produce a widen result.
18534   // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
18535   // => <2 x i64> <ae|cg>
18536   //
18537   // In other word, to have all the results, we need to perform two PMULxD:
18538   // 1. one with the even values.
18539   // 2. one with the odd values.
18540   // To achieve #2, with need to place the odd values at an even position.
18541   //
18542   // Place the odd value at an even position (basically, shift all values 1
18543   // step to the left):
18544   const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
18545   // <a|b|c|d> => <b|undef|d|undef>
18546   SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask);
18547   // <e|f|g|h> => <f|undef|h|undef>
18548   SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask);
18549
18550   // Emit two multiplies, one for the lower 2 ints and one for the higher 2
18551   // ints.
18552   MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
18553   bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
18554   unsigned Opcode =
18555       (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
18556   // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
18557   // => <2 x i64> <ae|cg>
18558   SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT,
18559                              DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
18560   // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
18561   // => <2 x i64> <bf|dh>
18562   SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT,
18563                              DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
18564
18565   // Shuffle it back into the right order.
18566   SDValue Highs, Lows;
18567   if (VT == MVT::v8i32) {
18568     const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
18569     Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
18570     const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
18571     Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
18572   } else {
18573     const int HighMask[] = {1, 5, 3, 7};
18574     Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
18575     const int LowMask[] = {0, 4, 2, 6};
18576     Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
18577   }
18578
18579   // If we have a signed multiply but no PMULDQ fix up the high parts of a
18580   // unsigned multiply.
18581   if (IsSigned && !Subtarget->hasSSE41()) {
18582     SDValue ShAmt =
18583         DAG.getConstant(31, DAG.getTargetLoweringInfo().getShiftAmountTy(VT));
18584     SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
18585                              DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
18586     SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
18587                              DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
18588
18589     SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
18590     Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
18591   }
18592
18593   // The first result of MUL_LOHI is actually the low value, followed by the
18594   // high value.
18595   SDValue Ops[] = {Lows, Highs};
18596   return DAG.getMergeValues(Ops, dl);
18597 }
18598
18599 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
18600                                          const X86Subtarget *Subtarget) {
18601   MVT VT = Op.getSimpleValueType();
18602   SDLoc dl(Op);
18603   SDValue R = Op.getOperand(0);
18604   SDValue Amt = Op.getOperand(1);
18605
18606   // Optimize shl/srl/sra with constant shift amount.
18607   if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
18608     if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
18609       uint64_t ShiftAmt = ShiftConst->getZExtValue();
18610
18611       if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
18612           (Subtarget->hasInt256() &&
18613            (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16)) ||
18614           (Subtarget->hasAVX512() &&
18615            (VT == MVT::v8i64 || VT == MVT::v16i32))) {
18616         if (Op.getOpcode() == ISD::SHL)
18617           return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
18618                                             DAG);
18619         if (Op.getOpcode() == ISD::SRL)
18620           return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
18621                                             DAG);
18622         if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64)
18623           return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
18624                                             DAG);
18625       }
18626
18627       if (VT == MVT::v16i8) {
18628         if (Op.getOpcode() == ISD::SHL) {
18629           // Make a large shift.
18630           SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
18631                                                    MVT::v8i16, R, ShiftAmt,
18632                                                    DAG);
18633           SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
18634           // Zero out the rightmost bits.
18635           SmallVector<SDValue, 16> V(16,
18636                                      DAG.getConstant(uint8_t(-1U << ShiftAmt),
18637                                                      MVT::i8));
18638           return DAG.getNode(ISD::AND, dl, VT, SHL,
18639                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18640         }
18641         if (Op.getOpcode() == ISD::SRL) {
18642           // Make a large shift.
18643           SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
18644                                                    MVT::v8i16, R, ShiftAmt,
18645                                                    DAG);
18646           SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
18647           // Zero out the leftmost bits.
18648           SmallVector<SDValue, 16> V(16,
18649                                      DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
18650                                                      MVT::i8));
18651           return DAG.getNode(ISD::AND, dl, VT, SRL,
18652                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18653         }
18654         if (Op.getOpcode() == ISD::SRA) {
18655           if (ShiftAmt == 7) {
18656             // R s>> 7  ===  R s< 0
18657             SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
18658             return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
18659           }
18660
18661           // R s>> a === ((R u>> a) ^ m) - m
18662           SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
18663           SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
18664                                                          MVT::i8));
18665           SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
18666           Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
18667           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
18668           return Res;
18669         }
18670         llvm_unreachable("Unknown shift opcode.");
18671       }
18672
18673       if (Subtarget->hasInt256() && VT == MVT::v32i8) {
18674         if (Op.getOpcode() == ISD::SHL) {
18675           // Make a large shift.
18676           SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
18677                                                    MVT::v16i16, R, ShiftAmt,
18678                                                    DAG);
18679           SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
18680           // Zero out the rightmost bits.
18681           SmallVector<SDValue, 32> V(32,
18682                                      DAG.getConstant(uint8_t(-1U << ShiftAmt),
18683                                                      MVT::i8));
18684           return DAG.getNode(ISD::AND, dl, VT, SHL,
18685                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18686         }
18687         if (Op.getOpcode() == ISD::SRL) {
18688           // Make a large shift.
18689           SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
18690                                                    MVT::v16i16, R, ShiftAmt,
18691                                                    DAG);
18692           SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
18693           // Zero out the leftmost bits.
18694           SmallVector<SDValue, 32> V(32,
18695                                      DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
18696                                                      MVT::i8));
18697           return DAG.getNode(ISD::AND, dl, VT, SRL,
18698                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18699         }
18700         if (Op.getOpcode() == ISD::SRA) {
18701           if (ShiftAmt == 7) {
18702             // R s>> 7  ===  R s< 0
18703             SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
18704             return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
18705           }
18706
18707           // R s>> a === ((R u>> a) ^ m) - m
18708           SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
18709           SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt,
18710                                                          MVT::i8));
18711           SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
18712           Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
18713           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
18714           return Res;
18715         }
18716         llvm_unreachable("Unknown shift opcode.");
18717       }
18718     }
18719   }
18720
18721   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
18722   if (!Subtarget->is64Bit() &&
18723       (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
18724       Amt.getOpcode() == ISD::BITCAST &&
18725       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
18726     Amt = Amt.getOperand(0);
18727     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
18728                      VT.getVectorNumElements();
18729     unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
18730     uint64_t ShiftAmt = 0;
18731     for (unsigned i = 0; i != Ratio; ++i) {
18732       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i));
18733       if (!C)
18734         return SDValue();
18735       // 6 == Log2(64)
18736       ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
18737     }
18738     // Check remaining shift amounts.
18739     for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
18740       uint64_t ShAmt = 0;
18741       for (unsigned j = 0; j != Ratio; ++j) {
18742         ConstantSDNode *C =
18743           dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
18744         if (!C)
18745           return SDValue();
18746         // 6 == Log2(64)
18747         ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
18748       }
18749       if (ShAmt != ShiftAmt)
18750         return SDValue();
18751     }
18752     switch (Op.getOpcode()) {
18753     default:
18754       llvm_unreachable("Unknown shift opcode!");
18755     case ISD::SHL:
18756       return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
18757                                         DAG);
18758     case ISD::SRL:
18759       return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
18760                                         DAG);
18761     case ISD::SRA:
18762       return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
18763                                         DAG);
18764     }
18765   }
18766
18767   return SDValue();
18768 }
18769
18770 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
18771                                         const X86Subtarget* Subtarget) {
18772   MVT VT = Op.getSimpleValueType();
18773   SDLoc dl(Op);
18774   SDValue R = Op.getOperand(0);
18775   SDValue Amt = Op.getOperand(1);
18776
18777   if ((VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) ||
18778       VT == MVT::v4i32 || VT == MVT::v8i16 ||
18779       (Subtarget->hasInt256() &&
18780        ((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) ||
18781         VT == MVT::v8i32 || VT == MVT::v16i16)) ||
18782        (Subtarget->hasAVX512() && (VT == MVT::v8i64 || VT == MVT::v16i32))) {
18783     SDValue BaseShAmt;
18784     EVT EltVT = VT.getVectorElementType();
18785
18786     if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
18787       // Check if this build_vector node is doing a splat.
18788       // If so, then set BaseShAmt equal to the splat value.
18789       BaseShAmt = BV->getSplatValue();
18790       if (BaseShAmt && BaseShAmt.getOpcode() == ISD::UNDEF)
18791         BaseShAmt = SDValue();
18792     } else {
18793       if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
18794         Amt = Amt.getOperand(0);
18795
18796       ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
18797       if (SVN && SVN->isSplat()) {
18798         unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
18799         SDValue InVec = Amt.getOperand(0);
18800         if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
18801           assert((SplatIdx < InVec.getValueType().getVectorNumElements()) &&
18802                  "Unexpected shuffle index found!");
18803           BaseShAmt = InVec.getOperand(SplatIdx);
18804         } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
18805            if (ConstantSDNode *C =
18806                dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
18807              if (C->getZExtValue() == SplatIdx)
18808                BaseShAmt = InVec.getOperand(1);
18809            }
18810         }
18811
18812         if (!BaseShAmt)
18813           // Avoid introducing an extract element from a shuffle.
18814           BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
18815                                     DAG.getIntPtrConstant(SplatIdx));
18816       }
18817     }
18818
18819     if (BaseShAmt.getNode()) {
18820       assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
18821       if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
18822         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
18823       else if (EltVT.bitsLT(MVT::i32))
18824         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
18825
18826       switch (Op.getOpcode()) {
18827       default:
18828         llvm_unreachable("Unknown shift opcode!");
18829       case ISD::SHL:
18830         switch (VT.SimpleTy) {
18831         default: return SDValue();
18832         case MVT::v2i64:
18833         case MVT::v4i32:
18834         case MVT::v8i16:
18835         case MVT::v4i64:
18836         case MVT::v8i32:
18837         case MVT::v16i16:
18838         case MVT::v16i32:
18839         case MVT::v8i64:
18840           return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG);
18841         }
18842       case ISD::SRA:
18843         switch (VT.SimpleTy) {
18844         default: return SDValue();
18845         case MVT::v4i32:
18846         case MVT::v8i16:
18847         case MVT::v8i32:
18848         case MVT::v16i16:
18849         case MVT::v16i32:
18850         case MVT::v8i64:
18851           return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG);
18852         }
18853       case ISD::SRL:
18854         switch (VT.SimpleTy) {
18855         default: return SDValue();
18856         case MVT::v2i64:
18857         case MVT::v4i32:
18858         case MVT::v8i16:
18859         case MVT::v4i64:
18860         case MVT::v8i32:
18861         case MVT::v16i16:
18862         case MVT::v16i32:
18863         case MVT::v8i64:
18864           return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG);
18865         }
18866       }
18867     }
18868   }
18869
18870   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
18871   if (!Subtarget->is64Bit() &&
18872       (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64) ||
18873       (Subtarget->hasAVX512() && VT == MVT::v8i64)) &&
18874       Amt.getOpcode() == ISD::BITCAST &&
18875       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
18876     Amt = Amt.getOperand(0);
18877     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
18878                      VT.getVectorNumElements();
18879     std::vector<SDValue> Vals(Ratio);
18880     for (unsigned i = 0; i != Ratio; ++i)
18881       Vals[i] = Amt.getOperand(i);
18882     for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
18883       for (unsigned j = 0; j != Ratio; ++j)
18884         if (Vals[j] != Amt.getOperand(i + j))
18885           return SDValue();
18886     }
18887     switch (Op.getOpcode()) {
18888     default:
18889       llvm_unreachable("Unknown shift opcode!");
18890     case ISD::SHL:
18891       return DAG.getNode(X86ISD::VSHL, dl, VT, R, Op.getOperand(1));
18892     case ISD::SRL:
18893       return DAG.getNode(X86ISD::VSRL, dl, VT, R, Op.getOperand(1));
18894     case ISD::SRA:
18895       return DAG.getNode(X86ISD::VSRA, dl, VT, R, Op.getOperand(1));
18896     }
18897   }
18898
18899   return SDValue();
18900 }
18901
18902 static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
18903                           SelectionDAG &DAG) {
18904   MVT VT = Op.getSimpleValueType();
18905   SDLoc dl(Op);
18906   SDValue R = Op.getOperand(0);
18907   SDValue Amt = Op.getOperand(1);
18908   SDValue V;
18909
18910   assert(VT.isVector() && "Custom lowering only for vector shifts!");
18911   assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!");
18912
18913   V = LowerScalarImmediateShift(Op, DAG, Subtarget);
18914   if (V.getNode())
18915     return V;
18916
18917   V = LowerScalarVariableShift(Op, DAG, Subtarget);
18918   if (V.getNode())
18919       return V;
18920
18921   if (Subtarget->hasAVX512() && (VT == MVT::v16i32 || VT == MVT::v8i64))
18922     return Op;
18923   // AVX2 has VPSLLV/VPSRAV/VPSRLV.
18924   if (Subtarget->hasInt256()) {
18925     if (Op.getOpcode() == ISD::SRL &&
18926         (VT == MVT::v2i64 || VT == MVT::v4i32 ||
18927          VT == MVT::v4i64 || VT == MVT::v8i32))
18928       return Op;
18929     if (Op.getOpcode() == ISD::SHL &&
18930         (VT == MVT::v2i64 || VT == MVT::v4i32 ||
18931          VT == MVT::v4i64 || VT == MVT::v8i32))
18932       return Op;
18933     if (Op.getOpcode() == ISD::SRA && (VT == MVT::v4i32 || VT == MVT::v8i32))
18934       return Op;
18935   }
18936
18937   // If possible, lower this packed shift into a vector multiply instead of
18938   // expanding it into a sequence of scalar shifts.
18939   // Do this only if the vector shift count is a constant build_vector.
18940   if (Op.getOpcode() == ISD::SHL &&
18941       (VT == MVT::v8i16 || VT == MVT::v4i32 ||
18942        (Subtarget->hasInt256() && VT == MVT::v16i16)) &&
18943       ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
18944     SmallVector<SDValue, 8> Elts;
18945     EVT SVT = VT.getScalarType();
18946     unsigned SVTBits = SVT.getSizeInBits();
18947     const APInt &One = APInt(SVTBits, 1);
18948     unsigned NumElems = VT.getVectorNumElements();
18949
18950     for (unsigned i=0; i !=NumElems; ++i) {
18951       SDValue Op = Amt->getOperand(i);
18952       if (Op->getOpcode() == ISD::UNDEF) {
18953         Elts.push_back(Op);
18954         continue;
18955       }
18956
18957       ConstantSDNode *ND = cast<ConstantSDNode>(Op);
18958       const APInt &C = APInt(SVTBits, ND->getAPIntValue().getZExtValue());
18959       uint64_t ShAmt = C.getZExtValue();
18960       if (ShAmt >= SVTBits) {
18961         Elts.push_back(DAG.getUNDEF(SVT));
18962         continue;
18963       }
18964       Elts.push_back(DAG.getConstant(One.shl(ShAmt), SVT));
18965     }
18966     SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
18967     return DAG.getNode(ISD::MUL, dl, VT, R, BV);
18968   }
18969
18970   // Lower SHL with variable shift amount.
18971   if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
18972     Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT));
18973
18974     Op = DAG.getNode(ISD::ADD, dl, VT, Op, DAG.getConstant(0x3f800000U, VT));
18975     Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op);
18976     Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
18977     return DAG.getNode(ISD::MUL, dl, VT, Op, R);
18978   }
18979
18980   // If possible, lower this shift as a sequence of two shifts by
18981   // constant plus a MOVSS/MOVSD instead of scalarizing it.
18982   // Example:
18983   //   (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
18984   //
18985   // Could be rewritten as:
18986   //   (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
18987   //
18988   // The advantage is that the two shifts from the example would be
18989   // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
18990   // the vector shift into four scalar shifts plus four pairs of vector
18991   // insert/extract.
18992   if ((VT == MVT::v8i16 || VT == MVT::v4i32) &&
18993       ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
18994     unsigned TargetOpcode = X86ISD::MOVSS;
18995     bool CanBeSimplified;
18996     // The splat value for the first packed shift (the 'X' from the example).
18997     SDValue Amt1 = Amt->getOperand(0);
18998     // The splat value for the second packed shift (the 'Y' from the example).
18999     SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) :
19000                                         Amt->getOperand(2);
19001
19002     // See if it is possible to replace this node with a sequence of
19003     // two shifts followed by a MOVSS/MOVSD
19004     if (VT == MVT::v4i32) {
19005       // Check if it is legal to use a MOVSS.
19006       CanBeSimplified = Amt2 == Amt->getOperand(2) &&
19007                         Amt2 == Amt->getOperand(3);
19008       if (!CanBeSimplified) {
19009         // Otherwise, check if we can still simplify this node using a MOVSD.
19010         CanBeSimplified = Amt1 == Amt->getOperand(1) &&
19011                           Amt->getOperand(2) == Amt->getOperand(3);
19012         TargetOpcode = X86ISD::MOVSD;
19013         Amt2 = Amt->getOperand(2);
19014       }
19015     } else {
19016       // Do similar checks for the case where the machine value type
19017       // is MVT::v8i16.
19018       CanBeSimplified = Amt1 == Amt->getOperand(1);
19019       for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
19020         CanBeSimplified = Amt2 == Amt->getOperand(i);
19021
19022       if (!CanBeSimplified) {
19023         TargetOpcode = X86ISD::MOVSD;
19024         CanBeSimplified = true;
19025         Amt2 = Amt->getOperand(4);
19026         for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
19027           CanBeSimplified = Amt1 == Amt->getOperand(i);
19028         for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
19029           CanBeSimplified = Amt2 == Amt->getOperand(j);
19030       }
19031     }
19032
19033     if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
19034         isa<ConstantSDNode>(Amt2)) {
19035       // Replace this node with two shifts followed by a MOVSS/MOVSD.
19036       EVT CastVT = MVT::v4i32;
19037       SDValue Splat1 =
19038         DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), VT);
19039       SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
19040       SDValue Splat2 =
19041         DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), VT);
19042       SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
19043       if (TargetOpcode == X86ISD::MOVSD)
19044         CastVT = MVT::v2i64;
19045       SDValue BitCast1 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift1);
19046       SDValue BitCast2 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift2);
19047       SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2,
19048                                             BitCast1, DAG);
19049       return DAG.getNode(ISD::BITCAST, dl, VT, Result);
19050     }
19051   }
19052
19053   if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
19054     assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq.");
19055
19056     // a = a << 5;
19057     Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(5, VT));
19058     Op = DAG.getNode(ISD::BITCAST, dl, VT, Op);
19059
19060     // Turn 'a' into a mask suitable for VSELECT
19061     SDValue VSelM = DAG.getConstant(0x80, VT);
19062     SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
19063     OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
19064
19065     SDValue CM1 = DAG.getConstant(0x0f, VT);
19066     SDValue CM2 = DAG.getConstant(0x3f, VT);
19067
19068     // r = VSELECT(r, psllw(r & (char16)15, 4), a);
19069     SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1);
19070     M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 4, DAG);
19071     M = DAG.getNode(ISD::BITCAST, dl, VT, M);
19072     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
19073
19074     // a += a
19075     Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
19076     OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
19077     OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
19078
19079     // r = VSELECT(r, psllw(r & (char16)63, 2), a);
19080     M = DAG.getNode(ISD::AND, dl, VT, R, CM2);
19081     M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 2, DAG);
19082     M = DAG.getNode(ISD::BITCAST, dl, VT, M);
19083     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
19084
19085     // a += a
19086     Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
19087     OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
19088     OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
19089
19090     // return VSELECT(r, r+r, a);
19091     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel,
19092                     DAG.getNode(ISD::ADD, dl, VT, R, R), R);
19093     return R;
19094   }
19095
19096   // It's worth extending once and using the v8i32 shifts for 16-bit types, but
19097   // the extra overheads to get from v16i8 to v8i32 make the existing SSE
19098   // solution better.
19099   if (Subtarget->hasInt256() && VT == MVT::v8i16) {
19100     MVT NewVT = VT == MVT::v8i16 ? MVT::v8i32 : MVT::v16i16;
19101     unsigned ExtOpc =
19102         Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
19103     R = DAG.getNode(ExtOpc, dl, NewVT, R);
19104     Amt = DAG.getNode(ISD::ANY_EXTEND, dl, NewVT, Amt);
19105     return DAG.getNode(ISD::TRUNCATE, dl, VT,
19106                        DAG.getNode(Op.getOpcode(), dl, NewVT, R, Amt));
19107     }
19108
19109   // Decompose 256-bit shifts into smaller 128-bit shifts.
19110   if (VT.is256BitVector()) {
19111     unsigned NumElems = VT.getVectorNumElements();
19112     MVT EltVT = VT.getVectorElementType();
19113     EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
19114
19115     // Extract the two vectors
19116     SDValue V1 = Extract128BitVector(R, 0, DAG, dl);
19117     SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl);
19118
19119     // Recreate the shift amount vectors
19120     SDValue Amt1, Amt2;
19121     if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
19122       // Constant shift amount
19123       SmallVector<SDValue, 4> Amt1Csts;
19124       SmallVector<SDValue, 4> Amt2Csts;
19125       for (unsigned i = 0; i != NumElems/2; ++i)
19126         Amt1Csts.push_back(Amt->getOperand(i));
19127       for (unsigned i = NumElems/2; i != NumElems; ++i)
19128         Amt2Csts.push_back(Amt->getOperand(i));
19129
19130       Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts);
19131       Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts);
19132     } else {
19133       // Variable shift amount
19134       Amt1 = Extract128BitVector(Amt, 0, DAG, dl);
19135       Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl);
19136     }
19137
19138     // Issue new vector shifts for the smaller types
19139     V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
19140     V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
19141
19142     // Concatenate the result back
19143     return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
19144   }
19145
19146   return SDValue();
19147 }
19148
19149 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
19150   // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
19151   // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
19152   // looks for this combo and may remove the "setcc" instruction if the "setcc"
19153   // has only one use.
19154   SDNode *N = Op.getNode();
19155   SDValue LHS = N->getOperand(0);
19156   SDValue RHS = N->getOperand(1);
19157   unsigned BaseOp = 0;
19158   unsigned Cond = 0;
19159   SDLoc DL(Op);
19160   switch (Op.getOpcode()) {
19161   default: llvm_unreachable("Unknown ovf instruction!");
19162   case ISD::SADDO:
19163     // A subtract of one will be selected as a INC. Note that INC doesn't
19164     // set CF, so we can't do this for UADDO.
19165     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
19166       if (C->isOne()) {
19167         BaseOp = X86ISD::INC;
19168         Cond = X86::COND_O;
19169         break;
19170       }
19171     BaseOp = X86ISD::ADD;
19172     Cond = X86::COND_O;
19173     break;
19174   case ISD::UADDO:
19175     BaseOp = X86ISD::ADD;
19176     Cond = X86::COND_B;
19177     break;
19178   case ISD::SSUBO:
19179     // A subtract of one will be selected as a DEC. Note that DEC doesn't
19180     // set CF, so we can't do this for USUBO.
19181     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
19182       if (C->isOne()) {
19183         BaseOp = X86ISD::DEC;
19184         Cond = X86::COND_O;
19185         break;
19186       }
19187     BaseOp = X86ISD::SUB;
19188     Cond = X86::COND_O;
19189     break;
19190   case ISD::USUBO:
19191     BaseOp = X86ISD::SUB;
19192     Cond = X86::COND_B;
19193     break;
19194   case ISD::SMULO:
19195     BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
19196     Cond = X86::COND_O;
19197     break;
19198   case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
19199     if (N->getValueType(0) == MVT::i8) {
19200       BaseOp = X86ISD::UMUL8;
19201       Cond = X86::COND_O;
19202       break;
19203     }
19204     SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
19205                                  MVT::i32);
19206     SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
19207
19208     SDValue SetCC =
19209       DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
19210                   DAG.getConstant(X86::COND_O, MVT::i32),
19211                   SDValue(Sum.getNode(), 2));
19212
19213     return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
19214   }
19215   }
19216
19217   // Also sets EFLAGS.
19218   SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
19219   SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
19220
19221   SDValue SetCC =
19222     DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1),
19223                 DAG.getConstant(Cond, MVT::i32),
19224                 SDValue(Sum.getNode(), 1));
19225
19226   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
19227 }
19228
19229 // Sign extension of the low part of vector elements. This may be used either
19230 // when sign extend instructions are not available or if the vector element
19231 // sizes already match the sign-extended size. If the vector elements are in
19232 // their pre-extended size and sign extend instructions are available, that will
19233 // be handled by LowerSIGN_EXTEND.
19234 SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
19235                                                   SelectionDAG &DAG) const {
19236   SDLoc dl(Op);
19237   EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
19238   MVT VT = Op.getSimpleValueType();
19239
19240   if (!Subtarget->hasSSE2() || !VT.isVector())
19241     return SDValue();
19242
19243   unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
19244                       ExtraVT.getScalarType().getSizeInBits();
19245
19246   switch (VT.SimpleTy) {
19247     default: return SDValue();
19248     case MVT::v8i32:
19249     case MVT::v16i16:
19250       if (!Subtarget->hasFp256())
19251         return SDValue();
19252       if (!Subtarget->hasInt256()) {
19253         // needs to be split
19254         unsigned NumElems = VT.getVectorNumElements();
19255
19256         // Extract the LHS vectors
19257         SDValue LHS = Op.getOperand(0);
19258         SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
19259         SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
19260
19261         MVT EltVT = VT.getVectorElementType();
19262         EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
19263
19264         EVT ExtraEltVT = ExtraVT.getVectorElementType();
19265         unsigned ExtraNumElems = ExtraVT.getVectorNumElements();
19266         ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
19267                                    ExtraNumElems/2);
19268         SDValue Extra = DAG.getValueType(ExtraVT);
19269
19270         LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
19271         LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
19272
19273         return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);
19274       }
19275       // fall through
19276     case MVT::v4i32:
19277     case MVT::v8i16: {
19278       SDValue Op0 = Op.getOperand(0);
19279
19280       // This is a sign extension of some low part of vector elements without
19281       // changing the size of the vector elements themselves:
19282       // Shift-Left + Shift-Right-Algebraic.
19283       SDValue Shl = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0,
19284                                                BitsDiff, DAG);
19285       return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Shl, BitsDiff,
19286                                         DAG);
19287     }
19288   }
19289 }
19290
19291 /// Returns true if the operand type is exactly twice the native width, and
19292 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
19293 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
19294 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
19295 bool X86TargetLowering::needsCmpXchgNb(const Type *MemType) const {
19296   unsigned OpWidth = MemType->getPrimitiveSizeInBits();
19297
19298   if (OpWidth == 64)
19299     return !Subtarget->is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
19300   else if (OpWidth == 128)
19301     return Subtarget->hasCmpxchg16b();
19302   else
19303     return false;
19304 }
19305
19306 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
19307   return needsCmpXchgNb(SI->getValueOperand()->getType());
19308 }
19309
19310 // Note: this turns large loads into lock cmpxchg8b/16b.
19311 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
19312 bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
19313   auto PTy = cast<PointerType>(LI->getPointerOperand()->getType());
19314   return needsCmpXchgNb(PTy->getElementType());
19315 }
19316
19317 bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
19318   unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
19319   const Type *MemType = AI->getType();
19320
19321   // If the operand is too big, we must see if cmpxchg8/16b is available
19322   // and default to library calls otherwise.
19323   if (MemType->getPrimitiveSizeInBits() > NativeWidth)
19324     return needsCmpXchgNb(MemType);
19325
19326   AtomicRMWInst::BinOp Op = AI->getOperation();
19327   switch (Op) {
19328   default:
19329     llvm_unreachable("Unknown atomic operation");
19330   case AtomicRMWInst::Xchg:
19331   case AtomicRMWInst::Add:
19332   case AtomicRMWInst::Sub:
19333     // It's better to use xadd, xsub or xchg for these in all cases.
19334     return false;
19335   case AtomicRMWInst::Or:
19336   case AtomicRMWInst::And:
19337   case AtomicRMWInst::Xor:
19338     // If the atomicrmw's result isn't actually used, we can just add a "lock"
19339     // prefix to a normal instruction for these operations.
19340     return !AI->use_empty();
19341   case AtomicRMWInst::Nand:
19342   case AtomicRMWInst::Max:
19343   case AtomicRMWInst::Min:
19344   case AtomicRMWInst::UMax:
19345   case AtomicRMWInst::UMin:
19346     // These always require a non-trivial set of data operations on x86. We must
19347     // use a cmpxchg loop.
19348     return true;
19349   }
19350 }
19351
19352 static bool hasMFENCE(const X86Subtarget& Subtarget) {
19353   // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
19354   // no-sse2). There isn't any reason to disable it if the target processor
19355   // supports it.
19356   return Subtarget.hasSSE2() || Subtarget.is64Bit();
19357 }
19358
19359 LoadInst *
19360 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
19361   unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
19362   const Type *MemType = AI->getType();
19363   // Accesses larger than the native width are turned into cmpxchg/libcalls, so
19364   // there is no benefit in turning such RMWs into loads, and it is actually
19365   // harmful as it introduces a mfence.
19366   if (MemType->getPrimitiveSizeInBits() > NativeWidth)
19367     return nullptr;
19368
19369   auto Builder = IRBuilder<>(AI);
19370   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
19371   auto SynchScope = AI->getSynchScope();
19372   // We must restrict the ordering to avoid generating loads with Release or
19373   // ReleaseAcquire orderings.
19374   auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
19375   auto Ptr = AI->getPointerOperand();
19376
19377   // Before the load we need a fence. Here is an example lifted from
19378   // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
19379   // is required:
19380   // Thread 0:
19381   //   x.store(1, relaxed);
19382   //   r1 = y.fetch_add(0, release);
19383   // Thread 1:
19384   //   y.fetch_add(42, acquire);
19385   //   r2 = x.load(relaxed);
19386   // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
19387   // lowered to just a load without a fence. A mfence flushes the store buffer,
19388   // making the optimization clearly correct.
19389   // FIXME: it is required if isAtLeastRelease(Order) but it is not clear
19390   // otherwise, we might be able to be more agressive on relaxed idempotent
19391   // rmw. In practice, they do not look useful, so we don't try to be
19392   // especially clever.
19393   if (SynchScope == SingleThread) {
19394     // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
19395     // the IR level, so we must wrap it in an intrinsic.
19396     return nullptr;
19397   } else if (hasMFENCE(*Subtarget)) {
19398     Function *MFence = llvm::Intrinsic::getDeclaration(M,
19399             Intrinsic::x86_sse2_mfence);
19400     Builder.CreateCall(MFence);
19401   } else {
19402     // FIXME: it might make sense to use a locked operation here but on a
19403     // different cache-line to prevent cache-line bouncing. In practice it
19404     // is probably a small win, and x86 processors without mfence are rare
19405     // enough that we do not bother.
19406     return nullptr;
19407   }
19408
19409   // Finally we can emit the atomic load.
19410   LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
19411           AI->getType()->getPrimitiveSizeInBits());
19412   Loaded->setAtomic(Order, SynchScope);
19413   AI->replaceAllUsesWith(Loaded);
19414   AI->eraseFromParent();
19415   return Loaded;
19416 }
19417
19418 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
19419                                  SelectionDAG &DAG) {
19420   SDLoc dl(Op);
19421   AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
19422     cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
19423   SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
19424     cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
19425
19426   // The only fence that needs an instruction is a sequentially-consistent
19427   // cross-thread fence.
19428   if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) {
19429     if (hasMFENCE(*Subtarget))
19430       return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
19431
19432     SDValue Chain = Op.getOperand(0);
19433     SDValue Zero = DAG.getConstant(0, MVT::i32);
19434     SDValue Ops[] = {
19435       DAG.getRegister(X86::ESP, MVT::i32), // Base
19436       DAG.getTargetConstant(1, MVT::i8),   // Scale
19437       DAG.getRegister(0, MVT::i32),        // Index
19438       DAG.getTargetConstant(0, MVT::i32),  // Disp
19439       DAG.getRegister(0, MVT::i32),        // Segment.
19440       Zero,
19441       Chain
19442     };
19443     SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
19444     return SDValue(Res, 0);
19445   }
19446
19447   // MEMBARRIER is a compiler barrier; it codegens to a no-op.
19448   return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
19449 }
19450
19451 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
19452                              SelectionDAG &DAG) {
19453   MVT T = Op.getSimpleValueType();
19454   SDLoc DL(Op);
19455   unsigned Reg = 0;
19456   unsigned size = 0;
19457   switch(T.SimpleTy) {
19458   default: llvm_unreachable("Invalid value type!");
19459   case MVT::i8:  Reg = X86::AL;  size = 1; break;
19460   case MVT::i16: Reg = X86::AX;  size = 2; break;
19461   case MVT::i32: Reg = X86::EAX; size = 4; break;
19462   case MVT::i64:
19463     assert(Subtarget->is64Bit() && "Node not type legal!");
19464     Reg = X86::RAX; size = 8;
19465     break;
19466   }
19467   SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
19468                                   Op.getOperand(2), SDValue());
19469   SDValue Ops[] = { cpIn.getValue(0),
19470                     Op.getOperand(1),
19471                     Op.getOperand(3),
19472                     DAG.getTargetConstant(size, MVT::i8),
19473                     cpIn.getValue(1) };
19474   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
19475   MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
19476   SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
19477                                            Ops, T, MMO);
19478
19479   SDValue cpOut =
19480     DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
19481   SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
19482                                       MVT::i32, cpOut.getValue(2));
19483   SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1),
19484                                 DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);
19485
19486   DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
19487   DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
19488   DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
19489   return SDValue();
19490 }
19491
19492 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
19493                             SelectionDAG &DAG) {
19494   MVT SrcVT = Op.getOperand(0).getSimpleValueType();
19495   MVT DstVT = Op.getSimpleValueType();
19496
19497   if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) {
19498     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
19499     if (DstVT != MVT::f64)
19500       // This conversion needs to be expanded.
19501       return SDValue();
19502
19503     SDValue InVec = Op->getOperand(0);
19504     SDLoc dl(Op);
19505     unsigned NumElts = SrcVT.getVectorNumElements();
19506     EVT SVT = SrcVT.getVectorElementType();
19507
19508     // Widen the vector in input in the case of MVT::v2i32.
19509     // Example: from MVT::v2i32 to MVT::v4i32.
19510     SmallVector<SDValue, 16> Elts;
19511     for (unsigned i = 0, e = NumElts; i != e; ++i)
19512       Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec,
19513                                  DAG.getIntPtrConstant(i)));
19514
19515     // Explicitly mark the extra elements as Undef.
19516     SDValue Undef = DAG.getUNDEF(SVT);
19517     for (unsigned i = NumElts, e = NumElts * 2; i != e; ++i)
19518       Elts.push_back(Undef);
19519
19520     EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
19521     SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts);
19522     SDValue ToV2F64 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, BV);
19523     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
19524                        DAG.getIntPtrConstant(0));
19525   }
19526
19527   assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
19528          Subtarget->hasMMX() && "Unexpected custom BITCAST");
19529   assert((DstVT == MVT::i64 ||
19530           (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
19531          "Unexpected custom BITCAST");
19532   // i64 <=> MMX conversions are Legal.
19533   if (SrcVT==MVT::i64 && DstVT.isVector())
19534     return Op;
19535   if (DstVT==MVT::i64 && SrcVT.isVector())
19536     return Op;
19537   // MMX <=> MMX conversions are Legal.
19538   if (SrcVT.isVector() && DstVT.isVector())
19539     return Op;
19540   // All other conversions need to be expanded.
19541   return SDValue();
19542 }
19543
19544 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget,
19545                           SelectionDAG &DAG) {
19546   SDNode *Node = Op.getNode();
19547   SDLoc dl(Node);
19548
19549   Op = Op.getOperand(0);
19550   EVT VT = Op.getValueType();
19551   assert((VT.is128BitVector() || VT.is256BitVector()) &&
19552          "CTPOP lowering only implemented for 128/256-bit wide vector types");
19553
19554   unsigned NumElts = VT.getVectorNumElements();
19555   EVT EltVT = VT.getVectorElementType();
19556   unsigned Len = EltVT.getSizeInBits();
19557
19558   // This is the vectorized version of the "best" algorithm from
19559   // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
19560   // with a minor tweak to use a series of adds + shifts instead of vector
19561   // multiplications. Implemented for the v2i64, v4i64, v4i32, v8i32 types:
19562   //
19563   //  v2i64, v4i64, v4i32 => Only profitable w/ popcnt disabled
19564   //  v8i32 => Always profitable
19565   //
19566   // FIXME: There a couple of possible improvements:
19567   //
19568   // 1) Support for i8 and i16 vectors (needs measurements if popcnt enabled).
19569   // 2) Use strategies from http://wm.ite.pl/articles/sse-popcount.html
19570   //
19571   assert(EltVT.isInteger() && (Len == 32 || Len == 64) && Len % 8 == 0 &&
19572          "CTPOP not implemented for this vector element type.");
19573
19574   // X86 canonicalize ANDs to vXi64, generate the appropriate bitcasts to avoid
19575   // extra legalization.
19576   bool NeedsBitcast = EltVT == MVT::i32;
19577   MVT BitcastVT = VT.is256BitVector() ? MVT::v4i64 : MVT::v2i64;
19578
19579   SDValue Cst55 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), EltVT);
19580   SDValue Cst33 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), EltVT);
19581   SDValue Cst0F = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), EltVT);
19582
19583   // v = v - ((v >> 1) & 0x55555555...)
19584   SmallVector<SDValue, 8> Ones(NumElts, DAG.getConstant(1, EltVT));
19585   SDValue OnesV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ones);
19586   SDValue Srl = DAG.getNode(ISD::SRL, dl, VT, Op, OnesV);
19587   if (NeedsBitcast)
19588     Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);
19589
19590   SmallVector<SDValue, 8> Mask55(NumElts, Cst55);
19591   SDValue M55 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask55);
19592   if (NeedsBitcast)
19593     M55 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M55);
19594
19595   SDValue And = DAG.getNode(ISD::AND, dl, Srl.getValueType(), Srl, M55);
19596   if (VT != And.getValueType())
19597     And = DAG.getNode(ISD::BITCAST, dl, VT, And);
19598   SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, Op, And);
19599
19600   // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
19601   SmallVector<SDValue, 8> Mask33(NumElts, Cst33);
19602   SDValue M33 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask33);
19603   SmallVector<SDValue, 8> Twos(NumElts, DAG.getConstant(2, EltVT));
19604   SDValue TwosV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Twos);
19605
19606   Srl = DAG.getNode(ISD::SRL, dl, VT, Sub, TwosV);
19607   if (NeedsBitcast) {
19608     Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);
19609     M33 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M33);
19610     Sub = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Sub);
19611   }
19612
19613   SDValue AndRHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Srl, M33);
19614   SDValue AndLHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Sub, M33);
19615   if (VT != AndRHS.getValueType()) {
19616     AndRHS = DAG.getNode(ISD::BITCAST, dl, VT, AndRHS);
19617     AndLHS = DAG.getNode(ISD::BITCAST, dl, VT, AndLHS);
19618   }
19619   SDValue Add = DAG.getNode(ISD::ADD, dl, VT, AndLHS, AndRHS);
19620
19621   // v = (v + (v >> 4)) & 0x0F0F0F0F...
19622   SmallVector<SDValue, 8> Fours(NumElts, DAG.getConstant(4, EltVT));
19623   SDValue FoursV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Fours);
19624   Srl = DAG.getNode(ISD::SRL, dl, VT, Add, FoursV);
19625   Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);
19626
19627   SmallVector<SDValue, 8> Mask0F(NumElts, Cst0F);
19628   SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask0F);
19629   if (NeedsBitcast) {
19630     Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);
19631     M0F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M0F);
19632   }
19633   And = DAG.getNode(ISD::AND, dl, M0F.getValueType(), Add, M0F);
19634   if (VT != And.getValueType())
19635     And = DAG.getNode(ISD::BITCAST, dl, VT, And);
19636
19637   // The algorithm mentioned above uses:
19638   //    v = (v * 0x01010101...) >> (Len - 8)
19639   //
19640   // Change it to use vector adds + vector shifts which yield faster results on
19641   // Haswell than using vector integer multiplication.
19642   //
19643   // For i32 elements:
19644   //    v = v + (v >> 8)
19645   //    v = v + (v >> 16)
19646   //
19647   // For i64 elements:
19648   //    v = v + (v >> 8)
19649   //    v = v + (v >> 16)
19650   //    v = v + (v >> 32)
19651   //
19652   Add = And;
19653   SmallVector<SDValue, 8> Csts;
19654   for (unsigned i = 8; i <= Len/2; i *= 2) {
19655     Csts.assign(NumElts, DAG.getConstant(i, EltVT));
19656     SDValue CstsV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Csts);
19657     Srl = DAG.getNode(ISD::SRL, dl, VT, Add, CstsV);
19658     Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);
19659     Csts.clear();
19660   }
19661
19662   // The result is on the least significant 6-bits on i32 and 7-bits on i64.
19663   SDValue Cst3F = DAG.getConstant(APInt(Len, Len == 32 ? 0x3F : 0x7F), EltVT);
19664   SmallVector<SDValue, 8> Cst3FV(NumElts, Cst3F);
19665   SDValue M3F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Cst3FV);
19666   if (NeedsBitcast) {
19667     Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);
19668     M3F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M3F);
19669   }
19670   And = DAG.getNode(ISD::AND, dl, M3F.getValueType(), Add, M3F);
19671   if (VT != And.getValueType())
19672     And = DAG.getNode(ISD::BITCAST, dl, VT, And);
19673
19674   return And;
19675 }
19676
19677 static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
19678   SDNode *Node = Op.getNode();
19679   SDLoc dl(Node);
19680   EVT T = Node->getValueType(0);
19681   SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
19682                               DAG.getConstant(0, T), Node->getOperand(2));
19683   return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
19684                        cast<AtomicSDNode>(Node)->getMemoryVT(),
19685                        Node->getOperand(0),
19686                        Node->getOperand(1), negOp,
19687                        cast<AtomicSDNode>(Node)->getMemOperand(),
19688                        cast<AtomicSDNode>(Node)->getOrdering(),
19689                        cast<AtomicSDNode>(Node)->getSynchScope());
19690 }
19691
19692 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
19693   SDNode *Node = Op.getNode();
19694   SDLoc dl(Node);
19695   EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
19696
19697   // Convert seq_cst store -> xchg
19698   // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
19699   // FIXME: On 32-bit, store -> fist or movq would be more efficient
19700   //        (The only way to get a 16-byte store is cmpxchg16b)
19701   // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
19702   if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent ||
19703       !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
19704     SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
19705                                  cast<AtomicSDNode>(Node)->getMemoryVT(),
19706                                  Node->getOperand(0),
19707                                  Node->getOperand(1), Node->getOperand(2),
19708                                  cast<AtomicSDNode>(Node)->getMemOperand(),
19709                                  cast<AtomicSDNode>(Node)->getOrdering(),
19710                                  cast<AtomicSDNode>(Node)->getSynchScope());
19711     return Swap.getValue(1);
19712   }
19713   // Other atomic stores have a simple pattern.
19714   return Op;
19715 }
19716
19717 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
19718   EVT VT = Op.getNode()->getSimpleValueType(0);
19719
19720   // Let legalize expand this if it isn't a legal type yet.
19721   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
19722     return SDValue();
19723
19724   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
19725
19726   unsigned Opc;
19727   bool ExtraOp = false;
19728   switch (Op.getOpcode()) {
19729   default: llvm_unreachable("Invalid code");
19730   case ISD::ADDC: Opc = X86ISD::ADD; break;
19731   case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
19732   case ISD::SUBC: Opc = X86ISD::SUB; break;
19733   case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
19734   }
19735
19736   if (!ExtraOp)
19737     return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
19738                        Op.getOperand(1));
19739   return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
19740                      Op.getOperand(1), Op.getOperand(2));
19741 }
19742
19743 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
19744                             SelectionDAG &DAG) {
19745   assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit());
19746
19747   // For MacOSX, we want to call an alternative entry point: __sincos_stret,
19748   // which returns the values as { float, float } (in XMM0) or
19749   // { double, double } (which is returned in XMM0, XMM1).
19750   SDLoc dl(Op);
19751   SDValue Arg = Op.getOperand(0);
19752   EVT ArgVT = Arg.getValueType();
19753   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
19754
19755   TargetLowering::ArgListTy Args;
19756   TargetLowering::ArgListEntry Entry;
19757
19758   Entry.Node = Arg;
19759   Entry.Ty = ArgTy;
19760   Entry.isSExt = false;
19761   Entry.isZExt = false;
19762   Args.push_back(Entry);
19763
19764   bool isF64 = ArgVT == MVT::f64;
19765   // Only optimize x86_64 for now. i386 is a bit messy. For f32,
19766   // the small struct {f32, f32} is returned in (eax, edx). For f64,
19767   // the results are returned via SRet in memory.
19768   const char *LibcallName =  isF64 ? "__sincos_stret" : "__sincosf_stret";
19769   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19770   SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy());
19771
19772   Type *RetTy = isF64
19773     ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
19774     : (Type*)VectorType::get(ArgTy, 4);
19775
19776   TargetLowering::CallLoweringInfo CLI(DAG);
19777   CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
19778     .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0);
19779
19780   std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
19781
19782   if (isF64)
19783     // Returned in xmm0 and xmm1.
19784     return CallResult.first;
19785
19786   // Returned in bits 0:31 and 32:64 xmm0.
19787   SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
19788                                CallResult.first, DAG.getIntPtrConstant(0));
19789   SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
19790                                CallResult.first, DAG.getIntPtrConstant(1));
19791   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
19792   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
19793 }
19794
19795 /// LowerOperation - Provide custom lowering hooks for some operations.
19796 ///
19797 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
19798   switch (Op.getOpcode()) {
19799   default: llvm_unreachable("Should not custom lower this!");
19800   case ISD::SIGN_EXTEND_INREG:  return LowerSIGN_EXTEND_INREG(Op,DAG);
19801   case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
19802   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
19803     return LowerCMP_SWAP(Op, Subtarget, DAG);
19804   case ISD::CTPOP:              return LowerCTPOP(Op, Subtarget, DAG);
19805   case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
19806   case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op,DAG);
19807   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
19808   case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, DAG);
19809   case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
19810   case ISD::VSELECT:            return LowerVSELECT(Op, DAG);
19811   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
19812   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
19813   case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
19814   case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
19815   case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
19816   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
19817   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
19818   case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
19819   case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
19820   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
19821   case ISD::SHL_PARTS:
19822   case ISD::SRA_PARTS:
19823   case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
19824   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
19825   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
19826   case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
19827   case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, Subtarget, DAG);
19828   case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, Subtarget, DAG);
19829   case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, Subtarget, DAG);
19830   case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
19831   case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
19832   case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
19833   case ISD::LOAD:               return LowerExtendedLoad(Op, Subtarget, DAG);
19834   case ISD::FABS:
19835   case ISD::FNEG:               return LowerFABSorFNEG(Op, DAG);
19836   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
19837   case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
19838   case ISD::SETCC:              return LowerSETCC(Op, DAG);
19839   case ISD::SELECT:             return LowerSELECT(Op, DAG);
19840   case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
19841   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
19842   case ISD::VASTART:            return LowerVASTART(Op, DAG);
19843   case ISD::VAARG:              return LowerVAARG(Op, DAG);
19844   case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
19845   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
19846   case ISD::INTRINSIC_VOID:
19847   case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
19848   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
19849   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
19850   case ISD::FRAME_TO_ARGS_OFFSET:
19851                                 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
19852   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
19853   case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
19854   case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
19855   case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
19856   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
19857   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
19858   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
19859   case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
19860   case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ_ZERO_UNDEF(Op, DAG);
19861   case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
19862   case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
19863   case ISD::UMUL_LOHI:
19864   case ISD::SMUL_LOHI:          return LowerMUL_LOHI(Op, Subtarget, DAG);
19865   case ISD::SRA:
19866   case ISD::SRL:
19867   case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
19868   case ISD::SADDO:
19869   case ISD::UADDO:
19870   case ISD::SSUBO:
19871   case ISD::USUBO:
19872   case ISD::SMULO:
19873   case ISD::UMULO:              return LowerXALUO(Op, DAG);
19874   case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
19875   case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);
19876   case ISD::ADDC:
19877   case ISD::ADDE:
19878   case ISD::SUBC:
19879   case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
19880   case ISD::ADD:                return LowerADD(Op, DAG);
19881   case ISD::SUB:                return LowerSUB(Op, DAG);
19882   case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
19883   }
19884 }
19885
19886 /// ReplaceNodeResults - Replace a node with an illegal result type
19887 /// with a new node built out of custom code.
19888 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
19889                                            SmallVectorImpl<SDValue>&Results,
19890                                            SelectionDAG &DAG) const {
19891   SDLoc dl(N);
19892   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19893   switch (N->getOpcode()) {
19894   default:
19895     llvm_unreachable("Do not know how to custom type legalize this operation!");
19896   // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
19897   case X86ISD::FMINC:
19898   case X86ISD::FMIN:
19899   case X86ISD::FMAXC:
19900   case X86ISD::FMAX: {
19901     EVT VT = N->getValueType(0);
19902     if (VT != MVT::v2f32)
19903       llvm_unreachable("Unexpected type (!= v2f32) on FMIN/FMAX.");
19904     SDValue UNDEF = DAG.getUNDEF(VT);
19905     SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
19906                               N->getOperand(0), UNDEF);
19907     SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
19908                               N->getOperand(1), UNDEF);
19909     Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
19910     return;
19911   }
19912   case ISD::SIGN_EXTEND_INREG:
19913   case ISD::ADDC:
19914   case ISD::ADDE:
19915   case ISD::SUBC:
19916   case ISD::SUBE:
19917     // We don't want to expand or promote these.
19918     return;
19919   case ISD::SDIV:
19920   case ISD::UDIV:
19921   case ISD::SREM:
19922   case ISD::UREM:
19923   case ISD::SDIVREM:
19924   case ISD::UDIVREM: {
19925     SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
19926     Results.push_back(V);
19927     return;
19928   }
19929   case ISD::FP_TO_SINT:
19930   case ISD::FP_TO_UINT: {
19931     bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
19932
19933     if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType()))
19934       return;
19935
19936     std::pair<SDValue,SDValue> Vals =
19937         FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
19938     SDValue FIST = Vals.first, StackSlot = Vals.second;
19939     if (FIST.getNode()) {
19940       EVT VT = N->getValueType(0);
19941       // Return a load from the stack slot.
19942       if (StackSlot.getNode())
19943         Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
19944                                       MachinePointerInfo(),
19945                                       false, false, false, 0));
19946       else
19947         Results.push_back(FIST);
19948     }
19949     return;
19950   }
19951   case ISD::UINT_TO_FP: {
19952     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
19953     if (N->getOperand(0).getValueType() != MVT::v2i32 ||
19954         N->getValueType(0) != MVT::v2f32)
19955       return;
19956     SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,
19957                                  N->getOperand(0));
19958     SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
19959                                      MVT::f64);
19960     SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias);
19961     SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
19962                              DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, VBias));
19963     Or = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or);
19964     SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
19965     Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
19966     return;
19967   }
19968   case ISD::FP_ROUND: {
19969     if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
19970         return;
19971     SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
19972     Results.push_back(V);
19973     return;
19974   }
19975   case ISD::INTRINSIC_W_CHAIN: {
19976     unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
19977     switch (IntNo) {
19978     default : llvm_unreachable("Do not know how to custom type "
19979                                "legalize this intrinsic operation!");
19980     case Intrinsic::x86_rdtsc:
19981       return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
19982                                      Results);
19983     case Intrinsic::x86_rdtscp:
19984       return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
19985                                      Results);
19986     case Intrinsic::x86_rdpmc:
19987       return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
19988     }
19989   }
19990   case ISD::READCYCLECOUNTER: {
19991     return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
19992                                    Results);
19993   }
19994   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
19995     EVT T = N->getValueType(0);
19996     assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
19997     bool Regs64bit = T == MVT::i128;
19998     EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
19999     SDValue cpInL, cpInH;
20000     cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
20001                         DAG.getConstant(0, HalfT));
20002     cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
20003                         DAG.getConstant(1, HalfT));
20004     cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
20005                              Regs64bit ? X86::RAX : X86::EAX,
20006                              cpInL, SDValue());
20007     cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
20008                              Regs64bit ? X86::RDX : X86::EDX,
20009                              cpInH, cpInL.getValue(1));
20010     SDValue swapInL, swapInH;
20011     swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
20012                           DAG.getConstant(0, HalfT));
20013     swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
20014                           DAG.getConstant(1, HalfT));
20015     swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl,
20016                                Regs64bit ? X86::RBX : X86::EBX,
20017                                swapInL, cpInH.getValue(1));
20018     swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl,
20019                                Regs64bit ? X86::RCX : X86::ECX,
20020                                swapInH, swapInL.getValue(1));
20021     SDValue Ops[] = { swapInH.getValue(0),
20022                       N->getOperand(1),
20023                       swapInH.getValue(1) };
20024     SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20025     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
20026     unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
20027                                   X86ISD::LCMPXCHG8_DAG;
20028     SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
20029     SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
20030                                         Regs64bit ? X86::RAX : X86::EAX,
20031                                         HalfT, Result.getValue(1));
20032     SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
20033                                         Regs64bit ? X86::RDX : X86::EDX,
20034                                         HalfT, cpOutL.getValue(2));
20035     SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
20036
20037     SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
20038                                         MVT::i32, cpOutH.getValue(2));
20039     SDValue Success =
20040         DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
20041                     DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);
20042     Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
20043
20044     Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
20045     Results.push_back(Success);
20046     Results.push_back(EFLAGS.getValue(1));
20047     return;
20048   }
20049   case ISD::ATOMIC_SWAP:
20050   case ISD::ATOMIC_LOAD_ADD:
20051   case ISD::ATOMIC_LOAD_SUB:
20052   case ISD::ATOMIC_LOAD_AND:
20053   case ISD::ATOMIC_LOAD_OR:
20054   case ISD::ATOMIC_LOAD_XOR:
20055   case ISD::ATOMIC_LOAD_NAND:
20056   case ISD::ATOMIC_LOAD_MIN:
20057   case ISD::ATOMIC_LOAD_MAX:
20058   case ISD::ATOMIC_LOAD_UMIN:
20059   case ISD::ATOMIC_LOAD_UMAX:
20060   case ISD::ATOMIC_LOAD: {
20061     // Delegate to generic TypeLegalization. Situations we can really handle
20062     // should have already been dealt with by AtomicExpandPass.cpp.
20063     break;
20064   }
20065   case ISD::BITCAST: {
20066     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
20067     EVT DstVT = N->getValueType(0);
20068     EVT SrcVT = N->getOperand(0)->getValueType(0);
20069
20070     if (SrcVT != MVT::f64 ||
20071         (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
20072       return;
20073
20074     unsigned NumElts = DstVT.getVectorNumElements();
20075     EVT SVT = DstVT.getVectorElementType();
20076     EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
20077     SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
20078                                    MVT::v2f64, N->getOperand(0));
20079     SDValue ToVecInt = DAG.getNode(ISD::BITCAST, dl, WiderVT, Expanded);
20080
20081     if (ExperimentalVectorWideningLegalization) {
20082       // If we are legalizing vectors by widening, we already have the desired
20083       // legal vector type, just return it.
20084       Results.push_back(ToVecInt);
20085       return;
20086     }
20087
20088     SmallVector<SDValue, 8> Elts;
20089     for (unsigned i = 0, e = NumElts; i != e; ++i)
20090       Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
20091                                    ToVecInt, DAG.getIntPtrConstant(i)));
20092
20093     Results.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, DstVT, Elts));
20094   }
20095   }
20096 }
20097
20098 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
20099   switch (Opcode) {
20100   default: return nullptr;
20101   case X86ISD::BSF:                return "X86ISD::BSF";
20102   case X86ISD::BSR:                return "X86ISD::BSR";
20103   case X86ISD::SHLD:               return "X86ISD::SHLD";
20104   case X86ISD::SHRD:               return "X86ISD::SHRD";
20105   case X86ISD::FAND:               return "X86ISD::FAND";
20106   case X86ISD::FANDN:              return "X86ISD::FANDN";
20107   case X86ISD::FOR:                return "X86ISD::FOR";
20108   case X86ISD::FXOR:               return "X86ISD::FXOR";
20109   case X86ISD::FSRL:               return "X86ISD::FSRL";
20110   case X86ISD::FILD:               return "X86ISD::FILD";
20111   case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
20112   case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
20113   case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
20114   case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
20115   case X86ISD::FLD:                return "X86ISD::FLD";
20116   case X86ISD::FST:                return "X86ISD::FST";
20117   case X86ISD::CALL:               return "X86ISD::CALL";
20118   case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
20119   case X86ISD::RDTSCP_DAG:         return "X86ISD::RDTSCP_DAG";
20120   case X86ISD::RDPMC_DAG:          return "X86ISD::RDPMC_DAG";
20121   case X86ISD::BT:                 return "X86ISD::BT";
20122   case X86ISD::CMP:                return "X86ISD::CMP";
20123   case X86ISD::COMI:               return "X86ISD::COMI";
20124   case X86ISD::UCOMI:              return "X86ISD::UCOMI";
20125   case X86ISD::CMPM:               return "X86ISD::CMPM";
20126   case X86ISD::CMPMU:              return "X86ISD::CMPMU";
20127   case X86ISD::SETCC:              return "X86ISD::SETCC";
20128   case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
20129   case X86ISD::FSETCC:             return "X86ISD::FSETCC";
20130   case X86ISD::CMOV:               return "X86ISD::CMOV";
20131   case X86ISD::BRCOND:             return "X86ISD::BRCOND";
20132   case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
20133   case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
20134   case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
20135   case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
20136   case X86ISD::Wrapper:            return "X86ISD::Wrapper";
20137   case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
20138   case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
20139   case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
20140   case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
20141   case X86ISD::PINSRB:             return "X86ISD::PINSRB";
20142   case X86ISD::PINSRW:             return "X86ISD::PINSRW";
20143   case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
20144   case X86ISD::ANDNP:              return "X86ISD::ANDNP";
20145   case X86ISD::PSIGN:              return "X86ISD::PSIGN";
20146   case X86ISD::BLENDI:             return "X86ISD::BLENDI";
20147   case X86ISD::SHRUNKBLEND:        return "X86ISD::SHRUNKBLEND";
20148   case X86ISD::SUBUS:              return "X86ISD::SUBUS";
20149   case X86ISD::HADD:               return "X86ISD::HADD";
20150   case X86ISD::HSUB:               return "X86ISD::HSUB";
20151   case X86ISD::FHADD:              return "X86ISD::FHADD";
20152   case X86ISD::FHSUB:              return "X86ISD::FHSUB";
20153   case X86ISD::UMAX:               return "X86ISD::UMAX";
20154   case X86ISD::UMIN:               return "X86ISD::UMIN";
20155   case X86ISD::SMAX:               return "X86ISD::SMAX";
20156   case X86ISD::SMIN:               return "X86ISD::SMIN";
20157   case X86ISD::FMAX:               return "X86ISD::FMAX";
20158   case X86ISD::FMIN:               return "X86ISD::FMIN";
20159   case X86ISD::FMAXC:              return "X86ISD::FMAXC";
20160   case X86ISD::FMINC:              return "X86ISD::FMINC";
20161   case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
20162   case X86ISD::FRCP:               return "X86ISD::FRCP";
20163   case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
20164   case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
20165   case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
20166   case X86ISD::EH_SJLJ_SETJMP:     return "X86ISD::EH_SJLJ_SETJMP";
20167   case X86ISD::EH_SJLJ_LONGJMP:    return "X86ISD::EH_SJLJ_LONGJMP";
20168   case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
20169   case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
20170   case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
20171   case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
20172   case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
20173   case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
20174   case X86ISD::LCMPXCHG16_DAG:     return "X86ISD::LCMPXCHG16_DAG";
20175   case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
20176   case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
20177   case X86ISD::VZEXT:              return "X86ISD::VZEXT";
20178   case X86ISD::VSEXT:              return "X86ISD::VSEXT";
20179   case X86ISD::VTRUNC:             return "X86ISD::VTRUNC";
20180   case X86ISD::VTRUNCM:            return "X86ISD::VTRUNCM";
20181   case X86ISD::VINSERT:            return "X86ISD::VINSERT";
20182   case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
20183   case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
20184   case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
20185   case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
20186   case X86ISD::VSHL:               return "X86ISD::VSHL";
20187   case X86ISD::VSRL:               return "X86ISD::VSRL";
20188   case X86ISD::VSRA:               return "X86ISD::VSRA";
20189   case X86ISD::VSHLI:              return "X86ISD::VSHLI";
20190   case X86ISD::VSRLI:              return "X86ISD::VSRLI";
20191   case X86ISD::VSRAI:              return "X86ISD::VSRAI";
20192   case X86ISD::CMPP:               return "X86ISD::CMPP";
20193   case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
20194   case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
20195   case X86ISD::PCMPEQM:            return "X86ISD::PCMPEQM";
20196   case X86ISD::PCMPGTM:            return "X86ISD::PCMPGTM";
20197   case X86ISD::ADD:                return "X86ISD::ADD";
20198   case X86ISD::SUB:                return "X86ISD::SUB";
20199   case X86ISD::ADC:                return "X86ISD::ADC";
20200   case X86ISD::SBB:                return "X86ISD::SBB";
20201   case X86ISD::SMUL:               return "X86ISD::SMUL";
20202   case X86ISD::UMUL:               return "X86ISD::UMUL";
20203   case X86ISD::SMUL8:              return "X86ISD::SMUL8";
20204   case X86ISD::UMUL8:              return "X86ISD::UMUL8";
20205   case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
20206   case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
20207   case X86ISD::INC:                return "X86ISD::INC";
20208   case X86ISD::DEC:                return "X86ISD::DEC";
20209   case X86ISD::OR:                 return "X86ISD::OR";
20210   case X86ISD::XOR:                return "X86ISD::XOR";
20211   case X86ISD::AND:                return "X86ISD::AND";
20212   case X86ISD::BEXTR:              return "X86ISD::BEXTR";
20213   case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
20214   case X86ISD::PTEST:              return "X86ISD::PTEST";
20215   case X86ISD::TESTP:              return "X86ISD::TESTP";
20216   case X86ISD::TESTM:              return "X86ISD::TESTM";
20217   case X86ISD::TESTNM:             return "X86ISD::TESTNM";
20218   case X86ISD::KORTEST:            return "X86ISD::KORTEST";
20219   case X86ISD::PACKSS:             return "X86ISD::PACKSS";
20220   case X86ISD::PACKUS:             return "X86ISD::PACKUS";
20221   case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
20222   case X86ISD::VALIGN:             return "X86ISD::VALIGN";
20223   case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
20224   case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
20225   case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
20226   case X86ISD::SHUFP:              return "X86ISD::SHUFP";
20227   case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
20228   case X86ISD::MOVLHPD:            return "X86ISD::MOVLHPD";
20229   case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
20230   case X86ISD::MOVLPS:             return "X86ISD::MOVLPS";
20231   case X86ISD::MOVLPD:             return "X86ISD::MOVLPD";
20232   case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
20233   case X86ISD::MOVSHDUP:           return "X86ISD::MOVSHDUP";
20234   case X86ISD::MOVSLDUP:           return "X86ISD::MOVSLDUP";
20235   case X86ISD::MOVSD:              return "X86ISD::MOVSD";
20236   case X86ISD::MOVSS:              return "X86ISD::MOVSS";
20237   case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
20238   case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
20239   case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
20240   case X86ISD::VBROADCASTM:        return "X86ISD::VBROADCASTM";
20241   case X86ISD::VEXTRACT:           return "X86ISD::VEXTRACT";
20242   case X86ISD::VPERMILPI:          return "X86ISD::VPERMILPI";
20243   case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
20244   case X86ISD::VPERMV:             return "X86ISD::VPERMV";
20245   case X86ISD::VPERMV3:            return "X86ISD::VPERMV3";
20246   case X86ISD::VPERMIV3:           return "X86ISD::VPERMIV3";
20247   case X86ISD::VPERMI:             return "X86ISD::VPERMI";
20248   case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
20249   case X86ISD::PMULDQ:             return "X86ISD::PMULDQ";
20250   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
20251   case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
20252   case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
20253   case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
20254   case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
20255   case X86ISD::WIN_FTOL:           return "X86ISD::WIN_FTOL";
20256   case X86ISD::SAHF:               return "X86ISD::SAHF";
20257   case X86ISD::RDRAND:             return "X86ISD::RDRAND";
20258   case X86ISD::RDSEED:             return "X86ISD::RDSEED";
20259   case X86ISD::FMADD:              return "X86ISD::FMADD";
20260   case X86ISD::FMSUB:              return "X86ISD::FMSUB";
20261   case X86ISD::FNMADD:             return "X86ISD::FNMADD";
20262   case X86ISD::FNMSUB:             return "X86ISD::FNMSUB";
20263   case X86ISD::FMADDSUB:           return "X86ISD::FMADDSUB";
20264   case X86ISD::FMSUBADD:           return "X86ISD::FMSUBADD";
20265   case X86ISD::PCMPESTRI:          return "X86ISD::PCMPESTRI";
20266   case X86ISD::PCMPISTRI:          return "X86ISD::PCMPISTRI";
20267   case X86ISD::XTEST:              return "X86ISD::XTEST";
20268   case X86ISD::COMPRESS:           return "X86ISD::COMPRESS";
20269   case X86ISD::EXPAND:             return "X86ISD::EXPAND";
20270   case X86ISD::SELECT:             return "X86ISD::SELECT";
20271   case X86ISD::ADDSUB:             return "X86ISD::ADDSUB";
20272   case X86ISD::RCP28:              return "X86ISD::RCP28";
20273   case X86ISD::RSQRT28:            return "X86ISD::RSQRT28";
20274   }
20275 }
20276
20277 // isLegalAddressingMode - Return true if the addressing mode represented
20278 // by AM is legal for this target, for a load/store of the specified type.
20279 bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
20280                                               Type *Ty) const {
20281   // X86 supports extremely general addressing modes.
20282   CodeModel::Model M = getTargetMachine().getCodeModel();
20283   Reloc::Model R = getTargetMachine().getRelocationModel();
20284
20285   // X86 allows a sign-extended 32-bit immediate field as a displacement.
20286   if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
20287     return false;
20288
20289   if (AM.BaseGV) {
20290     unsigned GVFlags =
20291       Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
20292
20293     // If a reference to this global requires an extra load, we can't fold it.
20294     if (isGlobalStubReference(GVFlags))
20295       return false;
20296
20297     // If BaseGV requires a register for the PIC base, we cannot also have a
20298     // BaseReg specified.
20299     if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
20300       return false;
20301
20302     // If lower 4G is not available, then we must use rip-relative addressing.
20303     if ((M != CodeModel::Small || R != Reloc::Static) &&
20304         Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
20305       return false;
20306   }
20307
20308   switch (AM.Scale) {
20309   case 0:
20310   case 1:
20311   case 2:
20312   case 4:
20313   case 8:
20314     // These scales always work.
20315     break;
20316   case 3:
20317   case 5:
20318   case 9:
20319     // These scales are formed with basereg+scalereg.  Only accept if there is
20320     // no basereg yet.
20321     if (AM.HasBaseReg)
20322       return false;
20323     break;
20324   default:  // Other stuff never works.
20325     return false;
20326   }
20327
20328   return true;
20329 }
20330
20331 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
20332   unsigned Bits = Ty->getScalarSizeInBits();
20333
20334   // 8-bit shifts are always expensive, but versions with a scalar amount aren't
20335   // particularly cheaper than those without.
20336   if (Bits == 8)
20337     return false;
20338
20339   // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
20340   // variable shifts just as cheap as scalar ones.
20341   if (Subtarget->hasInt256() && (Bits == 32 || Bits == 64))
20342     return false;
20343
20344   // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
20345   // fully general vector.
20346   return true;
20347 }
20348
20349 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
20350   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
20351     return false;
20352   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
20353   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
20354   return NumBits1 > NumBits2;
20355 }
20356
20357 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
20358   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
20359     return false;
20360
20361   if (!isTypeLegal(EVT::getEVT(Ty1)))
20362     return false;
20363
20364   assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
20365
20366   // Assuming the caller doesn't have a zeroext or signext return parameter,
20367   // truncation all the way down to i1 is valid.
20368   return true;
20369 }
20370
20371 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
20372   return isInt<32>(Imm);
20373 }
20374
20375 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
20376   // Can also use sub to handle negated immediates.
20377   return isInt<32>(Imm);
20378 }
20379
20380 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
20381   if (!VT1.isInteger() || !VT2.isInteger())
20382     return false;
20383   unsigned NumBits1 = VT1.getSizeInBits();
20384   unsigned NumBits2 = VT2.getSizeInBits();
20385   return NumBits1 > NumBits2;
20386 }
20387
20388 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
20389   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
20390   return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
20391 }
20392
20393 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
20394   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
20395   return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
20396 }
20397
20398 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
20399   EVT VT1 = Val.getValueType();
20400   if (isZExtFree(VT1, VT2))
20401     return true;
20402
20403   if (Val.getOpcode() != ISD::LOAD)
20404     return false;
20405
20406   if (!VT1.isSimple() || !VT1.isInteger() ||
20407       !VT2.isSimple() || !VT2.isInteger())
20408     return false;
20409
20410   switch (VT1.getSimpleVT().SimpleTy) {
20411   default: break;
20412   case MVT::i8:
20413   case MVT::i16:
20414   case MVT::i32:
20415     // X86 has 8, 16, and 32-bit zero-extending loads.
20416     return true;
20417   }
20418
20419   return false;
20420 }
20421
20422 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
20423
20424 bool
20425 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
20426   if (!(Subtarget->hasFMA() || Subtarget->hasFMA4()))
20427     return false;
20428
20429   VT = VT.getScalarType();
20430
20431   if (!VT.isSimple())
20432     return false;
20433
20434   switch (VT.getSimpleVT().SimpleTy) {
20435   case MVT::f32:
20436   case MVT::f64:
20437     return true;
20438   default:
20439     break;
20440   }
20441
20442   return false;
20443 }
20444
20445 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
20446   // i16 instructions are longer (0x66 prefix) and potentially slower.
20447   return !(VT1 == MVT::i32 && VT2 == MVT::i16);
20448 }
20449
20450 /// isShuffleMaskLegal - Targets can use this to indicate that they only
20451 /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
20452 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
20453 /// are assumed to be legal.
20454 bool
20455 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
20456                                       EVT VT) const {
20457   if (!VT.isSimple())
20458     return false;
20459
20460   MVT SVT = VT.getSimpleVT();
20461
20462   // Very little shuffling can be done for 64-bit vectors right now.
20463   if (VT.getSizeInBits() == 64)
20464     return false;
20465
20466   // This is an experimental legality test that is tailored to match the
20467   // legality test of the experimental lowering more closely. They are gated
20468   // separately to ease testing of performance differences.
20469   if (ExperimentalVectorShuffleLegality)
20470     // We only care that the types being shuffled are legal. The lowering can
20471     // handle any possible shuffle mask that results.
20472     return isTypeLegal(SVT);
20473
20474   // If this is a single-input shuffle with no 128 bit lane crossings we can
20475   // lower it into pshufb.
20476   if ((SVT.is128BitVector() && Subtarget->hasSSSE3()) ||
20477       (SVT.is256BitVector() && Subtarget->hasInt256())) {
20478     bool isLegal = true;
20479     for (unsigned I = 0, E = M.size(); I != E; ++I) {
20480       if (M[I] >= (int)SVT.getVectorNumElements() ||
20481           ShuffleCrosses128bitLane(SVT, I, M[I])) {
20482         isLegal = false;
20483         break;
20484       }
20485     }
20486     if (isLegal)
20487       return true;
20488   }
20489
20490   // FIXME: blends, shifts.
20491   return (SVT.getVectorNumElements() == 2 ||
20492           ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
20493           isMOVLMask(M, SVT) ||
20494           isCommutedMOVLMask(M, SVT) ||
20495           isMOVHLPSMask(M, SVT) ||
20496           isSHUFPMask(M, SVT) ||
20497           isSHUFPMask(M, SVT, /* Commuted */ true) ||
20498           isPSHUFDMask(M, SVT) ||
20499           isPSHUFDMask(M, SVT, /* SecondOperand */ true) ||
20500           isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) ||
20501           isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) ||
20502           isPALIGNRMask(M, SVT, Subtarget) ||
20503           isUNPCKLMask(M, SVT, Subtarget->hasInt256()) ||
20504           isUNPCKHMask(M, SVT, Subtarget->hasInt256()) ||
20505           isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
20506           isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
20507           isBlendMask(M, SVT, Subtarget->hasSSE41(), Subtarget->hasInt256()) ||
20508           (Subtarget->hasSSE41() && isINSERTPSMask(M, SVT)));
20509 }
20510
20511 bool
20512 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
20513                                           EVT VT) const {
20514   if (!VT.isSimple())
20515     return false;
20516
20517   MVT SVT = VT.getSimpleVT();
20518
20519   // This is an experimental legality test that is tailored to match the
20520   // legality test of the experimental lowering more closely. They are gated
20521   // separately to ease testing of performance differences.
20522   if (ExperimentalVectorShuffleLegality)
20523     // The new vector shuffle lowering is very good at managing zero-inputs.
20524     return isShuffleMaskLegal(Mask, VT);
20525
20526   unsigned NumElts = SVT.getVectorNumElements();
20527   // FIXME: This collection of masks seems suspect.
20528   if (NumElts == 2)
20529     return true;
20530   if (NumElts == 4 && SVT.is128BitVector()) {
20531     return (isMOVLMask(Mask, SVT)  ||
20532             isCommutedMOVLMask(Mask, SVT, true) ||
20533             isSHUFPMask(Mask, SVT) ||
20534             isSHUFPMask(Mask, SVT, /* Commuted */ true) ||
20535             isBlendMask(Mask, SVT, Subtarget->hasSSE41(),
20536                         Subtarget->hasInt256()));
20537   }
20538   return false;
20539 }
20540
20541 //===----------------------------------------------------------------------===//
20542 //                           X86 Scheduler Hooks
20543 //===----------------------------------------------------------------------===//
20544
20545 /// Utility function to emit xbegin specifying the start of an RTM region.
20546 static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,
20547                                      const TargetInstrInfo *TII) {
20548   DebugLoc DL = MI->getDebugLoc();
20549
20550   const BasicBlock *BB = MBB->getBasicBlock();
20551   MachineFunction::iterator I = MBB;
20552   ++I;
20553
20554   // For the v = xbegin(), we generate
20555   //
20556   // thisMBB:
20557   //  xbegin sinkMBB
20558   //
20559   // mainMBB:
20560   //  eax = -1
20561   //
20562   // sinkMBB:
20563   //  v = eax
20564
20565   MachineBasicBlock *thisMBB = MBB;
20566   MachineFunction *MF = MBB->getParent();
20567   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
20568   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
20569   MF->insert(I, mainMBB);
20570   MF->insert(I, sinkMBB);
20571
20572   // Transfer the remainder of BB and its successor edges to sinkMBB.
20573   sinkMBB->splice(sinkMBB->begin(), MBB,
20574                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
20575   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
20576
20577   // thisMBB:
20578   //  xbegin sinkMBB
20579   //  # fallthrough to mainMBB
20580   //  # abortion to sinkMBB
20581   BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
20582   thisMBB->addSuccessor(mainMBB);
20583   thisMBB->addSuccessor(sinkMBB);
20584
20585   // mainMBB:
20586   //  EAX = -1
20587   BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
20588   mainMBB->addSuccessor(sinkMBB);
20589
20590   // sinkMBB:
20591   // EAX is live into the sinkMBB
20592   sinkMBB->addLiveIn(X86::EAX);
20593   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
20594           TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
20595     .addReg(X86::EAX);
20596
20597   MI->eraseFromParent();
20598   return sinkMBB;
20599 }
20600
20601 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
20602 // or XMM0_V32I8 in AVX all of this code can be replaced with that
20603 // in the .td file.
20604 static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB,
20605                                        const TargetInstrInfo *TII) {
20606   unsigned Opc;
20607   switch (MI->getOpcode()) {
20608   default: llvm_unreachable("illegal opcode!");
20609   case X86::PCMPISTRM128REG:  Opc = X86::PCMPISTRM128rr;  break;
20610   case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
20611   case X86::PCMPISTRM128MEM:  Opc = X86::PCMPISTRM128rm;  break;
20612   case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
20613   case X86::PCMPESTRM128REG:  Opc = X86::PCMPESTRM128rr;  break;
20614   case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
20615   case X86::PCMPESTRM128MEM:  Opc = X86::PCMPESTRM128rm;  break;
20616   case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
20617   }
20618
20619   DebugLoc dl = MI->getDebugLoc();
20620   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
20621
20622   unsigned NumArgs = MI->getNumOperands();
20623   for (unsigned i = 1; i < NumArgs; ++i) {
20624     MachineOperand &Op = MI->getOperand(i);
20625     if (!(Op.isReg() && Op.isImplicit()))
20626       MIB.addOperand(Op);
20627   }
20628   if (MI->hasOneMemOperand())
20629     MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
20630
20631   BuildMI(*BB, MI, dl,
20632     TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
20633     .addReg(X86::XMM0);
20634
20635   MI->eraseFromParent();
20636   return BB;
20637 }
20638
20639 // FIXME: Custom handling because TableGen doesn't support multiple implicit
20640 // defs in an instruction pattern
20641 static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB,
20642                                        const TargetInstrInfo *TII) {
20643   unsigned Opc;
20644   switch (MI->getOpcode()) {
20645   default: llvm_unreachable("illegal opcode!");
20646   case X86::PCMPISTRIREG:  Opc = X86::PCMPISTRIrr;  break;
20647   case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
20648   case X86::PCMPISTRIMEM:  Opc = X86::PCMPISTRIrm;  break;
20649   case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
20650   case X86::PCMPESTRIREG:  Opc = X86::PCMPESTRIrr;  break;
20651   case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
20652   case X86::PCMPESTRIMEM:  Opc = X86::PCMPESTRIrm;  break;
20653   case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
20654   }
20655
20656   DebugLoc dl = MI->getDebugLoc();
20657   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
20658
20659   unsigned NumArgs = MI->getNumOperands(); // remove the results
20660   for (unsigned i = 1; i < NumArgs; ++i) {
20661     MachineOperand &Op = MI->getOperand(i);
20662     if (!(Op.isReg() && Op.isImplicit()))
20663       MIB.addOperand(Op);
20664   }
20665   if (MI->hasOneMemOperand())
20666     MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
20667
20668   BuildMI(*BB, MI, dl,
20669     TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
20670     .addReg(X86::ECX);
20671
20672   MI->eraseFromParent();
20673   return BB;
20674 }
20675
20676 static MachineBasicBlock *EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
20677                                       const X86Subtarget *Subtarget) {
20678   DebugLoc dl = MI->getDebugLoc();
20679   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
20680   // Address into RAX/EAX, other two args into ECX, EDX.
20681   unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
20682   unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
20683   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
20684   for (int i = 0; i < X86::AddrNumOperands; ++i)
20685     MIB.addOperand(MI->getOperand(i));
20686
20687   unsigned ValOps = X86::AddrNumOperands;
20688   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
20689     .addReg(MI->getOperand(ValOps).getReg());
20690   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
20691     .addReg(MI->getOperand(ValOps+1).getReg());
20692
20693   // The instruction doesn't actually take any operands though.
20694   BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr));
20695
20696   MI->eraseFromParent(); // The pseudo is gone now.
20697   return BB;
20698 }
20699
20700 MachineBasicBlock *
20701 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI,
20702                                                  MachineBasicBlock *MBB) const {
20703   // Emit va_arg instruction on X86-64.
20704
20705   // Operands to this pseudo-instruction:
20706   // 0  ) Output        : destination address (reg)
20707   // 1-5) Input         : va_list address (addr, i64mem)
20708   // 6  ) ArgSize       : Size (in bytes) of vararg type
20709   // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
20710   // 8  ) Align         : Alignment of type
20711   // 9  ) EFLAGS (implicit-def)
20712
20713   assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
20714   assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands");
20715
20716   unsigned DestReg = MI->getOperand(0).getReg();
20717   MachineOperand &Base = MI->getOperand(1);
20718   MachineOperand &Scale = MI->getOperand(2);
20719   MachineOperand &Index = MI->getOperand(3);
20720   MachineOperand &Disp = MI->getOperand(4);
20721   MachineOperand &Segment = MI->getOperand(5);
20722   unsigned ArgSize = MI->getOperand(6).getImm();
20723   unsigned ArgMode = MI->getOperand(7).getImm();
20724   unsigned Align = MI->getOperand(8).getImm();
20725
20726   // Memory Reference
20727   assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
20728   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
20729   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
20730
20731   // Machine Information
20732   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
20733   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
20734   const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
20735   const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
20736   DebugLoc DL = MI->getDebugLoc();
20737
20738   // struct va_list {
20739   //   i32   gp_offset
20740   //   i32   fp_offset
20741   //   i64   overflow_area (address)
20742   //   i64   reg_save_area (address)
20743   // }
20744   // sizeof(va_list) = 24
20745   // alignment(va_list) = 8
20746
20747   unsigned TotalNumIntRegs = 6;
20748   unsigned TotalNumXMMRegs = 8;
20749   bool UseGPOffset = (ArgMode == 1);
20750   bool UseFPOffset = (ArgMode == 2);
20751   unsigned MaxOffset = TotalNumIntRegs * 8 +
20752                        (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
20753
20754   /* Align ArgSize to a multiple of 8 */
20755   unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
20756   bool NeedsAlign = (Align > 8);
20757
20758   MachineBasicBlock *thisMBB = MBB;
20759   MachineBasicBlock *overflowMBB;
20760   MachineBasicBlock *offsetMBB;
20761   MachineBasicBlock *endMBB;
20762
20763   unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
20764   unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
20765   unsigned OffsetReg = 0;
20766
20767   if (!UseGPOffset && !UseFPOffset) {
20768     // If we only pull from the overflow region, we don't create a branch.
20769     // We don't need to alter control flow.
20770     OffsetDestReg = 0; // unused
20771     OverflowDestReg = DestReg;
20772
20773     offsetMBB = nullptr;
20774     overflowMBB = thisMBB;
20775     endMBB = thisMBB;
20776   } else {
20777     // First emit code to check if gp_offset (or fp_offset) is below the bound.
20778     // If so, pull the argument from reg_save_area. (branch to offsetMBB)
20779     // If not, pull from overflow_area. (branch to overflowMBB)
20780     //
20781     //       thisMBB
20782     //         |     .
20783     //         |        .
20784     //     offsetMBB   overflowMBB
20785     //         |        .
20786     //         |     .
20787     //        endMBB
20788
20789     // Registers for the PHI in endMBB
20790     OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
20791     OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
20792
20793     const BasicBlock *LLVM_BB = MBB->getBasicBlock();
20794     MachineFunction *MF = MBB->getParent();
20795     overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20796     offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20797     endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20798
20799     MachineFunction::iterator MBBIter = MBB;
20800     ++MBBIter;
20801
20802     // Insert the new basic blocks
20803     MF->insert(MBBIter, offsetMBB);
20804     MF->insert(MBBIter, overflowMBB);
20805     MF->insert(MBBIter, endMBB);
20806
20807     // Transfer the remainder of MBB and its successor edges to endMBB.
20808     endMBB->splice(endMBB->begin(), thisMBB,
20809                    std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
20810     endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
20811
20812     // Make offsetMBB and overflowMBB successors of thisMBB
20813     thisMBB->addSuccessor(offsetMBB);
20814     thisMBB->addSuccessor(overflowMBB);
20815
20816     // endMBB is a successor of both offsetMBB and overflowMBB
20817     offsetMBB->addSuccessor(endMBB);
20818     overflowMBB->addSuccessor(endMBB);
20819
20820     // Load the offset value into a register
20821     OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
20822     BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
20823       .addOperand(Base)
20824       .addOperand(Scale)
20825       .addOperand(Index)
20826       .addDisp(Disp, UseFPOffset ? 4 : 0)
20827       .addOperand(Segment)
20828       .setMemRefs(MMOBegin, MMOEnd);
20829
20830     // Check if there is enough room left to pull this argument.
20831     BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
20832       .addReg(OffsetReg)
20833       .addImm(MaxOffset + 8 - ArgSizeA8);
20834
20835     // Branch to "overflowMBB" if offset >= max
20836     // Fall through to "offsetMBB" otherwise
20837     BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
20838       .addMBB(overflowMBB);
20839   }
20840
20841   // In offsetMBB, emit code to use the reg_save_area.
20842   if (offsetMBB) {
20843     assert(OffsetReg != 0);
20844
20845     // Read the reg_save_area address.
20846     unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
20847     BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
20848       .addOperand(Base)
20849       .addOperand(Scale)
20850       .addOperand(Index)
20851       .addDisp(Disp, 16)
20852       .addOperand(Segment)
20853       .setMemRefs(MMOBegin, MMOEnd);
20854
20855     // Zero-extend the offset
20856     unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
20857       BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
20858         .addImm(0)
20859         .addReg(OffsetReg)
20860         .addImm(X86::sub_32bit);
20861
20862     // Add the offset to the reg_save_area to get the final address.
20863     BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
20864       .addReg(OffsetReg64)
20865       .addReg(RegSaveReg);
20866
20867     // Compute the offset for the next argument
20868     unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
20869     BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
20870       .addReg(OffsetReg)
20871       .addImm(UseFPOffset ? 16 : 8);
20872
20873     // Store it back into the va_list.
20874     BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
20875       .addOperand(Base)
20876       .addOperand(Scale)
20877       .addOperand(Index)
20878       .addDisp(Disp, UseFPOffset ? 4 : 0)
20879       .addOperand(Segment)
20880       .addReg(NextOffsetReg)
20881       .setMemRefs(MMOBegin, MMOEnd);
20882
20883     // Jump to endMBB
20884     BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
20885       .addMBB(endMBB);
20886   }
20887
20888   //
20889   // Emit code to use overflow area
20890   //
20891
20892   // Load the overflow_area address into a register.
20893   unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
20894   BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
20895     .addOperand(Base)
20896     .addOperand(Scale)
20897     .addOperand(Index)
20898     .addDisp(Disp, 8)
20899     .addOperand(Segment)
20900     .setMemRefs(MMOBegin, MMOEnd);
20901
20902   // If we need to align it, do so. Otherwise, just copy the address
20903   // to OverflowDestReg.
20904   if (NeedsAlign) {
20905     // Align the overflow address
20906     assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2");
20907     unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
20908
20909     // aligned_addr = (addr + (align-1)) & ~(align-1)
20910     BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
20911       .addReg(OverflowAddrReg)
20912       .addImm(Align-1);
20913
20914     BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
20915       .addReg(TmpReg)
20916       .addImm(~(uint64_t)(Align-1));
20917   } else {
20918     BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
20919       .addReg(OverflowAddrReg);
20920   }
20921
20922   // Compute the next overflow address after this argument.
20923   // (the overflow address should be kept 8-byte aligned)
20924   unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
20925   BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
20926     .addReg(OverflowDestReg)
20927     .addImm(ArgSizeA8);
20928
20929   // Store the new overflow address.
20930   BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
20931     .addOperand(Base)
20932     .addOperand(Scale)
20933     .addOperand(Index)
20934     .addDisp(Disp, 8)
20935     .addOperand(Segment)
20936     .addReg(NextAddrReg)
20937     .setMemRefs(MMOBegin, MMOEnd);
20938
20939   // If we branched, emit the PHI to the front of endMBB.
20940   if (offsetMBB) {
20941     BuildMI(*endMBB, endMBB->begin(), DL,
20942             TII->get(X86::PHI), DestReg)
20943       .addReg(OffsetDestReg).addMBB(offsetMBB)
20944       .addReg(OverflowDestReg).addMBB(overflowMBB);
20945   }
20946
20947   // Erase the pseudo instruction
20948   MI->eraseFromParent();
20949
20950   return endMBB;
20951 }
20952
20953 MachineBasicBlock *
20954 X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
20955                                                  MachineInstr *MI,
20956                                                  MachineBasicBlock *MBB) const {
20957   // Emit code to save XMM registers to the stack. The ABI says that the
20958   // number of registers to save is given in %al, so it's theoretically
20959   // possible to do an indirect jump trick to avoid saving all of them,
20960   // however this code takes a simpler approach and just executes all
20961   // of the stores if %al is non-zero. It's less code, and it's probably
20962   // easier on the hardware branch predictor, and stores aren't all that
20963   // expensive anyway.
20964
20965   // Create the new basic blocks. One block contains all the XMM stores,
20966   // and one block is the final destination regardless of whether any
20967   // stores were performed.
20968   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
20969   MachineFunction *F = MBB->getParent();
20970   MachineFunction::iterator MBBIter = MBB;
20971   ++MBBIter;
20972   MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
20973   MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
20974   F->insert(MBBIter, XMMSaveMBB);
20975   F->insert(MBBIter, EndMBB);
20976
20977   // Transfer the remainder of MBB and its successor edges to EndMBB.
20978   EndMBB->splice(EndMBB->begin(), MBB,
20979                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
20980   EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
20981
20982   // The original block will now fall through to the XMM save block.
20983   MBB->addSuccessor(XMMSaveMBB);
20984   // The XMMSaveMBB will fall through to the end block.
20985   XMMSaveMBB->addSuccessor(EndMBB);
20986
20987   // Now add the instructions.
20988   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
20989   DebugLoc DL = MI->getDebugLoc();
20990
20991   unsigned CountReg = MI->getOperand(0).getReg();
20992   int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
20993   int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
20994
20995   if (!Subtarget->isTargetWin64()) {
20996     // If %al is 0, branch around the XMM save block.
20997     BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
20998     BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
20999     MBB->addSuccessor(EndMBB);
21000   }
21001
21002   // Make sure the last operand is EFLAGS, which gets clobbered by the branch
21003   // that was just emitted, but clearly shouldn't be "saved".
21004   assert((MI->getNumOperands() <= 3 ||
21005           !MI->getOperand(MI->getNumOperands() - 1).isReg() ||
21006           MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS)
21007          && "Expected last argument to be EFLAGS");
21008   unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
21009   // In the XMM save block, save all the XMM argument registers.
21010   for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) {
21011     int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
21012     MachineMemOperand *MMO =
21013       F->getMachineMemOperand(
21014           MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset),
21015         MachineMemOperand::MOStore,
21016         /*Size=*/16, /*Align=*/16);
21017     BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
21018       .addFrameIndex(RegSaveFrameIndex)
21019       .addImm(/*Scale=*/1)
21020       .addReg(/*IndexReg=*/0)
21021       .addImm(/*Disp=*/Offset)
21022       .addReg(/*Segment=*/0)
21023       .addReg(MI->getOperand(i).getReg())
21024       .addMemOperand(MMO);
21025   }
21026
21027   MI->eraseFromParent();   // The pseudo instruction is gone now.
21028
21029   return EndMBB;
21030 }
21031
21032 // The EFLAGS operand of SelectItr might be missing a kill marker
21033 // because there were multiple uses of EFLAGS, and ISel didn't know
21034 // which to mark. Figure out whether SelectItr should have had a
21035 // kill marker, and set it if it should. Returns the correct kill
21036 // marker value.
21037 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
21038                                      MachineBasicBlock* BB,
21039                                      const TargetRegisterInfo* TRI) {
21040   // Scan forward through BB for a use/def of EFLAGS.
21041   MachineBasicBlock::iterator miI(std::next(SelectItr));
21042   for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
21043     const MachineInstr& mi = *miI;
21044     if (mi.readsRegister(X86::EFLAGS))
21045       return false;
21046     if (mi.definesRegister(X86::EFLAGS))
21047       break; // Should have kill-flag - update below.
21048   }
21049
21050   // If we hit the end of the block, check whether EFLAGS is live into a
21051   // successor.
21052   if (miI == BB->end()) {
21053     for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
21054                                           sEnd = BB->succ_end();
21055          sItr != sEnd; ++sItr) {
21056       MachineBasicBlock* succ = *sItr;
21057       if (succ->isLiveIn(X86::EFLAGS))
21058         return false;
21059     }
21060   }
21061
21062   // We found a def, or hit the end of the basic block and EFLAGS wasn't live
21063   // out. SelectMI should have a kill flag on EFLAGS.
21064   SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
21065   return true;
21066 }
21067
21068 MachineBasicBlock *
21069 X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
21070                                      MachineBasicBlock *BB) const {
21071   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21072   DebugLoc DL = MI->getDebugLoc();
21073
21074   // To "insert" a SELECT_CC instruction, we actually have to insert the
21075   // diamond control-flow pattern.  The incoming instruction knows the
21076   // destination vreg to set, the condition code register to branch on, the
21077   // true/false values to select between, and a branch opcode to use.
21078   const BasicBlock *LLVM_BB = BB->getBasicBlock();
21079   MachineFunction::iterator It = BB;
21080   ++It;
21081
21082   //  thisMBB:
21083   //  ...
21084   //   TrueVal = ...
21085   //   cmpTY ccX, r1, r2
21086   //   bCC copy1MBB
21087   //   fallthrough --> copy0MBB
21088   MachineBasicBlock *thisMBB = BB;
21089   MachineFunction *F = BB->getParent();
21090   MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
21091   MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
21092   F->insert(It, copy0MBB);
21093   F->insert(It, sinkMBB);
21094
21095   // If the EFLAGS register isn't dead in the terminator, then claim that it's
21096   // live into the sink and copy blocks.
21097   const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
21098   if (!MI->killsRegister(X86::EFLAGS) &&
21099       !checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
21100     copy0MBB->addLiveIn(X86::EFLAGS);
21101     sinkMBB->addLiveIn(X86::EFLAGS);
21102   }
21103
21104   // Transfer the remainder of BB and its successor edges to sinkMBB.
21105   sinkMBB->splice(sinkMBB->begin(), BB,
21106                   std::next(MachineBasicBlock::iterator(MI)), BB->end());
21107   sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
21108
21109   // Add the true and fallthrough blocks as its successors.
21110   BB->addSuccessor(copy0MBB);
21111   BB->addSuccessor(sinkMBB);
21112
21113   // Create the conditional branch instruction.
21114   unsigned Opc =
21115     X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
21116   BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
21117
21118   //  copy0MBB:
21119   //   %FalseValue = ...
21120   //   # fallthrough to sinkMBB
21121   copy0MBB->addSuccessor(sinkMBB);
21122
21123   //  sinkMBB:
21124   //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
21125   //  ...
21126   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
21127           TII->get(X86::PHI), MI->getOperand(0).getReg())
21128     .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
21129     .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
21130
21131   MI->eraseFromParent();   // The pseudo instruction is gone now.
21132   return sinkMBB;
21133 }
21134
21135 MachineBasicBlock *
21136 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
21137                                         MachineBasicBlock *BB) const {
21138   MachineFunction *MF = BB->getParent();
21139   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21140   DebugLoc DL = MI->getDebugLoc();
21141   const BasicBlock *LLVM_BB = BB->getBasicBlock();
21142
21143   assert(MF->shouldSplitStack());
21144
21145   const bool Is64Bit = Subtarget->is64Bit();
21146   const bool IsLP64 = Subtarget->isTarget64BitLP64();
21147
21148   const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
21149   const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
21150
21151   // BB:
21152   //  ... [Till the alloca]
21153   // If stacklet is not large enough, jump to mallocMBB
21154   //
21155   // bumpMBB:
21156   //  Allocate by subtracting from RSP
21157   //  Jump to continueMBB
21158   //
21159   // mallocMBB:
21160   //  Allocate by call to runtime
21161   //
21162   // continueMBB:
21163   //  ...
21164   //  [rest of original BB]
21165   //
21166
21167   MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
21168   MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
21169   MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
21170
21171   MachineRegisterInfo &MRI = MF->getRegInfo();
21172   const TargetRegisterClass *AddrRegClass =
21173     getRegClassFor(getPointerTy());
21174
21175   unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
21176     bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
21177     tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
21178     SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
21179     sizeVReg = MI->getOperand(1).getReg(),
21180     physSPReg = IsLP64 || Subtarget->isTargetNaCl64() ? X86::RSP : X86::ESP;
21181
21182   MachineFunction::iterator MBBIter = BB;
21183   ++MBBIter;
21184
21185   MF->insert(MBBIter, bumpMBB);
21186   MF->insert(MBBIter, mallocMBB);
21187   MF->insert(MBBIter, continueMBB);
21188
21189   continueMBB->splice(continueMBB->begin(), BB,
21190                       std::next(MachineBasicBlock::iterator(MI)), BB->end());
21191   continueMBB->transferSuccessorsAndUpdatePHIs(BB);
21192
21193   // Add code to the main basic block to check if the stack limit has been hit,
21194   // and if so, jump to mallocMBB otherwise to bumpMBB.
21195   BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
21196   BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
21197     .addReg(tmpSPVReg).addReg(sizeVReg);
21198   BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
21199     .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
21200     .addReg(SPLimitVReg);
21201   BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
21202
21203   // bumpMBB simply decreases the stack pointer, since we know the current
21204   // stacklet has enough space.
21205   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
21206     .addReg(SPLimitVReg);
21207   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
21208     .addReg(SPLimitVReg);
21209   BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
21210
21211   // Calls into a routine in libgcc to allocate more space from the heap.
21212   const uint32_t *RegMask =
21213       Subtarget->getRegisterInfo()->getCallPreservedMask(CallingConv::C);
21214   if (IsLP64) {
21215     BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
21216       .addReg(sizeVReg);
21217     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
21218       .addExternalSymbol("__morestack_allocate_stack_space")
21219       .addRegMask(RegMask)
21220       .addReg(X86::RDI, RegState::Implicit)
21221       .addReg(X86::RAX, RegState::ImplicitDefine);
21222   } else if (Is64Bit) {
21223     BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
21224       .addReg(sizeVReg);
21225     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
21226       .addExternalSymbol("__morestack_allocate_stack_space")
21227       .addRegMask(RegMask)
21228       .addReg(X86::EDI, RegState::Implicit)
21229       .addReg(X86::EAX, RegState::ImplicitDefine);
21230   } else {
21231     BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
21232       .addImm(12);
21233     BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
21234     BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
21235       .addExternalSymbol("__morestack_allocate_stack_space")
21236       .addRegMask(RegMask)
21237       .addReg(X86::EAX, RegState::ImplicitDefine);
21238   }
21239
21240   if (!Is64Bit)
21241     BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
21242       .addImm(16);
21243
21244   BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
21245     .addReg(IsLP64 ? X86::RAX : X86::EAX);
21246   BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
21247
21248   // Set up the CFG correctly.
21249   BB->addSuccessor(bumpMBB);
21250   BB->addSuccessor(mallocMBB);
21251   mallocMBB->addSuccessor(continueMBB);
21252   bumpMBB->addSuccessor(continueMBB);
21253
21254   // Take care of the PHI nodes.
21255   BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
21256           MI->getOperand(0).getReg())
21257     .addReg(mallocPtrVReg).addMBB(mallocMBB)
21258     .addReg(bumpSPPtrVReg).addMBB(bumpMBB);
21259
21260   // Delete the original pseudo instruction.
21261   MI->eraseFromParent();
21262
21263   // And we're done.
21264   return continueMBB;
21265 }
21266
21267 MachineBasicBlock *
21268 X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
21269                                         MachineBasicBlock *BB) const {
21270   DebugLoc DL = MI->getDebugLoc();
21271
21272   assert(!Subtarget->isTargetMachO());
21273
21274   X86FrameLowering::emitStackProbeCall(*BB->getParent(), *BB, MI, DL);
21275
21276   MI->eraseFromParent();   // The pseudo instruction is gone now.
21277   return BB;
21278 }
21279
21280 MachineBasicBlock *
21281 X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
21282                                       MachineBasicBlock *BB) const {
21283   // This is pretty easy.  We're taking the value that we received from
21284   // our load from the relocation, sticking it in either RDI (x86-64)
21285   // or EAX and doing an indirect call.  The return value will then
21286   // be in the normal return register.
21287   MachineFunction *F = BB->getParent();
21288   const X86InstrInfo *TII = Subtarget->getInstrInfo();
21289   DebugLoc DL = MI->getDebugLoc();
21290
21291   assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
21292   assert(MI->getOperand(3).isGlobal() && "This should be a global");
21293
21294   // Get a register mask for the lowered call.
21295   // FIXME: The 32-bit calls have non-standard calling conventions. Use a
21296   // proper register mask.
21297   const uint32_t *RegMask =
21298       Subtarget->getRegisterInfo()->getCallPreservedMask(CallingConv::C);
21299   if (Subtarget->is64Bit()) {
21300     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
21301                                       TII->get(X86::MOV64rm), X86::RDI)
21302     .addReg(X86::RIP)
21303     .addImm(0).addReg(0)
21304     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
21305                       MI->getOperand(3).getTargetFlags())
21306     .addReg(0);
21307     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
21308     addDirectMem(MIB, X86::RDI);
21309     MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
21310   } else if (F->getTarget().getRelocationModel() != Reloc::PIC_) {
21311     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
21312                                       TII->get(X86::MOV32rm), X86::EAX)
21313     .addReg(0)
21314     .addImm(0).addReg(0)
21315     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
21316                       MI->getOperand(3).getTargetFlags())
21317     .addReg(0);
21318     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
21319     addDirectMem(MIB, X86::EAX);
21320     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
21321   } else {
21322     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
21323                                       TII->get(X86::MOV32rm), X86::EAX)
21324     .addReg(TII->getGlobalBaseReg(F))
21325     .addImm(0).addReg(0)
21326     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
21327                       MI->getOperand(3).getTargetFlags())
21328     .addReg(0);
21329     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
21330     addDirectMem(MIB, X86::EAX);
21331     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
21332   }
21333
21334   MI->eraseFromParent(); // The pseudo instruction is gone now.
21335   return BB;
21336 }
21337
21338 MachineBasicBlock *
21339 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
21340                                     MachineBasicBlock *MBB) const {
21341   DebugLoc DL = MI->getDebugLoc();
21342   MachineFunction *MF = MBB->getParent();
21343   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21344   MachineRegisterInfo &MRI = MF->getRegInfo();
21345
21346   const BasicBlock *BB = MBB->getBasicBlock();
21347   MachineFunction::iterator I = MBB;
21348   ++I;
21349
21350   // Memory Reference
21351   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
21352   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
21353
21354   unsigned DstReg;
21355   unsigned MemOpndSlot = 0;
21356
21357   unsigned CurOp = 0;
21358
21359   DstReg = MI->getOperand(CurOp++).getReg();
21360   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
21361   assert(RC->hasType(MVT::i32) && "Invalid destination!");
21362   unsigned mainDstReg = MRI.createVirtualRegister(RC);
21363   unsigned restoreDstReg = MRI.createVirtualRegister(RC);
21364
21365   MemOpndSlot = CurOp;
21366
21367   MVT PVT = getPointerTy();
21368   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
21369          "Invalid Pointer Size!");
21370
21371   // For v = setjmp(buf), we generate
21372   //
21373   // thisMBB:
21374   //  buf[LabelOffset] = restoreMBB
21375   //  SjLjSetup restoreMBB
21376   //
21377   // mainMBB:
21378   //  v_main = 0
21379   //
21380   // sinkMBB:
21381   //  v = phi(main, restore)
21382   //
21383   // restoreMBB:
21384   //  if base pointer being used, load it from frame
21385   //  v_restore = 1
21386
21387   MachineBasicBlock *thisMBB = MBB;
21388   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
21389   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
21390   MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
21391   MF->insert(I, mainMBB);
21392   MF->insert(I, sinkMBB);
21393   MF->push_back(restoreMBB);
21394
21395   MachineInstrBuilder MIB;
21396
21397   // Transfer the remainder of BB and its successor edges to sinkMBB.
21398   sinkMBB->splice(sinkMBB->begin(), MBB,
21399                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
21400   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
21401
21402   // thisMBB:
21403   unsigned PtrStoreOpc = 0;
21404   unsigned LabelReg = 0;
21405   const int64_t LabelOffset = 1 * PVT.getStoreSize();
21406   Reloc::Model RM = MF->getTarget().getRelocationModel();
21407   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
21408                      (RM == Reloc::Static || RM == Reloc::DynamicNoPIC);
21409
21410   // Prepare IP either in reg or imm.
21411   if (!UseImmLabel) {
21412     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
21413     const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
21414     LabelReg = MRI.createVirtualRegister(PtrRC);
21415     if (Subtarget->is64Bit()) {
21416       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
21417               .addReg(X86::RIP)
21418               .addImm(0)
21419               .addReg(0)
21420               .addMBB(restoreMBB)
21421               .addReg(0);
21422     } else {
21423       const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
21424       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
21425               .addReg(XII->getGlobalBaseReg(MF))
21426               .addImm(0)
21427               .addReg(0)
21428               .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference())
21429               .addReg(0);
21430     }
21431   } else
21432     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
21433   // Store IP
21434   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
21435   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
21436     if (i == X86::AddrDisp)
21437       MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset);
21438     else
21439       MIB.addOperand(MI->getOperand(MemOpndSlot + i));
21440   }
21441   if (!UseImmLabel)
21442     MIB.addReg(LabelReg);
21443   else
21444     MIB.addMBB(restoreMBB);
21445   MIB.setMemRefs(MMOBegin, MMOEnd);
21446   // Setup
21447   MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
21448           .addMBB(restoreMBB);
21449
21450   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
21451   MIB.addRegMask(RegInfo->getNoPreservedMask());
21452   thisMBB->addSuccessor(mainMBB);
21453   thisMBB->addSuccessor(restoreMBB);
21454
21455   // mainMBB:
21456   //  EAX = 0
21457   BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
21458   mainMBB->addSuccessor(sinkMBB);
21459
21460   // sinkMBB:
21461   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
21462           TII->get(X86::PHI), DstReg)
21463     .addReg(mainDstReg).addMBB(mainMBB)
21464     .addReg(restoreDstReg).addMBB(restoreMBB);
21465
21466   // restoreMBB:
21467   if (RegInfo->hasBasePointer(*MF)) {
21468     const bool Uses64BitFramePtr =
21469         Subtarget->isTarget64BitLP64() || Subtarget->isTargetNaCl64();
21470     X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
21471     X86FI->setRestoreBasePointer(MF);
21472     unsigned FramePtr = RegInfo->getFrameRegister(*MF);
21473     unsigned BasePtr = RegInfo->getBaseRegister();
21474     unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
21475     addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
21476                  FramePtr, true, X86FI->getRestoreBasePointerOffset())
21477       .setMIFlag(MachineInstr::FrameSetup);
21478   }
21479   BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
21480   BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
21481   restoreMBB->addSuccessor(sinkMBB);
21482
21483   MI->eraseFromParent();
21484   return sinkMBB;
21485 }
21486
21487 MachineBasicBlock *
21488 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
21489                                      MachineBasicBlock *MBB) const {
21490   DebugLoc DL = MI->getDebugLoc();
21491   MachineFunction *MF = MBB->getParent();
21492   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21493   MachineRegisterInfo &MRI = MF->getRegInfo();
21494
21495   // Memory Reference
21496   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
21497   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
21498
21499   MVT PVT = getPointerTy();
21500   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
21501          "Invalid Pointer Size!");
21502
21503   const TargetRegisterClass *RC =
21504     (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
21505   unsigned Tmp = MRI.createVirtualRegister(RC);
21506   // Since FP is only updated here but NOT referenced, it's treated as GPR.
21507   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
21508   unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
21509   unsigned SP = RegInfo->getStackRegister();
21510
21511   MachineInstrBuilder MIB;
21512
21513   const int64_t LabelOffset = 1 * PVT.getStoreSize();
21514   const int64_t SPOffset = 2 * PVT.getStoreSize();
21515
21516   unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
21517   unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
21518
21519   // Reload FP
21520   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
21521   for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
21522     MIB.addOperand(MI->getOperand(i));
21523   MIB.setMemRefs(MMOBegin, MMOEnd);
21524   // Reload IP
21525   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
21526   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
21527     if (i == X86::AddrDisp)
21528       MIB.addDisp(MI->getOperand(i), LabelOffset);
21529     else
21530       MIB.addOperand(MI->getOperand(i));
21531   }
21532   MIB.setMemRefs(MMOBegin, MMOEnd);
21533   // Reload SP
21534   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
21535   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
21536     if (i == X86::AddrDisp)
21537       MIB.addDisp(MI->getOperand(i), SPOffset);
21538     else
21539       MIB.addOperand(MI->getOperand(i));
21540   }
21541   MIB.setMemRefs(MMOBegin, MMOEnd);
21542   // Jump
21543   BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
21544
21545   MI->eraseFromParent();
21546   return MBB;
21547 }
21548
21549 // Replace 213-type (isel default) FMA3 instructions with 231-type for
21550 // accumulator loops. Writing back to the accumulator allows the coalescer
21551 // to remove extra copies in the loop.
21552 MachineBasicBlock *
21553 X86TargetLowering::emitFMA3Instr(MachineInstr *MI,
21554                                  MachineBasicBlock *MBB) const {
21555   MachineOperand &AddendOp = MI->getOperand(3);
21556
21557   // Bail out early if the addend isn't a register - we can't switch these.
21558   if (!AddendOp.isReg())
21559     return MBB;
21560
21561   MachineFunction &MF = *MBB->getParent();
21562   MachineRegisterInfo &MRI = MF.getRegInfo();
21563
21564   // Check whether the addend is defined by a PHI:
21565   assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?");
21566   MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg());
21567   if (!AddendDef.isPHI())
21568     return MBB;
21569
21570   // Look for the following pattern:
21571   // loop:
21572   //   %addend = phi [%entry, 0], [%loop, %result]
21573   //   ...
21574   //   %result<tied1> = FMA213 %m2<tied0>, %m1, %addend
21575
21576   // Replace with:
21577   //   loop:
21578   //   %addend = phi [%entry, 0], [%loop, %result]
21579   //   ...
21580   //   %result<tied1> = FMA231 %addend<tied0>, %m1, %m2
21581
21582   for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) {
21583     assert(AddendDef.getOperand(i).isReg());
21584     MachineOperand PHISrcOp = AddendDef.getOperand(i);
21585     MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg());
21586     if (&PHISrcInst == MI) {
21587       // Found a matching instruction.
21588       unsigned NewFMAOpc = 0;
21589       switch (MI->getOpcode()) {
21590         case X86::VFMADDPDr213r: NewFMAOpc = X86::VFMADDPDr231r; break;
21591         case X86::VFMADDPSr213r: NewFMAOpc = X86::VFMADDPSr231r; break;
21592         case X86::VFMADDSDr213r: NewFMAOpc = X86::VFMADDSDr231r; break;
21593         case X86::VFMADDSSr213r: NewFMAOpc = X86::VFMADDSSr231r; break;
21594         case X86::VFMSUBPDr213r: NewFMAOpc = X86::VFMSUBPDr231r; break;
21595         case X86::VFMSUBPSr213r: NewFMAOpc = X86::VFMSUBPSr231r; break;
21596         case X86::VFMSUBSDr213r: NewFMAOpc = X86::VFMSUBSDr231r; break;
21597         case X86::VFMSUBSSr213r: NewFMAOpc = X86::VFMSUBSSr231r; break;
21598         case X86::VFNMADDPDr213r: NewFMAOpc = X86::VFNMADDPDr231r; break;
21599         case X86::VFNMADDPSr213r: NewFMAOpc = X86::VFNMADDPSr231r; break;
21600         case X86::VFNMADDSDr213r: NewFMAOpc = X86::VFNMADDSDr231r; break;
21601         case X86::VFNMADDSSr213r: NewFMAOpc = X86::VFNMADDSSr231r; break;
21602         case X86::VFNMSUBPDr213r: NewFMAOpc = X86::VFNMSUBPDr231r; break;
21603         case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break;
21604         case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break;
21605         case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break;
21606         case X86::VFMADDSUBPDr213r: NewFMAOpc = X86::VFMADDSUBPDr231r; break;
21607         case X86::VFMADDSUBPSr213r: NewFMAOpc = X86::VFMADDSUBPSr231r; break;
21608         case X86::VFMSUBADDPDr213r: NewFMAOpc = X86::VFMSUBADDPDr231r; break;
21609         case X86::VFMSUBADDPSr213r: NewFMAOpc = X86::VFMSUBADDPSr231r; break;
21610
21611         case X86::VFMADDPDr213rY: NewFMAOpc = X86::VFMADDPDr231rY; break;
21612         case X86::VFMADDPSr213rY: NewFMAOpc = X86::VFMADDPSr231rY; break;
21613         case X86::VFMSUBPDr213rY: NewFMAOpc = X86::VFMSUBPDr231rY; break;
21614         case X86::VFMSUBPSr213rY: NewFMAOpc = X86::VFMSUBPSr231rY; break;
21615         case X86::VFNMADDPDr213rY: NewFMAOpc = X86::VFNMADDPDr231rY; break;
21616         case X86::VFNMADDPSr213rY: NewFMAOpc = X86::VFNMADDPSr231rY; break;
21617         case X86::VFNMSUBPDr213rY: NewFMAOpc = X86::VFNMSUBPDr231rY; break;
21618         case X86::VFNMSUBPSr213rY: NewFMAOpc = X86::VFNMSUBPSr231rY; break;
21619         case X86::VFMADDSUBPDr213rY: NewFMAOpc = X86::VFMADDSUBPDr231rY; break;
21620         case X86::VFMADDSUBPSr213rY: NewFMAOpc = X86::VFMADDSUBPSr231rY; break;
21621         case X86::VFMSUBADDPDr213rY: NewFMAOpc = X86::VFMSUBADDPDr231rY; break;
21622         case X86::VFMSUBADDPSr213rY: NewFMAOpc = X86::VFMSUBADDPSr231rY; break;
21623         default: llvm_unreachable("Unrecognized FMA variant.");
21624       }
21625
21626       const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
21627       MachineInstrBuilder MIB =
21628         BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc))
21629         .addOperand(MI->getOperand(0))
21630         .addOperand(MI->getOperand(3))
21631         .addOperand(MI->getOperand(2))
21632         .addOperand(MI->getOperand(1));
21633       MBB->insert(MachineBasicBlock::iterator(MI), MIB);
21634       MI->eraseFromParent();
21635     }
21636   }
21637
21638   return MBB;
21639 }
21640
21641 MachineBasicBlock *
21642 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
21643                                                MachineBasicBlock *BB) const {
21644   switch (MI->getOpcode()) {
21645   default: llvm_unreachable("Unexpected instr type to insert");
21646   case X86::TAILJMPd64:
21647   case X86::TAILJMPr64:
21648   case X86::TAILJMPm64:
21649   case X86::TAILJMPd64_REX:
21650   case X86::TAILJMPr64_REX:
21651   case X86::TAILJMPm64_REX:
21652     llvm_unreachable("TAILJMP64 would not be touched here.");
21653   case X86::TCRETURNdi64:
21654   case X86::TCRETURNri64:
21655   case X86::TCRETURNmi64:
21656     return BB;
21657   case X86::WIN_ALLOCA:
21658     return EmitLoweredWinAlloca(MI, BB);
21659   case X86::SEG_ALLOCA_32:
21660   case X86::SEG_ALLOCA_64:
21661     return EmitLoweredSegAlloca(MI, BB);
21662   case X86::TLSCall_32:
21663   case X86::TLSCall_64:
21664     return EmitLoweredTLSCall(MI, BB);
21665   case X86::CMOV_GR8:
21666   case X86::CMOV_FR32:
21667   case X86::CMOV_FR64:
21668   case X86::CMOV_V4F32:
21669   case X86::CMOV_V2F64:
21670   case X86::CMOV_V2I64:
21671   case X86::CMOV_V8F32:
21672   case X86::CMOV_V4F64:
21673   case X86::CMOV_V4I64:
21674   case X86::CMOV_V16F32:
21675   case X86::CMOV_V8F64:
21676   case X86::CMOV_V8I64:
21677   case X86::CMOV_GR16:
21678   case X86::CMOV_GR32:
21679   case X86::CMOV_RFP32:
21680   case X86::CMOV_RFP64:
21681   case X86::CMOV_RFP80:
21682     return EmitLoweredSelect(MI, BB);
21683
21684   case X86::FP32_TO_INT16_IN_MEM:
21685   case X86::FP32_TO_INT32_IN_MEM:
21686   case X86::FP32_TO_INT64_IN_MEM:
21687   case X86::FP64_TO_INT16_IN_MEM:
21688   case X86::FP64_TO_INT32_IN_MEM:
21689   case X86::FP64_TO_INT64_IN_MEM:
21690   case X86::FP80_TO_INT16_IN_MEM:
21691   case X86::FP80_TO_INT32_IN_MEM:
21692   case X86::FP80_TO_INT64_IN_MEM: {
21693     MachineFunction *F = BB->getParent();
21694     const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21695     DebugLoc DL = MI->getDebugLoc();
21696
21697     // Change the floating point control register to use "round towards zero"
21698     // mode when truncating to an integer value.
21699     int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
21700     addFrameReference(BuildMI(*BB, MI, DL,
21701                               TII->get(X86::FNSTCW16m)), CWFrameIdx);
21702
21703     // Load the old value of the high byte of the control word...
21704     unsigned OldCW =
21705       F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
21706     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
21707                       CWFrameIdx);
21708
21709     // Set the high part to be round to zero...
21710     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
21711       .addImm(0xC7F);
21712
21713     // Reload the modified control word now...
21714     addFrameReference(BuildMI(*BB, MI, DL,
21715                               TII->get(X86::FLDCW16m)), CWFrameIdx);
21716
21717     // Restore the memory image of control word to original value
21718     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
21719       .addReg(OldCW);
21720
21721     // Get the X86 opcode to use.
21722     unsigned Opc;
21723     switch (MI->getOpcode()) {
21724     default: llvm_unreachable("illegal opcode!");
21725     case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
21726     case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
21727     case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
21728     case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
21729     case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
21730     case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
21731     case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
21732     case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
21733     case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
21734     }
21735
21736     X86AddressMode AM;
21737     MachineOperand &Op = MI->getOperand(0);
21738     if (Op.isReg()) {
21739       AM.BaseType = X86AddressMode::RegBase;
21740       AM.Base.Reg = Op.getReg();
21741     } else {
21742       AM.BaseType = X86AddressMode::FrameIndexBase;
21743       AM.Base.FrameIndex = Op.getIndex();
21744     }
21745     Op = MI->getOperand(1);
21746     if (Op.isImm())
21747       AM.Scale = Op.getImm();
21748     Op = MI->getOperand(2);
21749     if (Op.isImm())
21750       AM.IndexReg = Op.getImm();
21751     Op = MI->getOperand(3);
21752     if (Op.isGlobal()) {
21753       AM.GV = Op.getGlobal();
21754     } else {
21755       AM.Disp = Op.getImm();
21756     }
21757     addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
21758                       .addReg(MI->getOperand(X86::AddrNumOperands).getReg());
21759
21760     // Reload the original control word now.
21761     addFrameReference(BuildMI(*BB, MI, DL,
21762                               TII->get(X86::FLDCW16m)), CWFrameIdx);
21763
21764     MI->eraseFromParent();   // The pseudo instruction is gone now.
21765     return BB;
21766   }
21767     // String/text processing lowering.
21768   case X86::PCMPISTRM128REG:
21769   case X86::VPCMPISTRM128REG:
21770   case X86::PCMPISTRM128MEM:
21771   case X86::VPCMPISTRM128MEM:
21772   case X86::PCMPESTRM128REG:
21773   case X86::VPCMPESTRM128REG:
21774   case X86::PCMPESTRM128MEM:
21775   case X86::VPCMPESTRM128MEM:
21776     assert(Subtarget->hasSSE42() &&
21777            "Target must have SSE4.2 or AVX features enabled");
21778     return EmitPCMPSTRM(MI, BB, Subtarget->getInstrInfo());
21779
21780   // String/text processing lowering.
21781   case X86::PCMPISTRIREG:
21782   case X86::VPCMPISTRIREG:
21783   case X86::PCMPISTRIMEM:
21784   case X86::VPCMPISTRIMEM:
21785   case X86::PCMPESTRIREG:
21786   case X86::VPCMPESTRIREG:
21787   case X86::PCMPESTRIMEM:
21788   case X86::VPCMPESTRIMEM:
21789     assert(Subtarget->hasSSE42() &&
21790            "Target must have SSE4.2 or AVX features enabled");
21791     return EmitPCMPSTRI(MI, BB, Subtarget->getInstrInfo());
21792
21793   // Thread synchronization.
21794   case X86::MONITOR:
21795     return EmitMonitor(MI, BB, Subtarget);
21796
21797   // xbegin
21798   case X86::XBEGIN:
21799     return EmitXBegin(MI, BB, Subtarget->getInstrInfo());
21800
21801   case X86::VASTART_SAVE_XMM_REGS:
21802     return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
21803
21804   case X86::VAARG_64:
21805     return EmitVAARG64WithCustomInserter(MI, BB);
21806
21807   case X86::EH_SjLj_SetJmp32:
21808   case X86::EH_SjLj_SetJmp64:
21809     return emitEHSjLjSetJmp(MI, BB);
21810
21811   case X86::EH_SjLj_LongJmp32:
21812   case X86::EH_SjLj_LongJmp64:
21813     return emitEHSjLjLongJmp(MI, BB);
21814
21815   case TargetOpcode::STATEPOINT:
21816     // As an implementation detail, STATEPOINT shares the STACKMAP format at
21817     // this point in the process.  We diverge later.
21818     return emitPatchPoint(MI, BB);
21819
21820   case TargetOpcode::STACKMAP:
21821   case TargetOpcode::PATCHPOINT:
21822     return emitPatchPoint(MI, BB);
21823
21824   case X86::VFMADDPDr213r:
21825   case X86::VFMADDPSr213r:
21826   case X86::VFMADDSDr213r:
21827   case X86::VFMADDSSr213r:
21828   case X86::VFMSUBPDr213r:
21829   case X86::VFMSUBPSr213r:
21830   case X86::VFMSUBSDr213r:
21831   case X86::VFMSUBSSr213r:
21832   case X86::VFNMADDPDr213r:
21833   case X86::VFNMADDPSr213r:
21834   case X86::VFNMADDSDr213r:
21835   case X86::VFNMADDSSr213r:
21836   case X86::VFNMSUBPDr213r:
21837   case X86::VFNMSUBPSr213r:
21838   case X86::VFNMSUBSDr213r:
21839   case X86::VFNMSUBSSr213r:
21840   case X86::VFMADDSUBPDr213r:
21841   case X86::VFMADDSUBPSr213r:
21842   case X86::VFMSUBADDPDr213r:
21843   case X86::VFMSUBADDPSr213r:
21844   case X86::VFMADDPDr213rY:
21845   case X86::VFMADDPSr213rY:
21846   case X86::VFMSUBPDr213rY:
21847   case X86::VFMSUBPSr213rY:
21848   case X86::VFNMADDPDr213rY:
21849   case X86::VFNMADDPSr213rY:
21850   case X86::VFNMSUBPDr213rY:
21851   case X86::VFNMSUBPSr213rY:
21852   case X86::VFMADDSUBPDr213rY:
21853   case X86::VFMADDSUBPSr213rY:
21854   case X86::VFMSUBADDPDr213rY:
21855   case X86::VFMSUBADDPSr213rY:
21856     return emitFMA3Instr(MI, BB);
21857   }
21858 }
21859
21860 //===----------------------------------------------------------------------===//
21861 //                           X86 Optimization Hooks
21862 //===----------------------------------------------------------------------===//
21863
21864 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
21865                                                       APInt &KnownZero,
21866                                                       APInt &KnownOne,
21867                                                       const SelectionDAG &DAG,
21868                                                       unsigned Depth) const {
21869   unsigned BitWidth = KnownZero.getBitWidth();
21870   unsigned Opc = Op.getOpcode();
21871   assert((Opc >= ISD::BUILTIN_OP_END ||
21872           Opc == ISD::INTRINSIC_WO_CHAIN ||
21873           Opc == ISD::INTRINSIC_W_CHAIN ||
21874           Opc == ISD::INTRINSIC_VOID) &&
21875          "Should use MaskedValueIsZero if you don't know whether Op"
21876          " is a target node!");
21877
21878   KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
21879   switch (Opc) {
21880   default: break;
21881   case X86ISD::ADD:
21882   case X86ISD::SUB:
21883   case X86ISD::ADC:
21884   case X86ISD::SBB:
21885   case X86ISD::SMUL:
21886   case X86ISD::UMUL:
21887   case X86ISD::INC:
21888   case X86ISD::DEC:
21889   case X86ISD::OR:
21890   case X86ISD::XOR:
21891   case X86ISD::AND:
21892     // These nodes' second result is a boolean.
21893     if (Op.getResNo() == 0)
21894       break;
21895     // Fallthrough
21896   case X86ISD::SETCC:
21897     KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
21898     break;
21899   case ISD::INTRINSIC_WO_CHAIN: {
21900     unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
21901     unsigned NumLoBits = 0;
21902     switch (IntId) {
21903     default: break;
21904     case Intrinsic::x86_sse_movmsk_ps:
21905     case Intrinsic::x86_avx_movmsk_ps_256:
21906     case Intrinsic::x86_sse2_movmsk_pd:
21907     case Intrinsic::x86_avx_movmsk_pd_256:
21908     case Intrinsic::x86_mmx_pmovmskb:
21909     case Intrinsic::x86_sse2_pmovmskb_128:
21910     case Intrinsic::x86_avx2_pmovmskb: {
21911       // High bits of movmskp{s|d}, pmovmskb are known zero.
21912       switch (IntId) {
21913         default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
21914         case Intrinsic::x86_sse_movmsk_ps:      NumLoBits = 4; break;
21915         case Intrinsic::x86_avx_movmsk_ps_256:  NumLoBits = 8; break;
21916         case Intrinsic::x86_sse2_movmsk_pd:     NumLoBits = 2; break;
21917         case Intrinsic::x86_avx_movmsk_pd_256:  NumLoBits = 4; break;
21918         case Intrinsic::x86_mmx_pmovmskb:       NumLoBits = 8; break;
21919         case Intrinsic::x86_sse2_pmovmskb_128:  NumLoBits = 16; break;
21920         case Intrinsic::x86_avx2_pmovmskb:      NumLoBits = 32; break;
21921       }
21922       KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
21923       break;
21924     }
21925     }
21926     break;
21927   }
21928   }
21929 }
21930
21931 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
21932   SDValue Op,
21933   const SelectionDAG &,
21934   unsigned Depth) const {
21935   // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
21936   if (Op.getOpcode() == X86ISD::SETCC_CARRY)
21937     return Op.getValueType().getScalarType().getSizeInBits();
21938
21939   // Fallback case.
21940   return 1;
21941 }
21942
21943 /// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
21944 /// node is a GlobalAddress + offset.
21945 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
21946                                        const GlobalValue* &GA,
21947                                        int64_t &Offset) const {
21948   if (N->getOpcode() == X86ISD::Wrapper) {
21949     if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
21950       GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
21951       Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
21952       return true;
21953     }
21954   }
21955   return TargetLowering::isGAPlusOffset(N, GA, Offset);
21956 }
21957
21958 /// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the
21959 /// same as extracting the high 128-bit part of 256-bit vector and then
21960 /// inserting the result into the low part of a new 256-bit vector
21961 static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
21962   EVT VT = SVOp->getValueType(0);
21963   unsigned NumElems = VT.getVectorNumElements();
21964
21965   // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
21966   for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j)
21967     if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
21968         SVOp->getMaskElt(j) >= 0)
21969       return false;
21970
21971   return true;
21972 }
21973
21974 /// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the
21975 /// same as extracting the low 128-bit part of 256-bit vector and then
21976 /// inserting the result into the high part of a new 256-bit vector
21977 static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
21978   EVT VT = SVOp->getValueType(0);
21979   unsigned NumElems = VT.getVectorNumElements();
21980
21981   // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
21982   for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j)
21983     if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
21984         SVOp->getMaskElt(j) >= 0)
21985       return false;
21986
21987   return true;
21988 }
21989
21990 /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
21991 static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
21992                                         TargetLowering::DAGCombinerInfo &DCI,
21993                                         const X86Subtarget* Subtarget) {
21994   SDLoc dl(N);
21995   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
21996   SDValue V1 = SVOp->getOperand(0);
21997   SDValue V2 = SVOp->getOperand(1);
21998   EVT VT = SVOp->getValueType(0);
21999   unsigned NumElems = VT.getVectorNumElements();
22000
22001   if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
22002       V2.getOpcode() == ISD::CONCAT_VECTORS) {
22003     //
22004     //                   0,0,0,...
22005     //                      |
22006     //    V      UNDEF    BUILD_VECTOR    UNDEF
22007     //     \      /           \           /
22008     //  CONCAT_VECTOR         CONCAT_VECTOR
22009     //         \                  /
22010     //          \                /
22011     //          RESULT: V + zero extended
22012     //
22013     if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
22014         V2.getOperand(1).getOpcode() != ISD::UNDEF ||
22015         V1.getOperand(1).getOpcode() != ISD::UNDEF)
22016       return SDValue();
22017
22018     if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
22019       return SDValue();
22020
22021     // To match the shuffle mask, the first half of the mask should
22022     // be exactly the first vector, and all the rest a splat with the
22023     // first element of the second one.
22024     for (unsigned i = 0; i != NumElems/2; ++i)
22025       if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
22026           !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
22027         return SDValue();
22028
22029     // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
22030     if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
22031       if (Ld->hasNUsesOfValue(1, 0)) {
22032         SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
22033         SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
22034         SDValue ResNode =
22035           DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
22036                                   Ld->getMemoryVT(),
22037                                   Ld->getPointerInfo(),
22038                                   Ld->getAlignment(),
22039                                   false/*isVolatile*/, true/*ReadMem*/,
22040                                   false/*WriteMem*/);
22041
22042         // Make sure the newly-created LOAD is in the same position as Ld in
22043         // terms of dependency. We create a TokenFactor for Ld and ResNode,
22044         // and update uses of Ld's output chain to use the TokenFactor.
22045         if (Ld->hasAnyUseOfValue(1)) {
22046           SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
22047                              SDValue(Ld, 1), SDValue(ResNode.getNode(), 1));
22048           DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
22049           DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
22050                                  SDValue(ResNode.getNode(), 1));
22051         }
22052
22053         return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
22054       }
22055     }
22056
22057     // Emit a zeroed vector and insert the desired subvector on its
22058     // first half.
22059     SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
22060     SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
22061     return DCI.CombineTo(N, InsV);
22062   }
22063
22064   //===--------------------------------------------------------------------===//
22065   // Combine some shuffles into subvector extracts and inserts:
22066   //
22067
22068   // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
22069   if (isShuffleHigh128VectorInsertLow(SVOp)) {
22070     SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl);
22071     SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl);
22072     return DCI.CombineTo(N, InsV);
22073   }
22074
22075   // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
22076   if (isShuffleLow128VectorInsertHigh(SVOp)) {
22077     SDValue V = Extract128BitVector(V1, 0, DAG, dl);
22078     SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl);
22079     return DCI.CombineTo(N, InsV);
22080   }
22081
22082   return SDValue();
22083 }
22084
22085 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
22086 /// possible.
22087 ///
22088 /// This is the leaf of the recursive combinine below. When we have found some
22089 /// chain of single-use x86 shuffle instructions and accumulated the combined
22090 /// shuffle mask represented by them, this will try to pattern match that mask
22091 /// into either a single instruction if there is a special purpose instruction
22092 /// for this operation, or into a PSHUFB instruction which is a fully general
22093 /// instruction but should only be used to replace chains over a certain depth.
22094 static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
22095                                    int Depth, bool HasPSHUFB, SelectionDAG &DAG,
22096                                    TargetLowering::DAGCombinerInfo &DCI,
22097                                    const X86Subtarget *Subtarget) {
22098   assert(!Mask.empty() && "Cannot combine an empty shuffle mask!");
22099
22100   // Find the operand that enters the chain. Note that multiple uses are OK
22101   // here, we're not going to remove the operand we find.
22102   SDValue Input = Op.getOperand(0);
22103   while (Input.getOpcode() == ISD::BITCAST)
22104     Input = Input.getOperand(0);
22105
22106   MVT VT = Input.getSimpleValueType();
22107   MVT RootVT = Root.getSimpleValueType();
22108   SDLoc DL(Root);
22109
22110   // Just remove no-op shuffle masks.
22111   if (Mask.size() == 1) {
22112     DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Input),
22113                   /*AddTo*/ true);
22114     return true;
22115   }
22116
22117   // Use the float domain if the operand type is a floating point type.
22118   bool FloatDomain = VT.isFloatingPoint();
22119
22120   // For floating point shuffles, we don't have free copies in the shuffle
22121   // instructions or the ability to load as part of the instruction, so
22122   // canonicalize their shuffles to UNPCK or MOV variants.
22123   //
22124   // Note that even with AVX we prefer the PSHUFD form of shuffle for integer
22125   // vectors because it can have a load folded into it that UNPCK cannot. This
22126   // doesn't preclude something switching to the shorter encoding post-RA.
22127   if (FloatDomain) {
22128     if (Mask.equals(0, 0) || Mask.equals(1, 1)) {
22129       bool Lo = Mask.equals(0, 0);
22130       unsigned Shuffle;
22131       MVT ShuffleVT;
22132       // Check if we have SSE3 which will let us use MOVDDUP. That instruction
22133       // is no slower than UNPCKLPD but has the option to fold the input operand
22134       // into even an unaligned memory load.
22135       if (Lo && Subtarget->hasSSE3()) {
22136         Shuffle = X86ISD::MOVDDUP;
22137         ShuffleVT = MVT::v2f64;
22138       } else {
22139         // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller
22140         // than the UNPCK variants.
22141         Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS;
22142         ShuffleVT = MVT::v4f32;
22143       }
22144       if (Depth == 1 && Root->getOpcode() == Shuffle)
22145         return false; // Nothing to do!
22146       Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
22147       DCI.AddToWorklist(Op.getNode());
22148       if (Shuffle == X86ISD::MOVDDUP)
22149         Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
22150       else
22151         Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
22152       DCI.AddToWorklist(Op.getNode());
22153       DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22154                     /*AddTo*/ true);
22155       return true;
22156     }
22157     if (Subtarget->hasSSE3() &&
22158         (Mask.equals(0, 0, 2, 2) || Mask.equals(1, 1, 3, 3))) {
22159       bool Lo = Mask.equals(0, 0, 2, 2);
22160       unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP;
22161       MVT ShuffleVT = MVT::v4f32;
22162       if (Depth == 1 && Root->getOpcode() == Shuffle)
22163         return false; // Nothing to do!
22164       Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
22165       DCI.AddToWorklist(Op.getNode());
22166       Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
22167       DCI.AddToWorklist(Op.getNode());
22168       DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22169                     /*AddTo*/ true);
22170       return true;
22171     }
22172     if (Mask.equals(0, 0, 1, 1) || Mask.equals(2, 2, 3, 3)) {
22173       bool Lo = Mask.equals(0, 0, 1, 1);
22174       unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
22175       MVT ShuffleVT = MVT::v4f32;
22176       if (Depth == 1 && Root->getOpcode() == Shuffle)
22177         return false; // Nothing to do!
22178       Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
22179       DCI.AddToWorklist(Op.getNode());
22180       Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
22181       DCI.AddToWorklist(Op.getNode());
22182       DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22183                     /*AddTo*/ true);
22184       return true;
22185     }
22186   }
22187
22188   // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK
22189   // variants as none of these have single-instruction variants that are
22190   // superior to the UNPCK formulation.
22191   if (!FloatDomain &&
22192       (Mask.equals(0, 0, 1, 1, 2, 2, 3, 3) ||
22193        Mask.equals(4, 4, 5, 5, 6, 6, 7, 7) ||
22194        Mask.equals(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7) ||
22195        Mask.equals(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15,
22196                    15))) {
22197     bool Lo = Mask[0] == 0;
22198     unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
22199     if (Depth == 1 && Root->getOpcode() == Shuffle)
22200       return false; // Nothing to do!
22201     MVT ShuffleVT;
22202     switch (Mask.size()) {
22203     case 8:
22204       ShuffleVT = MVT::v8i16;
22205       break;
22206     case 16:
22207       ShuffleVT = MVT::v16i8;
22208       break;
22209     default:
22210       llvm_unreachable("Impossible mask size!");
22211     };
22212     Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
22213     DCI.AddToWorklist(Op.getNode());
22214     Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
22215     DCI.AddToWorklist(Op.getNode());
22216     DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22217                   /*AddTo*/ true);
22218     return true;
22219   }
22220
22221   // Don't try to re-form single instruction chains under any circumstances now
22222   // that we've done encoding canonicalization for them.
22223   if (Depth < 2)
22224     return false;
22225
22226   // If we have 3 or more shuffle instructions or a chain involving PSHUFB, we
22227   // can replace them with a single PSHUFB instruction profitably. Intel's
22228   // manuals suggest only using PSHUFB if doing so replacing 5 instructions, but
22229   // in practice PSHUFB tends to be *very* fast so we're more aggressive.
22230   if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) {
22231     SmallVector<SDValue, 16> PSHUFBMask;
22232     assert(Mask.size() <= 16 && "Can't shuffle elements smaller than bytes!");
22233     int Ratio = 16 / Mask.size();
22234     for (unsigned i = 0; i < 16; ++i) {
22235       if (Mask[i / Ratio] == SM_SentinelUndef) {
22236         PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
22237         continue;
22238       }
22239       int M = Mask[i / Ratio] != SM_SentinelZero
22240                   ? Ratio * Mask[i / Ratio] + i % Ratio
22241                   : 255;
22242       PSHUFBMask.push_back(DAG.getConstant(M, MVT::i8));
22243     }
22244     Op = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Input);
22245     DCI.AddToWorklist(Op.getNode());
22246     SDValue PSHUFBMaskOp =
22247         DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, PSHUFBMask);
22248     DCI.AddToWorklist(PSHUFBMaskOp.getNode());
22249     Op = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, Op, PSHUFBMaskOp);
22250     DCI.AddToWorklist(Op.getNode());
22251     DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22252                   /*AddTo*/ true);
22253     return true;
22254   }
22255
22256   // Failed to find any combines.
22257   return false;
22258 }
22259
22260 /// \brief Fully generic combining of x86 shuffle instructions.
22261 ///
22262 /// This should be the last combine run over the x86 shuffle instructions. Once
22263 /// they have been fully optimized, this will recursively consider all chains
22264 /// of single-use shuffle instructions, build a generic model of the cumulative
22265 /// shuffle operation, and check for simpler instructions which implement this
22266 /// operation. We use this primarily for two purposes:
22267 ///
22268 /// 1) Collapse generic shuffles to specialized single instructions when
22269 ///    equivalent. In most cases, this is just an encoding size win, but
22270 ///    sometimes we will collapse multiple generic shuffles into a single
22271 ///    special-purpose shuffle.
22272 /// 2) Look for sequences of shuffle instructions with 3 or more total
22273 ///    instructions, and replace them with the slightly more expensive SSSE3
22274 ///    PSHUFB instruction if available. We do this as the last combining step
22275 ///    to ensure we avoid using PSHUFB if we can implement the shuffle with
22276 ///    a suitable short sequence of other instructions. The PHUFB will either
22277 ///    use a register or have to read from memory and so is slightly (but only
22278 ///    slightly) more expensive than the other shuffle instructions.
22279 ///
22280 /// Because this is inherently a quadratic operation (for each shuffle in
22281 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
22282 /// This should never be an issue in practice as the shuffle lowering doesn't
22283 /// produce sequences of more than 8 instructions.
22284 ///
22285 /// FIXME: We will currently miss some cases where the redundant shuffling
22286 /// would simplify under the threshold for PSHUFB formation because of
22287 /// combine-ordering. To fix this, we should do the redundant instruction
22288 /// combining in this recursive walk.
22289 static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
22290                                           ArrayRef<int> RootMask,
22291                                           int Depth, bool HasPSHUFB,
22292                                           SelectionDAG &DAG,
22293                                           TargetLowering::DAGCombinerInfo &DCI,
22294                                           const X86Subtarget *Subtarget) {
22295   // Bound the depth of our recursive combine because this is ultimately
22296   // quadratic in nature.
22297   if (Depth > 8)
22298     return false;
22299
22300   // Directly rip through bitcasts to find the underlying operand.
22301   while (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).hasOneUse())
22302     Op = Op.getOperand(0);
22303
22304   MVT VT = Op.getSimpleValueType();
22305   if (!VT.isVector())
22306     return false; // Bail if we hit a non-vector.
22307   // FIXME: This routine should be taught about 256-bit shuffles, or a 256-bit
22308   // version should be added.
22309   if (VT.getSizeInBits() != 128)
22310     return false;
22311
22312   assert(Root.getSimpleValueType().isVector() &&
22313          "Shuffles operate on vector types!");
22314   assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
22315          "Can only combine shuffles of the same vector register size.");
22316
22317   if (!isTargetShuffle(Op.getOpcode()))
22318     return false;
22319   SmallVector<int, 16> OpMask;
22320   bool IsUnary;
22321   bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, OpMask, IsUnary);
22322   // We only can combine unary shuffles which we can decode the mask for.
22323   if (!HaveMask || !IsUnary)
22324     return false;
22325
22326   assert(VT.getVectorNumElements() == OpMask.size() &&
22327          "Different mask size from vector size!");
22328   assert(((RootMask.size() > OpMask.size() &&
22329            RootMask.size() % OpMask.size() == 0) ||
22330           (OpMask.size() > RootMask.size() &&
22331            OpMask.size() % RootMask.size() == 0) ||
22332           OpMask.size() == RootMask.size()) &&
22333          "The smaller number of elements must divide the larger.");
22334   int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
22335   int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
22336   assert(((RootRatio == 1 && OpRatio == 1) ||
22337           (RootRatio == 1) != (OpRatio == 1)) &&
22338          "Must not have a ratio for both incoming and op masks!");
22339
22340   SmallVector<int, 16> Mask;
22341   Mask.reserve(std::max(OpMask.size(), RootMask.size()));
22342
22343   // Merge this shuffle operation's mask into our accumulated mask. Note that
22344   // this shuffle's mask will be the first applied to the input, followed by the
22345   // root mask to get us all the way to the root value arrangement. The reason
22346   // for this order is that we are recursing up the operation chain.
22347   for (int i = 0, e = std::max(OpMask.size(), RootMask.size()); i < e; ++i) {
22348     int RootIdx = i / RootRatio;
22349     if (RootMask[RootIdx] < 0) {
22350       // This is a zero or undef lane, we're done.
22351       Mask.push_back(RootMask[RootIdx]);
22352       continue;
22353     }
22354
22355     int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
22356     int OpIdx = RootMaskedIdx / OpRatio;
22357     if (OpMask[OpIdx] < 0) {
22358       // The incoming lanes are zero or undef, it doesn't matter which ones we
22359       // are using.
22360       Mask.push_back(OpMask[OpIdx]);
22361       continue;
22362     }
22363
22364     // Ok, we have non-zero lanes, map them through.
22365     Mask.push_back(OpMask[OpIdx] * OpRatio +
22366                    RootMaskedIdx % OpRatio);
22367   }
22368
22369   // See if we can recurse into the operand to combine more things.
22370   switch (Op.getOpcode()) {
22371     case X86ISD::PSHUFB:
22372       HasPSHUFB = true;
22373     case X86ISD::PSHUFD:
22374     case X86ISD::PSHUFHW:
22375     case X86ISD::PSHUFLW:
22376       if (Op.getOperand(0).hasOneUse() &&
22377           combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
22378                                         HasPSHUFB, DAG, DCI, Subtarget))
22379         return true;
22380       break;
22381
22382     case X86ISD::UNPCKL:
22383     case X86ISD::UNPCKH:
22384       assert(Op.getOperand(0) == Op.getOperand(1) && "We only combine unary shuffles!");
22385       // We can't check for single use, we have to check that this shuffle is the only user.
22386       if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
22387           combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
22388                                         HasPSHUFB, DAG, DCI, Subtarget))
22389           return true;
22390       break;
22391   }
22392
22393   // Minor canonicalization of the accumulated shuffle mask to make it easier
22394   // to match below. All this does is detect masks with squential pairs of
22395   // elements, and shrink them to the half-width mask. It does this in a loop
22396   // so it will reduce the size of the mask to the minimal width mask which
22397   // performs an equivalent shuffle.
22398   SmallVector<int, 16> WidenedMask;
22399   while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
22400     Mask = std::move(WidenedMask);
22401     WidenedMask.clear();
22402   }
22403
22404   return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI,
22405                                 Subtarget);
22406 }
22407
22408 /// \brief Get the PSHUF-style mask from PSHUF node.
22409 ///
22410 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
22411 /// PSHUF-style masks that can be reused with such instructions.
22412 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
22413   SmallVector<int, 4> Mask;
22414   bool IsUnary;
22415   bool HaveMask = getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), Mask, IsUnary);
22416   (void)HaveMask;
22417   assert(HaveMask);
22418
22419   switch (N.getOpcode()) {
22420   case X86ISD::PSHUFD:
22421     return Mask;
22422   case X86ISD::PSHUFLW:
22423     Mask.resize(4);
22424     return Mask;
22425   case X86ISD::PSHUFHW:
22426     Mask.erase(Mask.begin(), Mask.begin() + 4);
22427     for (int &M : Mask)
22428       M -= 4;
22429     return Mask;
22430   default:
22431     llvm_unreachable("No valid shuffle instruction found!");
22432   }
22433 }
22434
22435 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
22436 ///
22437 /// We walk up the chain and look for a combinable shuffle, skipping over
22438 /// shuffles that we could hoist this shuffle's transformation past without
22439 /// altering anything.
22440 static SDValue
22441 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
22442                              SelectionDAG &DAG,
22443                              TargetLowering::DAGCombinerInfo &DCI) {
22444   assert(N.getOpcode() == X86ISD::PSHUFD &&
22445          "Called with something other than an x86 128-bit half shuffle!");
22446   SDLoc DL(N);
22447
22448   // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
22449   // of the shuffles in the chain so that we can form a fresh chain to replace
22450   // this one.
22451   SmallVector<SDValue, 8> Chain;
22452   SDValue V = N.getOperand(0);
22453   for (; V.hasOneUse(); V = V.getOperand(0)) {
22454     switch (V.getOpcode()) {
22455     default:
22456       return SDValue(); // Nothing combined!
22457
22458     case ISD::BITCAST:
22459       // Skip bitcasts as we always know the type for the target specific
22460       // instructions.
22461       continue;
22462
22463     case X86ISD::PSHUFD:
22464       // Found another dword shuffle.
22465       break;
22466
22467     case X86ISD::PSHUFLW:
22468       // Check that the low words (being shuffled) are the identity in the
22469       // dword shuffle, and the high words are self-contained.
22470       if (Mask[0] != 0 || Mask[1] != 1 ||
22471           !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
22472         return SDValue();
22473
22474       Chain.push_back(V);
22475       continue;
22476
22477     case X86ISD::PSHUFHW:
22478       // Check that the high words (being shuffled) are the identity in the
22479       // dword shuffle, and the low words are self-contained.
22480       if (Mask[2] != 2 || Mask[3] != 3 ||
22481           !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
22482         return SDValue();
22483
22484       Chain.push_back(V);
22485       continue;
22486
22487     case X86ISD::UNPCKL:
22488     case X86ISD::UNPCKH:
22489       // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
22490       // shuffle into a preceding word shuffle.
22491       if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16)
22492         return SDValue();
22493
22494       // Search for a half-shuffle which we can combine with.
22495       unsigned CombineOp =
22496           V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
22497       if (V.getOperand(0) != V.getOperand(1) ||
22498           !V->isOnlyUserOf(V.getOperand(0).getNode()))
22499         return SDValue();
22500       Chain.push_back(V);
22501       V = V.getOperand(0);
22502       do {
22503         switch (V.getOpcode()) {
22504         default:
22505           return SDValue(); // Nothing to combine.
22506
22507         case X86ISD::PSHUFLW:
22508         case X86ISD::PSHUFHW:
22509           if (V.getOpcode() == CombineOp)
22510             break;
22511
22512           Chain.push_back(V);
22513
22514           // Fallthrough!
22515         case ISD::BITCAST:
22516           V = V.getOperand(0);
22517           continue;
22518         }
22519         break;
22520       } while (V.hasOneUse());
22521       break;
22522     }
22523     // Break out of the loop if we break out of the switch.
22524     break;
22525   }
22526
22527   if (!V.hasOneUse())
22528     // We fell out of the loop without finding a viable combining instruction.
22529     return SDValue();
22530
22531   // Merge this node's mask and our incoming mask.
22532   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
22533   for (int &M : Mask)
22534     M = VMask[M];
22535   V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
22536                   getV4X86ShuffleImm8ForMask(Mask, DAG));
22537
22538   // Rebuild the chain around this new shuffle.
22539   while (!Chain.empty()) {
22540     SDValue W = Chain.pop_back_val();
22541
22542     if (V.getValueType() != W.getOperand(0).getValueType())
22543       V = DAG.getNode(ISD::BITCAST, DL, W.getOperand(0).getValueType(), V);
22544
22545     switch (W.getOpcode()) {
22546     default:
22547       llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
22548
22549     case X86ISD::UNPCKL:
22550     case X86ISD::UNPCKH:
22551       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
22552       break;
22553
22554     case X86ISD::PSHUFD:
22555     case X86ISD::PSHUFLW:
22556     case X86ISD::PSHUFHW:
22557       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
22558       break;
22559     }
22560   }
22561   if (V.getValueType() != N.getValueType())
22562     V = DAG.getNode(ISD::BITCAST, DL, N.getValueType(), V);
22563
22564   // Return the new chain to replace N.
22565   return V;
22566 }
22567
22568 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw.
22569 ///
22570 /// We walk up the chain, skipping shuffles of the other half and looking
22571 /// through shuffles which switch halves trying to find a shuffle of the same
22572 /// pair of dwords.
22573 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
22574                                         SelectionDAG &DAG,
22575                                         TargetLowering::DAGCombinerInfo &DCI) {
22576   assert(
22577       (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
22578       "Called with something other than an x86 128-bit half shuffle!");
22579   SDLoc DL(N);
22580   unsigned CombineOpcode = N.getOpcode();
22581
22582   // Walk up a single-use chain looking for a combinable shuffle.
22583   SDValue V = N.getOperand(0);
22584   for (; V.hasOneUse(); V = V.getOperand(0)) {
22585     switch (V.getOpcode()) {
22586     default:
22587       return false; // Nothing combined!
22588
22589     case ISD::BITCAST:
22590       // Skip bitcasts as we always know the type for the target specific
22591       // instructions.
22592       continue;
22593
22594     case X86ISD::PSHUFLW:
22595     case X86ISD::PSHUFHW:
22596       if (V.getOpcode() == CombineOpcode)
22597         break;
22598
22599       // Other-half shuffles are no-ops.
22600       continue;
22601     }
22602     // Break out of the loop if we break out of the switch.
22603     break;
22604   }
22605
22606   if (!V.hasOneUse())
22607     // We fell out of the loop without finding a viable combining instruction.
22608     return false;
22609
22610   // Combine away the bottom node as its shuffle will be accumulated into
22611   // a preceding shuffle.
22612   DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
22613
22614   // Record the old value.
22615   SDValue Old = V;
22616
22617   // Merge this node's mask and our incoming mask (adjusted to account for all
22618   // the pshufd instructions encountered).
22619   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
22620   for (int &M : Mask)
22621     M = VMask[M];
22622   V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
22623                   getV4X86ShuffleImm8ForMask(Mask, DAG));
22624
22625   // Check that the shuffles didn't cancel each other out. If not, we need to
22626   // combine to the new one.
22627   if (Old != V)
22628     // Replace the combinable shuffle with the combined one, updating all users
22629     // so that we re-evaluate the chain here.
22630     DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
22631
22632   return true;
22633 }
22634
22635 /// \brief Try to combine x86 target specific shuffles.
22636 static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
22637                                            TargetLowering::DAGCombinerInfo &DCI,
22638                                            const X86Subtarget *Subtarget) {
22639   SDLoc DL(N);
22640   MVT VT = N.getSimpleValueType();
22641   SmallVector<int, 4> Mask;
22642
22643   switch (N.getOpcode()) {
22644   case X86ISD::PSHUFD:
22645   case X86ISD::PSHUFLW:
22646   case X86ISD::PSHUFHW:
22647     Mask = getPSHUFShuffleMask(N);
22648     assert(Mask.size() == 4);
22649     break;
22650   default:
22651     return SDValue();
22652   }
22653
22654   // Nuke no-op shuffles that show up after combining.
22655   if (isNoopShuffleMask(Mask))
22656     return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
22657
22658   // Look for simplifications involving one or two shuffle instructions.
22659   SDValue V = N.getOperand(0);
22660   switch (N.getOpcode()) {
22661   default:
22662     break;
22663   case X86ISD::PSHUFLW:
22664   case X86ISD::PSHUFHW:
22665     assert(VT == MVT::v8i16);
22666     (void)VT;
22667
22668     if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
22669       return SDValue(); // We combined away this shuffle, so we're done.
22670
22671     // See if this reduces to a PSHUFD which is no more expensive and can
22672     // combine with more operations. Note that it has to at least flip the
22673     // dwords as otherwise it would have been removed as a no-op.
22674     if (Mask[0] == 2 && Mask[1] == 3 && Mask[2] == 0 && Mask[3] == 1) {
22675       int DMask[] = {0, 1, 2, 3};
22676       int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
22677       DMask[DOffset + 0] = DOffset + 1;
22678       DMask[DOffset + 1] = DOffset + 0;
22679       V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V);
22680       DCI.AddToWorklist(V.getNode());
22681       V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V,
22682                       getV4X86ShuffleImm8ForMask(DMask, DAG));
22683       DCI.AddToWorklist(V.getNode());
22684       return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
22685     }
22686
22687     // Look for shuffle patterns which can be implemented as a single unpack.
22688     // FIXME: This doesn't handle the location of the PSHUFD generically, and
22689     // only works when we have a PSHUFD followed by two half-shuffles.
22690     if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
22691         (V.getOpcode() == X86ISD::PSHUFLW ||
22692          V.getOpcode() == X86ISD::PSHUFHW) &&
22693         V.getOpcode() != N.getOpcode() &&
22694         V.hasOneUse()) {
22695       SDValue D = V.getOperand(0);
22696       while (D.getOpcode() == ISD::BITCAST && D.hasOneUse())
22697         D = D.getOperand(0);
22698       if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
22699         SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
22700         SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
22701         int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
22702         int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
22703         int WordMask[8];
22704         for (int i = 0; i < 4; ++i) {
22705           WordMask[i + NOffset] = Mask[i] + NOffset;
22706           WordMask[i + VOffset] = VMask[i] + VOffset;
22707         }
22708         // Map the word mask through the DWord mask.
22709         int MappedMask[8];
22710         for (int i = 0; i < 8; ++i)
22711           MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
22712         const int UnpackLoMask[] = {0, 0, 1, 1, 2, 2, 3, 3};
22713         const int UnpackHiMask[] = {4, 4, 5, 5, 6, 6, 7, 7};
22714         if (std::equal(std::begin(MappedMask), std::end(MappedMask),
22715                        std::begin(UnpackLoMask)) ||
22716             std::equal(std::begin(MappedMask), std::end(MappedMask),
22717                        std::begin(UnpackHiMask))) {
22718           // We can replace all three shuffles with an unpack.
22719           V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, D.getOperand(0));
22720           DCI.AddToWorklist(V.getNode());
22721           return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
22722                                                 : X86ISD::UNPCKH,
22723                              DL, MVT::v8i16, V, V);
22724         }
22725       }
22726     }
22727
22728     break;
22729
22730   case X86ISD::PSHUFD:
22731     if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI))
22732       return NewN;
22733
22734     break;
22735   }
22736
22737   return SDValue();
22738 }
22739
22740 /// \brief Try to combine a shuffle into a target-specific add-sub node.
22741 ///
22742 /// We combine this directly on the abstract vector shuffle nodes so it is
22743 /// easier to generically match. We also insert dummy vector shuffle nodes for
22744 /// the operands which explicitly discard the lanes which are unused by this
22745 /// operation to try to flow through the rest of the combiner the fact that
22746 /// they're unused.
22747 static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) {
22748   SDLoc DL(N);
22749   EVT VT = N->getValueType(0);
22750
22751   // We only handle target-independent shuffles.
22752   // FIXME: It would be easy and harmless to use the target shuffle mask
22753   // extraction tool to support more.
22754   if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
22755     return SDValue();
22756
22757   auto *SVN = cast<ShuffleVectorSDNode>(N);
22758   ArrayRef<int> Mask = SVN->getMask();
22759   SDValue V1 = N->getOperand(0);
22760   SDValue V2 = N->getOperand(1);
22761
22762   // We require the first shuffle operand to be the SUB node, and the second to
22763   // be the ADD node.
22764   // FIXME: We should support the commuted patterns.
22765   if (V1->getOpcode() != ISD::FSUB || V2->getOpcode() != ISD::FADD)
22766     return SDValue();
22767
22768   // If there are other uses of these operations we can't fold them.
22769   if (!V1->hasOneUse() || !V2->hasOneUse())
22770     return SDValue();
22771
22772   // Ensure that both operations have the same operands. Note that we can
22773   // commute the FADD operands.
22774   SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
22775   if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
22776       (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
22777     return SDValue();
22778
22779   // We're looking for blends between FADD and FSUB nodes. We insist on these
22780   // nodes being lined up in a specific expected pattern.
22781   if (!(isShuffleEquivalent(Mask, 0, 3) ||
22782         isShuffleEquivalent(Mask, 0, 5, 2, 7) ||
22783         isShuffleEquivalent(Mask, 0, 9, 2, 11, 4, 13, 6, 15)))
22784     return SDValue();
22785
22786   // Only specific types are legal at this point, assert so we notice if and
22787   // when these change.
22788   assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 ||
22789           VT == MVT::v4f64) &&
22790          "Unknown vector type encountered!");
22791
22792   return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS);
22793 }
22794
22795 /// PerformShuffleCombine - Performs several different shuffle combines.
22796 static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
22797                                      TargetLowering::DAGCombinerInfo &DCI,
22798                                      const X86Subtarget *Subtarget) {
22799   SDLoc dl(N);
22800   SDValue N0 = N->getOperand(0);
22801   SDValue N1 = N->getOperand(1);
22802   EVT VT = N->getValueType(0);
22803
22804   // Don't create instructions with illegal types after legalize types has run.
22805   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22806   if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
22807     return SDValue();
22808
22809   // If we have legalized the vector types, look for blends of FADD and FSUB
22810   // nodes that we can fuse into an ADDSUB node.
22811   if (TLI.isTypeLegal(VT) && Subtarget->hasSSE3())
22812     if (SDValue AddSub = combineShuffleToAddSub(N, DAG))
22813       return AddSub;
22814
22815   // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
22816   if (Subtarget->hasFp256() && VT.is256BitVector() &&
22817       N->getOpcode() == ISD::VECTOR_SHUFFLE)
22818     return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
22819
22820   // During Type Legalization, when promoting illegal vector types,
22821   // the backend might introduce new shuffle dag nodes and bitcasts.
22822   //
22823   // This code performs the following transformation:
22824   // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
22825   //       (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
22826   //
22827   // We do this only if both the bitcast and the BINOP dag nodes have
22828   // one use. Also, perform this transformation only if the new binary
22829   // operation is legal. This is to avoid introducing dag nodes that
22830   // potentially need to be further expanded (or custom lowered) into a
22831   // less optimal sequence of dag nodes.
22832   if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
22833       N1.getOpcode() == ISD::UNDEF && N0.hasOneUse() &&
22834       N0.getOpcode() == ISD::BITCAST) {
22835     SDValue BC0 = N0.getOperand(0);
22836     EVT SVT = BC0.getValueType();
22837     unsigned Opcode = BC0.getOpcode();
22838     unsigned NumElts = VT.getVectorNumElements();
22839
22840     if (BC0.hasOneUse() && SVT.isVector() &&
22841         SVT.getVectorNumElements() * 2 == NumElts &&
22842         TLI.isOperationLegal(Opcode, VT)) {
22843       bool CanFold = false;
22844       switch (Opcode) {
22845       default : break;
22846       case ISD::ADD :
22847       case ISD::FADD :
22848       case ISD::SUB :
22849       case ISD::FSUB :
22850       case ISD::MUL :
22851       case ISD::FMUL :
22852         CanFold = true;
22853       }
22854
22855       unsigned SVTNumElts = SVT.getVectorNumElements();
22856       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
22857       for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
22858         CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
22859       for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
22860         CanFold = SVOp->getMaskElt(i) < 0;
22861
22862       if (CanFold) {
22863         SDValue BC00 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(0));
22864         SDValue BC01 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(1));
22865         SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
22866         return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]);
22867       }
22868     }
22869   }
22870
22871   // Only handle 128 wide vector from here on.
22872   if (!VT.is128BitVector())
22873     return SDValue();
22874
22875   // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
22876   // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
22877   // consecutive, non-overlapping, and in the right order.
22878   SmallVector<SDValue, 16> Elts;
22879   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
22880     Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
22881
22882   SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true);
22883   if (LD.getNode())
22884     return LD;
22885
22886   if (isTargetShuffle(N->getOpcode())) {
22887     SDValue Shuffle =
22888         PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget);
22889     if (Shuffle.getNode())
22890       return Shuffle;
22891
22892     // Try recursively combining arbitrary sequences of x86 shuffle
22893     // instructions into higher-order shuffles. We do this after combining
22894     // specific PSHUF instruction sequences into their minimal form so that we
22895     // can evaluate how many specialized shuffle instructions are involved in
22896     // a particular chain.
22897     SmallVector<int, 1> NonceMask; // Just a placeholder.
22898     NonceMask.push_back(0);
22899     if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask,
22900                                       /*Depth*/ 1, /*HasPSHUFB*/ false, DAG,
22901                                       DCI, Subtarget))
22902       return SDValue(); // This routine will use CombineTo to replace N.
22903   }
22904
22905   return SDValue();
22906 }
22907
22908 /// PerformTruncateCombine - Converts truncate operation to
22909 /// a sequence of vector shuffle operations.
22910 /// It is possible when we truncate 256-bit vector to 128-bit vector
22911 static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
22912                                       TargetLowering::DAGCombinerInfo &DCI,
22913                                       const X86Subtarget *Subtarget)  {
22914   return SDValue();
22915 }
22916
22917 /// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target
22918 /// specific shuffle of a load can be folded into a single element load.
22919 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
22920 /// shuffles have been custom lowered so we need to handle those here.
22921 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
22922                                          TargetLowering::DAGCombinerInfo &DCI) {
22923   if (DCI.isBeforeLegalizeOps())
22924     return SDValue();
22925
22926   SDValue InVec = N->getOperand(0);
22927   SDValue EltNo = N->getOperand(1);
22928
22929   if (!isa<ConstantSDNode>(EltNo))
22930     return SDValue();
22931
22932   EVT OriginalVT = InVec.getValueType();
22933
22934   if (InVec.getOpcode() == ISD::BITCAST) {
22935     // Don't duplicate a load with other uses.
22936     if (!InVec.hasOneUse())
22937       return SDValue();
22938     EVT BCVT = InVec.getOperand(0).getValueType();
22939     if (BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
22940       return SDValue();
22941     InVec = InVec.getOperand(0);
22942   }
22943
22944   EVT CurrentVT = InVec.getValueType();
22945
22946   if (!isTargetShuffle(InVec.getOpcode()))
22947     return SDValue();
22948
22949   // Don't duplicate a load with other uses.
22950   if (!InVec.hasOneUse())
22951     return SDValue();
22952
22953   SmallVector<int, 16> ShuffleMask;
22954   bool UnaryShuffle;
22955   if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(),
22956                             ShuffleMask, UnaryShuffle))
22957     return SDValue();
22958
22959   // Select the input vector, guarding against out of range extract vector.
22960   unsigned NumElems = CurrentVT.getVectorNumElements();
22961   int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
22962   int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt];
22963   SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0)
22964                                          : InVec.getOperand(1);
22965
22966   // If inputs to shuffle are the same for both ops, then allow 2 uses
22967   unsigned AllowedUses = InVec.getNumOperands() > 1 &&
22968                          InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
22969
22970   if (LdNode.getOpcode() == ISD::BITCAST) {
22971     // Don't duplicate a load with other uses.
22972     if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
22973       return SDValue();
22974
22975     AllowedUses = 1; // only allow 1 load use if we have a bitcast
22976     LdNode = LdNode.getOperand(0);
22977   }
22978
22979   if (!ISD::isNormalLoad(LdNode.getNode()))
22980     return SDValue();
22981
22982   LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
22983
22984   if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
22985     return SDValue();
22986
22987   EVT EltVT = N->getValueType(0);
22988   // If there's a bitcast before the shuffle, check if the load type and
22989   // alignment is valid.
22990   unsigned Align = LN0->getAlignment();
22991   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22992   unsigned NewAlign = TLI.getDataLayout()->getABITypeAlignment(
22993       EltVT.getTypeForEVT(*DAG.getContext()));
22994
22995   if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
22996     return SDValue();
22997
22998   // All checks match so transform back to vector_shuffle so that DAG combiner
22999   // can finish the job
23000   SDLoc dl(N);
23001
23002   // Create shuffle node taking into account the case that its a unary shuffle
23003   SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT)
23004                                    : InVec.getOperand(1);
23005   Shuffle = DAG.getVectorShuffle(CurrentVT, dl,
23006                                  InVec.getOperand(0), Shuffle,
23007                                  &ShuffleMask[0]);
23008   Shuffle = DAG.getNode(ISD::BITCAST, dl, OriginalVT, Shuffle);
23009   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
23010                      EltNo);
23011 }
23012
23013 /// \brief Detect bitcasts between i32 to x86mmx low word. Since MMX types are
23014 /// special and don't usually play with other vector types, it's better to
23015 /// handle them early to be sure we emit efficient code by avoiding
23016 /// store-load conversions.
23017 static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG) {
23018   if (N->getValueType(0) != MVT::x86mmx ||
23019       N->getOperand(0)->getOpcode() != ISD::BUILD_VECTOR ||
23020       N->getOperand(0)->getValueType(0) != MVT::v2i32)
23021     return SDValue();
23022
23023   SDValue V = N->getOperand(0);
23024   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(1));
23025   if (C && C->getZExtValue() == 0 && V.getOperand(0).getValueType() == MVT::i32)
23026     return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(V.getOperand(0)),
23027                        N->getValueType(0), V.getOperand(0));
23028
23029   return SDValue();
23030 }
23031
23032 /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
23033 /// generation and convert it from being a bunch of shuffles and extracts
23034 /// into a somewhat faster sequence. For i686, the best sequence is apparently
23035 /// storing the value and loading scalars back, while for x64 we should
23036 /// use 64-bit extracts and shifts.
23037 static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
23038                                          TargetLowering::DAGCombinerInfo &DCI) {
23039   SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI);
23040   if (NewOp.getNode())
23041     return NewOp;
23042
23043   SDValue InputVector = N->getOperand(0);
23044
23045   // Detect mmx to i32 conversion through a v2i32 elt extract.
23046   if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
23047       N->getValueType(0) == MVT::i32 &&
23048       InputVector.getValueType() == MVT::v2i32) {
23049
23050     // The bitcast source is a direct mmx result.
23051     SDValue MMXSrc = InputVector.getNode()->getOperand(0);
23052     if (MMXSrc.getValueType() == MVT::x86mmx)
23053       return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
23054                          N->getValueType(0),
23055                          InputVector.getNode()->getOperand(0));
23056
23057     // The mmx is indirect: (i64 extract_elt (v1i64 bitcast (x86mmx ...))).
23058     SDValue MMXSrcOp = MMXSrc.getOperand(0);
23059     if (MMXSrc.getOpcode() == ISD::EXTRACT_VECTOR_ELT && MMXSrc.hasOneUse() &&
23060         MMXSrc.getValueType() == MVT::i64 && MMXSrcOp.hasOneUse() &&
23061         MMXSrcOp.getOpcode() == ISD::BITCAST &&
23062         MMXSrcOp.getValueType() == MVT::v1i64 &&
23063         MMXSrcOp.getOperand(0).getValueType() == MVT::x86mmx)
23064       return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
23065                          N->getValueType(0),
23066                          MMXSrcOp.getOperand(0));
23067   }
23068
23069   // Only operate on vectors of 4 elements, where the alternative shuffling
23070   // gets to be more expensive.
23071   if (InputVector.getValueType() != MVT::v4i32)
23072     return SDValue();
23073
23074   // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
23075   // single use which is a sign-extend or zero-extend, and all elements are
23076   // used.
23077   SmallVector<SDNode *, 4> Uses;
23078   unsigned ExtractedElements = 0;
23079   for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
23080        UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
23081     if (UI.getUse().getResNo() != InputVector.getResNo())
23082       return SDValue();
23083
23084     SDNode *Extract = *UI;
23085     if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
23086       return SDValue();
23087
23088     if (Extract->getValueType(0) != MVT::i32)
23089       return SDValue();
23090     if (!Extract->hasOneUse())
23091       return SDValue();
23092     if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
23093         Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
23094       return SDValue();
23095     if (!isa<ConstantSDNode>(Extract->getOperand(1)))
23096       return SDValue();
23097
23098     // Record which element was extracted.
23099     ExtractedElements |=
23100       1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
23101
23102     Uses.push_back(Extract);
23103   }
23104
23105   // If not all the elements were used, this may not be worthwhile.
23106   if (ExtractedElements != 15)
23107     return SDValue();
23108
23109   // Ok, we've now decided to do the transformation.
23110   // If 64-bit shifts are legal, use the extract-shift sequence,
23111   // otherwise bounce the vector off the cache.
23112   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23113   SDValue Vals[4];
23114   SDLoc dl(InputVector);
23115
23116   if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
23117     SDValue Cst = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, InputVector);
23118     EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy();
23119     SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
23120       DAG.getConstant(0, VecIdxTy));
23121     SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
23122       DAG.getConstant(1, VecIdxTy));
23123
23124     SDValue ShAmt = DAG.getConstant(32,
23125       DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64));
23126     Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
23127     Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
23128       DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
23129     Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
23130     Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
23131       DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
23132   } else {
23133     // Store the value to a temporary stack slot.
23134     SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
23135     SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
23136       MachinePointerInfo(), false, false, 0);
23137
23138     EVT ElementType = InputVector.getValueType().getVectorElementType();
23139     unsigned EltSize = ElementType.getSizeInBits() / 8;
23140
23141     // Replace each use (extract) with a load of the appropriate element.
23142     for (unsigned i = 0; i < 4; ++i) {
23143       uint64_t Offset = EltSize * i;
23144       SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
23145
23146       SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
23147                                        StackPtr, OffsetVal);
23148
23149       // Load the scalar.
23150       Vals[i] = DAG.getLoad(ElementType, dl, Ch,
23151                             ScalarAddr, MachinePointerInfo(),
23152                             false, false, false, 0);
23153
23154     }
23155   }
23156
23157   // Replace the extracts
23158   for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
23159     UE = Uses.end(); UI != UE; ++UI) {
23160     SDNode *Extract = *UI;
23161
23162     SDValue Idx = Extract->getOperand(1);
23163     uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
23164     DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
23165   }
23166
23167   // The replacement was made in place; don't return anything.
23168   return SDValue();
23169 }
23170
23171 /// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match.
23172 static std::pair<unsigned, bool>
23173 matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
23174                    SelectionDAG &DAG, const X86Subtarget *Subtarget) {
23175   if (!VT.isVector())
23176     return std::make_pair(0, false);
23177
23178   bool NeedSplit = false;
23179   switch (VT.getSimpleVT().SimpleTy) {
23180   default: return std::make_pair(0, false);
23181   case MVT::v4i64:
23182   case MVT::v2i64:
23183     if (!Subtarget->hasVLX())
23184       return std::make_pair(0, false);
23185     break;
23186   case MVT::v64i8:
23187   case MVT::v32i16:
23188     if (!Subtarget->hasBWI())
23189       return std::make_pair(0, false);
23190     break;
23191   case MVT::v16i32:
23192   case MVT::v8i64:
23193     if (!Subtarget->hasAVX512())
23194       return std::make_pair(0, false);
23195     break;
23196   case MVT::v32i8:
23197   case MVT::v16i16:
23198   case MVT::v8i32:
23199     if (!Subtarget->hasAVX2())
23200       NeedSplit = true;
23201     if (!Subtarget->hasAVX())
23202       return std::make_pair(0, false);
23203     break;
23204   case MVT::v16i8:
23205   case MVT::v8i16:
23206   case MVT::v4i32:
23207     if (!Subtarget->hasSSE2())
23208       return std::make_pair(0, false);
23209   }
23210
23211   // SSE2 has only a small subset of the operations.
23212   bool hasUnsigned = Subtarget->hasSSE41() ||
23213                      (Subtarget->hasSSE2() && VT == MVT::v16i8);
23214   bool hasSigned = Subtarget->hasSSE41() ||
23215                    (Subtarget->hasSSE2() && VT == MVT::v8i16);
23216
23217   ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23218
23219   unsigned Opc = 0;
23220   // Check for x CC y ? x : y.
23221   if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
23222       DAG.isEqualTo(RHS, Cond.getOperand(1))) {
23223     switch (CC) {
23224     default: break;
23225     case ISD::SETULT:
23226     case ISD::SETULE:
23227       Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
23228     case ISD::SETUGT:
23229     case ISD::SETUGE:
23230       Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
23231     case ISD::SETLT:
23232     case ISD::SETLE:
23233       Opc = hasSigned ? X86ISD::SMIN : 0; break;
23234     case ISD::SETGT:
23235     case ISD::SETGE:
23236       Opc = hasSigned ? X86ISD::SMAX : 0; break;
23237     }
23238   // Check for x CC y ? y : x -- a min/max with reversed arms.
23239   } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
23240              DAG.isEqualTo(RHS, Cond.getOperand(0))) {
23241     switch (CC) {
23242     default: break;
23243     case ISD::SETULT:
23244     case ISD::SETULE:
23245       Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
23246     case ISD::SETUGT:
23247     case ISD::SETUGE:
23248       Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
23249     case ISD::SETLT:
23250     case ISD::SETLE:
23251       Opc = hasSigned ? X86ISD::SMAX : 0; break;
23252     case ISD::SETGT:
23253     case ISD::SETGE:
23254       Opc = hasSigned ? X86ISD::SMIN : 0; break;
23255     }
23256   }
23257
23258   return std::make_pair(Opc, NeedSplit);
23259 }
23260
23261 static SDValue
23262 transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
23263                                       const X86Subtarget *Subtarget) {
23264   SDLoc dl(N);
23265   SDValue Cond = N->getOperand(0);
23266   SDValue LHS = N->getOperand(1);
23267   SDValue RHS = N->getOperand(2);
23268
23269   if (Cond.getOpcode() == ISD::SIGN_EXTEND) {
23270     SDValue CondSrc = Cond->getOperand(0);
23271     if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG)
23272       Cond = CondSrc->getOperand(0);
23273   }
23274
23275   if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
23276     return SDValue();
23277
23278   // A vselect where all conditions and data are constants can be optimized into
23279   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
23280   if (ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
23281       ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
23282     return SDValue();
23283
23284   unsigned MaskValue = 0;
23285   if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
23286     return SDValue();
23287
23288   MVT VT = N->getSimpleValueType(0);
23289   unsigned NumElems = VT.getVectorNumElements();
23290   SmallVector<int, 8> ShuffleMask(NumElems, -1);
23291   for (unsigned i = 0; i < NumElems; ++i) {
23292     // Be sure we emit undef where we can.
23293     if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF)
23294       ShuffleMask[i] = -1;
23295     else
23296       ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1);
23297   }
23298
23299   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23300   if (!TLI.isShuffleMaskLegal(ShuffleMask, VT))
23301     return SDValue();
23302   return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]);
23303 }
23304
23305 /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
23306 /// nodes.
23307 static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
23308                                     TargetLowering::DAGCombinerInfo &DCI,
23309                                     const X86Subtarget *Subtarget) {
23310   SDLoc DL(N);
23311   SDValue Cond = N->getOperand(0);
23312   // Get the LHS/RHS of the select.
23313   SDValue LHS = N->getOperand(1);
23314   SDValue RHS = N->getOperand(2);
23315   EVT VT = LHS.getValueType();
23316   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23317
23318   // If we have SSE[12] support, try to form min/max nodes. SSE min/max
23319   // instructions match the semantics of the common C idiom x<y?x:y but not
23320   // x<=y?x:y, because of how they handle negative zero (which can be
23321   // ignored in unsafe-math mode).
23322   // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
23323   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
23324       VT != MVT::f80 && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
23325       (Subtarget->hasSSE2() ||
23326        (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
23327     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23328
23329     unsigned Opcode = 0;
23330     // Check for x CC y ? x : y.
23331     if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
23332         DAG.isEqualTo(RHS, Cond.getOperand(1))) {
23333       switch (CC) {
23334       default: break;
23335       case ISD::SETULT:
23336         // Converting this to a min would handle NaNs incorrectly, and swapping
23337         // the operands would cause it to handle comparisons between positive
23338         // and negative zero incorrectly.
23339         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
23340           if (!DAG.getTarget().Options.UnsafeFPMath &&
23341               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
23342             break;
23343           std::swap(LHS, RHS);
23344         }
23345         Opcode = X86ISD::FMIN;
23346         break;
23347       case ISD::SETOLE:
23348         // Converting this to a min would handle comparisons between positive
23349         // and negative zero incorrectly.
23350         if (!DAG.getTarget().Options.UnsafeFPMath &&
23351             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
23352           break;
23353         Opcode = X86ISD::FMIN;
23354         break;
23355       case ISD::SETULE:
23356         // Converting this to a min would handle both negative zeros and NaNs
23357         // incorrectly, but we can swap the operands to fix both.
23358         std::swap(LHS, RHS);
23359       case ISD::SETOLT:
23360       case ISD::SETLT:
23361       case ISD::SETLE:
23362         Opcode = X86ISD::FMIN;
23363         break;
23364
23365       case ISD::SETOGE:
23366         // Converting this to a max would handle comparisons between positive
23367         // and negative zero incorrectly.
23368         if (!DAG.getTarget().Options.UnsafeFPMath &&
23369             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
23370           break;
23371         Opcode = X86ISD::FMAX;
23372         break;
23373       case ISD::SETUGT:
23374         // Converting this to a max would handle NaNs incorrectly, and swapping
23375         // the operands would cause it to handle comparisons between positive
23376         // and negative zero incorrectly.
23377         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
23378           if (!DAG.getTarget().Options.UnsafeFPMath &&
23379               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
23380             break;
23381           std::swap(LHS, RHS);
23382         }
23383         Opcode = X86ISD::FMAX;
23384         break;
23385       case ISD::SETUGE:
23386         // Converting this to a max would handle both negative zeros and NaNs
23387         // incorrectly, but we can swap the operands to fix both.
23388         std::swap(LHS, RHS);
23389       case ISD::SETOGT:
23390       case ISD::SETGT:
23391       case ISD::SETGE:
23392         Opcode = X86ISD::FMAX;
23393         break;
23394       }
23395     // Check for x CC y ? y : x -- a min/max with reversed arms.
23396     } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
23397                DAG.isEqualTo(RHS, Cond.getOperand(0))) {
23398       switch (CC) {
23399       default: break;
23400       case ISD::SETOGE:
23401         // Converting this to a min would handle comparisons between positive
23402         // and negative zero incorrectly, and swapping the operands would
23403         // cause it to handle NaNs incorrectly.
23404         if (!DAG.getTarget().Options.UnsafeFPMath &&
23405             !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
23406           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
23407             break;
23408           std::swap(LHS, RHS);
23409         }
23410         Opcode = X86ISD::FMIN;
23411         break;
23412       case ISD::SETUGT:
23413         // Converting this to a min would handle NaNs incorrectly.
23414         if (!DAG.getTarget().Options.UnsafeFPMath &&
23415             (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
23416           break;
23417         Opcode = X86ISD::FMIN;
23418         break;
23419       case ISD::SETUGE:
23420         // Converting this to a min would handle both negative zeros and NaNs
23421         // incorrectly, but we can swap the operands to fix both.
23422         std::swap(LHS, RHS);
23423       case ISD::SETOGT:
23424       case ISD::SETGT:
23425       case ISD::SETGE:
23426         Opcode = X86ISD::FMIN;
23427         break;
23428
23429       case ISD::SETULT:
23430         // Converting this to a max would handle NaNs incorrectly.
23431         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
23432           break;
23433         Opcode = X86ISD::FMAX;
23434         break;
23435       case ISD::SETOLE:
23436         // Converting this to a max would handle comparisons between positive
23437         // and negative zero incorrectly, and swapping the operands would
23438         // cause it to handle NaNs incorrectly.
23439         if (!DAG.getTarget().Options.UnsafeFPMath &&
23440             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
23441           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
23442             break;
23443           std::swap(LHS, RHS);
23444         }
23445         Opcode = X86ISD::FMAX;
23446         break;
23447       case ISD::SETULE:
23448         // Converting this to a max would handle both negative zeros and NaNs
23449         // incorrectly, but we can swap the operands to fix both.
23450         std::swap(LHS, RHS);
23451       case ISD::SETOLT:
23452       case ISD::SETLT:
23453       case ISD::SETLE:
23454         Opcode = X86ISD::FMAX;
23455         break;
23456       }
23457     }
23458
23459     if (Opcode)
23460       return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
23461   }
23462
23463   EVT CondVT = Cond.getValueType();
23464   if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() &&
23465       CondVT.getVectorElementType() == MVT::i1) {
23466     // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
23467     // lowering on KNL. In this case we convert it to
23468     // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
23469     // The same situation for all 128 and 256-bit vectors of i8 and i16.
23470     // Since SKX these selects have a proper lowering.
23471     EVT OpVT = LHS.getValueType();
23472     if ((OpVT.is128BitVector() || OpVT.is256BitVector()) &&
23473         (OpVT.getVectorElementType() == MVT::i8 ||
23474          OpVT.getVectorElementType() == MVT::i16) &&
23475         !(Subtarget->hasBWI() && Subtarget->hasVLX())) {
23476       Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond);
23477       DCI.AddToWorklist(Cond.getNode());
23478       return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS);
23479     }
23480   }
23481   // If this is a select between two integer constants, try to do some
23482   // optimizations.
23483   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
23484     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
23485       // Don't do this for crazy integer types.
23486       if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
23487         // If this is efficiently invertible, canonicalize the LHSC/RHSC values
23488         // so that TrueC (the true value) is larger than FalseC.
23489         bool NeedsCondInvert = false;
23490
23491         if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
23492             // Efficiently invertible.
23493             (Cond.getOpcode() == ISD::SETCC ||  // setcc -> invertible.
23494              (Cond.getOpcode() == ISD::XOR &&   // xor(X, C) -> invertible.
23495               isa<ConstantSDNode>(Cond.getOperand(1))))) {
23496           NeedsCondInvert = true;
23497           std::swap(TrueC, FalseC);
23498         }
23499
23500         // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
23501         if (FalseC->getAPIntValue() == 0 &&
23502             TrueC->getAPIntValue().isPowerOf2()) {
23503           if (NeedsCondInvert) // Invert the condition if needed.
23504             Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
23505                                DAG.getConstant(1, Cond.getValueType()));
23506
23507           // Zero extend the condition if needed.
23508           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
23509
23510           unsigned ShAmt = TrueC->getAPIntValue().logBase2();
23511           return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
23512                              DAG.getConstant(ShAmt, MVT::i8));
23513         }
23514
23515         // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
23516         if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
23517           if (NeedsCondInvert) // Invert the condition if needed.
23518             Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
23519                                DAG.getConstant(1, Cond.getValueType()));
23520
23521           // Zero extend the condition if needed.
23522           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
23523                              FalseC->getValueType(0), Cond);
23524           return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
23525                              SDValue(FalseC, 0));
23526         }
23527
23528         // Optimize cases that will turn into an LEA instruction.  This requires
23529         // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
23530         if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
23531           uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
23532           if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
23533
23534           bool isFastMultiplier = false;
23535           if (Diff < 10) {
23536             switch ((unsigned char)Diff) {
23537               default: break;
23538               case 1:  // result = add base, cond
23539               case 2:  // result = lea base(    , cond*2)
23540               case 3:  // result = lea base(cond, cond*2)
23541               case 4:  // result = lea base(    , cond*4)
23542               case 5:  // result = lea base(cond, cond*4)
23543               case 8:  // result = lea base(    , cond*8)
23544               case 9:  // result = lea base(cond, cond*8)
23545                 isFastMultiplier = true;
23546                 break;
23547             }
23548           }
23549
23550           if (isFastMultiplier) {
23551             APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
23552             if (NeedsCondInvert) // Invert the condition if needed.
23553               Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
23554                                  DAG.getConstant(1, Cond.getValueType()));
23555
23556             // Zero extend the condition if needed.
23557             Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
23558                                Cond);
23559             // Scale the condition by the difference.
23560             if (Diff != 1)
23561               Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
23562                                  DAG.getConstant(Diff, Cond.getValueType()));
23563
23564             // Add the base if non-zero.
23565             if (FalseC->getAPIntValue() != 0)
23566               Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
23567                                  SDValue(FalseC, 0));
23568             return Cond;
23569           }
23570         }
23571       }
23572   }
23573
23574   // Canonicalize max and min:
23575   // (x > y) ? x : y -> (x >= y) ? x : y
23576   // (x < y) ? x : y -> (x <= y) ? x : y
23577   // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
23578   // the need for an extra compare
23579   // against zero. e.g.
23580   // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
23581   // subl   %esi, %edi
23582   // testl  %edi, %edi
23583   // movl   $0, %eax
23584   // cmovgl %edi, %eax
23585   // =>
23586   // xorl   %eax, %eax
23587   // subl   %esi, $edi
23588   // cmovsl %eax, %edi
23589   if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
23590       DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
23591       DAG.isEqualTo(RHS, Cond.getOperand(1))) {
23592     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23593     switch (CC) {
23594     default: break;
23595     case ISD::SETLT:
23596     case ISD::SETGT: {
23597       ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
23598       Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
23599                           Cond.getOperand(0), Cond.getOperand(1), NewCC);
23600       return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
23601     }
23602     }
23603   }
23604
23605   // Early exit check
23606   if (!TLI.isTypeLegal(VT))
23607     return SDValue();
23608
23609   // Match VSELECTs into subs with unsigned saturation.
23610   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
23611       // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
23612       ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
23613        (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
23614     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23615
23616     // Check if one of the arms of the VSELECT is a zero vector. If it's on the
23617     // left side invert the predicate to simplify logic below.
23618     SDValue Other;
23619     if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
23620       Other = RHS;
23621       CC = ISD::getSetCCInverse(CC, true);
23622     } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
23623       Other = LHS;
23624     }
23625
23626     if (Other.getNode() && Other->getNumOperands() == 2 &&
23627         DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
23628       SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
23629       SDValue CondRHS = Cond->getOperand(1);
23630
23631       // Look for a general sub with unsigned saturation first.
23632       // x >= y ? x-y : 0 --> subus x, y
23633       // x >  y ? x-y : 0 --> subus x, y
23634       if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
23635           Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
23636         return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
23637
23638       if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
23639         if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
23640           if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
23641             if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
23642               // If the RHS is a constant we have to reverse the const
23643               // canonicalization.
23644               // x > C-1 ? x+-C : 0 --> subus x, C
23645               if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
23646                   CondRHSConst->getAPIntValue() ==
23647                       (-OpRHSConst->getAPIntValue() - 1))
23648                 return DAG.getNode(
23649                     X86ISD::SUBUS, DL, VT, OpLHS,
23650                     DAG.getConstant(-OpRHSConst->getAPIntValue(), VT));
23651
23652           // Another special case: If C was a sign bit, the sub has been
23653           // canonicalized into a xor.
23654           // FIXME: Would it be better to use computeKnownBits to determine
23655           //        whether it's safe to decanonicalize the xor?
23656           // x s< 0 ? x^C : 0 --> subus x, C
23657           if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
23658               ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
23659               OpRHSConst->getAPIntValue().isSignBit())
23660             // Note that we have to rebuild the RHS constant here to ensure we
23661             // don't rely on particular values of undef lanes.
23662             return DAG.getNode(
23663                 X86ISD::SUBUS, DL, VT, OpLHS,
23664                 DAG.getConstant(OpRHSConst->getAPIntValue(), VT));
23665         }
23666     }
23667   }
23668
23669   // Try to match a min/max vector operation.
23670   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) {
23671     std::pair<unsigned, bool> ret = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget);
23672     unsigned Opc = ret.first;
23673     bool NeedSplit = ret.second;
23674
23675     if (Opc && NeedSplit) {
23676       unsigned NumElems = VT.getVectorNumElements();
23677       // Extract the LHS vectors
23678       SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, DL);
23679       SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, DL);
23680
23681       // Extract the RHS vectors
23682       SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, DL);
23683       SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, DL);
23684
23685       // Create min/max for each subvector
23686       LHS = DAG.getNode(Opc, DL, LHS1.getValueType(), LHS1, RHS1);
23687       RHS = DAG.getNode(Opc, DL, LHS2.getValueType(), LHS2, RHS2);
23688
23689       // Merge the result
23690       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS, RHS);
23691     } else if (Opc)
23692       return DAG.getNode(Opc, DL, VT, LHS, RHS);
23693   }
23694
23695   // Simplify vector selection if condition value type matches vselect
23696   // operand type
23697   if (N->getOpcode() == ISD::VSELECT && CondVT == VT) {
23698     assert(Cond.getValueType().isVector() &&
23699            "vector select expects a vector selector!");
23700
23701     bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
23702     bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
23703
23704     // Try invert the condition if true value is not all 1s and false value
23705     // is not all 0s.
23706     if (!TValIsAllOnes && !FValIsAllZeros &&
23707         // Check if the selector will be produced by CMPP*/PCMP*
23708         Cond.getOpcode() == ISD::SETCC &&
23709         // Check if SETCC has already been promoted
23710         TLI.getSetCCResultType(*DAG.getContext(), VT) == CondVT) {
23711       bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
23712       bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
23713
23714       if (TValIsAllZeros || FValIsAllOnes) {
23715         SDValue CC = Cond.getOperand(2);
23716         ISD::CondCode NewCC =
23717           ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
23718                                Cond.getOperand(0).getValueType().isInteger());
23719         Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC);
23720         std::swap(LHS, RHS);
23721         TValIsAllOnes = FValIsAllOnes;
23722         FValIsAllZeros = TValIsAllZeros;
23723       }
23724     }
23725
23726     if (TValIsAllOnes || FValIsAllZeros) {
23727       SDValue Ret;
23728
23729       if (TValIsAllOnes && FValIsAllZeros)
23730         Ret = Cond;
23731       else if (TValIsAllOnes)
23732         Ret = DAG.getNode(ISD::OR, DL, CondVT, Cond,
23733                           DAG.getNode(ISD::BITCAST, DL, CondVT, RHS));
23734       else if (FValIsAllZeros)
23735         Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond,
23736                           DAG.getNode(ISD::BITCAST, DL, CondVT, LHS));
23737
23738       return DAG.getNode(ISD::BITCAST, DL, VT, Ret);
23739     }
23740   }
23741
23742   // If we know that this node is legal then we know that it is going to be
23743   // matched by one of the SSE/AVX BLEND instructions. These instructions only
23744   // depend on the highest bit in each word. Try to use SimplifyDemandedBits
23745   // to simplify previous instructions.
23746   if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
23747       !DCI.isBeforeLegalize() &&
23748       // We explicitly check against v8i16 and v16i16 because, although
23749       // they're marked as Custom, they might only be legal when Cond is a
23750       // build_vector of constants. This will be taken care in a later
23751       // condition.
23752       (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 &&
23753        VT != MVT::v8i16) &&
23754       // Don't optimize vector of constants. Those are handled by
23755       // the generic code and all the bits must be properly set for
23756       // the generic optimizer.
23757       !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
23758     unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
23759
23760     // Don't optimize vector selects that map to mask-registers.
23761     if (BitWidth == 1)
23762       return SDValue();
23763
23764     assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
23765     APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
23766
23767     APInt KnownZero, KnownOne;
23768     TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
23769                                           DCI.isBeforeLegalizeOps());
23770     if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
23771         TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
23772                                  TLO)) {
23773       // If we changed the computation somewhere in the DAG, this change
23774       // will affect all users of Cond.
23775       // Make sure it is fine and update all the nodes so that we do not
23776       // use the generic VSELECT anymore. Otherwise, we may perform
23777       // wrong optimizations as we messed up with the actual expectation
23778       // for the vector boolean values.
23779       if (Cond != TLO.Old) {
23780         // Check all uses of that condition operand to check whether it will be
23781         // consumed by non-BLEND instructions, which may depend on all bits are
23782         // set properly.
23783         for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
23784              I != E; ++I)
23785           if (I->getOpcode() != ISD::VSELECT)
23786             // TODO: Add other opcodes eventually lowered into BLEND.
23787             return SDValue();
23788
23789         // Update all the users of the condition, before committing the change,
23790         // so that the VSELECT optimizations that expect the correct vector
23791         // boolean value will not be triggered.
23792         for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
23793              I != E; ++I)
23794           DAG.ReplaceAllUsesOfValueWith(
23795               SDValue(*I, 0),
23796               DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0),
23797                           Cond, I->getOperand(1), I->getOperand(2)));
23798         DCI.CommitTargetLoweringOpt(TLO);
23799         return SDValue();
23800       }
23801       // At this point, only Cond is changed. Change the condition
23802       // just for N to keep the opportunity to optimize all other
23803       // users their own way.
23804       DAG.ReplaceAllUsesOfValueWith(
23805           SDValue(N, 0),
23806           DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0),
23807                       TLO.New, N->getOperand(1), N->getOperand(2)));
23808       return SDValue();
23809     }
23810   }
23811
23812   // We should generate an X86ISD::BLENDI from a vselect if its argument
23813   // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
23814   // constants. This specific pattern gets generated when we split a
23815   // selector for a 512 bit vector in a machine without AVX512 (but with
23816   // 256-bit vectors), during legalization:
23817   //
23818   // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
23819   //
23820   // Iff we find this pattern and the build_vectors are built from
23821   // constants, we translate the vselect into a shuffle_vector that we
23822   // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
23823   if ((N->getOpcode() == ISD::VSELECT ||
23824        N->getOpcode() == X86ISD::SHRUNKBLEND) &&
23825       !DCI.isBeforeLegalize()) {
23826     SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
23827     if (Shuffle.getNode())
23828       return Shuffle;
23829   }
23830
23831   return SDValue();
23832 }
23833
23834 // Check whether a boolean test is testing a boolean value generated by
23835 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
23836 // code.
23837 //
23838 // Simplify the following patterns:
23839 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
23840 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
23841 // to (Op EFLAGS Cond)
23842 //
23843 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
23844 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
23845 // to (Op EFLAGS !Cond)
23846 //
23847 // where Op could be BRCOND or CMOV.
23848 //
23849 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
23850   // Quit if not CMP and SUB with its value result used.
23851   if (Cmp.getOpcode() != X86ISD::CMP &&
23852       (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0)))
23853       return SDValue();
23854
23855   // Quit if not used as a boolean value.
23856   if (CC != X86::COND_E && CC != X86::COND_NE)
23857     return SDValue();
23858
23859   // Check CMP operands. One of them should be 0 or 1 and the other should be
23860   // an SetCC or extended from it.
23861   SDValue Op1 = Cmp.getOperand(0);
23862   SDValue Op2 = Cmp.getOperand(1);
23863
23864   SDValue SetCC;
23865   const ConstantSDNode* C = nullptr;
23866   bool needOppositeCond = (CC == X86::COND_E);
23867   bool checkAgainstTrue = false; // Is it a comparison against 1?
23868
23869   if ((C = dyn_cast<ConstantSDNode>(Op1)))
23870     SetCC = Op2;
23871   else if ((C = dyn_cast<ConstantSDNode>(Op2)))
23872     SetCC = Op1;
23873   else // Quit if all operands are not constants.
23874     return SDValue();
23875
23876   if (C->getZExtValue() == 1) {
23877     needOppositeCond = !needOppositeCond;
23878     checkAgainstTrue = true;
23879   } else if (C->getZExtValue() != 0)
23880     // Quit if the constant is neither 0 or 1.
23881     return SDValue();
23882
23883   bool truncatedToBoolWithAnd = false;
23884   // Skip (zext $x), (trunc $x), or (and $x, 1) node.
23885   while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
23886          SetCC.getOpcode() == ISD::TRUNCATE ||
23887          SetCC.getOpcode() == ISD::AND) {
23888     if (SetCC.getOpcode() == ISD::AND) {
23889       int OpIdx = -1;
23890       ConstantSDNode *CS;
23891       if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(0))) &&
23892           CS->getZExtValue() == 1)
23893         OpIdx = 1;
23894       if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(1))) &&
23895           CS->getZExtValue() == 1)
23896         OpIdx = 0;
23897       if (OpIdx == -1)
23898         break;
23899       SetCC = SetCC.getOperand(OpIdx);
23900       truncatedToBoolWithAnd = true;
23901     } else
23902       SetCC = SetCC.getOperand(0);
23903   }
23904
23905   switch (SetCC.getOpcode()) {
23906   case X86ISD::SETCC_CARRY:
23907     // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
23908     // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
23909     // i.e. it's a comparison against true but the result of SETCC_CARRY is not
23910     // truncated to i1 using 'and'.
23911     if (checkAgainstTrue && !truncatedToBoolWithAnd)
23912       break;
23913     assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
23914            "Invalid use of SETCC_CARRY!");
23915     // FALL THROUGH
23916   case X86ISD::SETCC:
23917     // Set the condition code or opposite one if necessary.
23918     CC = X86::CondCode(SetCC.getConstantOperandVal(0));
23919     if (needOppositeCond)
23920       CC = X86::GetOppositeBranchCondition(CC);
23921     return SetCC.getOperand(1);
23922   case X86ISD::CMOV: {
23923     // Check whether false/true value has canonical one, i.e. 0 or 1.
23924     ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
23925     ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
23926     // Quit if true value is not a constant.
23927     if (!TVal)
23928       return SDValue();
23929     // Quit if false value is not a constant.
23930     if (!FVal) {
23931       SDValue Op = SetCC.getOperand(0);
23932       // Skip 'zext' or 'trunc' node.
23933       if (Op.getOpcode() == ISD::ZERO_EXTEND ||
23934           Op.getOpcode() == ISD::TRUNCATE)
23935         Op = Op.getOperand(0);
23936       // A special case for rdrand/rdseed, where 0 is set if false cond is
23937       // found.
23938       if ((Op.getOpcode() != X86ISD::RDRAND &&
23939            Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
23940         return SDValue();
23941     }
23942     // Quit if false value is not the constant 0 or 1.
23943     bool FValIsFalse = true;
23944     if (FVal && FVal->getZExtValue() != 0) {
23945       if (FVal->getZExtValue() != 1)
23946         return SDValue();
23947       // If FVal is 1, opposite cond is needed.
23948       needOppositeCond = !needOppositeCond;
23949       FValIsFalse = false;
23950     }
23951     // Quit if TVal is not the constant opposite of FVal.
23952     if (FValIsFalse && TVal->getZExtValue() != 1)
23953       return SDValue();
23954     if (!FValIsFalse && TVal->getZExtValue() != 0)
23955       return SDValue();
23956     CC = X86::CondCode(SetCC.getConstantOperandVal(2));
23957     if (needOppositeCond)
23958       CC = X86::GetOppositeBranchCondition(CC);
23959     return SetCC.getOperand(3);
23960   }
23961   }
23962
23963   return SDValue();
23964 }
23965
23966 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
23967 static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
23968                                   TargetLowering::DAGCombinerInfo &DCI,
23969                                   const X86Subtarget *Subtarget) {
23970   SDLoc DL(N);
23971
23972   // If the flag operand isn't dead, don't touch this CMOV.
23973   if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
23974     return SDValue();
23975
23976   SDValue FalseOp = N->getOperand(0);
23977   SDValue TrueOp = N->getOperand(1);
23978   X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
23979   SDValue Cond = N->getOperand(3);
23980
23981   if (CC == X86::COND_E || CC == X86::COND_NE) {
23982     switch (Cond.getOpcode()) {
23983     default: break;
23984     case X86ISD::BSR:
23985     case X86ISD::BSF:
23986       // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
23987       if (DAG.isKnownNeverZero(Cond.getOperand(0)))
23988         return (CC == X86::COND_E) ? FalseOp : TrueOp;
23989     }
23990   }
23991
23992   SDValue Flags;
23993
23994   Flags = checkBoolTestSetCCCombine(Cond, CC);
23995   if (Flags.getNode() &&
23996       // Extra check as FCMOV only supports a subset of X86 cond.
23997       (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
23998     SDValue Ops[] = { FalseOp, TrueOp,
23999                       DAG.getConstant(CC, MVT::i8), Flags };
24000     return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
24001   }
24002
24003   // If this is a select between two integer constants, try to do some
24004   // optimizations.  Note that the operands are ordered the opposite of SELECT
24005   // operands.
24006   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
24007     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
24008       // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
24009       // larger than FalseC (the false value).
24010       if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
24011         CC = X86::GetOppositeBranchCondition(CC);
24012         std::swap(TrueC, FalseC);
24013         std::swap(TrueOp, FalseOp);
24014       }
24015
24016       // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
24017       // This is efficient for any integer data type (including i8/i16) and
24018       // shift amount.
24019       if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
24020         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
24021                            DAG.getConstant(CC, MVT::i8), Cond);
24022
24023         // Zero extend the condition if needed.
24024         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
24025
24026         unsigned ShAmt = TrueC->getAPIntValue().logBase2();
24027         Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
24028                            DAG.getConstant(ShAmt, MVT::i8));
24029         if (N->getNumValues() == 2)  // Dead flag value?
24030           return DCI.CombineTo(N, Cond, SDValue());
24031         return Cond;
24032       }
24033
24034       // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
24035       // for any integer data type, including i8/i16.
24036       if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
24037         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
24038                            DAG.getConstant(CC, MVT::i8), Cond);
24039
24040         // Zero extend the condition if needed.
24041         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
24042                            FalseC->getValueType(0), Cond);
24043         Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
24044                            SDValue(FalseC, 0));
24045
24046         if (N->getNumValues() == 2)  // Dead flag value?
24047           return DCI.CombineTo(N, Cond, SDValue());
24048         return Cond;
24049       }
24050
24051       // Optimize cases that will turn into an LEA instruction.  This requires
24052       // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
24053       if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
24054         uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
24055         if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
24056
24057         bool isFastMultiplier = false;
24058         if (Diff < 10) {
24059           switch ((unsigned char)Diff) {
24060           default: break;
24061           case 1:  // result = add base, cond
24062           case 2:  // result = lea base(    , cond*2)
24063           case 3:  // result = lea base(cond, cond*2)
24064           case 4:  // result = lea base(    , cond*4)
24065           case 5:  // result = lea base(cond, cond*4)
24066           case 8:  // result = lea base(    , cond*8)
24067           case 9:  // result = lea base(cond, cond*8)
24068             isFastMultiplier = true;
24069             break;
24070           }
24071         }
24072
24073         if (isFastMultiplier) {
24074           APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
24075           Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
24076                              DAG.getConstant(CC, MVT::i8), Cond);
24077           // Zero extend the condition if needed.
24078           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
24079                              Cond);
24080           // Scale the condition by the difference.
24081           if (Diff != 1)
24082             Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
24083                                DAG.getConstant(Diff, Cond.getValueType()));
24084
24085           // Add the base if non-zero.
24086           if (FalseC->getAPIntValue() != 0)
24087             Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
24088                                SDValue(FalseC, 0));
24089           if (N->getNumValues() == 2)  // Dead flag value?
24090             return DCI.CombineTo(N, Cond, SDValue());
24091           return Cond;
24092         }
24093       }
24094     }
24095   }
24096
24097   // Handle these cases:
24098   //   (select (x != c), e, c) -> select (x != c), e, x),
24099   //   (select (x == c), c, e) -> select (x == c), x, e)
24100   // where the c is an integer constant, and the "select" is the combination
24101   // of CMOV and CMP.
24102   //
24103   // The rationale for this change is that the conditional-move from a constant
24104   // needs two instructions, however, conditional-move from a register needs
24105   // only one instruction.
24106   //
24107   // CAVEAT: By replacing a constant with a symbolic value, it may obscure
24108   //  some instruction-combining opportunities. This opt needs to be
24109   //  postponed as late as possible.
24110   //
24111   if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
24112     // the DCI.xxxx conditions are provided to postpone the optimization as
24113     // late as possible.
24114
24115     ConstantSDNode *CmpAgainst = nullptr;
24116     if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
24117         (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
24118         !isa<ConstantSDNode>(Cond.getOperand(0))) {
24119
24120       if (CC == X86::COND_NE &&
24121           CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
24122         CC = X86::GetOppositeBranchCondition(CC);
24123         std::swap(TrueOp, FalseOp);
24124       }
24125
24126       if (CC == X86::COND_E &&
24127           CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
24128         SDValue Ops[] = { FalseOp, Cond.getOperand(0),
24129                           DAG.getConstant(CC, MVT::i8), Cond };
24130         return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
24131       }
24132     }
24133   }
24134
24135   return SDValue();
24136 }
24137
24138 static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
24139                                                 const X86Subtarget *Subtarget) {
24140   unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
24141   switch (IntNo) {
24142   default: return SDValue();
24143   // SSE/AVX/AVX2 blend intrinsics.
24144   case Intrinsic::x86_avx2_pblendvb:
24145   case Intrinsic::x86_avx2_pblendw:
24146   case Intrinsic::x86_avx2_pblendd_128:
24147   case Intrinsic::x86_avx2_pblendd_256:
24148     // Don't try to simplify this intrinsic if we don't have AVX2.
24149     if (!Subtarget->hasAVX2())
24150       return SDValue();
24151     // FALL-THROUGH
24152   case Intrinsic::x86_avx_blend_pd_256:
24153   case Intrinsic::x86_avx_blend_ps_256:
24154   case Intrinsic::x86_avx_blendv_pd_256:
24155   case Intrinsic::x86_avx_blendv_ps_256:
24156     // Don't try to simplify this intrinsic if we don't have AVX.
24157     if (!Subtarget->hasAVX())
24158       return SDValue();
24159     // FALL-THROUGH
24160   case Intrinsic::x86_sse41_pblendw:
24161   case Intrinsic::x86_sse41_blendpd:
24162   case Intrinsic::x86_sse41_blendps:
24163   case Intrinsic::x86_sse41_blendvps:
24164   case Intrinsic::x86_sse41_blendvpd:
24165   case Intrinsic::x86_sse41_pblendvb: {
24166     SDValue Op0 = N->getOperand(1);
24167     SDValue Op1 = N->getOperand(2);
24168     SDValue Mask = N->getOperand(3);
24169
24170     // Don't try to simplify this intrinsic if we don't have SSE4.1.
24171     if (!Subtarget->hasSSE41())
24172       return SDValue();
24173
24174     // fold (blend A, A, Mask) -> A
24175     if (Op0 == Op1)
24176       return Op0;
24177     // fold (blend A, B, allZeros) -> A
24178     if (ISD::isBuildVectorAllZeros(Mask.getNode()))
24179       return Op0;
24180     // fold (blend A, B, allOnes) -> B
24181     if (ISD::isBuildVectorAllOnes(Mask.getNode()))
24182       return Op1;
24183
24184     // Simplify the case where the mask is a constant i32 value.
24185     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mask)) {
24186       if (C->isNullValue())
24187         return Op0;
24188       if (C->isAllOnesValue())
24189         return Op1;
24190     }
24191
24192     return SDValue();
24193   }
24194
24195   // Packed SSE2/AVX2 arithmetic shift immediate intrinsics.
24196   case Intrinsic::x86_sse2_psrai_w:
24197   case Intrinsic::x86_sse2_psrai_d:
24198   case Intrinsic::x86_avx2_psrai_w:
24199   case Intrinsic::x86_avx2_psrai_d:
24200   case Intrinsic::x86_sse2_psra_w:
24201   case Intrinsic::x86_sse2_psra_d:
24202   case Intrinsic::x86_avx2_psra_w:
24203   case Intrinsic::x86_avx2_psra_d: {
24204     SDValue Op0 = N->getOperand(1);
24205     SDValue Op1 = N->getOperand(2);
24206     EVT VT = Op0.getValueType();
24207     assert(VT.isVector() && "Expected a vector type!");
24208
24209     if (isa<BuildVectorSDNode>(Op1))
24210       Op1 = Op1.getOperand(0);
24211
24212     if (!isa<ConstantSDNode>(Op1))
24213       return SDValue();
24214
24215     EVT SVT = VT.getVectorElementType();
24216     unsigned SVTBits = SVT.getSizeInBits();
24217
24218     ConstantSDNode *CND = cast<ConstantSDNode>(Op1);
24219     const APInt &C = APInt(SVTBits, CND->getAPIntValue().getZExtValue());
24220     uint64_t ShAmt = C.getZExtValue();
24221
24222     // Don't try to convert this shift into a ISD::SRA if the shift
24223     // count is bigger than or equal to the element size.
24224     if (ShAmt >= SVTBits)
24225       return SDValue();
24226
24227     // Trivial case: if the shift count is zero, then fold this
24228     // into the first operand.
24229     if (ShAmt == 0)
24230       return Op0;
24231
24232     // Replace this packed shift intrinsic with a target independent
24233     // shift dag node.
24234     SDValue Splat = DAG.getConstant(C, VT);
24235     return DAG.getNode(ISD::SRA, SDLoc(N), VT, Op0, Splat);
24236   }
24237   }
24238 }
24239
24240 /// PerformMulCombine - Optimize a single multiply with constant into two
24241 /// in order to implement it with two cheaper instructions, e.g.
24242 /// LEA + SHL, LEA + LEA.
24243 static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
24244                                  TargetLowering::DAGCombinerInfo &DCI) {
24245   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
24246     return SDValue();
24247
24248   EVT VT = N->getValueType(0);
24249   if (VT != MVT::i64 && VT != MVT::i32)
24250     return SDValue();
24251
24252   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
24253   if (!C)
24254     return SDValue();
24255   uint64_t MulAmt = C->getZExtValue();
24256   if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
24257     return SDValue();
24258
24259   uint64_t MulAmt1 = 0;
24260   uint64_t MulAmt2 = 0;
24261   if ((MulAmt % 9) == 0) {
24262     MulAmt1 = 9;
24263     MulAmt2 = MulAmt / 9;
24264   } else if ((MulAmt % 5) == 0) {
24265     MulAmt1 = 5;
24266     MulAmt2 = MulAmt / 5;
24267   } else if ((MulAmt % 3) == 0) {
24268     MulAmt1 = 3;
24269     MulAmt2 = MulAmt / 3;
24270   }
24271   if (MulAmt2 &&
24272       (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
24273     SDLoc DL(N);
24274
24275     if (isPowerOf2_64(MulAmt2) &&
24276         !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
24277       // If second multiplifer is pow2, issue it first. We want the multiply by
24278       // 3, 5, or 9 to be folded into the addressing mode unless the lone use
24279       // is an add.
24280       std::swap(MulAmt1, MulAmt2);
24281
24282     SDValue NewMul;
24283     if (isPowerOf2_64(MulAmt1))
24284       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
24285                            DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
24286     else
24287       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
24288                            DAG.getConstant(MulAmt1, VT));
24289
24290     if (isPowerOf2_64(MulAmt2))
24291       NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
24292                            DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
24293     else
24294       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
24295                            DAG.getConstant(MulAmt2, VT));
24296
24297     // Do not add new nodes to DAG combiner worklist.
24298     DCI.CombineTo(N, NewMul, false);
24299   }
24300   return SDValue();
24301 }
24302
24303 static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
24304   SDValue N0 = N->getOperand(0);
24305   SDValue N1 = N->getOperand(1);
24306   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
24307   EVT VT = N0.getValueType();
24308
24309   // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
24310   // since the result of setcc_c is all zero's or all ones.
24311   if (VT.isInteger() && !VT.isVector() &&
24312       N1C && N0.getOpcode() == ISD::AND &&
24313       N0.getOperand(1).getOpcode() == ISD::Constant) {
24314     SDValue N00 = N0.getOperand(0);
24315     if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
24316         ((N00.getOpcode() == ISD::ANY_EXTEND ||
24317           N00.getOpcode() == ISD::ZERO_EXTEND) &&
24318          N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) {
24319       APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
24320       APInt ShAmt = N1C->getAPIntValue();
24321       Mask = Mask.shl(ShAmt);
24322       if (Mask != 0)
24323         return DAG.getNode(ISD::AND, SDLoc(N), VT,
24324                            N00, DAG.getConstant(Mask, VT));
24325     }
24326   }
24327
24328   // Hardware support for vector shifts is sparse which makes us scalarize the
24329   // vector operations in many cases. Also, on sandybridge ADD is faster than
24330   // shl.
24331   // (shl V, 1) -> add V,V
24332   if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
24333     if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
24334       assert(N0.getValueType().isVector() && "Invalid vector shift type");
24335       // We shift all of the values by one. In many cases we do not have
24336       // hardware support for this operation. This is better expressed as an ADD
24337       // of two values.
24338       if (N1SplatC->getZExtValue() == 1)
24339         return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
24340     }
24341
24342   return SDValue();
24343 }
24344
24345 /// \brief Returns a vector of 0s if the node in input is a vector logical
24346 /// shift by a constant amount which is known to be bigger than or equal
24347 /// to the vector element size in bits.
24348 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
24349                                       const X86Subtarget *Subtarget) {
24350   EVT VT = N->getValueType(0);
24351
24352   if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
24353       (!Subtarget->hasInt256() ||
24354        (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
24355     return SDValue();
24356
24357   SDValue Amt = N->getOperand(1);
24358   SDLoc DL(N);
24359   if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
24360     if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
24361       APInt ShiftAmt = AmtSplat->getAPIntValue();
24362       unsigned MaxAmount = VT.getVectorElementType().getSizeInBits();
24363
24364       // SSE2/AVX2 logical shifts always return a vector of 0s
24365       // if the shift amount is bigger than or equal to
24366       // the element size. The constant shift amount will be
24367       // encoded as a 8-bit immediate.
24368       if (ShiftAmt.trunc(8).uge(MaxAmount))
24369         return getZeroVector(VT, Subtarget, DAG, DL);
24370     }
24371
24372   return SDValue();
24373 }
24374
24375 /// PerformShiftCombine - Combine shifts.
24376 static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
24377                                    TargetLowering::DAGCombinerInfo &DCI,
24378                                    const X86Subtarget *Subtarget) {
24379   if (N->getOpcode() == ISD::SHL) {
24380     SDValue V = PerformSHLCombine(N, DAG);
24381     if (V.getNode()) return V;
24382   }
24383
24384   if (N->getOpcode() != ISD::SRA) {
24385     // Try to fold this logical shift into a zero vector.
24386     SDValue V = performShiftToAllZeros(N, DAG, Subtarget);
24387     if (V.getNode()) return V;
24388   }
24389
24390   return SDValue();
24391 }
24392
24393 // CMPEQCombine - Recognize the distinctive  (AND (setcc ...) (setcc ..))
24394 // where both setccs reference the same FP CMP, and rewrite for CMPEQSS
24395 // and friends.  Likewise for OR -> CMPNEQSS.
24396 static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
24397                             TargetLowering::DAGCombinerInfo &DCI,
24398                             const X86Subtarget *Subtarget) {
24399   unsigned opcode;
24400
24401   // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
24402   // we're requiring SSE2 for both.
24403   if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
24404     SDValue N0 = N->getOperand(0);
24405     SDValue N1 = N->getOperand(1);
24406     SDValue CMP0 = N0->getOperand(1);
24407     SDValue CMP1 = N1->getOperand(1);
24408     SDLoc DL(N);
24409
24410     // The SETCCs should both refer to the same CMP.
24411     if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
24412       return SDValue();
24413
24414     SDValue CMP00 = CMP0->getOperand(0);
24415     SDValue CMP01 = CMP0->getOperand(1);
24416     EVT     VT    = CMP00.getValueType();
24417
24418     if (VT == MVT::f32 || VT == MVT::f64) {
24419       bool ExpectingFlags = false;
24420       // Check for any users that want flags:
24421       for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
24422            !ExpectingFlags && UI != UE; ++UI)
24423         switch (UI->getOpcode()) {
24424         default:
24425         case ISD::BR_CC:
24426         case ISD::BRCOND:
24427         case ISD::SELECT:
24428           ExpectingFlags = true;
24429           break;
24430         case ISD::CopyToReg:
24431         case ISD::SIGN_EXTEND:
24432         case ISD::ZERO_EXTEND:
24433         case ISD::ANY_EXTEND:
24434           break;
24435         }
24436
24437       if (!ExpectingFlags) {
24438         enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
24439         enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
24440
24441         if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
24442           X86::CondCode tmp = cc0;
24443           cc0 = cc1;
24444           cc1 = tmp;
24445         }
24446
24447         if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
24448             (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
24449           // FIXME: need symbolic constants for these magic numbers.
24450           // See X86ATTInstPrinter.cpp:printSSECC().
24451           unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
24452           if (Subtarget->hasAVX512()) {
24453             SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00,
24454                                          CMP01, DAG.getConstant(x86cc, MVT::i8));
24455             if (N->getValueType(0) != MVT::i1)
24456               return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
24457                                  FSetCC);
24458             return FSetCC;
24459           }
24460           SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
24461                                               CMP00.getValueType(), CMP00, CMP01,
24462                                               DAG.getConstant(x86cc, MVT::i8));
24463
24464           bool is64BitFP = (CMP00.getValueType() == MVT::f64);
24465           MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
24466
24467           if (is64BitFP && !Subtarget->is64Bit()) {
24468             // On a 32-bit target, we cannot bitcast the 64-bit float to a
24469             // 64-bit integer, since that's not a legal type. Since
24470             // OnesOrZeroesF is all ones of all zeroes, we don't need all the
24471             // bits, but can do this little dance to extract the lowest 32 bits
24472             // and work with those going forward.
24473             SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
24474                                            OnesOrZeroesF);
24475             SDValue Vector32 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32,
24476                                            Vector64);
24477             OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
24478                                         Vector32, DAG.getIntPtrConstant(0));
24479             IntVT = MVT::i32;
24480           }
24481
24482           SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, IntVT, OnesOrZeroesF);
24483           SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
24484                                       DAG.getConstant(1, IntVT));
24485           SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed);
24486           return OneBitOfTruth;
24487         }
24488       }
24489     }
24490   }
24491   return SDValue();
24492 }
24493
24494 /// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector
24495 /// so it can be folded inside ANDNP.
24496 static bool CanFoldXORWithAllOnes(const SDNode *N) {
24497   EVT VT = N->getValueType(0);
24498
24499   // Match direct AllOnes for 128 and 256-bit vectors
24500   if (ISD::isBuildVectorAllOnes(N))
24501     return true;
24502
24503   // Look through a bit convert.
24504   if (N->getOpcode() == ISD::BITCAST)
24505     N = N->getOperand(0).getNode();
24506
24507   // Sometimes the operand may come from a insert_subvector building a 256-bit
24508   // allones vector
24509   if (VT.is256BitVector() &&
24510       N->getOpcode() == ISD::INSERT_SUBVECTOR) {
24511     SDValue V1 = N->getOperand(0);
24512     SDValue V2 = N->getOperand(1);
24513
24514     if (V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
24515         V1.getOperand(0).getOpcode() == ISD::UNDEF &&
24516         ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
24517         ISD::isBuildVectorAllOnes(V2.getNode()))
24518       return true;
24519   }
24520
24521   return false;
24522 }
24523
24524 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
24525 // register. In most cases we actually compare or select YMM-sized registers
24526 // and mixing the two types creates horrible code. This method optimizes
24527 // some of the transition sequences.
24528 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
24529                                  TargetLowering::DAGCombinerInfo &DCI,
24530                                  const X86Subtarget *Subtarget) {
24531   EVT VT = N->getValueType(0);
24532   if (!VT.is256BitVector())
24533     return SDValue();
24534
24535   assert((N->getOpcode() == ISD::ANY_EXTEND ||
24536           N->getOpcode() == ISD::ZERO_EXTEND ||
24537           N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
24538
24539   SDValue Narrow = N->getOperand(0);
24540   EVT NarrowVT = Narrow->getValueType(0);
24541   if (!NarrowVT.is128BitVector())
24542     return SDValue();
24543
24544   if (Narrow->getOpcode() != ISD::XOR &&
24545       Narrow->getOpcode() != ISD::AND &&
24546       Narrow->getOpcode() != ISD::OR)
24547     return SDValue();
24548
24549   SDValue N0  = Narrow->getOperand(0);
24550   SDValue N1  = Narrow->getOperand(1);
24551   SDLoc DL(Narrow);
24552
24553   // The Left side has to be a trunc.
24554   if (N0.getOpcode() != ISD::TRUNCATE)
24555     return SDValue();
24556
24557   // The type of the truncated inputs.
24558   EVT WideVT = N0->getOperand(0)->getValueType(0);
24559   if (WideVT != VT)
24560     return SDValue();
24561
24562   // The right side has to be a 'trunc' or a constant vector.
24563   bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
24564   ConstantSDNode *RHSConstSplat = nullptr;
24565   if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
24566     RHSConstSplat = RHSBV->getConstantSplatNode();
24567   if (!RHSTrunc && !RHSConstSplat)
24568     return SDValue();
24569
24570   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24571
24572   if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
24573     return SDValue();
24574
24575   // Set N0 and N1 to hold the inputs to the new wide operation.
24576   N0 = N0->getOperand(0);
24577   if (RHSConstSplat) {
24578     N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(),
24579                      SDValue(RHSConstSplat, 0));
24580     SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);
24581     N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C);
24582   } else if (RHSTrunc) {
24583     N1 = N1->getOperand(0);
24584   }
24585
24586   // Generate the wide operation.
24587   SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
24588   unsigned Opcode = N->getOpcode();
24589   switch (Opcode) {
24590   case ISD::ANY_EXTEND:
24591     return Op;
24592   case ISD::ZERO_EXTEND: {
24593     unsigned InBits = NarrowVT.getScalarType().getSizeInBits();
24594     APInt Mask = APInt::getAllOnesValue(InBits);
24595     Mask = Mask.zext(VT.getScalarType().getSizeInBits());
24596     return DAG.getNode(ISD::AND, DL, VT,
24597                        Op, DAG.getConstant(Mask, VT));
24598   }
24599   case ISD::SIGN_EXTEND:
24600     return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
24601                        Op, DAG.getValueType(NarrowVT));
24602   default:
24603     llvm_unreachable("Unexpected opcode");
24604   }
24605 }
24606
24607 static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
24608                                  TargetLowering::DAGCombinerInfo &DCI,
24609                                  const X86Subtarget *Subtarget) {
24610   EVT VT = N->getValueType(0);
24611   if (DCI.isBeforeLegalizeOps())
24612     return SDValue();
24613
24614   SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
24615   if (R.getNode())
24616     return R;
24617
24618   // Create BEXTR instructions
24619   // BEXTR is ((X >> imm) & (2**size-1))
24620   if (VT == MVT::i32 || VT == MVT::i64) {
24621     SDValue N0 = N->getOperand(0);
24622     SDValue N1 = N->getOperand(1);
24623     SDLoc DL(N);
24624
24625     // Check for BEXTR.
24626     if ((Subtarget->hasBMI() || Subtarget->hasTBM()) &&
24627         (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) {
24628       ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
24629       ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
24630       if (MaskNode && ShiftNode) {
24631         uint64_t Mask = MaskNode->getZExtValue();
24632         uint64_t Shift = ShiftNode->getZExtValue();
24633         if (isMask_64(Mask)) {
24634           uint64_t MaskSize = countPopulation(Mask);
24635           if (Shift + MaskSize <= VT.getSizeInBits())
24636             return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
24637                                DAG.getConstant(Shift | (MaskSize << 8), VT));
24638         }
24639       }
24640     } // BEXTR
24641
24642     return SDValue();
24643   }
24644
24645   // Want to form ANDNP nodes:
24646   // 1) In the hopes of then easily combining them with OR and AND nodes
24647   //    to form PBLEND/PSIGN.
24648   // 2) To match ANDN packed intrinsics
24649   if (VT != MVT::v2i64 && VT != MVT::v4i64)
24650     return SDValue();
24651
24652   SDValue N0 = N->getOperand(0);
24653   SDValue N1 = N->getOperand(1);
24654   SDLoc DL(N);
24655
24656   // Check LHS for vnot
24657   if (N0.getOpcode() == ISD::XOR &&
24658       //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
24659       CanFoldXORWithAllOnes(N0.getOperand(1).getNode()))
24660     return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
24661
24662   // Check RHS for vnot
24663   if (N1.getOpcode() == ISD::XOR &&
24664       //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
24665       CanFoldXORWithAllOnes(N1.getOperand(1).getNode()))
24666     return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
24667
24668   return SDValue();
24669 }
24670
24671 static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
24672                                 TargetLowering::DAGCombinerInfo &DCI,
24673                                 const X86Subtarget *Subtarget) {
24674   if (DCI.isBeforeLegalizeOps())
24675     return SDValue();
24676
24677   SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
24678   if (R.getNode())
24679     return R;
24680
24681   SDValue N0 = N->getOperand(0);
24682   SDValue N1 = N->getOperand(1);
24683   EVT VT = N->getValueType(0);
24684
24685   // look for psign/blend
24686   if (VT == MVT::v2i64 || VT == MVT::v4i64) {
24687     if (!Subtarget->hasSSSE3() ||
24688         (VT == MVT::v4i64 && !Subtarget->hasInt256()))
24689       return SDValue();
24690
24691     // Canonicalize pandn to RHS
24692     if (N0.getOpcode() == X86ISD::ANDNP)
24693       std::swap(N0, N1);
24694     // or (and (m, y), (pandn m, x))
24695     if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
24696       SDValue Mask = N1.getOperand(0);
24697       SDValue X    = N1.getOperand(1);
24698       SDValue Y;
24699       if (N0.getOperand(0) == Mask)
24700         Y = N0.getOperand(1);
24701       if (N0.getOperand(1) == Mask)
24702         Y = N0.getOperand(0);
24703
24704       // Check to see if the mask appeared in both the AND and ANDNP and
24705       if (!Y.getNode())
24706         return SDValue();
24707
24708       // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
24709       // Look through mask bitcast.
24710       if (Mask.getOpcode() == ISD::BITCAST)
24711         Mask = Mask.getOperand(0);
24712       if (X.getOpcode() == ISD::BITCAST)
24713         X = X.getOperand(0);
24714       if (Y.getOpcode() == ISD::BITCAST)
24715         Y = Y.getOperand(0);
24716
24717       EVT MaskVT = Mask.getValueType();
24718
24719       // Validate that the Mask operand is a vector sra node.
24720       // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
24721       // there is no psrai.b
24722       unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
24723       unsigned SraAmt = ~0;
24724       if (Mask.getOpcode() == ISD::SRA) {
24725         if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
24726           if (auto *AmtConst = AmtBV->getConstantSplatNode())
24727             SraAmt = AmtConst->getZExtValue();
24728       } else if (Mask.getOpcode() == X86ISD::VSRAI) {
24729         SDValue SraC = Mask.getOperand(1);
24730         SraAmt  = cast<ConstantSDNode>(SraC)->getZExtValue();
24731       }
24732       if ((SraAmt + 1) != EltBits)
24733         return SDValue();
24734
24735       SDLoc DL(N);
24736
24737       // Now we know we at least have a plendvb with the mask val.  See if
24738       // we can form a psignb/w/d.
24739       // psign = x.type == y.type == mask.type && y = sub(0, x);
24740       if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
24741           ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
24742           X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
24743         assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
24744                "Unsupported VT for PSIGN");
24745         Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0));
24746         return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
24747       }
24748       // PBLENDVB only available on SSE 4.1
24749       if (!Subtarget->hasSSE41())
24750         return SDValue();
24751
24752       EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
24753
24754       X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X);
24755       Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y);
24756       Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask);
24757       Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
24758       return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
24759     }
24760   }
24761
24762   if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
24763     return SDValue();
24764
24765   // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
24766   MachineFunction &MF = DAG.getMachineFunction();
24767   bool OptForSize = MF.getFunction()->getAttributes().
24768     hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
24769
24770   // SHLD/SHRD instructions have lower register pressure, but on some
24771   // platforms they have higher latency than the equivalent
24772   // series of shifts/or that would otherwise be generated.
24773   // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
24774   // have higher latencies and we are not optimizing for size.
24775   if (!OptForSize && Subtarget->isSHLDSlow())
24776     return SDValue();
24777
24778   if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
24779     std::swap(N0, N1);
24780   if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
24781     return SDValue();
24782   if (!N0.hasOneUse() || !N1.hasOneUse())
24783     return SDValue();
24784
24785   SDValue ShAmt0 = N0.getOperand(1);
24786   if (ShAmt0.getValueType() != MVT::i8)
24787     return SDValue();
24788   SDValue ShAmt1 = N1.getOperand(1);
24789   if (ShAmt1.getValueType() != MVT::i8)
24790     return SDValue();
24791   if (ShAmt0.getOpcode() == ISD::TRUNCATE)
24792     ShAmt0 = ShAmt0.getOperand(0);
24793   if (ShAmt1.getOpcode() == ISD::TRUNCATE)
24794     ShAmt1 = ShAmt1.getOperand(0);
24795
24796   SDLoc DL(N);
24797   unsigned Opc = X86ISD::SHLD;
24798   SDValue Op0 = N0.getOperand(0);
24799   SDValue Op1 = N1.getOperand(0);
24800   if (ShAmt0.getOpcode() == ISD::SUB) {
24801     Opc = X86ISD::SHRD;
24802     std::swap(Op0, Op1);
24803     std::swap(ShAmt0, ShAmt1);
24804   }
24805
24806   unsigned Bits = VT.getSizeInBits();
24807   if (ShAmt1.getOpcode() == ISD::SUB) {
24808     SDValue Sum = ShAmt1.getOperand(0);
24809     if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
24810       SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
24811       if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
24812         ShAmt1Op1 = ShAmt1Op1.getOperand(0);
24813       if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
24814         return DAG.getNode(Opc, DL, VT,
24815                            Op0, Op1,
24816                            DAG.getNode(ISD::TRUNCATE, DL,
24817                                        MVT::i8, ShAmt0));
24818     }
24819   } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
24820     ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
24821     if (ShAmt0C &&
24822         ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
24823       return DAG.getNode(Opc, DL, VT,
24824                          N0.getOperand(0), N1.getOperand(0),
24825                          DAG.getNode(ISD::TRUNCATE, DL,
24826                                        MVT::i8, ShAmt0));
24827   }
24828
24829   return SDValue();
24830 }
24831
24832 // Generate NEG and CMOV for integer abs.
24833 static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
24834   EVT VT = N->getValueType(0);
24835
24836   // Since X86 does not have CMOV for 8-bit integer, we don't convert
24837   // 8-bit integer abs to NEG and CMOV.
24838   if (VT.isInteger() && VT.getSizeInBits() == 8)
24839     return SDValue();
24840
24841   SDValue N0 = N->getOperand(0);
24842   SDValue N1 = N->getOperand(1);
24843   SDLoc DL(N);
24844
24845   // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
24846   // and change it to SUB and CMOV.
24847   if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
24848       N0.getOpcode() == ISD::ADD &&
24849       N0.getOperand(1) == N1 &&
24850       N1.getOpcode() == ISD::SRA &&
24851       N1.getOperand(0) == N0.getOperand(0))
24852     if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
24853       if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) {
24854         // Generate SUB & CMOV.
24855         SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
24856                                   DAG.getConstant(0, VT), N0.getOperand(0));
24857
24858         SDValue Ops[] = { N0.getOperand(0), Neg,
24859                           DAG.getConstant(X86::COND_GE, MVT::i8),
24860                           SDValue(Neg.getNode(), 1) };
24861         return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
24862       }
24863   return SDValue();
24864 }
24865
24866 // PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes
24867 static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
24868                                  TargetLowering::DAGCombinerInfo &DCI,
24869                                  const X86Subtarget *Subtarget) {
24870   if (DCI.isBeforeLegalizeOps())
24871     return SDValue();
24872
24873   if (Subtarget->hasCMov()) {
24874     SDValue RV = performIntegerAbsCombine(N, DAG);
24875     if (RV.getNode())
24876       return RV;
24877   }
24878
24879   return SDValue();
24880 }
24881
24882 /// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
24883 static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
24884                                   TargetLowering::DAGCombinerInfo &DCI,
24885                                   const X86Subtarget *Subtarget) {
24886   LoadSDNode *Ld = cast<LoadSDNode>(N);
24887   EVT RegVT = Ld->getValueType(0);
24888   EVT MemVT = Ld->getMemoryVT();
24889   SDLoc dl(Ld);
24890   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24891
24892   // For chips with slow 32-byte unaligned loads, break the 32-byte operation
24893   // into two 16-byte operations.
24894   ISD::LoadExtType Ext = Ld->getExtensionType();
24895   unsigned Alignment = Ld->getAlignment();
24896   bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
24897   if (RegVT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
24898       !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
24899     unsigned NumElems = RegVT.getVectorNumElements();
24900     if (NumElems < 2)
24901       return SDValue();
24902
24903     SDValue Ptr = Ld->getBasePtr();
24904     SDValue Increment = DAG.getConstant(16, TLI.getPointerTy());
24905
24906     EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
24907                                   NumElems/2);
24908     SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
24909                                 Ld->getPointerInfo(), Ld->isVolatile(),
24910                                 Ld->isNonTemporal(), Ld->isInvariant(),
24911                                 Alignment);
24912     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
24913     SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
24914                                 Ld->getPointerInfo(), Ld->isVolatile(),
24915                                 Ld->isNonTemporal(), Ld->isInvariant(),
24916                                 std::min(16U, Alignment));
24917     SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
24918                              Load1.getValue(1),
24919                              Load2.getValue(1));
24920
24921     SDValue NewVec = DAG.getUNDEF(RegVT);
24922     NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);
24923     NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);
24924     return DCI.CombineTo(N, NewVec, TF, true);
24925   }
24926
24927   return SDValue();
24928 }
24929
24930 /// PerformMLOADCombine - Resolve extending loads
24931 static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
24932                                    TargetLowering::DAGCombinerInfo &DCI,
24933                                    const X86Subtarget *Subtarget) {
24934   MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
24935   if (Mld->getExtensionType() != ISD::SEXTLOAD)
24936     return SDValue();
24937
24938   EVT VT = Mld->getValueType(0);
24939   unsigned NumElems = VT.getVectorNumElements();
24940   EVT LdVT = Mld->getMemoryVT();
24941   SDLoc dl(Mld);
24942
24943   assert(LdVT != VT && "Cannot extend to the same type");
24944   unsigned ToSz = VT.getVectorElementType().getSizeInBits();
24945   unsigned FromSz = LdVT.getVectorElementType().getSizeInBits();
24946   // From, To sizes and ElemCount must be pow of two
24947   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
24948     "Unexpected size for extending masked load");
24949
24950   unsigned SizeRatio  = ToSz / FromSz;
24951   assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
24952
24953   // Create a type on which we perform the shuffle
24954   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
24955           LdVT.getScalarType(), NumElems*SizeRatio);
24956   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
24957
24958   // Convert Src0 value
24959   SDValue WideSrc0 = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mld->getSrc0());
24960   if (Mld->getSrc0().getOpcode() != ISD::UNDEF) {
24961     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
24962     for (unsigned i = 0; i != NumElems; ++i)
24963       ShuffleVec[i] = i * SizeRatio;
24964
24965     // Can't shuffle using an illegal type.
24966     assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)
24967             && "WideVecVT should be legal");
24968     WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
24969                                     DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
24970   }
24971   // Prepare the new mask
24972   SDValue NewMask;
24973   SDValue Mask = Mld->getMask();
24974   if (Mask.getValueType() == VT) {
24975     // Mask and original value have the same type
24976     NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
24977     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
24978     for (unsigned i = 0; i != NumElems; ++i)
24979       ShuffleVec[i] = i * SizeRatio;
24980     for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
24981       ShuffleVec[i] = NumElems*SizeRatio;
24982     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
24983                                    DAG.getConstant(0, WideVecVT),
24984                                    &ShuffleVec[0]);
24985   }
24986   else {
24987     assert(Mask.getValueType().getVectorElementType() == MVT::i1);
24988     unsigned WidenNumElts = NumElems*SizeRatio;
24989     unsigned MaskNumElts = VT.getVectorNumElements();
24990     EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
24991                                      WidenNumElts);
24992
24993     unsigned NumConcat = WidenNumElts / MaskNumElts;
24994     SmallVector<SDValue, 16> Ops(NumConcat);
24995     SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType());
24996     Ops[0] = Mask;
24997     for (unsigned i = 1; i != NumConcat; ++i)
24998       Ops[i] = ZeroVal;
24999
25000     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
25001   }
25002
25003   SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
25004                                      Mld->getBasePtr(), NewMask, WideSrc0,
25005                                      Mld->getMemoryVT(), Mld->getMemOperand(),
25006                                      ISD::NON_EXTLOAD);
25007   SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
25008   return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
25009
25010 }
25011 /// PerformMSTORECombine - Resolve truncating stores
25012 static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
25013                                     const X86Subtarget *Subtarget) {
25014   MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
25015   if (!Mst->isTruncatingStore())
25016     return SDValue();
25017
25018   EVT VT = Mst->getValue().getValueType();
25019   unsigned NumElems = VT.getVectorNumElements();
25020   EVT StVT = Mst->getMemoryVT();
25021   SDLoc dl(Mst);
25022
25023   assert(StVT != VT && "Cannot truncate to the same type");
25024   unsigned FromSz = VT.getVectorElementType().getSizeInBits();
25025   unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
25026
25027   // From, To sizes and ElemCount must be pow of two
25028   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
25029     "Unexpected size for truncating masked store");
25030   // We are going to use the original vector elt for storing.
25031   // Accumulated smaller vector elements must be a multiple of the store size.
25032   assert (((NumElems * FromSz) % ToSz) == 0 &&
25033           "Unexpected ratio for truncating masked store");
25034
25035   unsigned SizeRatio  = FromSz / ToSz;
25036   assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
25037
25038   // Create a type on which we perform the shuffle
25039   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
25040           StVT.getScalarType(), NumElems*SizeRatio);
25041
25042   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
25043
25044   SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mst->getValue());
25045   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
25046   for (unsigned i = 0; i != NumElems; ++i)
25047     ShuffleVec[i] = i * SizeRatio;
25048
25049   // Can't shuffle using an illegal type.
25050   assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)
25051           && "WideVecVT should be legal");
25052
25053   SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
25054                                         DAG.getUNDEF(WideVecVT),
25055                                         &ShuffleVec[0]);
25056
25057   SDValue NewMask;
25058   SDValue Mask = Mst->getMask();
25059   if (Mask.getValueType() == VT) {
25060     // Mask and original value have the same type
25061     NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
25062     for (unsigned i = 0; i != NumElems; ++i)
25063       ShuffleVec[i] = i * SizeRatio;
25064     for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
25065       ShuffleVec[i] = NumElems*SizeRatio;
25066     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
25067                                    DAG.getConstant(0, WideVecVT),
25068                                    &ShuffleVec[0]);
25069   }
25070   else {
25071     assert(Mask.getValueType().getVectorElementType() == MVT::i1);
25072     unsigned WidenNumElts = NumElems*SizeRatio;
25073     unsigned MaskNumElts = VT.getVectorNumElements();
25074     EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
25075                                      WidenNumElts);
25076
25077     unsigned NumConcat = WidenNumElts / MaskNumElts;
25078     SmallVector<SDValue, 16> Ops(NumConcat);
25079     SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType());
25080     Ops[0] = Mask;
25081     for (unsigned i = 1; i != NumConcat; ++i)
25082       Ops[i] = ZeroVal;
25083
25084     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
25085   }
25086
25087   return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, Mst->getBasePtr(),
25088                             NewMask, StVT, Mst->getMemOperand(), false);
25089 }
25090 /// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
25091 static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
25092                                    const X86Subtarget *Subtarget) {
25093   StoreSDNode *St = cast<StoreSDNode>(N);
25094   EVT VT = St->getValue().getValueType();
25095   EVT StVT = St->getMemoryVT();
25096   SDLoc dl(St);
25097   SDValue StoredVal = St->getOperand(1);
25098   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25099
25100   // If we are saving a concatenation of two XMM registers and 32-byte stores
25101   // are slow, such as on Sandy Bridge, perform two 16-byte stores.
25102   unsigned Alignment = St->getAlignment();
25103   bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8;
25104   if (VT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
25105       StVT == VT && !IsAligned) {
25106     unsigned NumElems = VT.getVectorNumElements();
25107     if (NumElems < 2)
25108       return SDValue();
25109
25110     SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl);
25111     SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl);
25112
25113     SDValue Stride = DAG.getConstant(16, TLI.getPointerTy());
25114     SDValue Ptr0 = St->getBasePtr();
25115     SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
25116
25117     SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
25118                                 St->getPointerInfo(), St->isVolatile(),
25119                                 St->isNonTemporal(), Alignment);
25120     SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
25121                                 St->getPointerInfo(), St->isVolatile(),
25122                                 St->isNonTemporal(),
25123                                 std::min(16U, Alignment));
25124     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
25125   }
25126
25127   // Optimize trunc store (of multiple scalars) to shuffle and store.
25128   // First, pack all of the elements in one place. Next, store to memory
25129   // in fewer chunks.
25130   if (St->isTruncatingStore() && VT.isVector()) {
25131     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25132     unsigned NumElems = VT.getVectorNumElements();
25133     assert(StVT != VT && "Cannot truncate to the same type");
25134     unsigned FromSz = VT.getVectorElementType().getSizeInBits();
25135     unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
25136
25137     // From, To sizes and ElemCount must be pow of two
25138     if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
25139     // We are going to use the original vector elt for storing.
25140     // Accumulated smaller vector elements must be a multiple of the store size.
25141     if (0 != (NumElems * FromSz) % ToSz) return SDValue();
25142
25143     unsigned SizeRatio  = FromSz / ToSz;
25144
25145     assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
25146
25147     // Create a type on which we perform the shuffle
25148     EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
25149             StVT.getScalarType(), NumElems*SizeRatio);
25150
25151     assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
25152
25153     SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());
25154     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
25155     for (unsigned i = 0; i != NumElems; ++i)
25156       ShuffleVec[i] = i * SizeRatio;
25157
25158     // Can't shuffle using an illegal type.
25159     if (!TLI.isTypeLegal(WideVecVT))
25160       return SDValue();
25161
25162     SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
25163                                          DAG.getUNDEF(WideVecVT),
25164                                          &ShuffleVec[0]);
25165     // At this point all of the data is stored at the bottom of the
25166     // register. We now need to save it to mem.
25167
25168     // Find the largest store unit
25169     MVT StoreType = MVT::i8;
25170     for (MVT Tp : MVT::integer_valuetypes()) {
25171       if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
25172         StoreType = Tp;
25173     }
25174
25175     // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
25176     if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
25177         (64 <= NumElems * ToSz))
25178       StoreType = MVT::f64;
25179
25180     // Bitcast the original vector into a vector of store-size units
25181     EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
25182             StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
25183     assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
25184     SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
25185     SmallVector<SDValue, 8> Chains;
25186     SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
25187                                         TLI.getPointerTy());
25188     SDValue Ptr = St->getBasePtr();
25189
25190     // Perform one or more big stores into memory.
25191     for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
25192       SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
25193                                    StoreType, ShuffWide,
25194                                    DAG.getIntPtrConstant(i));
25195       SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
25196                                 St->getPointerInfo(), St->isVolatile(),
25197                                 St->isNonTemporal(), St->getAlignment());
25198       Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
25199       Chains.push_back(Ch);
25200     }
25201
25202     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
25203   }
25204
25205   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
25206   // the FP state in cases where an emms may be missing.
25207   // A preferable solution to the general problem is to figure out the right
25208   // places to insert EMMS.  This qualifies as a quick hack.
25209
25210   // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
25211   if (VT.getSizeInBits() != 64)
25212     return SDValue();
25213
25214   const Function *F = DAG.getMachineFunction().getFunction();
25215   bool NoImplicitFloatOps = F->getAttributes().
25216     hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
25217   bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps
25218                      && Subtarget->hasSSE2();
25219   if ((VT.isVector() ||
25220        (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
25221       isa<LoadSDNode>(St->getValue()) &&
25222       !cast<LoadSDNode>(St->getValue())->isVolatile() &&
25223       St->getChain().hasOneUse() && !St->isVolatile()) {
25224     SDNode* LdVal = St->getValue().getNode();
25225     LoadSDNode *Ld = nullptr;
25226     int TokenFactorIndex = -1;
25227     SmallVector<SDValue, 8> Ops;
25228     SDNode* ChainVal = St->getChain().getNode();
25229     // Must be a store of a load.  We currently handle two cases:  the load
25230     // is a direct child, and it's under an intervening TokenFactor.  It is
25231     // possible to dig deeper under nested TokenFactors.
25232     if (ChainVal == LdVal)
25233       Ld = cast<LoadSDNode>(St->getChain());
25234     else if (St->getValue().hasOneUse() &&
25235              ChainVal->getOpcode() == ISD::TokenFactor) {
25236       for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
25237         if (ChainVal->getOperand(i).getNode() == LdVal) {
25238           TokenFactorIndex = i;
25239           Ld = cast<LoadSDNode>(St->getValue());
25240         } else
25241           Ops.push_back(ChainVal->getOperand(i));
25242       }
25243     }
25244
25245     if (!Ld || !ISD::isNormalLoad(Ld))
25246       return SDValue();
25247
25248     // If this is not the MMX case, i.e. we are just turning i64 load/store
25249     // into f64 load/store, avoid the transformation if there are multiple
25250     // uses of the loaded value.
25251     if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
25252       return SDValue();
25253
25254     SDLoc LdDL(Ld);
25255     SDLoc StDL(N);
25256     // If we are a 64-bit capable x86, lower to a single movq load/store pair.
25257     // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
25258     // pair instead.
25259     if (Subtarget->is64Bit() || F64IsLegal) {
25260       EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
25261       SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
25262                                   Ld->getPointerInfo(), Ld->isVolatile(),
25263                                   Ld->isNonTemporal(), Ld->isInvariant(),
25264                                   Ld->getAlignment());
25265       SDValue NewChain = NewLd.getValue(1);
25266       if (TokenFactorIndex != -1) {
25267         Ops.push_back(NewChain);
25268         NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
25269       }
25270       return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
25271                           St->getPointerInfo(),
25272                           St->isVolatile(), St->isNonTemporal(),
25273                           St->getAlignment());
25274     }
25275
25276     // Otherwise, lower to two pairs of 32-bit loads / stores.
25277     SDValue LoAddr = Ld->getBasePtr();
25278     SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
25279                                  DAG.getConstant(4, MVT::i32));
25280
25281     SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
25282                                Ld->getPointerInfo(),
25283                                Ld->isVolatile(), Ld->isNonTemporal(),
25284                                Ld->isInvariant(), Ld->getAlignment());
25285     SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
25286                                Ld->getPointerInfo().getWithOffset(4),
25287                                Ld->isVolatile(), Ld->isNonTemporal(),
25288                                Ld->isInvariant(),
25289                                MinAlign(Ld->getAlignment(), 4));
25290
25291     SDValue NewChain = LoLd.getValue(1);
25292     if (TokenFactorIndex != -1) {
25293       Ops.push_back(LoLd);
25294       Ops.push_back(HiLd);
25295       NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
25296     }
25297
25298     LoAddr = St->getBasePtr();
25299     HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
25300                          DAG.getConstant(4, MVT::i32));
25301
25302     SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
25303                                 St->getPointerInfo(),
25304                                 St->isVolatile(), St->isNonTemporal(),
25305                                 St->getAlignment());
25306     SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
25307                                 St->getPointerInfo().getWithOffset(4),
25308                                 St->isVolatile(),
25309                                 St->isNonTemporal(),
25310                                 MinAlign(St->getAlignment(), 4));
25311     return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
25312   }
25313   return SDValue();
25314 }
25315
25316 /// Return 'true' if this vector operation is "horizontal"
25317 /// and return the operands for the horizontal operation in LHS and RHS.  A
25318 /// horizontal operation performs the binary operation on successive elements
25319 /// of its first operand, then on successive elements of its second operand,
25320 /// returning the resulting values in a vector.  For example, if
25321 ///   A = < float a0, float a1, float a2, float a3 >
25322 /// and
25323 ///   B = < float b0, float b1, float b2, float b3 >
25324 /// then the result of doing a horizontal operation on A and B is
25325 ///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
25326 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
25327 /// A horizontal-op B, for some already available A and B, and if so then LHS is
25328 /// set to A, RHS to B, and the routine returns 'true'.
25329 /// Note that the binary operation should have the property that if one of the
25330 /// operands is UNDEF then the result is UNDEF.
25331 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
25332   // Look for the following pattern: if
25333   //   A = < float a0, float a1, float a2, float a3 >
25334   //   B = < float b0, float b1, float b2, float b3 >
25335   // and
25336   //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
25337   //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
25338   // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
25339   // which is A horizontal-op B.
25340
25341   // At least one of the operands should be a vector shuffle.
25342   if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
25343       RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
25344     return false;
25345
25346   MVT VT = LHS.getSimpleValueType();
25347
25348   assert((VT.is128BitVector() || VT.is256BitVector()) &&
25349          "Unsupported vector type for horizontal add/sub");
25350
25351   // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
25352   // operate independently on 128-bit lanes.
25353   unsigned NumElts = VT.getVectorNumElements();
25354   unsigned NumLanes = VT.getSizeInBits()/128;
25355   unsigned NumLaneElts = NumElts / NumLanes;
25356   assert((NumLaneElts % 2 == 0) &&
25357          "Vector type should have an even number of elements in each lane");
25358   unsigned HalfLaneElts = NumLaneElts/2;
25359
25360   // View LHS in the form
25361   //   LHS = VECTOR_SHUFFLE A, B, LMask
25362   // If LHS is not a shuffle then pretend it is the shuffle
25363   //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
25364   // NOTE: in what follows a default initialized SDValue represents an UNDEF of
25365   // type VT.
25366   SDValue A, B;
25367   SmallVector<int, 16> LMask(NumElts);
25368   if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
25369     if (LHS.getOperand(0).getOpcode() != ISD::UNDEF)
25370       A = LHS.getOperand(0);
25371     if (LHS.getOperand(1).getOpcode() != ISD::UNDEF)
25372       B = LHS.getOperand(1);
25373     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
25374     std::copy(Mask.begin(), Mask.end(), LMask.begin());
25375   } else {
25376     if (LHS.getOpcode() != ISD::UNDEF)
25377       A = LHS;
25378     for (unsigned i = 0; i != NumElts; ++i)
25379       LMask[i] = i;
25380   }
25381
25382   // Likewise, view RHS in the form
25383   //   RHS = VECTOR_SHUFFLE C, D, RMask
25384   SDValue C, D;
25385   SmallVector<int, 16> RMask(NumElts);
25386   if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
25387     if (RHS.getOperand(0).getOpcode() != ISD::UNDEF)
25388       C = RHS.getOperand(0);
25389     if (RHS.getOperand(1).getOpcode() != ISD::UNDEF)
25390       D = RHS.getOperand(1);
25391     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
25392     std::copy(Mask.begin(), Mask.end(), RMask.begin());
25393   } else {
25394     if (RHS.getOpcode() != ISD::UNDEF)
25395       C = RHS;
25396     for (unsigned i = 0; i != NumElts; ++i)
25397       RMask[i] = i;
25398   }
25399
25400   // Check that the shuffles are both shuffling the same vectors.
25401   if (!(A == C && B == D) && !(A == D && B == C))
25402     return false;
25403
25404   // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
25405   if (!A.getNode() && !B.getNode())
25406     return false;
25407
25408   // If A and B occur in reverse order in RHS, then "swap" them (which means
25409   // rewriting the mask).
25410   if (A != C)
25411     CommuteVectorShuffleMask(RMask, NumElts);
25412
25413   // At this point LHS and RHS are equivalent to
25414   //   LHS = VECTOR_SHUFFLE A, B, LMask
25415   //   RHS = VECTOR_SHUFFLE A, B, RMask
25416   // Check that the masks correspond to performing a horizontal operation.
25417   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
25418     for (unsigned i = 0; i != NumLaneElts; ++i) {
25419       int LIdx = LMask[i+l], RIdx = RMask[i+l];
25420
25421       // Ignore any UNDEF components.
25422       if (LIdx < 0 || RIdx < 0 ||
25423           (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
25424           (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
25425         continue;
25426
25427       // Check that successive elements are being operated on.  If not, this is
25428       // not a horizontal operation.
25429       unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
25430       int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
25431       if (!(LIdx == Index && RIdx == Index + 1) &&
25432           !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
25433         return false;
25434     }
25435   }
25436
25437   LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
25438   RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
25439   return true;
25440 }
25441
25442 /// Do target-specific dag combines on floating point adds.
25443 static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
25444                                   const X86Subtarget *Subtarget) {
25445   EVT VT = N->getValueType(0);
25446   SDValue LHS = N->getOperand(0);
25447   SDValue RHS = N->getOperand(1);
25448
25449   // Try to synthesize horizontal adds from adds of shuffles.
25450   if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
25451        (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
25452       isHorizontalBinOp(LHS, RHS, true))
25453     return DAG.getNode(X86ISD::FHADD, SDLoc(N), VT, LHS, RHS);
25454   return SDValue();
25455 }
25456
25457 /// Do target-specific dag combines on floating point subs.
25458 static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
25459                                   const X86Subtarget *Subtarget) {
25460   EVT VT = N->getValueType(0);
25461   SDValue LHS = N->getOperand(0);
25462   SDValue RHS = N->getOperand(1);
25463
25464   // Try to synthesize horizontal subs from subs of shuffles.
25465   if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
25466        (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
25467       isHorizontalBinOp(LHS, RHS, false))
25468     return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS);
25469   return SDValue();
25470 }
25471
25472 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
25473 static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
25474   assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
25475
25476   // F[X]OR(0.0, x) -> x
25477   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
25478     if (C->getValueAPF().isPosZero())
25479       return N->getOperand(1);
25480
25481   // F[X]OR(x, 0.0) -> x
25482   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
25483     if (C->getValueAPF().isPosZero())
25484       return N->getOperand(0);
25485   return SDValue();
25486 }
25487
25488 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
25489 static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
25490   assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
25491
25492   // Only perform optimizations if UnsafeMath is used.
25493   if (!DAG.getTarget().Options.UnsafeFPMath)
25494     return SDValue();
25495
25496   // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
25497   // into FMINC and FMAXC, which are Commutative operations.
25498   unsigned NewOp = 0;
25499   switch (N->getOpcode()) {
25500     default: llvm_unreachable("unknown opcode");
25501     case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
25502     case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
25503   }
25504
25505   return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
25506                      N->getOperand(0), N->getOperand(1));
25507 }
25508
25509 /// Do target-specific dag combines on X86ISD::FAND nodes.
25510 static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
25511   // FAND(0.0, x) -> 0.0
25512   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
25513     if (C->getValueAPF().isPosZero())
25514       return N->getOperand(0);
25515
25516   // FAND(x, 0.0) -> 0.0
25517   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
25518     if (C->getValueAPF().isPosZero())
25519       return N->getOperand(1);
25520
25521   return SDValue();
25522 }
25523
25524 /// Do target-specific dag combines on X86ISD::FANDN nodes
25525 static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) {
25526   // FANDN(0.0, x) -> x
25527   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
25528     if (C->getValueAPF().isPosZero())
25529       return N->getOperand(1);
25530
25531   // FANDN(x, 0.0) -> 0.0
25532   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
25533     if (C->getValueAPF().isPosZero())
25534       return N->getOperand(1);
25535
25536   return SDValue();
25537 }
25538
25539 static SDValue PerformBTCombine(SDNode *N,
25540                                 SelectionDAG &DAG,
25541                                 TargetLowering::DAGCombinerInfo &DCI) {
25542   // BT ignores high bits in the bit index operand.
25543   SDValue Op1 = N->getOperand(1);
25544   if (Op1.hasOneUse()) {
25545     unsigned BitWidth = Op1.getValueSizeInBits();
25546     APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
25547     APInt KnownZero, KnownOne;
25548     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
25549                                           !DCI.isBeforeLegalizeOps());
25550     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25551     if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
25552         TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
25553       DCI.CommitTargetLoweringOpt(TLO);
25554   }
25555   return SDValue();
25556 }
25557
25558 static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
25559   SDValue Op = N->getOperand(0);
25560   if (Op.getOpcode() == ISD::BITCAST)
25561     Op = Op.getOperand(0);
25562   EVT VT = N->getValueType(0), OpVT = Op.getValueType();
25563   if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
25564       VT.getVectorElementType().getSizeInBits() ==
25565       OpVT.getVectorElementType().getSizeInBits()) {
25566     return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
25567   }
25568   return SDValue();
25569 }
25570
25571 static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
25572                                                const X86Subtarget *Subtarget) {
25573   EVT VT = N->getValueType(0);
25574   if (!VT.isVector())
25575     return SDValue();
25576
25577   SDValue N0 = N->getOperand(0);
25578   SDValue N1 = N->getOperand(1);
25579   EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
25580   SDLoc dl(N);
25581
25582   // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
25583   // both SSE and AVX2 since there is no sign-extended shift right
25584   // operation on a vector with 64-bit elements.
25585   //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
25586   // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
25587   if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
25588       N0.getOpcode() == ISD::SIGN_EXTEND)) {
25589     SDValue N00 = N0.getOperand(0);
25590
25591     // EXTLOAD has a better solution on AVX2,
25592     // it may be replaced with X86ISD::VSEXT node.
25593     if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256())
25594       if (!ISD::isNormalLoad(N00.getNode()))
25595         return SDValue();
25596
25597     if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
25598         SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
25599                                   N00, N1);
25600       return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
25601     }
25602   }
25603   return SDValue();
25604 }
25605
25606 static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
25607                                   TargetLowering::DAGCombinerInfo &DCI,
25608                                   const X86Subtarget *Subtarget) {
25609   SDValue N0 = N->getOperand(0);
25610   EVT VT = N->getValueType(0);
25611
25612   // (i8,i32 sext (sdivrem (i8 x, i8 y)) ->
25613   // (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y)
25614   // This exposes the sext to the sdivrem lowering, so that it directly extends
25615   // from AH (which we otherwise need to do contortions to access).
25616   if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 &&
25617       N0.getValueType() == MVT::i8 && VT == MVT::i32) {
25618     SDLoc dl(N);
25619     SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
25620     SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, dl, NodeTys,
25621                             N0.getOperand(0), N0.getOperand(1));
25622     DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
25623     return R.getValue(1);
25624   }
25625
25626   if (!DCI.isBeforeLegalizeOps())
25627     return SDValue();
25628
25629   if (!Subtarget->hasFp256())
25630     return SDValue();
25631
25632   if (VT.isVector() && VT.getSizeInBits() == 256) {
25633     SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
25634     if (R.getNode())
25635       return R;
25636   }
25637
25638   return SDValue();
25639 }
25640
25641 static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
25642                                  const X86Subtarget* Subtarget) {
25643   SDLoc dl(N);
25644   EVT VT = N->getValueType(0);
25645
25646   // Let legalize expand this if it isn't a legal type yet.
25647   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
25648     return SDValue();
25649
25650   EVT ScalarVT = VT.getScalarType();
25651   if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
25652       (!Subtarget->hasFMA() && !Subtarget->hasFMA4()))
25653     return SDValue();
25654
25655   SDValue A = N->getOperand(0);
25656   SDValue B = N->getOperand(1);
25657   SDValue C = N->getOperand(2);
25658
25659   bool NegA = (A.getOpcode() == ISD::FNEG);
25660   bool NegB = (B.getOpcode() == ISD::FNEG);
25661   bool NegC = (C.getOpcode() == ISD::FNEG);
25662
25663   // Negative multiplication when NegA xor NegB
25664   bool NegMul = (NegA != NegB);
25665   if (NegA)
25666     A = A.getOperand(0);
25667   if (NegB)
25668     B = B.getOperand(0);
25669   if (NegC)
25670     C = C.getOperand(0);
25671
25672   unsigned Opcode;
25673   if (!NegMul)
25674     Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
25675   else
25676     Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
25677
25678   return DAG.getNode(Opcode, dl, VT, A, B, C);
25679 }
25680
25681 static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
25682                                   TargetLowering::DAGCombinerInfo &DCI,
25683                                   const X86Subtarget *Subtarget) {
25684   // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
25685   //           (and (i32 x86isd::setcc_carry), 1)
25686   // This eliminates the zext. This transformation is necessary because
25687   // ISD::SETCC is always legalized to i8.
25688   SDLoc dl(N);
25689   SDValue N0 = N->getOperand(0);
25690   EVT VT = N->getValueType(0);
25691
25692   if (N0.getOpcode() == ISD::AND &&
25693       N0.hasOneUse() &&
25694       N0.getOperand(0).hasOneUse()) {
25695     SDValue N00 = N0.getOperand(0);
25696     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
25697       ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
25698       if (!C || C->getZExtValue() != 1)
25699         return SDValue();
25700       return DAG.getNode(ISD::AND, dl, VT,
25701                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
25702                                      N00.getOperand(0), N00.getOperand(1)),
25703                          DAG.getConstant(1, VT));
25704     }
25705   }
25706
25707   if (N0.getOpcode() == ISD::TRUNCATE &&
25708       N0.hasOneUse() &&
25709       N0.getOperand(0).hasOneUse()) {
25710     SDValue N00 = N0.getOperand(0);
25711     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
25712       return DAG.getNode(ISD::AND, dl, VT,
25713                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
25714                                      N00.getOperand(0), N00.getOperand(1)),
25715                          DAG.getConstant(1, VT));
25716     }
25717   }
25718   if (VT.is256BitVector()) {
25719     SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
25720     if (R.getNode())
25721       return R;
25722   }
25723
25724   // (i8,i32 zext (udivrem (i8 x, i8 y)) ->
25725   // (i8,i32 (udivrem_zext_hreg (i8 x, i8 y)
25726   // This exposes the zext to the udivrem lowering, so that it directly extends
25727   // from AH (which we otherwise need to do contortions to access).
25728   if (N0.getOpcode() == ISD::UDIVREM &&
25729       N0.getResNo() == 1 && N0.getValueType() == MVT::i8 &&
25730       (VT == MVT::i32 || VT == MVT::i64)) {
25731     SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
25732     SDValue R = DAG.getNode(X86ISD::UDIVREM8_ZEXT_HREG, dl, NodeTys,
25733                             N0.getOperand(0), N0.getOperand(1));
25734     DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
25735     return R.getValue(1);
25736   }
25737
25738   return SDValue();
25739 }
25740
25741 // Optimize x == -y --> x+y == 0
25742 //          x != -y --> x+y != 0
25743 static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
25744                                       const X86Subtarget* Subtarget) {
25745   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
25746   SDValue LHS = N->getOperand(0);
25747   SDValue RHS = N->getOperand(1);
25748   EVT VT = N->getValueType(0);
25749   SDLoc DL(N);
25750
25751   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
25752     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
25753       if (C->getAPIntValue() == 0 && LHS.hasOneUse()) {
25754         SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
25755                                    LHS.getValueType(), RHS, LHS.getOperand(1));
25756         return DAG.getSetCC(SDLoc(N), N->getValueType(0),
25757                             addV, DAG.getConstant(0, addV.getValueType()), CC);
25758       }
25759   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
25760     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0)))
25761       if (C->getAPIntValue() == 0 && RHS.hasOneUse()) {
25762         SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
25763                                    RHS.getValueType(), LHS, RHS.getOperand(1));
25764         return DAG.getSetCC(SDLoc(N), N->getValueType(0),
25765                             addV, DAG.getConstant(0, addV.getValueType()), CC);
25766       }
25767
25768   if (VT.getScalarType() == MVT::i1) {
25769     bool IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
25770       (LHS.getOperand(0).getValueType().getScalarType() ==  MVT::i1);
25771     bool IsVZero0 = ISD::isBuildVectorAllZeros(LHS.getNode());
25772     if (!IsSEXT0 && !IsVZero0)
25773       return SDValue();
25774     bool IsSEXT1 = (RHS.getOpcode() == ISD::SIGN_EXTEND) &&
25775       (RHS.getOperand(0).getValueType().getScalarType() ==  MVT::i1);
25776     bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
25777
25778     if (!IsSEXT1 && !IsVZero1)
25779       return SDValue();
25780
25781     if (IsSEXT0 && IsVZero1) {
25782       assert(VT == LHS.getOperand(0).getValueType() && "Uexpected operand type");
25783       if (CC == ISD::SETEQ)
25784         return DAG.getNOT(DL, LHS.getOperand(0), VT);
25785       return LHS.getOperand(0);
25786     }
25787     if (IsSEXT1 && IsVZero0) {
25788       assert(VT == RHS.getOperand(0).getValueType() && "Uexpected operand type");
25789       if (CC == ISD::SETEQ)
25790         return DAG.getNOT(DL, RHS.getOperand(0), VT);
25791       return RHS.getOperand(0);
25792     }
25793   }
25794
25795   return SDValue();
25796 }
25797
25798 static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
25799                                       const X86Subtarget *Subtarget) {
25800   SDLoc dl(N);
25801   MVT VT = N->getOperand(1)->getSimpleValueType(0);
25802   assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
25803          "X86insertps is only defined for v4x32");
25804
25805   SDValue Ld = N->getOperand(1);
25806   if (MayFoldLoad(Ld)) {
25807     // Extract the countS bits from the immediate so we can get the proper
25808     // address when narrowing the vector load to a specific element.
25809     // When the second source op is a memory address, interps doesn't use
25810     // countS and just gets an f32 from that address.
25811     unsigned DestIndex =
25812         cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
25813     Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
25814   } else
25815     return SDValue();
25816
25817   // Create this as a scalar to vector to match the instruction pattern.
25818   SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
25819   // countS bits are ignored when loading from memory on insertps, which
25820   // means we don't need to explicitly set them to 0.
25821   return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
25822                      LoadScalarToVector, N->getOperand(2));
25823 }
25824
25825 // Helper function of PerformSETCCCombine. It is to materialize "setb reg"
25826 // as "sbb reg,reg", since it can be extended without zext and produces
25827 // an all-ones bit which is more useful than 0/1 in some cases.
25828 static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG,
25829                                MVT VT) {
25830   if (VT == MVT::i8)
25831     return DAG.getNode(ISD::AND, DL, VT,
25832                        DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
25833                                    DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS),
25834                        DAG.getConstant(1, VT));
25835   assert (VT == MVT::i1 && "Unexpected type for SECCC node");
25836   return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,
25837                      DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
25838                                  DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS));
25839 }
25840
25841 // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
25842 static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
25843                                    TargetLowering::DAGCombinerInfo &DCI,
25844                                    const X86Subtarget *Subtarget) {
25845   SDLoc DL(N);
25846   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
25847   SDValue EFLAGS = N->getOperand(1);
25848
25849   if (CC == X86::COND_A) {
25850     // Try to convert COND_A into COND_B in an attempt to facilitate
25851     // materializing "setb reg".
25852     //
25853     // Do not flip "e > c", where "c" is a constant, because Cmp instruction
25854     // cannot take an immediate as its first operand.
25855     //
25856     if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
25857         EFLAGS.getValueType().isInteger() &&
25858         !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
25859       SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
25860                                    EFLAGS.getNode()->getVTList(),
25861                                    EFLAGS.getOperand(1), EFLAGS.getOperand(0));
25862       SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
25863       return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0));
25864     }
25865   }
25866
25867   // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
25868   // a zext and produces an all-ones bit which is more useful than 0/1 in some
25869   // cases.
25870   if (CC == X86::COND_B)
25871     return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
25872
25873   SDValue Flags;
25874
25875   Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
25876   if (Flags.getNode()) {
25877     SDValue Cond = DAG.getConstant(CC, MVT::i8);
25878     return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
25879   }
25880
25881   return SDValue();
25882 }
25883
25884 // Optimize branch condition evaluation.
25885 //
25886 static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
25887                                     TargetLowering::DAGCombinerInfo &DCI,
25888                                     const X86Subtarget *Subtarget) {
25889   SDLoc DL(N);
25890   SDValue Chain = N->getOperand(0);
25891   SDValue Dest = N->getOperand(1);
25892   SDValue EFLAGS = N->getOperand(3);
25893   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
25894
25895   SDValue Flags;
25896
25897   Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
25898   if (Flags.getNode()) {
25899     SDValue Cond = DAG.getConstant(CC, MVT::i8);
25900     return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
25901                        Flags);
25902   }
25903
25904   return SDValue();
25905 }
25906
25907 static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
25908                                                          SelectionDAG &DAG) {
25909   // Take advantage of vector comparisons producing 0 or -1 in each lane to
25910   // optimize away operation when it's from a constant.
25911   //
25912   // The general transformation is:
25913   //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
25914   //       AND(VECTOR_CMP(x,y), constant2)
25915   //    constant2 = UNARYOP(constant)
25916
25917   // Early exit if this isn't a vector operation, the operand of the
25918   // unary operation isn't a bitwise AND, or if the sizes of the operations
25919   // aren't the same.
25920   EVT VT = N->getValueType(0);
25921   if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
25922       N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
25923       VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
25924     return SDValue();
25925
25926   // Now check that the other operand of the AND is a constant. We could
25927   // make the transformation for non-constant splats as well, but it's unclear
25928   // that would be a benefit as it would not eliminate any operations, just
25929   // perform one more step in scalar code before moving to the vector unit.
25930   if (BuildVectorSDNode *BV =
25931           dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
25932     // Bail out if the vector isn't a constant.
25933     if (!BV->isConstant())
25934       return SDValue();
25935
25936     // Everything checks out. Build up the new and improved node.
25937     SDLoc DL(N);
25938     EVT IntVT = BV->getValueType(0);
25939     // Create a new constant of the appropriate type for the transformed
25940     // DAG.
25941     SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
25942     // The AND node needs bitcasts to/from an integer vector type around it.
25943     SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
25944     SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
25945                                  N->getOperand(0)->getOperand(0), MaskConst);
25946     SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
25947     return Res;
25948   }
25949
25950   return SDValue();
25951 }
25952
25953 static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
25954                                         const X86Subtarget *Subtarget) {
25955   // First try to optimize away the conversion entirely when it's
25956   // conditionally from a constant. Vectors only.
25957   SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG);
25958   if (Res != SDValue())
25959     return Res;
25960
25961   // Now move on to more general possibilities.
25962   SDValue Op0 = N->getOperand(0);
25963   EVT InVT = Op0->getValueType(0);
25964
25965   // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32))
25966   if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
25967     SDLoc dl(N);
25968     MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
25969     SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
25970     return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
25971   }
25972
25973   // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
25974   // a 32-bit target where SSE doesn't support i64->FP operations.
25975   if (Op0.getOpcode() == ISD::LOAD) {
25976     LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
25977     EVT VT = Ld->getValueType(0);
25978     if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
25979         ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
25980         !Subtarget->is64Bit() && VT == MVT::i64) {
25981       SDValue FILDChain = Subtarget->getTargetLowering()->BuildFILD(
25982           SDValue(N, 0), Ld->getValueType(0), Ld->getChain(), Op0, DAG);
25983       DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
25984       return FILDChain;
25985     }
25986   }
25987   return SDValue();
25988 }
25989
25990 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
25991 static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
25992                                  X86TargetLowering::DAGCombinerInfo &DCI) {
25993   // If the LHS and RHS of the ADC node are zero, then it can't overflow and
25994   // the result is either zero or one (depending on the input carry bit).
25995   // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
25996   if (X86::isZeroNode(N->getOperand(0)) &&
25997       X86::isZeroNode(N->getOperand(1)) &&
25998       // We don't have a good way to replace an EFLAGS use, so only do this when
25999       // dead right now.
26000       SDValue(N, 1).use_empty()) {
26001     SDLoc DL(N);
26002     EVT VT = N->getValueType(0);
26003     SDValue CarryOut = DAG.getConstant(0, N->getValueType(1));
26004     SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
26005                                DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
26006                                            DAG.getConstant(X86::COND_B,MVT::i8),
26007                                            N->getOperand(2)),
26008                                DAG.getConstant(1, VT));
26009     return DCI.CombineTo(N, Res1, CarryOut);
26010   }
26011
26012   return SDValue();
26013 }
26014
26015 // fold (add Y, (sete  X, 0)) -> adc  0, Y
26016 //      (add Y, (setne X, 0)) -> sbb -1, Y
26017 //      (sub (sete  X, 0), Y) -> sbb  0, Y
26018 //      (sub (setne X, 0), Y) -> adc -1, Y
26019 static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
26020   SDLoc DL(N);
26021
26022   // Look through ZExts.
26023   SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
26024   if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
26025     return SDValue();
26026
26027   SDValue SetCC = Ext.getOperand(0);
26028   if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
26029     return SDValue();
26030
26031   X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
26032   if (CC != X86::COND_E && CC != X86::COND_NE)
26033     return SDValue();
26034
26035   SDValue Cmp = SetCC.getOperand(1);
26036   if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
26037       !X86::isZeroNode(Cmp.getOperand(1)) ||
26038       !Cmp.getOperand(0).getValueType().isInteger())
26039     return SDValue();
26040
26041   SDValue CmpOp0 = Cmp.getOperand(0);
26042   SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
26043                                DAG.getConstant(1, CmpOp0.getValueType()));
26044
26045   SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
26046   if (CC == X86::COND_NE)
26047     return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
26048                        DL, OtherVal.getValueType(), OtherVal,
26049                        DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp);
26050   return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
26051                      DL, OtherVal.getValueType(), OtherVal,
26052                      DAG.getConstant(0, OtherVal.getValueType()), NewCmp);
26053 }
26054
26055 /// PerformADDCombine - Do target-specific dag combines on integer adds.
26056 static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
26057                                  const X86Subtarget *Subtarget) {
26058   EVT VT = N->getValueType(0);
26059   SDValue Op0 = N->getOperand(0);
26060   SDValue Op1 = N->getOperand(1);
26061
26062   // Try to synthesize horizontal adds from adds of shuffles.
26063   if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
26064        (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
26065       isHorizontalBinOp(Op0, Op1, true))
26066     return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
26067
26068   return OptimizeConditionalInDecrement(N, DAG);
26069 }
26070
26071 static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
26072                                  const X86Subtarget *Subtarget) {
26073   SDValue Op0 = N->getOperand(0);
26074   SDValue Op1 = N->getOperand(1);
26075
26076   // X86 can't encode an immediate LHS of a sub. See if we can push the
26077   // negation into a preceding instruction.
26078   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
26079     // If the RHS of the sub is a XOR with one use and a constant, invert the
26080     // immediate. Then add one to the LHS of the sub so we can turn
26081     // X-Y -> X+~Y+1, saving one register.
26082     if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
26083         isa<ConstantSDNode>(Op1.getOperand(1))) {
26084       APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
26085       EVT VT = Op0.getValueType();
26086       SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
26087                                    Op1.getOperand(0),
26088                                    DAG.getConstant(~XorC, VT));
26089       return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
26090                          DAG.getConstant(C->getAPIntValue()+1, VT));
26091     }
26092   }
26093
26094   // Try to synthesize horizontal adds from adds of shuffles.
26095   EVT VT = N->getValueType(0);
26096   if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
26097        (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
26098       isHorizontalBinOp(Op0, Op1, true))
26099     return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
26100
26101   return OptimizeConditionalInDecrement(N, DAG);
26102 }
26103
26104 /// performVZEXTCombine - Performs build vector combines
26105 static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
26106                                    TargetLowering::DAGCombinerInfo &DCI,
26107                                    const X86Subtarget *Subtarget) {
26108   SDLoc DL(N);
26109   MVT VT = N->getSimpleValueType(0);
26110   SDValue Op = N->getOperand(0);
26111   MVT OpVT = Op.getSimpleValueType();
26112   MVT OpEltVT = OpVT.getVectorElementType();
26113   unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
26114
26115   // (vzext (bitcast (vzext (x)) -> (vzext x)
26116   SDValue V = Op;
26117   while (V.getOpcode() == ISD::BITCAST)
26118     V = V.getOperand(0);
26119
26120   if (V != Op && V.getOpcode() == X86ISD::VZEXT) {
26121     MVT InnerVT = V.getSimpleValueType();
26122     MVT InnerEltVT = InnerVT.getVectorElementType();
26123
26124     // If the element sizes match exactly, we can just do one larger vzext. This
26125     // is always an exact type match as vzext operates on integer types.
26126     if (OpEltVT == InnerEltVT) {
26127       assert(OpVT == InnerVT && "Types must match for vzext!");
26128       return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
26129     }
26130
26131     // The only other way we can combine them is if only a single element of the
26132     // inner vzext is used in the input to the outer vzext.
26133     if (InnerEltVT.getSizeInBits() < InputBits)
26134       return SDValue();
26135
26136     // In this case, the inner vzext is completely dead because we're going to
26137     // only look at bits inside of the low element. Just do the outer vzext on
26138     // a bitcast of the input to the inner.
26139     return DAG.getNode(X86ISD::VZEXT, DL, VT,
26140                        DAG.getNode(ISD::BITCAST, DL, OpVT, V));
26141   }
26142
26143   // Check if we can bypass extracting and re-inserting an element of an input
26144   // vector. Essentialy:
26145   // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
26146   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
26147       V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
26148       V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
26149     SDValue ExtractedV = V.getOperand(0);
26150     SDValue OrigV = ExtractedV.getOperand(0);
26151     if (auto *ExtractIdx = dyn_cast<ConstantSDNode>(ExtractedV.getOperand(1)))
26152       if (ExtractIdx->getZExtValue() == 0) {
26153         MVT OrigVT = OrigV.getSimpleValueType();
26154         // Extract a subvector if necessary...
26155         if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
26156           int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
26157           OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
26158                                     OrigVT.getVectorNumElements() / Ratio);
26159           OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
26160                               DAG.getIntPtrConstant(0));
26161         }
26162         Op = DAG.getNode(ISD::BITCAST, DL, OpVT, OrigV);
26163         return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
26164       }
26165   }
26166
26167   return SDValue();
26168 }
26169
26170 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
26171                                              DAGCombinerInfo &DCI) const {
26172   SelectionDAG &DAG = DCI.DAG;
26173   switch (N->getOpcode()) {
26174   default: break;
26175   case ISD::EXTRACT_VECTOR_ELT:
26176     return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
26177   case ISD::VSELECT:
26178   case ISD::SELECT:
26179   case X86ISD::SHRUNKBLEND:
26180     return PerformSELECTCombine(N, DAG, DCI, Subtarget);
26181   case ISD::BITCAST:        return PerformBITCASTCombine(N, DAG);
26182   case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI, Subtarget);
26183   case ISD::ADD:            return PerformAddCombine(N, DAG, Subtarget);
26184   case ISD::SUB:            return PerformSubCombine(N, DAG, Subtarget);
26185   case X86ISD::ADC:         return PerformADCCombine(N, DAG, DCI);
26186   case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
26187   case ISD::SHL:
26188   case ISD::SRA:
26189   case ISD::SRL:            return PerformShiftCombine(N, DAG, DCI, Subtarget);
26190   case ISD::AND:            return PerformAndCombine(N, DAG, DCI, Subtarget);
26191   case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
26192   case ISD::XOR:            return PerformXorCombine(N, DAG, DCI, Subtarget);
26193   case ISD::LOAD:           return PerformLOADCombine(N, DAG, DCI, Subtarget);
26194   case ISD::MLOAD:          return PerformMLOADCombine(N, DAG, DCI, Subtarget);
26195   case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
26196   case ISD::MSTORE:         return PerformMSTORECombine(N, DAG, Subtarget);
26197   case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, Subtarget);
26198   case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
26199   case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
26200   case X86ISD::FXOR:
26201   case X86ISD::FOR:         return PerformFORCombine(N, DAG);
26202   case X86ISD::FMIN:
26203   case X86ISD::FMAX:        return PerformFMinFMaxCombine(N, DAG);
26204   case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
26205   case X86ISD::FANDN:       return PerformFANDNCombine(N, DAG);
26206   case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
26207   case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
26208   case ISD::ANY_EXTEND:
26209   case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, DCI, Subtarget);
26210   case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
26211   case ISD::SIGN_EXTEND_INREG:
26212     return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
26213   case ISD::TRUNCATE:       return PerformTruncateCombine(N, DAG,DCI,Subtarget);
26214   case ISD::SETCC:          return PerformISDSETCCCombine(N, DAG, Subtarget);
26215   case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG, DCI, Subtarget);
26216   case X86ISD::BRCOND:      return PerformBrCondCombine(N, DAG, DCI, Subtarget);
26217   case X86ISD::VZEXT:       return performVZEXTCombine(N, DAG, DCI, Subtarget);
26218   case X86ISD::SHUFP:       // Handle all target specific shuffles
26219   case X86ISD::PALIGNR:
26220   case X86ISD::UNPCKH:
26221   case X86ISD::UNPCKL:
26222   case X86ISD::MOVHLPS:
26223   case X86ISD::MOVLHPS:
26224   case X86ISD::PSHUFB:
26225   case X86ISD::PSHUFD:
26226   case X86ISD::PSHUFHW:
26227   case X86ISD::PSHUFLW:
26228   case X86ISD::MOVSS:
26229   case X86ISD::MOVSD:
26230   case X86ISD::VPERMILPI:
26231   case X86ISD::VPERM2X128:
26232   case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
26233   case ISD::FMA:            return PerformFMACombine(N, DAG, Subtarget);
26234   case ISD::INTRINSIC_WO_CHAIN:
26235     return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget);
26236   case X86ISD::INSERTPS: {
26237     if (getTargetMachine().getOptLevel() > CodeGenOpt::None)
26238       return PerformINSERTPSCombine(N, DAG, Subtarget);
26239     break;
26240   }
26241   case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DAG, Subtarget);
26242   }
26243
26244   return SDValue();
26245 }
26246
26247 /// isTypeDesirableForOp - Return true if the target has native support for
26248 /// the specified value type and it is 'desirable' to use the type for the
26249 /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
26250 /// instruction encodings are longer and some i16 instructions are slow.
26251 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
26252   if (!isTypeLegal(VT))
26253     return false;
26254   if (VT != MVT::i16)
26255     return true;
26256
26257   switch (Opc) {
26258   default:
26259     return true;
26260   case ISD::LOAD:
26261   case ISD::SIGN_EXTEND:
26262   case ISD::ZERO_EXTEND:
26263   case ISD::ANY_EXTEND:
26264   case ISD::SHL:
26265   case ISD::SRL:
26266   case ISD::SUB:
26267   case ISD::ADD:
26268   case ISD::MUL:
26269   case ISD::AND:
26270   case ISD::OR:
26271   case ISD::XOR:
26272     return false;
26273   }
26274 }
26275
26276 /// IsDesirableToPromoteOp - This method query the target whether it is
26277 /// beneficial for dag combiner to promote the specified node. If true, it
26278 /// should return the desired promotion type by reference.
26279 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
26280   EVT VT = Op.getValueType();
26281   if (VT != MVT::i16)
26282     return false;
26283
26284   bool Promote = false;
26285   bool Commute = false;
26286   switch (Op.getOpcode()) {
26287   default: break;
26288   case ISD::LOAD: {
26289     LoadSDNode *LD = cast<LoadSDNode>(Op);
26290     // If the non-extending load has a single use and it's not live out, then it
26291     // might be folded.
26292     if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&&
26293                                                      Op.hasOneUse()*/) {
26294       for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
26295              UE = Op.getNode()->use_end(); UI != UE; ++UI) {
26296         // The only case where we'd want to promote LOAD (rather then it being
26297         // promoted as an operand is when it's only use is liveout.
26298         if (UI->getOpcode() != ISD::CopyToReg)
26299           return false;
26300       }
26301     }
26302     Promote = true;
26303     break;
26304   }
26305   case ISD::SIGN_EXTEND:
26306   case ISD::ZERO_EXTEND:
26307   case ISD::ANY_EXTEND:
26308     Promote = true;
26309     break;
26310   case ISD::SHL:
26311   case ISD::SRL: {
26312     SDValue N0 = Op.getOperand(0);
26313     // Look out for (store (shl (load), x)).
26314     if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
26315       return false;
26316     Promote = true;
26317     break;
26318   }
26319   case ISD::ADD:
26320   case ISD::MUL:
26321   case ISD::AND:
26322   case ISD::OR:
26323   case ISD::XOR:
26324     Commute = true;
26325     // fallthrough
26326   case ISD::SUB: {
26327     SDValue N0 = Op.getOperand(0);
26328     SDValue N1 = Op.getOperand(1);
26329     if (!Commute && MayFoldLoad(N1))
26330       return false;
26331     // Avoid disabling potential load folding opportunities.
26332     if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
26333       return false;
26334     if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
26335       return false;
26336     Promote = true;
26337   }
26338   }
26339
26340   PVT = MVT::i32;
26341   return Promote;
26342 }
26343
26344 //===----------------------------------------------------------------------===//
26345 //                           X86 Inline Assembly Support
26346 //===----------------------------------------------------------------------===//
26347
26348 namespace {
26349   // Helper to match a string separated by whitespace.
26350   bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) {
26351     s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace.
26352
26353     for (unsigned i = 0, e = args.size(); i != e; ++i) {
26354       StringRef piece(*args[i]);
26355       if (!s.startswith(piece)) // Check if the piece matches.
26356         return false;
26357
26358       s = s.substr(piece.size());
26359       StringRef::size_type pos = s.find_first_not_of(" \t");
26360       if (pos == 0) // We matched a prefix.
26361         return false;
26362
26363       s = s.substr(pos);
26364     }
26365
26366     return s.empty();
26367   }
26368   const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={};
26369 }
26370
26371 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
26372
26373   if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
26374     if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
26375         std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
26376         std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
26377
26378       if (AsmPieces.size() == 3)
26379         return true;
26380       else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
26381         return true;
26382     }
26383   }
26384   return false;
26385 }
26386
26387 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
26388   InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
26389
26390   std::string AsmStr = IA->getAsmString();
26391
26392   IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
26393   if (!Ty || Ty->getBitWidth() % 16 != 0)
26394     return false;
26395
26396   // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
26397   SmallVector<StringRef, 4> AsmPieces;
26398   SplitString(AsmStr, AsmPieces, ";\n");
26399
26400   switch (AsmPieces.size()) {
26401   default: return false;
26402   case 1:
26403     // FIXME: this should verify that we are targeting a 486 or better.  If not,
26404     // we will turn this bswap into something that will be lowered to logical
26405     // ops instead of emitting the bswap asm.  For now, we don't support 486 or
26406     // lower so don't worry about this.
26407     // bswap $0
26408     if (matchAsm(AsmPieces[0], "bswap", "$0") ||
26409         matchAsm(AsmPieces[0], "bswapl", "$0") ||
26410         matchAsm(AsmPieces[0], "bswapq", "$0") ||
26411         matchAsm(AsmPieces[0], "bswap", "${0:q}") ||
26412         matchAsm(AsmPieces[0], "bswapl", "${0:q}") ||
26413         matchAsm(AsmPieces[0], "bswapq", "${0:q}")) {
26414       // No need to check constraints, nothing other than the equivalent of
26415       // "=r,0" would be valid here.
26416       return IntrinsicLowering::LowerToByteSwap(CI);
26417     }
26418
26419     // rorw $$8, ${0:w}  -->  llvm.bswap.i16
26420     if (CI->getType()->isIntegerTy(16) &&
26421         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
26422         (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") ||
26423          matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) {
26424       AsmPieces.clear();
26425       const std::string &ConstraintsStr = IA->getConstraintString();
26426       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
26427       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
26428       if (clobbersFlagRegisters(AsmPieces))
26429         return IntrinsicLowering::LowerToByteSwap(CI);
26430     }
26431     break;
26432   case 3:
26433     if (CI->getType()->isIntegerTy(32) &&
26434         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
26435         matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") &&
26436         matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") &&
26437         matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) {
26438       AsmPieces.clear();
26439       const std::string &ConstraintsStr = IA->getConstraintString();
26440       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
26441       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
26442       if (clobbersFlagRegisters(AsmPieces))
26443         return IntrinsicLowering::LowerToByteSwap(CI);
26444     }
26445
26446     if (CI->getType()->isIntegerTy(64)) {
26447       InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
26448       if (Constraints.size() >= 2 &&
26449           Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
26450           Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
26451         // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
26452         if (matchAsm(AsmPieces[0], "bswap", "%eax") &&
26453             matchAsm(AsmPieces[1], "bswap", "%edx") &&
26454             matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx"))
26455           return IntrinsicLowering::LowerToByteSwap(CI);
26456       }
26457     }
26458     break;
26459   }
26460   return false;
26461 }
26462
26463 /// getConstraintType - Given a constraint letter, return the type of
26464 /// constraint it is for this target.
26465 X86TargetLowering::ConstraintType
26466 X86TargetLowering::getConstraintType(const std::string &Constraint) const {
26467   if (Constraint.size() == 1) {
26468     switch (Constraint[0]) {
26469     case 'R':
26470     case 'q':
26471     case 'Q':
26472     case 'f':
26473     case 't':
26474     case 'u':
26475     case 'y':
26476     case 'x':
26477     case 'Y':
26478     case 'l':
26479       return C_RegisterClass;
26480     case 'a':
26481     case 'b':
26482     case 'c':
26483     case 'd':
26484     case 'S':
26485     case 'D':
26486     case 'A':
26487       return C_Register;
26488     case 'I':
26489     case 'J':
26490     case 'K':
26491     case 'L':
26492     case 'M':
26493     case 'N':
26494     case 'G':
26495     case 'C':
26496     case 'e':
26497     case 'Z':
26498       return C_Other;
26499     default:
26500       break;
26501     }
26502   }
26503   return TargetLowering::getConstraintType(Constraint);
26504 }
26505
26506 /// Examine constraint type and operand type and determine a weight value.
26507 /// This object must already have been set up with the operand type
26508 /// and the current alternative constraint selected.
26509 TargetLowering::ConstraintWeight
26510   X86TargetLowering::getSingleConstraintMatchWeight(
26511     AsmOperandInfo &info, const char *constraint) const {
26512   ConstraintWeight weight = CW_Invalid;
26513   Value *CallOperandVal = info.CallOperandVal;
26514     // If we don't have a value, we can't do a match,
26515     // but allow it at the lowest weight.
26516   if (!CallOperandVal)
26517     return CW_Default;
26518   Type *type = CallOperandVal->getType();
26519   // Look at the constraint type.
26520   switch (*constraint) {
26521   default:
26522     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
26523   case 'R':
26524   case 'q':
26525   case 'Q':
26526   case 'a':
26527   case 'b':
26528   case 'c':
26529   case 'd':
26530   case 'S':
26531   case 'D':
26532   case 'A':
26533     if (CallOperandVal->getType()->isIntegerTy())
26534       weight = CW_SpecificReg;
26535     break;
26536   case 'f':
26537   case 't':
26538   case 'u':
26539     if (type->isFloatingPointTy())
26540       weight = CW_SpecificReg;
26541     break;
26542   case 'y':
26543     if (type->isX86_MMXTy() && Subtarget->hasMMX())
26544       weight = CW_SpecificReg;
26545     break;
26546   case 'x':
26547   case 'Y':
26548     if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) ||
26549         ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256()))
26550       weight = CW_Register;
26551     break;
26552   case 'I':
26553     if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
26554       if (C->getZExtValue() <= 31)
26555         weight = CW_Constant;
26556     }
26557     break;
26558   case 'J':
26559     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26560       if (C->getZExtValue() <= 63)
26561         weight = CW_Constant;
26562     }
26563     break;
26564   case 'K':
26565     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26566       if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
26567         weight = CW_Constant;
26568     }
26569     break;
26570   case 'L':
26571     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26572       if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
26573         weight = CW_Constant;
26574     }
26575     break;
26576   case 'M':
26577     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26578       if (C->getZExtValue() <= 3)
26579         weight = CW_Constant;
26580     }
26581     break;
26582   case 'N':
26583     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26584       if (C->getZExtValue() <= 0xff)
26585         weight = CW_Constant;
26586     }
26587     break;
26588   case 'G':
26589   case 'C':
26590     if (dyn_cast<ConstantFP>(CallOperandVal)) {
26591       weight = CW_Constant;
26592     }
26593     break;
26594   case 'e':
26595     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26596       if ((C->getSExtValue() >= -0x80000000LL) &&
26597           (C->getSExtValue() <= 0x7fffffffLL))
26598         weight = CW_Constant;
26599     }
26600     break;
26601   case 'Z':
26602     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26603       if (C->getZExtValue() <= 0xffffffff)
26604         weight = CW_Constant;
26605     }
26606     break;
26607   }
26608   return weight;
26609 }
26610
26611 /// LowerXConstraint - try to replace an X constraint, which matches anything,
26612 /// with another that has more specific requirements based on the type of the
26613 /// corresponding operand.
26614 const char *X86TargetLowering::
26615 LowerXConstraint(EVT ConstraintVT) const {
26616   // FP X constraints get lowered to SSE1/2 registers if available, otherwise
26617   // 'f' like normal targets.
26618   if (ConstraintVT.isFloatingPoint()) {
26619     if (Subtarget->hasSSE2())
26620       return "Y";
26621     if (Subtarget->hasSSE1())
26622       return "x";
26623   }
26624
26625   return TargetLowering::LowerXConstraint(ConstraintVT);
26626 }
26627
26628 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
26629 /// vector.  If it is invalid, don't add anything to Ops.
26630 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
26631                                                      std::string &Constraint,
26632                                                      std::vector<SDValue>&Ops,
26633                                                      SelectionDAG &DAG) const {
26634   SDValue Result;
26635
26636   // Only support length 1 constraints for now.
26637   if (Constraint.length() > 1) return;
26638
26639   char ConstraintLetter = Constraint[0];
26640   switch (ConstraintLetter) {
26641   default: break;
26642   case 'I':
26643     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26644       if (C->getZExtValue() <= 31) {
26645         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26646         break;
26647       }
26648     }
26649     return;
26650   case 'J':
26651     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26652       if (C->getZExtValue() <= 63) {
26653         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26654         break;
26655       }
26656     }
26657     return;
26658   case 'K':
26659     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26660       if (isInt<8>(C->getSExtValue())) {
26661         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26662         break;
26663       }
26664     }
26665     return;
26666   case 'L':
26667     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26668       if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
26669           (Subtarget->is64Bit() && C->getZExtValue() == 0xffffffff)) {
26670         Result = DAG.getTargetConstant(C->getSExtValue(), Op.getValueType());
26671         break;
26672       }
26673     }
26674     return;
26675   case 'M':
26676     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26677       if (C->getZExtValue() <= 3) {
26678         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26679         break;
26680       }
26681     }
26682     return;
26683   case 'N':
26684     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26685       if (C->getZExtValue() <= 255) {
26686         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26687         break;
26688       }
26689     }
26690     return;
26691   case 'O':
26692     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26693       if (C->getZExtValue() <= 127) {
26694         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26695         break;
26696       }
26697     }
26698     return;
26699   case 'e': {
26700     // 32-bit signed value
26701     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26702       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
26703                                            C->getSExtValue())) {
26704         // Widen to 64 bits here to get it sign extended.
26705         Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
26706         break;
26707       }
26708     // FIXME gcc accepts some relocatable values here too, but only in certain
26709     // memory models; it's complicated.
26710     }
26711     return;
26712   }
26713   case 'Z': {
26714     // 32-bit unsigned value
26715     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26716       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
26717                                            C->getZExtValue())) {
26718         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26719         break;
26720       }
26721     }
26722     // FIXME gcc accepts some relocatable values here too, but only in certain
26723     // memory models; it's complicated.
26724     return;
26725   }
26726   case 'i': {
26727     // Literal immediates are always ok.
26728     if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
26729       // Widen to 64 bits here to get it sign extended.
26730       Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64);
26731       break;
26732     }
26733
26734     // In any sort of PIC mode addresses need to be computed at runtime by
26735     // adding in a register or some sort of table lookup.  These can't
26736     // be used as immediates.
26737     if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC())
26738       return;
26739
26740     // If we are in non-pic codegen mode, we allow the address of a global (with
26741     // an optional displacement) to be used with 'i'.
26742     GlobalAddressSDNode *GA = nullptr;
26743     int64_t Offset = 0;
26744
26745     // Match either (GA), (GA+C), (GA+C1+C2), etc.
26746     while (1) {
26747       if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
26748         Offset += GA->getOffset();
26749         break;
26750       } else if (Op.getOpcode() == ISD::ADD) {
26751         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
26752           Offset += C->getZExtValue();
26753           Op = Op.getOperand(0);
26754           continue;
26755         }
26756       } else if (Op.getOpcode() == ISD::SUB) {
26757         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
26758           Offset += -C->getZExtValue();
26759           Op = Op.getOperand(0);
26760           continue;
26761         }
26762       }
26763
26764       // Otherwise, this isn't something we can handle, reject it.
26765       return;
26766     }
26767
26768     const GlobalValue *GV = GA->getGlobal();
26769     // If we require an extra load to get this address, as in PIC mode, we
26770     // can't accept it.
26771     if (isGlobalStubReference(
26772             Subtarget->ClassifyGlobalReference(GV, DAG.getTarget())))
26773       return;
26774
26775     Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
26776                                         GA->getValueType(0), Offset);
26777     break;
26778   }
26779   }
26780
26781   if (Result.getNode()) {
26782     Ops.push_back(Result);
26783     return;
26784   }
26785   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
26786 }
26787
26788 std::pair<unsigned, const TargetRegisterClass*>
26789 X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
26790                                                 MVT VT) const {
26791   // First, see if this is a constraint that directly corresponds to an LLVM
26792   // register class.
26793   if (Constraint.size() == 1) {
26794     // GCC Constraint Letters
26795     switch (Constraint[0]) {
26796     default: break;
26797       // TODO: Slight differences here in allocation order and leaving
26798       // RIP in the class. Do they matter any more here than they do
26799       // in the normal allocation?
26800     case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
26801       if (Subtarget->is64Bit()) {
26802         if (VT == MVT::i32 || VT == MVT::f32)
26803           return std::make_pair(0U, &X86::GR32RegClass);
26804         if (VT == MVT::i16)
26805           return std::make_pair(0U, &X86::GR16RegClass);
26806         if (VT == MVT::i8 || VT == MVT::i1)
26807           return std::make_pair(0U, &X86::GR8RegClass);
26808         if (VT == MVT::i64 || VT == MVT::f64)
26809           return std::make_pair(0U, &X86::GR64RegClass);
26810         break;
26811       }
26812       // 32-bit fallthrough
26813     case 'Q':   // Q_REGS
26814       if (VT == MVT::i32 || VT == MVT::f32)
26815         return std::make_pair(0U, &X86::GR32_ABCDRegClass);
26816       if (VT == MVT::i16)
26817         return std::make_pair(0U, &X86::GR16_ABCDRegClass);
26818       if (VT == MVT::i8 || VT == MVT::i1)
26819         return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
26820       if (VT == MVT::i64)
26821         return std::make_pair(0U, &X86::GR64_ABCDRegClass);
26822       break;
26823     case 'r':   // GENERAL_REGS
26824     case 'l':   // INDEX_REGS
26825       if (VT == MVT::i8 || VT == MVT::i1)
26826         return std::make_pair(0U, &X86::GR8RegClass);
26827       if (VT == MVT::i16)
26828         return std::make_pair(0U, &X86::GR16RegClass);
26829       if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit())
26830         return std::make_pair(0U, &X86::GR32RegClass);
26831       return std::make_pair(0U, &X86::GR64RegClass);
26832     case 'R':   // LEGACY_REGS
26833       if (VT == MVT::i8 || VT == MVT::i1)
26834         return std::make_pair(0U, &X86::GR8_NOREXRegClass);
26835       if (VT == MVT::i16)
26836         return std::make_pair(0U, &X86::GR16_NOREXRegClass);
26837       if (VT == MVT::i32 || !Subtarget->is64Bit())
26838         return std::make_pair(0U, &X86::GR32_NOREXRegClass);
26839       return std::make_pair(0U, &X86::GR64_NOREXRegClass);
26840     case 'f':  // FP Stack registers.
26841       // If SSE is enabled for this VT, use f80 to ensure the isel moves the
26842       // value to the correct fpstack register class.
26843       if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
26844         return std::make_pair(0U, &X86::RFP32RegClass);
26845       if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
26846         return std::make_pair(0U, &X86::RFP64RegClass);
26847       return std::make_pair(0U, &X86::RFP80RegClass);
26848     case 'y':   // MMX_REGS if MMX allowed.
26849       if (!Subtarget->hasMMX()) break;
26850       return std::make_pair(0U, &X86::VR64RegClass);
26851     case 'Y':   // SSE_REGS if SSE2 allowed
26852       if (!Subtarget->hasSSE2()) break;
26853       // FALL THROUGH.
26854     case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
26855       if (!Subtarget->hasSSE1()) break;
26856
26857       switch (VT.SimpleTy) {
26858       default: break;
26859       // Scalar SSE types.
26860       case MVT::f32:
26861       case MVT::i32:
26862         return std::make_pair(0U, &X86::FR32RegClass);
26863       case MVT::f64:
26864       case MVT::i64:
26865         return std::make_pair(0U, &X86::FR64RegClass);
26866       // Vector types.
26867       case MVT::v16i8:
26868       case MVT::v8i16:
26869       case MVT::v4i32:
26870       case MVT::v2i64:
26871       case MVT::v4f32:
26872       case MVT::v2f64:
26873         return std::make_pair(0U, &X86::VR128RegClass);
26874       // AVX types.
26875       case MVT::v32i8:
26876       case MVT::v16i16:
26877       case MVT::v8i32:
26878       case MVT::v4i64:
26879       case MVT::v8f32:
26880       case MVT::v4f64:
26881         return std::make_pair(0U, &X86::VR256RegClass);
26882       case MVT::v8f64:
26883       case MVT::v16f32:
26884       case MVT::v16i32:
26885       case MVT::v8i64:
26886         return std::make_pair(0U, &X86::VR512RegClass);
26887       }
26888       break;
26889     }
26890   }
26891
26892   // Use the default implementation in TargetLowering to convert the register
26893   // constraint into a member of a register class.
26894   std::pair<unsigned, const TargetRegisterClass*> Res;
26895   Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
26896
26897   // Not found as a standard register?
26898   if (!Res.second) {
26899     // Map st(0) -> st(7) -> ST0
26900     if (Constraint.size() == 7 && Constraint[0] == '{' &&
26901         tolower(Constraint[1]) == 's' &&
26902         tolower(Constraint[2]) == 't' &&
26903         Constraint[3] == '(' &&
26904         (Constraint[4] >= '0' && Constraint[4] <= '7') &&
26905         Constraint[5] == ')' &&
26906         Constraint[6] == '}') {
26907
26908       Res.first = X86::FP0+Constraint[4]-'0';
26909       Res.second = &X86::RFP80RegClass;
26910       return Res;
26911     }
26912
26913     // GCC allows "st(0)" to be called just plain "st".
26914     if (StringRef("{st}").equals_lower(Constraint)) {
26915       Res.first = X86::FP0;
26916       Res.second = &X86::RFP80RegClass;
26917       return Res;
26918     }
26919
26920     // flags -> EFLAGS
26921     if (StringRef("{flags}").equals_lower(Constraint)) {
26922       Res.first = X86::EFLAGS;
26923       Res.second = &X86::CCRRegClass;
26924       return Res;
26925     }
26926
26927     // 'A' means EAX + EDX.
26928     if (Constraint == "A") {
26929       Res.first = X86::EAX;
26930       Res.second = &X86::GR32_ADRegClass;
26931       return Res;
26932     }
26933     return Res;
26934   }
26935
26936   // Otherwise, check to see if this is a register class of the wrong value
26937   // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
26938   // turn into {ax},{dx}.
26939   if (Res.second->hasType(VT))
26940     return Res;   // Correct type already, nothing to do.
26941
26942   // All of the single-register GCC register classes map their values onto
26943   // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp".  If we
26944   // really want an 8-bit or 32-bit register, map to the appropriate register
26945   // class and return the appropriate register.
26946   if (Res.second == &X86::GR16RegClass) {
26947     if (VT == MVT::i8 || VT == MVT::i1) {
26948       unsigned DestReg = 0;
26949       switch (Res.first) {
26950       default: break;
26951       case X86::AX: DestReg = X86::AL; break;
26952       case X86::DX: DestReg = X86::DL; break;
26953       case X86::CX: DestReg = X86::CL; break;
26954       case X86::BX: DestReg = X86::BL; break;
26955       }
26956       if (DestReg) {
26957         Res.first = DestReg;
26958         Res.second = &X86::GR8RegClass;
26959       }
26960     } else if (VT == MVT::i32 || VT == MVT::f32) {
26961       unsigned DestReg = 0;
26962       switch (Res.first) {
26963       default: break;
26964       case X86::AX: DestReg = X86::EAX; break;
26965       case X86::DX: DestReg = X86::EDX; break;
26966       case X86::CX: DestReg = X86::ECX; break;
26967       case X86::BX: DestReg = X86::EBX; break;
26968       case X86::SI: DestReg = X86::ESI; break;
26969       case X86::DI: DestReg = X86::EDI; break;
26970       case X86::BP: DestReg = X86::EBP; break;
26971       case X86::SP: DestReg = X86::ESP; break;
26972       }
26973       if (DestReg) {
26974         Res.first = DestReg;
26975         Res.second = &X86::GR32RegClass;
26976       }
26977     } else if (VT == MVT::i64 || VT == MVT::f64) {
26978       unsigned DestReg = 0;
26979       switch (Res.first) {
26980       default: break;
26981       case X86::AX: DestReg = X86::RAX; break;
26982       case X86::DX: DestReg = X86::RDX; break;
26983       case X86::CX: DestReg = X86::RCX; break;
26984       case X86::BX: DestReg = X86::RBX; break;
26985       case X86::SI: DestReg = X86::RSI; break;
26986       case X86::DI: DestReg = X86::RDI; break;
26987       case X86::BP: DestReg = X86::RBP; break;
26988       case X86::SP: DestReg = X86::RSP; break;
26989       }
26990       if (DestReg) {
26991         Res.first = DestReg;
26992         Res.second = &X86::GR64RegClass;
26993       }
26994     }
26995   } else if (Res.second == &X86::FR32RegClass ||
26996              Res.second == &X86::FR64RegClass ||
26997              Res.second == &X86::VR128RegClass ||
26998              Res.second == &X86::VR256RegClass ||
26999              Res.second == &X86::FR32XRegClass ||
27000              Res.second == &X86::FR64XRegClass ||
27001              Res.second == &X86::VR128XRegClass ||
27002              Res.second == &X86::VR256XRegClass ||
27003              Res.second == &X86::VR512RegClass) {
27004     // Handle references to XMM physical registers that got mapped into the
27005     // wrong class.  This can happen with constraints like {xmm0} where the
27006     // target independent register mapper will just pick the first match it can
27007     // find, ignoring the required type.
27008
27009     if (VT == MVT::f32 || VT == MVT::i32)
27010       Res.second = &X86::FR32RegClass;
27011     else if (VT == MVT::f64 || VT == MVT::i64)
27012       Res.second = &X86::FR64RegClass;
27013     else if (X86::VR128RegClass.hasType(VT))
27014       Res.second = &X86::VR128RegClass;
27015     else if (X86::VR256RegClass.hasType(VT))
27016       Res.second = &X86::VR256RegClass;
27017     else if (X86::VR512RegClass.hasType(VT))
27018       Res.second = &X86::VR512RegClass;
27019   }
27020
27021   return Res;
27022 }
27023
27024 int X86TargetLowering::getScalingFactorCost(const AddrMode &AM,
27025                                             Type *Ty) const {
27026   // Scaling factors are not free at all.
27027   // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
27028   // will take 2 allocations in the out of order engine instead of 1
27029   // for plain addressing mode, i.e. inst (reg1).
27030   // E.g.,
27031   // vaddps (%rsi,%drx), %ymm0, %ymm1
27032   // Requires two allocations (one for the load, one for the computation)
27033   // whereas:
27034   // vaddps (%rsi), %ymm0, %ymm1
27035   // Requires just 1 allocation, i.e., freeing allocations for other operations
27036   // and having less micro operations to execute.
27037   //
27038   // For some X86 architectures, this is even worse because for instance for
27039   // stores, the complex addressing mode forces the instruction to use the
27040   // "load" ports instead of the dedicated "store" port.
27041   // E.g., on Haswell:
27042   // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
27043   // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
27044   if (isLegalAddressingMode(AM, Ty))
27045     // Scale represents reg2 * scale, thus account for 1
27046     // as soon as we use a second register.
27047     return AM.Scale != 0;
27048   return -1;
27049 }
27050
27051 bool X86TargetLowering::isTargetFTOL() const {
27052   return Subtarget->isTargetKnownWindowsMSVC() && !Subtarget->is64Bit();
27053 }