lib/Target/X86/X86ISelLowering.cpp

   1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This file defines the interfaces that X86 uses to lower LLVM code into a
  11 // selection DAG.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "X86ISelLowering.h"
  16 #include "Utils/X86ShuffleDecode.h"
  17 #include "X86CallingConv.h"
  18 #include "X86FrameLowering.h"
  19 #include "X86InstrBuilder.h"
  20 #include "X86MachineFunctionInfo.h"
  21 #include "X86TargetMachine.h"
  22 #include "X86TargetObjectFile.h"
  23 #include "llvm/ADT/SmallBitVector.h"
  24 #include "llvm/ADT/SmallSet.h"
  25 #include "llvm/ADT/Statistic.h"
  26 #include "llvm/ADT/StringExtras.h"
  27 #include "llvm/ADT/StringSwitch.h"
  28 #include "llvm/ADT/VariadicFunction.h"
  29 #include "llvm/CodeGen/IntrinsicLowering.h"
  30 #include "llvm/CodeGen/MachineFrameInfo.h"
  31 #include "llvm/CodeGen/MachineFunction.h"
  32 #include "llvm/CodeGen/MachineInstrBuilder.h"
  33 #include "llvm/CodeGen/MachineJumpTableInfo.h"
  34 #include "llvm/CodeGen/MachineModuleInfo.h"
  35 #include "llvm/CodeGen/MachineRegisterInfo.h"
  36 #include "llvm/IR/CallSite.h"
  37 #include "llvm/IR/CallingConv.h"
  38 #include "llvm/IR/Constants.h"
  39 #include "llvm/IR/DerivedTypes.h"
  40 #include "llvm/IR/Function.h"
  41 #include "llvm/IR/GlobalAlias.h"
  42 #include "llvm/IR/GlobalVariable.h"
  43 #include "llvm/IR/Instructions.h"
  44 #include "llvm/IR/Intrinsics.h"
  45 #include "llvm/MC/MCAsmInfo.h"
  46 #include "llvm/MC/MCContext.h"
  47 #include "llvm/MC/MCExpr.h"
  48 #include "llvm/MC/MCSymbol.h"
  49 #include "llvm/Support/CommandLine.h"
  50 #include "llvm/Support/Debug.h"
  51 #include "llvm/Support/ErrorHandling.h"
  52 #include "llvm/Support/MathExtras.h"
  53 #include "llvm/Target/TargetOptions.h"
  54 #include "X86IntrinsicsInfo.h"
  55 #include <bitset>
  56 #include <numeric>
  57 #include <cctype>
  58 using namespace llvm;
  59
  60 #define DEBUG_TYPE "x86-isel"
  61
  62 STATISTIC(NumTailCalls, "Number of tail calls");
  63
  64 static cl::opt<bool> ExperimentalVectorWideningLegalization(
  65     "x86-experimental-vector-widening-legalization", cl::init(false),
  66     cl::desc("Enable an experimental vector type legalization through widening "
  67              "rather than promotion."),
  68     cl::Hidden);
  69
  70 static cl::opt<bool> ExperimentalVectorShuffleLowering(
  71     "x86-experimental-vector-shuffle-lowering", cl::init(true),
  72     cl::desc("Enable an experimental vector shuffle lowering code path."),
  73     cl::Hidden);
  74
  75 static cl::opt<bool> ExperimentalVectorShuffleLegality(
  76     "x86-experimental-vector-shuffle-legality", cl::init(false),
  77     cl::desc("Enable experimental shuffle legality based on the experimental "
  78              "shuffle lowering. Should only be used with the experimental "
  79              "shuffle lowering."),
  80     cl::Hidden);
  81
  82 static cl::opt<int> ReciprocalEstimateRefinementSteps(
  83     "x86-recip-refinement-steps", cl::init(1),
  84     cl::desc("Specify the number of Newton-Raphson iterations applied to the "
  85              "result of the hardware reciprocal estimate instruction."),
  86     cl::NotHidden);
  87
  88 // Forward declarations.
  89 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
  90                        SDValue V2);
  91
  92 static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
  93                                 SelectionDAG &DAG, SDLoc dl,
  94                                 unsigned vectorWidth) {
  95   assert((vectorWidth == 128 || vectorWidth == 256) &&
  96          "Unsupported vector width");
  97   EVT VT = Vec.getValueType();
  98   EVT ElVT = VT.getVectorElementType();
  99   unsigned Factor = VT.getSizeInBits()/vectorWidth;
 100   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
 101                                   VT.getVectorNumElements()/Factor);
 102
 103   // Extract from UNDEF is UNDEF.
 104   if (Vec.getOpcode() == ISD::UNDEF)
 105     return DAG.getUNDEF(ResultVT);
 106
 107   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
 108   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
 109
 110   // This is the index of the first element of the vectorWidth-bit chunk
 111   // we want.
 112   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
 113                                * ElemsPerChunk);
 114
 115   // If the input is a buildvector just emit a smaller one.
 116   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
 117     return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
 118                        makeArrayRef(Vec->op_begin() + NormalizedIdxVal,
 119                                     ElemsPerChunk));
 120
 121   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
 122   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
 123 }
 124
 125 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
 126 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
 127 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
 128 /// instructions or a simple subregister reference. Idx is an index in the
 129 /// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
 130 /// lowering EXTRACT_VECTOR_ELT operations easier.
 131 static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
 132                                    SelectionDAG &DAG, SDLoc dl) {
 133   assert((Vec.getValueType().is256BitVector() ||
 134           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
 135   return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
 136 }
 137
 138 /// Generate a DAG to grab 256-bits from a 512-bit vector.
 139 static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
 140                                    SelectionDAG &DAG, SDLoc dl) {
 141   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
 142   return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
 143 }
 144
 145 static SDValue InsertSubVector(SDValue Result, SDValue Vec,
 146                                unsigned IdxVal, SelectionDAG &DAG,
 147                                SDLoc dl, unsigned vectorWidth) {
 148   assert((vectorWidth == 128 || vectorWidth == 256) &&
 149          "Unsupported vector width");
 150   // Inserting UNDEF is Result
 151   if (Vec.getOpcode() == ISD::UNDEF)
 152     return Result;
 153   EVT VT = Vec.getValueType();
 154   EVT ElVT = VT.getVectorElementType();
 155   EVT ResultVT = Result.getValueType();
 156
 157   // Insert the relevant vectorWidth bits.
 158   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
 159
 160   // This is the index of the first element of the vectorWidth-bit chunk
 161   // we want.
 162   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
 163                                * ElemsPerChunk);
 164
 165   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
 166   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
 167 }
 168
 169 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
 170 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
 171 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
 172 /// simple superregister reference.  Idx is an index in the 128 bits
 173 /// we want.  It need not be aligned to a 128-bit boundary.  That makes
 174 /// lowering INSERT_VECTOR_ELT operations easier.
 175 static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
 176                                   SelectionDAG &DAG,SDLoc dl) {
 177   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
 178   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
 179 }
 180
 181 static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
 182                                   SelectionDAG &DAG, SDLoc dl) {
 183   assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
 184   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
 185 }
 186
 187 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
 188 /// instructions. This is used because creating CONCAT_VECTOR nodes of
 189 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
 190 /// large BUILD_VECTORS.
 191 static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
 192                                    unsigned NumElems, SelectionDAG &DAG,
 193                                    SDLoc dl) {
 194   SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
 195   return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
 196 }
 197
 198 static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
 199                                    unsigned NumElems, SelectionDAG &DAG,
 200                                    SDLoc dl) {
 201   SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
 202   return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
 203 }
 204
 205 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 206                                      const X86Subtarget &STI)
 207     : TargetLowering(TM), Subtarget(&STI) {
 208   X86ScalarSSEf64 = Subtarget->hasSSE2();
 209   X86ScalarSSEf32 = Subtarget->hasSSE1();
 210   TD = getDataLayout();
 211
 212   // Set up the TargetLowering object.
 213   static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
 214
 215   // X86 is weird. It always uses i8 for shift amounts and setcc results.
 216   setBooleanContents(ZeroOrOneBooleanContent);
 217   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
 218   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 219
 220   // For 64-bit, since we have so many registers, use the ILP scheduler.
 221   // For 32-bit, use the register pressure specific scheduling.
 222   // For Atom, always use ILP scheduling.
 223   if (Subtarget->isAtom())
 224     setSchedulingPreference(Sched::ILP);
 225   else if (Subtarget->is64Bit())
 226     setSchedulingPreference(Sched::ILP);
 227   else
 228     setSchedulingPreference(Sched::RegPressure);
 229   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
 230   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
 231
 232   // Bypass expensive divides on Atom when compiling with O2.
 233   if (TM.getOptLevel() >= CodeGenOpt::Default) {
 234     if (Subtarget->hasSlowDivide32())
 235       addBypassSlowDiv(32, 8);
 236     if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit())
 237       addBypassSlowDiv(64, 16);
 238   }
 239
 240   if (Subtarget->isTargetKnownWindowsMSVC()) {
 241     // Setup Windows compiler runtime calls.
 242     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
 243     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
 244     setLibcallName(RTLIB::SREM_I64, "_allrem");
 245     setLibcallName(RTLIB::UREM_I64, "_aullrem");
 246     setLibcallName(RTLIB::MUL_I64, "_allmul");
 247     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
 248     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
 249     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
 250     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
 251     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
 252
 253     // The _ftol2 runtime function has an unusual calling conv, which
 254     // is modeled by a special pseudo-instruction.
 255     setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr);
 256     setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr);
 257     setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr);
 258     setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr);
 259   }
 260
 261   if (Subtarget->isTargetDarwin()) {
 262     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
 263     setUseUnderscoreSetJmp(false);
 264     setUseUnderscoreLongJmp(false);
 265   } else if (Subtarget->isTargetWindowsGNU()) {
 266     // MS runtime is weird: it exports _setjmp, but longjmp!
 267     setUseUnderscoreSetJmp(true);
 268     setUseUnderscoreLongJmp(false);
 269   } else {
 270     setUseUnderscoreSetJmp(true);
 271     setUseUnderscoreLongJmp(true);
 272   }
 273
 274   // Set up the register classes.
 275   addRegisterClass(MVT::i8, &X86::GR8RegClass);
 276   addRegisterClass(MVT::i16, &X86::GR16RegClass);
 277   addRegisterClass(MVT::i32, &X86::GR32RegClass);
 278   if (Subtarget->is64Bit())
 279     addRegisterClass(MVT::i64, &X86::GR64RegClass);
 280
 281   for (MVT VT : MVT::integer_valuetypes())
 282     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
 283
 284   // We don't accept any truncstore of integer registers.
 285   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
 286   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
 287   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
 288   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
 289   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
 290   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
 291
 292   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 293
 294   // SETOEQ and SETUNE require checking two conditions.
 295   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
 296   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
 297   setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
 298   setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
 299   setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
 300   setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
 301
 302   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
 303   // operation.
 304   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
 305   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
 306   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
 307
 308   if (Subtarget->is64Bit()) {
 309     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
 310     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
 311   } else if (!TM.Options.UseSoftFloat) {
 312     // We have an algorithm for SSE2->double, and we turn this into a
 313     // 64-bit FILD followed by conditional FADD for other targets.
 314     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
 315     // We have an algorithm for SSE2, and we turn this into a 64-bit
 316     // FILD for other targets.
 317     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
 318   }
 319
 320   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
 321   // this operation.
 322   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
 323   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
 324
 325   if (!TM.Options.UseSoftFloat) {
 326     // SSE has no i16 to fp conversion, only i32
 327     if (X86ScalarSSEf32) {
 328       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
 329       // f32 and f64 cases are Legal, f80 case is not
 330       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
 331     } else {
 332       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
 333       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
 334     }
 335   } else {
 336     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
 337     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
 338   }
 339
 340   // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
 341   // are Legal, f80 is custom lowered.
 342   setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
 343   setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
 344
 345   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
 346   // this operation.
 347   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
 348   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
 349
 350   if (X86ScalarSSEf32) {
 351     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
 352     // f32 and f64 cases are Legal, f80 case is not
 353     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
 354   } else {
 355     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
 356     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
 357   }
 358
 359   // Handle FP_TO_UINT by promoting the destination to a larger signed
 360   // conversion.
 361   setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
 362   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
 363   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
 364
 365   if (Subtarget->is64Bit()) {
 366     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
 367     setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
 368   } else if (!TM.Options.UseSoftFloat) {
 369     // Since AVX is a superset of SSE3, only check for SSE here.
 370     if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
 371       // Expand FP_TO_UINT into a select.
 372       // FIXME: We would like to use a Custom expander here eventually to do
 373       // the optimal thing for SSE vs. the default expansion in the legalizer.
 374       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
 375     else
 376       // With SSE3 we can use fisttpll to convert to a signed i64; without
 377       // SSE, we're stuck with a fistpll.
 378       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
 379   }
 380
 381   if (isTargetFTOL()) {
 382     // Use the _ftol2 runtime function, which has a pseudo-instruction
 383     // to handle its weird calling convention.
 384     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
 385   }
 386
 387   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
 388   if (!X86ScalarSSEf64) {
 389     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
 390     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
 391     if (Subtarget->is64Bit()) {
 392       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
 393       // Without SSE, i64->f64 goes through memory.
 394       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
 395     }
 396   }
 397
 398   // Scalar integer divide and remainder are lowered to use operations that
 399   // produce two results, to match the available instructions. This exposes
 400   // the two-result form to trivial CSE, which is able to combine x/y and x%y
 401   // into a single instruction.
 402   //
 403   // Scalar integer multiply-high is also lowered to use two-result
 404   // operations, to match the available instructions. However, plain multiply
 405   // (low) operations are left as Legal, as there are single-result
 406   // instructions for this in x86. Using the two-result multiply instructions
 407   // when both high and low results are needed must be arranged by dagcombine.
 408   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
 409     MVT VT = IntVTs[i];
 410     setOperationAction(ISD::MULHS, VT, Expand);
 411     setOperationAction(ISD::MULHU, VT, Expand);
 412     setOperationAction(ISD::SDIV, VT, Expand);
 413     setOperationAction(ISD::UDIV, VT, Expand);
 414     setOperationAction(ISD::SREM, VT, Expand);
 415     setOperationAction(ISD::UREM, VT, Expand);
 416
 417     // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
 418     setOperationAction(ISD::ADDC, VT, Custom);
 419     setOperationAction(ISD::ADDE, VT, Custom);
 420     setOperationAction(ISD::SUBC, VT, Custom);
 421     setOperationAction(ISD::SUBE, VT, Custom);
 422   }
 423
 424   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
 425   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
 426   setOperationAction(ISD::BR_CC            , MVT::f32,   Expand);
 427   setOperationAction(ISD::BR_CC            , MVT::f64,   Expand);
 428   setOperationAction(ISD::BR_CC            , MVT::f80,   Expand);
 429   setOperationAction(ISD::BR_CC            , MVT::i8,    Expand);
 430   setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
 431   setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
 432   setOperationAction(ISD::BR_CC            , MVT::i64,   Expand);
 433   setOperationAction(ISD::SELECT_CC        , MVT::f32,   Expand);
 434   setOperationAction(ISD::SELECT_CC        , MVT::f64,   Expand);
 435   setOperationAction(ISD::SELECT_CC        , MVT::f80,   Expand);
 436   setOperationAction(ISD::SELECT_CC        , MVT::i8,    Expand);
 437   setOperationAction(ISD::SELECT_CC        , MVT::i16,   Expand);
 438   setOperationAction(ISD::SELECT_CC        , MVT::i32,   Expand);
 439   setOperationAction(ISD::SELECT_CC        , MVT::i64,   Expand);
 440   if (Subtarget->is64Bit())
 441     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
 442   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
 443   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
 444   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
 445   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
 446   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
 447   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
 448   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
 449   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
 450
 451   // Promote the i8 variants and force them on up to i32 which has a shorter
 452   // encoding.
 453   setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
 454   AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
 455   setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
 456   AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
 457   if (Subtarget->hasBMI()) {
 458     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
 459     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
 460     if (Subtarget->is64Bit())
 461       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
 462   } else {
 463     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
 464     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
 465     if (Subtarget->is64Bit())
 466       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
 467   }
 468
 469   if (Subtarget->hasLZCNT()) {
 470     // When promoting the i8 variants, force them to i32 for a shorter
 471     // encoding.
 472     setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
 473     AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
 474     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
 475     AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
 476     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
 477     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
 478     if (Subtarget->is64Bit())
 479       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
 480   } else {
 481     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
 482     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
 483     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
 484     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
 485     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
 486     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
 487     if (Subtarget->is64Bit()) {
 488       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
 489       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
 490     }
 491   }
 492
 493   // Special handling for half-precision floating point conversions.
 494   // If we don't have F16C support, then lower half float conversions
 495   // into library calls.
 496   if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) {
 497     setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
 498     setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
 499   }
 500
 501   // There's never any support for operations beyond MVT::f32.
 502   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
 503   setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
 504   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
 505   setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
 506
 507   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
 508   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
 509   setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
 510   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
 511   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
 512   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
 513
 514   if (Subtarget->hasPOPCNT()) {
 515     setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
 516   } else {
 517     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
 518     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
 519     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
 520     if (Subtarget->is64Bit())
 521       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
 522   }
 523
 524   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
 525
 526   if (!Subtarget->hasMOVBE())
 527     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
 528
 529   // These should be promoted to a larger select which is supported.
 530   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
 531   // X86 wants to expand cmov itself.
 532   setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
 533   setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
 534   setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
 535   setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
 536   setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
 537   setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
 538   setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
 539   setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
 540   setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
 541   setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
 542   setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
 543   setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
 544   if (Subtarget->is64Bit()) {
 545     setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
 546     setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
 547   }
 548   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
 549   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
 550   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
 551   // support continuation, user-level threading, and etc.. As a result, no
 552   // other SjLj exception interfaces are implemented and please don't build
 553   // your own exception handling based on them.
 554   // LLVM/Clang supports zero-cost DWARF exception handling.
 555   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
 556   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
 557
 558   // Darwin ABI issue.
 559   setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
 560   setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
 561   setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
 562   setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
 563   if (Subtarget->is64Bit())
 564     setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
 565   setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
 566   setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
 567   if (Subtarget->is64Bit()) {
 568     setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
 569     setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
 570     setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
 571     setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
 572     setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
 573   }
 574   // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
 575   setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
 576   setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
 577   setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
 578   if (Subtarget->is64Bit()) {
 579     setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
 580     setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
 581     setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
 582   }
 583
 584   if (Subtarget->hasSSE1())
 585     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
 586
 587   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
 588
 589   // Expand certain atomics
 590   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
 591     MVT VT = IntVTs[i];
 592     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
 593     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
 594     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
 595   }
 596
 597   if (Subtarget->hasCmpxchg16b()) {
 598     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
 599   }
 600
 601   // FIXME - use subtarget debug flags
 602   if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() &&
 603       !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) {
 604     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
 605   }
 606
 607   if (Subtarget->is64Bit()) {
 608     setExceptionPointerRegister(X86::RAX);
 609     setExceptionSelectorRegister(X86::RDX);
 610   } else {
 611     setExceptionPointerRegister(X86::EAX);
 612     setExceptionSelectorRegister(X86::EDX);
 613   }
 614   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
 615   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
 616
 617   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
 618   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
 619
 620   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 621   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
 622
 623   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
 624   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
 625   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
 626   if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {
 627     // TargetInfo::X86_64ABIBuiltinVaList
 628     setOperationAction(ISD::VAARG           , MVT::Other, Custom);
 629     setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
 630   } else {
 631     // TargetInfo::CharPtrBuiltinVaList
 632     setOperationAction(ISD::VAARG           , MVT::Other, Expand);
 633     setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
 634   }
 635
 636   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
 637   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
 638
 639   setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(), Custom);
 640
 641   if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
 642     // f32 and f64 use SSE.
 643     // Set up the FP register classes.
 644     addRegisterClass(MVT::f32, &X86::FR32RegClass);
 645     addRegisterClass(MVT::f64, &X86::FR64RegClass);
 646
 647     // Use ANDPD to simulate FABS.
 648     setOperationAction(ISD::FABS , MVT::f64, Custom);
 649     setOperationAction(ISD::FABS , MVT::f32, Custom);
 650
 651     // Use XORP to simulate FNEG.
 652     setOperationAction(ISD::FNEG , MVT::f64, Custom);
 653     setOperationAction(ISD::FNEG , MVT::f32, Custom);
 654
 655     // Use ANDPD and ORPD to simulate FCOPYSIGN.
 656     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
 657     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 658
 659     // Lower this to FGETSIGNx86 plus an AND.
 660     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
 661     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
 662
 663     // We don't support sin/cos/fmod
 664     setOperationAction(ISD::FSIN   , MVT::f64, Expand);
 665     setOperationAction(ISD::FCOS   , MVT::f64, Expand);
 666     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 667     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
 668     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
 669     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 670
 671     // Expand FP immediates into loads from the stack, except for the special
 672     // cases we handle.
 673     addLegalFPImmediate(APFloat(+0.0)); // xorpd
 674     addLegalFPImmediate(APFloat(+0.0f)); // xorps
 675   } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
 676     // Use SSE for f32, x87 for f64.
 677     // Set up the FP register classes.
 678     addRegisterClass(MVT::f32, &X86::FR32RegClass);
 679     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
 680
 681     // Use ANDPS to simulate FABS.
 682     setOperationAction(ISD::FABS , MVT::f32, Custom);
 683
 684     // Use XORP to simulate FNEG.
 685     setOperationAction(ISD::FNEG , MVT::f32, Custom);
 686
 687     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
 688
 689     // Use ANDPS and ORPS to simulate FCOPYSIGN.
 690     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
 691     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 692
 693     // We don't support sin/cos/fmod
 694     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
 695     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
 696     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 697
 698     // Special cases we handle for FP constants.
 699     addLegalFPImmediate(APFloat(+0.0f)); // xorps
 700     addLegalFPImmediate(APFloat(+0.0)); // FLD0
 701     addLegalFPImmediate(APFloat(+1.0)); // FLD1
 702     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
 703     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
 704
 705     if (!TM.Options.UnsafeFPMath) {
 706       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
 707       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
 708       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 709     }
 710   } else if (!TM.Options.UseSoftFloat) {
 711     // f32 and f64 in x87.
 712     // Set up the FP register classes.
 713     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
 714     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
 715
 716     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
 717     setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
 718     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
 719     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
 720
 721     if (!TM.Options.UnsafeFPMath) {
 722       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
 723       setOperationAction(ISD::FSIN   , MVT::f32, Expand);
 724       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
 725       setOperationAction(ISD::FCOS   , MVT::f32, Expand);
 726       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 727       setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 728     }
 729     addLegalFPImmediate(APFloat(+0.0)); // FLD0
 730     addLegalFPImmediate(APFloat(+1.0)); // FLD1
 731     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
 732     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
 733     addLegalFPImmediate(APFloat(+0.0f)); // FLD0
 734     addLegalFPImmediate(APFloat(+1.0f)); // FLD1
 735     addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
 736     addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
 737   }
 738
 739   // We don't support FMA.
 740   setOperationAction(ISD::FMA, MVT::f64, Expand);
 741   setOperationAction(ISD::FMA, MVT::f32, Expand);
 742
 743   // Long double always uses X87.
 744   if (!TM.Options.UseSoftFloat) {
 745     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
 746     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
 747     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
 748     {
 749       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
 750       addLegalFPImmediate(TmpFlt);  // FLD0
 751       TmpFlt.changeSign();
 752       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
 753
 754       bool ignored;
 755       APFloat TmpFlt2(+1.0);
 756       TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
 757                       &ignored);
 758       addLegalFPImmediate(TmpFlt2);  // FLD1
 759       TmpFlt2.changeSign();
 760       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
 761     }
 762
 763     if (!TM.Options.UnsafeFPMath) {
 764       setOperationAction(ISD::FSIN   , MVT::f80, Expand);
 765       setOperationAction(ISD::FCOS   , MVT::f80, Expand);
 766       setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
 767     }
 768
 769     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
 770     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
 771     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
 772     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
 773     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
 774     setOperationAction(ISD::FMA, MVT::f80, Expand);
 775   }
 776
 777   // Always use a library call for pow.
 778   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
 779   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
 780   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
 781
 782   setOperationAction(ISD::FLOG, MVT::f80, Expand);
 783   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
 784   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
 785   setOperationAction(ISD::FEXP, MVT::f80, Expand);
 786   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
 787   setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
 788   setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
 789
 790   // First set operation action for all vector types to either promote
 791   // (for widening) or expand (for scalarization). Then we will selectively
 792   // turn on ones that can be effectively codegen'd.
 793   for (MVT VT : MVT::vector_valuetypes()) {
 794     setOperationAction(ISD::ADD , VT, Expand);
 795     setOperationAction(ISD::SUB , VT, Expand);
 796     setOperationAction(ISD::FADD, VT, Expand);
 797     setOperationAction(ISD::FNEG, VT, Expand);
 798     setOperationAction(ISD::FSUB, VT, Expand);
 799     setOperationAction(ISD::MUL , VT, Expand);
 800     setOperationAction(ISD::FMUL, VT, Expand);
 801     setOperationAction(ISD::SDIV, VT, Expand);
 802     setOperationAction(ISD::UDIV, VT, Expand);
 803     setOperationAction(ISD::FDIV, VT, Expand);
 804     setOperationAction(ISD::SREM, VT, Expand);
 805     setOperationAction(ISD::UREM, VT, Expand);
 806     setOperationAction(ISD::LOAD, VT, Expand);
 807     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
 808     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
 809     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
 810     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
 811     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
 812     setOperationAction(ISD::FABS, VT, Expand);
 813     setOperationAction(ISD::FSIN, VT, Expand);
 814     setOperationAction(ISD::FSINCOS, VT, Expand);
 815     setOperationAction(ISD::FCOS, VT, Expand);
 816     setOperationAction(ISD::FSINCOS, VT, Expand);
 817     setOperationAction(ISD::FREM, VT, Expand);
 818     setOperationAction(ISD::FMA,  VT, Expand);
 819     setOperationAction(ISD::FPOWI, VT, Expand);
 820     setOperationAction(ISD::FSQRT, VT, Expand);
 821     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
 822     setOperationAction(ISD::FFLOOR, VT, Expand);
 823     setOperationAction(ISD::FCEIL, VT, Expand);
 824     setOperationAction(ISD::FTRUNC, VT, Expand);
 825     setOperationAction(ISD::FRINT, VT, Expand);
 826     setOperationAction(ISD::FNEARBYINT, VT, Expand);
 827     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
 828     setOperationAction(ISD::MULHS, VT, Expand);
 829     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 830     setOperationAction(ISD::MULHU, VT, Expand);
 831     setOperationAction(ISD::SDIVREM, VT, Expand);
 832     setOperationAction(ISD::UDIVREM, VT, Expand);
 833     setOperationAction(ISD::FPOW, VT, Expand);
 834     setOperationAction(ISD::CTPOP, VT, Expand);
 835     setOperationAction(ISD::CTTZ, VT, Expand);
 836     setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
 837     setOperationAction(ISD::CTLZ, VT, Expand);
 838     setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
 839     setOperationAction(ISD::SHL, VT, Expand);
 840     setOperationAction(ISD::SRA, VT, Expand);
 841     setOperationAction(ISD::SRL, VT, Expand);
 842     setOperationAction(ISD::ROTL, VT, Expand);
 843     setOperationAction(ISD::ROTR, VT, Expand);
 844     setOperationAction(ISD::BSWAP, VT, Expand);
 845     setOperationAction(ISD::SETCC, VT, Expand);
 846     setOperationAction(ISD::FLOG, VT, Expand);
 847     setOperationAction(ISD::FLOG2, VT, Expand);
 848     setOperationAction(ISD::FLOG10, VT, Expand);
 849     setOperationAction(ISD::FEXP, VT, Expand);
 850     setOperationAction(ISD::FEXP2, VT, Expand);
 851     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
 852     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
 853     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
 854     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
 855     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
 856     setOperationAction(ISD::TRUNCATE, VT, Expand);
 857     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
 858     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
 859     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
 860     setOperationAction(ISD::VSELECT, VT, Expand);
 861     setOperationAction(ISD::SELECT_CC, VT, Expand);
 862     for (MVT InnerVT : MVT::vector_valuetypes()) {
 863       setTruncStoreAction(InnerVT, VT, Expand);
 864
 865       setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
 866       setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
 867
 868       // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
 869       // types, we have to deal with them whether we ask for Expansion or not.
 870       // Setting Expand causes its own optimisation problems though, so leave
 871       // them legal.
 872       if (VT.getVectorElementType() == MVT::i1)
 873         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
 874     }
 875   }
 876
 877   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
 878   // with -msoft-float, disable use of MMX as well.
 879   if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
 880     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
 881     // No operations on x86mmx supported, everything uses intrinsics.
 882   }
 883
 884   // MMX-sized vectors (other than x86mmx) are expected to be expanded
 885   // into smaller operations.
 886   setOperationAction(ISD::MULHS,              MVT::v8i8,  Expand);
 887   setOperationAction(ISD::MULHS,              MVT::v4i16, Expand);
 888   setOperationAction(ISD::MULHS,              MVT::v2i32, Expand);
 889   setOperationAction(ISD::MULHS,              MVT::v1i64, Expand);
 890   setOperationAction(ISD::AND,                MVT::v8i8,  Expand);
 891   setOperationAction(ISD::AND,                MVT::v4i16, Expand);
 892   setOperationAction(ISD::AND,                MVT::v2i32, Expand);
 893   setOperationAction(ISD::AND,                MVT::v1i64, Expand);
 894   setOperationAction(ISD::OR,                 MVT::v8i8,  Expand);
 895   setOperationAction(ISD::OR,                 MVT::v4i16, Expand);
 896   setOperationAction(ISD::OR,                 MVT::v2i32, Expand);
 897   setOperationAction(ISD::OR,                 MVT::v1i64, Expand);
 898   setOperationAction(ISD::XOR,                MVT::v8i8,  Expand);
 899   setOperationAction(ISD::XOR,                MVT::v4i16, Expand);
 900   setOperationAction(ISD::XOR,                MVT::v2i32, Expand);
 901   setOperationAction(ISD::XOR,                MVT::v1i64, Expand);
 902   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Expand);
 903   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Expand);
 904   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2i32, Expand);
 905   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Expand);
 906   setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
 907   setOperationAction(ISD::SELECT,             MVT::v8i8,  Expand);
 908   setOperationAction(ISD::SELECT,             MVT::v4i16, Expand);
 909   setOperationAction(ISD::SELECT,             MVT::v2i32, Expand);
 910   setOperationAction(ISD::SELECT,             MVT::v1i64, Expand);
 911   setOperationAction(ISD::BITCAST,            MVT::v8i8,  Expand);
 912   setOperationAction(ISD::BITCAST,            MVT::v4i16, Expand);
 913   setOperationAction(ISD::BITCAST,            MVT::v2i32, Expand);
 914   setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
 915
 916   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
 917     addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
 918
 919     setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
 920     setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
 921     setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
 922     setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
 923     setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
 924     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
 925     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
 926     setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
 927     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
 928     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
 929     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
 930     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
 931     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
 932   }
 933
 934   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
 935     addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
 936
 937     // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
 938     // registers cannot be used even for integer operations.
 939     addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
 940     addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
 941     addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
 942     addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
 943
 944     setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
 945     setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
 946     setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
 947     setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
 948     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
 949     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
 950     setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
 951     setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
 952     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
 953     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
 954     setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
 955     setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
 956     setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
 957     setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
 958     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
 959     setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
 960     setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
 961     setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
 962     setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
 963     setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
 964     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
 965     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
 966
 967     setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
 968     setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
 969     setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
 970     setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
 971
 972     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
 973     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
 974     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
 975     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
 976     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
 977
 978     // Only provide customized ctpop vector bit twiddling for vector types we
 979     // know to perform better than using the popcnt instructions on each vector
 980     // element. If popcnt isn't supported, always provide the custom version.
 981     if (!Subtarget->hasPOPCNT()) {
 982       setOperationAction(ISD::CTPOP,            MVT::v4i32, Custom);
 983       setOperationAction(ISD::CTPOP,            MVT::v2i64, Custom);
 984     }
 985
 986     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
 987     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
 988       MVT VT = (MVT::SimpleValueType)i;
 989       // Do not attempt to custom lower non-power-of-2 vectors
 990       if (!isPowerOf2_32(VT.getVectorNumElements()))
 991         continue;
 992       // Do not attempt to custom lower non-128-bit vectors
 993       if (!VT.is128BitVector())
 994         continue;
 995       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
 996       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
 997       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
 998     }
 999
1000     // We support custom legalizing of sext and anyext loads for specific
1001     // memory vector types which we can load as a scalar (or sequence of
1002     // scalars) and extend in-register to a legal 128-bit vector type. For sext
1003     // loads these must work with a single scalar load.
1004     for (MVT VT : MVT::integer_vector_valuetypes()) {
1005       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
1006       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
1007       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
1008       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
1009       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
1010       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
1011       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
1012       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
1013       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
1014     }
1015
1016     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
1017     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
1018     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
1019     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
1020     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
1021     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
1022
1023     if (Subtarget->is64Bit()) {
1024       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
1025       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
1026     }
1027
1028     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
1029     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
1030       MVT VT = (MVT::SimpleValueType)i;
1031
1032       // Do not attempt to promote non-128-bit vectors
1033       if (!VT.is128BitVector())
1034         continue;
1035
1036       setOperationAction(ISD::AND,    VT, Promote);
1037       AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
1038       setOperationAction(ISD::OR,     VT, Promote);
1039       AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
1040       setOperationAction(ISD::XOR,    VT, Promote);
1041       AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
1042       setOperationAction(ISD::LOAD,   VT, Promote);
1043       AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
1044       setOperationAction(ISD::SELECT, VT, Promote);
1045       AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
1046     }
1047
1048     // Custom lower v2i64 and v2f64 selects.
1049     setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
1050     setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
1051     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
1052     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
1053
1054     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
1055     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
1056
1057     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
1058     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
1059     // As there is no 64-bit GPR available, we need build a special custom
1060     // sequence to convert from v2i32 to v2f32.
1061     if (!Subtarget->is64Bit())
1062       setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
1063
1064     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
1065     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
1066
1067     for (MVT VT : MVT::fp_vector_valuetypes())
1068       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
1069
1070     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
1071     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
1072     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
1073   }
1074
1075   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
1076     setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
1077     setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
1078     setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
1079     setOperationAction(ISD::FRINT,              MVT::f32,   Legal);
1080     setOperationAction(ISD::FNEARBYINT,         MVT::f32,   Legal);
1081     setOperationAction(ISD::FFLOOR,             MVT::f64,   Legal);
1082     setOperationAction(ISD::FCEIL,              MVT::f64,   Legal);
1083     setOperationAction(ISD::FTRUNC,             MVT::f64,   Legal);
1084     setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
1085     setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
1086
1087     setOperationAction(ISD::FFLOOR,             MVT::v4f32, Legal);
1088     setOperationAction(ISD::FCEIL,              MVT::v4f32, Legal);
1089     setOperationAction(ISD::FTRUNC,             MVT::v4f32, Legal);
1090     setOperationAction(ISD::FRINT,              MVT::v4f32, Legal);
1091     setOperationAction(ISD::FNEARBYINT,         MVT::v4f32, Legal);
1092     setOperationAction(ISD::FFLOOR,             MVT::v2f64, Legal);
1093     setOperationAction(ISD::FCEIL,              MVT::v2f64, Legal);
1094     setOperationAction(ISD::FTRUNC,             MVT::v2f64, Legal);
1095     setOperationAction(ISD::FRINT,              MVT::v2f64, Legal);
1096     setOperationAction(ISD::FNEARBYINT,         MVT::v2f64, Legal);
1097
1098     // FIXME: Do we need to handle scalar-to-vector here?
1099     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
1100
1101     setOperationAction(ISD::VSELECT,            MVT::v2f64, Custom);
1102     setOperationAction(ISD::VSELECT,            MVT::v2i64, Custom);
1103     setOperationAction(ISD::VSELECT,            MVT::v4i32, Custom);
1104     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
1105     setOperationAction(ISD::VSELECT,            MVT::v8i16, Custom);
1106     // There is no BLENDI for byte vectors. We don't need to custom lower
1107     // some vselects for now.
1108     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
1109
1110     // SSE41 brings specific instructions for doing vector sign extend even in
1111     // cases where we don't have SRA.
1112     for (MVT VT : MVT::integer_vector_valuetypes()) {
1113       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
1114       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
1115       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
1116     }
1117
1118     // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1119     setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
1120     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
1121     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
1122     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1123     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1124     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1125
1126     setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
1127     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
1128     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
1129     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1130     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1131     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1132
1133     // i8 and i16 vectors are custom because the source register and source
1134     // source memory operand types are not the same width.  f32 vectors are
1135     // custom since the immediate controlling the insert encodes additional
1136     // information.
1137     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
1138     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
1139     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
1140     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
1141
1142     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
1143     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
1144     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
1145     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
1146
1147     // FIXME: these should be Legal, but that's only for the case where
1148     // the index is constant.  For now custom expand to deal with that.
1149     if (Subtarget->is64Bit()) {
1150       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
1151       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
1152     }
1153   }
1154
1155   if (Subtarget->hasSSE2()) {
1156     setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
1157     setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
1158
1159     setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
1160     setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
1161
1162     setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
1163     setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
1164
1165     // In the customized shift lowering, the legal cases in AVX2 will be
1166     // recognized.
1167     setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
1168     setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
1169
1170     setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
1171     setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
1172
1173     setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
1174   }
1175
1176   if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
1177     addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
1178     addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
1179     addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
1180     addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
1181     addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
1182     addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
1183
1184     setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
1185     setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
1186     setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
1187
1188     setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
1189     setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
1190     setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
1191     setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
1192     setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
1193     setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
1194     setOperationAction(ISD::FCEIL,              MVT::v8f32, Legal);
1195     setOperationAction(ISD::FTRUNC,             MVT::v8f32, Legal);
1196     setOperationAction(ISD::FRINT,              MVT::v8f32, Legal);
1197     setOperationAction(ISD::FNEARBYINT,         MVT::v8f32, Legal);
1198     setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
1199     setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
1200
1201     setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
1202     setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
1203     setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
1204     setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
1205     setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
1206     setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
1207     setOperationAction(ISD::FCEIL,              MVT::v4f64, Legal);
1208     setOperationAction(ISD::FTRUNC,             MVT::v4f64, Legal);
1209     setOperationAction(ISD::FRINT,              MVT::v4f64, Legal);
1210     setOperationAction(ISD::FNEARBYINT,         MVT::v4f64, Legal);
1211     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
1212     setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
1213
1214     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1215     // even though v8i16 is a legal type.
1216     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Promote);
1217     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Promote);
1218     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
1219
1220     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
1221     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
1222     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
1223
1224     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
1225     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
1226
1227     for (MVT VT : MVT::fp_vector_valuetypes())
1228       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1229
1230     setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
1231     setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
1232
1233     setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
1234     setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
1235
1236     setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
1237     setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
1238
1239     setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
1240     setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
1241     setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
1242     setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
1243
1244     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
1245     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
1246     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
1247
1248     setOperationAction(ISD::VSELECT,           MVT::v4f64, Custom);
1249     setOperationAction(ISD::VSELECT,           MVT::v4i64, Custom);
1250     setOperationAction(ISD::VSELECT,           MVT::v8i32, Custom);
1251     setOperationAction(ISD::VSELECT,           MVT::v8f32, Custom);
1252
1253     setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
1254     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
1255     setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
1256     setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
1257     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
1258     setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i16, Custom);
1259     setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
1260     setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
1261     setOperationAction(ISD::ANY_EXTEND,        MVT::v16i16, Custom);
1262     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
1263     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
1264     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
1265
1266     if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
1267       setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
1268       setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
1269       setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
1270       setOperationAction(ISD::FMA,             MVT::v2f64, Legal);
1271       setOperationAction(ISD::FMA,             MVT::f32, Legal);
1272       setOperationAction(ISD::FMA,             MVT::f64, Legal);
1273     }
1274
1275     if (Subtarget->hasInt256()) {
1276       setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
1277       setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
1278       setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
1279       setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
1280
1281       setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
1282       setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
1283       setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
1284       setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
1285
1286       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
1287       setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
1288       setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
1289       // Don't lower v32i8 because there is no 128-bit byte mul
1290
1291       setOperationAction(ISD::UMUL_LOHI,       MVT::v8i32, Custom);
1292       setOperationAction(ISD::SMUL_LOHI,       MVT::v8i32, Custom);
1293       setOperationAction(ISD::MULHU,           MVT::v16i16, Legal);
1294       setOperationAction(ISD::MULHS,           MVT::v16i16, Legal);
1295
1296       setOperationAction(ISD::VSELECT,         MVT::v16i16, Custom);
1297       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
1298
1299       // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1300       // when we have a 256bit-wide blend with immediate.
1301       setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1302
1303       // Only provide customized ctpop vector bit twiddling for vector types we
1304       // know to perform better than using the popcnt instructions on each
1305       // vector element. If popcnt isn't supported, always provide the custom
1306       // version.
1307       if (!Subtarget->hasPOPCNT())
1308         setOperationAction(ISD::CTPOP,           MVT::v4i64, Custom);
1309
1310       // Custom CTPOP always performs better on natively supported v8i32
1311       setOperationAction(ISD::CTPOP,             MVT::v8i32, Custom);
1312
1313       // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1314       setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1315       setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
1316       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
1317       setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
1318       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
1319       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
1320
1321       setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1322       setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
1323       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
1324       setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
1325       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
1326       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
1327     } else {
1328       setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
1329       setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
1330       setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
1331       setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
1332
1333       setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
1334       setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
1335       setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
1336       setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
1337
1338       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
1339       setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
1340       setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
1341       // Don't lower v32i8 because there is no 128-bit byte mul
1342     }
1343
1344     // In the customized shift lowering, the legal cases in AVX2 will be
1345     // recognized.
1346     setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
1347     setOperationAction(ISD::SRL,               MVT::v8i32, Custom);
1348
1349     setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
1350     setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
1351
1352     setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
1353
1354     // Custom lower several nodes for 256-bit types.
1355     for (MVT VT : MVT::vector_valuetypes()) {
1356       if (VT.getScalarSizeInBits() >= 32) {
1357         setOperationAction(ISD::MLOAD,  VT, Legal);
1358         setOperationAction(ISD::MSTORE, VT, Legal);
1359       }
1360       // Extract subvector is special because the value type
1361       // (result) is 128-bit but the source is 256-bit wide.
1362       if (VT.is128BitVector()) {
1363         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1364       }
1365       // Do not attempt to custom lower other non-256-bit vectors
1366       if (!VT.is256BitVector())
1367         continue;
1368
1369       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1370       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1371       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1372       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1373       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
1374       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
1375       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1376     }
1377
1378     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1379     for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
1380       MVT VT = (MVT::SimpleValueType)i;
1381
1382       // Do not attempt to promote non-256-bit vectors
1383       if (!VT.is256BitVector())
1384         continue;
1385
1386       setOperationAction(ISD::AND,    VT, Promote);
1387       AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
1388       setOperationAction(ISD::OR,     VT, Promote);
1389       AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
1390       setOperationAction(ISD::XOR,    VT, Promote);
1391       AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
1392       setOperationAction(ISD::LOAD,   VT, Promote);
1393       AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
1394       setOperationAction(ISD::SELECT, VT, Promote);
1395       AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
1396     }
1397   }
1398
1399   if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) {
1400     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1401     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1402     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
1403     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
1404
1405     addRegisterClass(MVT::i1,     &X86::VK1RegClass);
1406     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
1407     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
1408
1409     for (MVT VT : MVT::fp_vector_valuetypes())
1410       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1411
1412     setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
1413     setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
1414     setOperationAction(ISD::XOR,                MVT::i1,    Legal);
1415     setOperationAction(ISD::OR,                 MVT::i1,    Legal);
1416     setOperationAction(ISD::AND,                MVT::i1,    Legal);
1417     setOperationAction(ISD::LOAD,               MVT::v16f32, Legal);
1418     setOperationAction(ISD::LOAD,               MVT::v8f64, Legal);
1419     setOperationAction(ISD::LOAD,               MVT::v8i64, Legal);
1420     setOperationAction(ISD::LOAD,               MVT::v16i32, Legal);
1421     setOperationAction(ISD::LOAD,               MVT::v16i1, Legal);
1422
1423     setOperationAction(ISD::FADD,               MVT::v16f32, Legal);
1424     setOperationAction(ISD::FSUB,               MVT::v16f32, Legal);
1425     setOperationAction(ISD::FMUL,               MVT::v16f32, Legal);
1426     setOperationAction(ISD::FDIV,               MVT::v16f32, Legal);
1427     setOperationAction(ISD::FSQRT,              MVT::v16f32, Legal);
1428     setOperationAction(ISD::FNEG,               MVT::v16f32, Custom);
1429
1430     setOperationAction(ISD::FADD,               MVT::v8f64, Legal);
1431     setOperationAction(ISD::FSUB,               MVT::v8f64, Legal);
1432     setOperationAction(ISD::FMUL,               MVT::v8f64, Legal);
1433     setOperationAction(ISD::FDIV,               MVT::v8f64, Legal);
1434     setOperationAction(ISD::FSQRT,              MVT::v8f64, Legal);
1435     setOperationAction(ISD::FNEG,               MVT::v8f64, Custom);
1436     setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
1437     setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
1438
1439     setOperationAction(ISD::FP_TO_SINT,         MVT::i32, Legal);
1440     setOperationAction(ISD::FP_TO_UINT,         MVT::i32, Legal);
1441     setOperationAction(ISD::SINT_TO_FP,         MVT::i32, Legal);
1442     setOperationAction(ISD::UINT_TO_FP,         MVT::i32, Legal);
1443     if (Subtarget->is64Bit()) {
1444       setOperationAction(ISD::FP_TO_UINT,       MVT::i64, Legal);
1445       setOperationAction(ISD::FP_TO_SINT,       MVT::i64, Legal);
1446       setOperationAction(ISD::SINT_TO_FP,       MVT::i64, Legal);
1447       setOperationAction(ISD::UINT_TO_FP,       MVT::i64, Legal);
1448     }
1449     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
1450     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
1451     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
1452     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
1453     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
1454     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i1,   Custom);
1455     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i1,  Custom);
1456     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i8,  Promote);
1457     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i16, Promote);
1458     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
1459     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
1460     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
1461     setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
1462     setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
1463
1464     setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
1465     setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
1466     setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
1467     setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
1468     setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
1469     setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
1470     setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
1471     setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
1472     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
1473     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
1474     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
1475     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
1476     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
1477
1478     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
1479     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
1480     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
1481     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
1482     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1,    Custom);
1483     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1, Legal);
1484
1485     setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
1486     setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);
1487
1488     setOperationAction(ISD::MUL,              MVT::v8i64, Custom);
1489
1490     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
1491     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
1492     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
1493     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
1494     setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
1495     setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
1496     setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
1497     setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
1498     setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
1499
1500     setOperationAction(ISD::ADD,                MVT::v8i64, Legal);
1501     setOperationAction(ISD::ADD,                MVT::v16i32, Legal);
1502
1503     setOperationAction(ISD::SUB,                MVT::v8i64, Legal);
1504     setOperationAction(ISD::SUB,                MVT::v16i32, Legal);
1505
1506     setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
1507
1508     setOperationAction(ISD::SRL,                MVT::v8i64, Custom);
1509     setOperationAction(ISD::SRL,                MVT::v16i32, Custom);
1510
1511     setOperationAction(ISD::SHL,                MVT::v8i64, Custom);
1512     setOperationAction(ISD::SHL,                MVT::v16i32, Custom);
1513
1514     setOperationAction(ISD::SRA,                MVT::v8i64, Custom);
1515     setOperationAction(ISD::SRA,                MVT::v16i32, Custom);
1516
1517     setOperationAction(ISD::AND,                MVT::v8i64, Legal);
1518     setOperationAction(ISD::OR,                 MVT::v8i64, Legal);
1519     setOperationAction(ISD::XOR,                MVT::v8i64, Legal);
1520     setOperationAction(ISD::AND,                MVT::v16i32, Legal);
1521     setOperationAction(ISD::OR,                 MVT::v16i32, Legal);
1522     setOperationAction(ISD::XOR,                MVT::v16i32, Legal);
1523
1524     if (Subtarget->hasCDI()) {
1525       setOperationAction(ISD::CTLZ,             MVT::v8i64, Legal);
1526       setOperationAction(ISD::CTLZ,             MVT::v16i32, Legal);
1527     }
1528
1529     // Custom lower several nodes.
1530     for (MVT VT : MVT::vector_valuetypes()) {
1531       unsigned EltSize = VT.getVectorElementType().getSizeInBits();
1532       // Extract subvector is special because the value type
1533       // (result) is 256/128-bit but the source is 512-bit wide.
1534       if (VT.is128BitVector() || VT.is256BitVector()) {
1535         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1536       }
1537       if (VT.getVectorElementType() == MVT::i1)
1538         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1539
1540       // Do not attempt to custom lower other non-512-bit vectors
1541       if (!VT.is512BitVector())
1542         continue;
1543
1544       if ( EltSize >= 32) {
1545         setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
1546         setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
1547         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
1548         setOperationAction(ISD::VSELECT,             VT, Legal);
1549         setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
1550         setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
1551         setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
1552         setOperationAction(ISD::MLOAD,               VT, Legal);
1553         setOperationAction(ISD::MSTORE,              VT, Legal);
1554       }
1555     }
1556     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
1557       MVT VT = (MVT::SimpleValueType)i;
1558
1559       // Do not attempt to promote non-512-bit vectors.
1560       if (!VT.is512BitVector())
1561         continue;
1562
1563       setOperationAction(ISD::SELECT, VT, Promote);
1564       AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
1565     }
1566   }// has  AVX-512
1567
1568   if (!TM.Options.UseSoftFloat && Subtarget->hasBWI()) {
1569     addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1570     addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
1571
1572     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
1573     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
1574
1575     setOperationAction(ISD::LOAD,               MVT::v32i16, Legal);
1576     setOperationAction(ISD::LOAD,               MVT::v64i8, Legal);
1577     setOperationAction(ISD::SETCC,              MVT::v32i1, Custom);
1578     setOperationAction(ISD::SETCC,              MVT::v64i1, Custom);
1579     setOperationAction(ISD::ADD,                MVT::v32i16, Legal);
1580     setOperationAction(ISD::ADD,                MVT::v64i8, Legal);
1581     setOperationAction(ISD::SUB,                MVT::v32i16, Legal);
1582     setOperationAction(ISD::SUB,                MVT::v64i8, Legal);
1583     setOperationAction(ISD::MUL,                MVT::v32i16, Legal);
1584
1585     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
1586       const MVT VT = (MVT::SimpleValueType)i;
1587
1588       const unsigned EltSize = VT.getVectorElementType().getSizeInBits();
1589
1590       // Do not attempt to promote non-512-bit vectors.
1591       if (!VT.is512BitVector())
1592         continue;
1593
1594       if (EltSize < 32) {
1595         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
1596         setOperationAction(ISD::VSELECT,             VT, Legal);
1597       }
1598     }
1599   }
1600
1601   if (!TM.Options.UseSoftFloat && Subtarget->hasVLX()) {
1602     addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
1603     addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
1604
1605     setOperationAction(ISD::SETCC,              MVT::v4i1, Custom);
1606     setOperationAction(ISD::SETCC,              MVT::v2i1, Custom);
1607     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v8i1, Legal);
1608
1609     setOperationAction(ISD::AND,                MVT::v8i32, Legal);
1610     setOperationAction(ISD::OR,                 MVT::v8i32, Legal);
1611     setOperationAction(ISD::XOR,                MVT::v8i32, Legal);
1612     setOperationAction(ISD::AND,                MVT::v4i32, Legal);
1613     setOperationAction(ISD::OR,                 MVT::v4i32, Legal);
1614     setOperationAction(ISD::XOR,                MVT::v4i32, Legal);
1615   }
1616
1617   // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
1618   // of this type with custom code.
1619   for (MVT VT : MVT::vector_valuetypes())
1620     setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
1621
1622   // We want to custom lower some of our intrinsics.
1623   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1624   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1625   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1626   if (!Subtarget->is64Bit())
1627     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1628
1629   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1630   // handle type legalization for these operations here.
1631   //
1632   // FIXME: We really should do custom legalization for addition and
1633   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
1634   // than generic legalization for 64-bit multiplication-with-overflow, though.
1635   for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
1636     // Add/Sub/Mul with overflow operations are custom lowered.
1637     MVT VT = IntVTs[i];
1638     setOperationAction(ISD::SADDO, VT, Custom);
1639     setOperationAction(ISD::UADDO, VT, Custom);
1640     setOperationAction(ISD::SSUBO, VT, Custom);
1641     setOperationAction(ISD::USUBO, VT, Custom);
1642     setOperationAction(ISD::SMULO, VT, Custom);
1643     setOperationAction(ISD::UMULO, VT, Custom);
1644   }
1645
1646
1647   if (!Subtarget->is64Bit()) {
1648     // These libcalls are not available in 32-bit.
1649     setLibcallName(RTLIB::SHL_I128, nullptr);
1650     setLibcallName(RTLIB::SRL_I128, nullptr);
1651     setLibcallName(RTLIB::SRA_I128, nullptr);
1652   }
1653
1654   // Combine sin / cos into one node or libcall if possible.
1655   if (Subtarget->hasSinCos()) {
1656     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1657     setLibcallName(RTLIB::SINCOS_F64, "sincos");
1658     if (Subtarget->isTargetDarwin()) {
1659       // For MacOSX, we don't want the normal expansion of a libcall to sincos.
1660       // We want to issue a libcall to __sincos_stret to avoid memory traffic.
1661       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1662       setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1663     }
1664   }
1665
1666   if (Subtarget->isTargetWin64()) {
1667     setOperationAction(ISD::SDIV, MVT::i128, Custom);
1668     setOperationAction(ISD::UDIV, MVT::i128, Custom);
1669     setOperationAction(ISD::SREM, MVT::i128, Custom);
1670     setOperationAction(ISD::UREM, MVT::i128, Custom);
1671     setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1672     setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1673   }
1674
1675   // We have target-specific dag combine patterns for the following nodes:
1676   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1677   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1678   setTargetDAGCombine(ISD::BITCAST);
1679   setTargetDAGCombine(ISD::VSELECT);
1680   setTargetDAGCombine(ISD::SELECT);
1681   setTargetDAGCombine(ISD::SHL);
1682   setTargetDAGCombine(ISD::SRA);
1683   setTargetDAGCombine(ISD::SRL);
1684   setTargetDAGCombine(ISD::OR);
1685   setTargetDAGCombine(ISD::AND);
1686   setTargetDAGCombine(ISD::ADD);
1687   setTargetDAGCombine(ISD::FADD);
1688   setTargetDAGCombine(ISD::FSUB);
1689   setTargetDAGCombine(ISD::FMA);
1690   setTargetDAGCombine(ISD::SUB);
1691   setTargetDAGCombine(ISD::LOAD);
1692   setTargetDAGCombine(ISD::MLOAD);
1693   setTargetDAGCombine(ISD::STORE);
1694   setTargetDAGCombine(ISD::MSTORE);
1695   setTargetDAGCombine(ISD::ZERO_EXTEND);
1696   setTargetDAGCombine(ISD::ANY_EXTEND);
1697   setTargetDAGCombine(ISD::SIGN_EXTEND);
1698   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1699   setTargetDAGCombine(ISD::TRUNCATE);
1700   setTargetDAGCombine(ISD::SINT_TO_FP);
1701   setTargetDAGCombine(ISD::SETCC);
1702   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
1703   setTargetDAGCombine(ISD::BUILD_VECTOR);
1704   setTargetDAGCombine(ISD::MUL);
1705   setTargetDAGCombine(ISD::XOR);
1706
1707   computeRegisterProperties();
1708
1709   // On Darwin, -Os means optimize for size without hurting performance,
1710   // do not reduce the limit.
1711   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1712   MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
1713   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1714   MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1715   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1716   MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1717   setPrefLoopAlignment(4); // 2^4 bytes.
1718
1719   // Predictable cmov don't hurt on atom because it's in-order.
1720   PredictableSelectIsExpensive = !Subtarget->isAtom();
1721   EnableExtLdPromotion = true;
1722   setPrefFunctionAlignment(4); // 2^4 bytes.
1723
1724   verifyIntrinsicTables();
1725 }
1726
1727 // This has so far only been implemented for 64-bit MachO.
1728 bool X86TargetLowering::useLoadStackGuardNode() const {
1729   return Subtarget->isTargetMachO() && Subtarget->is64Bit();
1730 }
1731
1732 TargetLoweringBase::LegalizeTypeAction
1733 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1734   if (ExperimentalVectorWideningLegalization &&
1735       VT.getVectorNumElements() != 1 &&
1736       VT.getVectorElementType().getSimpleVT() != MVT::i1)
1737     return TypeWidenVector;
1738
1739   return TargetLoweringBase::getPreferredVectorAction(VT);
1740 }
1741
1742 EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1743   if (!VT.isVector())
1744     return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
1745
1746   const unsigned NumElts = VT.getVectorNumElements();
1747   const EVT EltVT = VT.getVectorElementType();
1748   if (VT.is512BitVector()) {
1749     if (Subtarget->hasAVX512())
1750       if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1751           EltVT == MVT::f32 || EltVT == MVT::f64)
1752         switch(NumElts) {
1753         case  8: return MVT::v8i1;
1754         case 16: return MVT::v16i1;
1755       }
1756     if (Subtarget->hasBWI())
1757       if (EltVT == MVT::i8 || EltVT == MVT::i16)
1758         switch(NumElts) {
1759         case 32: return MVT::v32i1;
1760         case 64: return MVT::v64i1;
1761       }
1762   }
1763
1764   if (VT.is256BitVector() || VT.is128BitVector()) {
1765     if (Subtarget->hasVLX())
1766       if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1767           EltVT == MVT::f32 || EltVT == MVT::f64)
1768         switch(NumElts) {
1769         case 2: return MVT::v2i1;
1770         case 4: return MVT::v4i1;
1771         case 8: return MVT::v8i1;
1772       }
1773     if (Subtarget->hasBWI() && Subtarget->hasVLX())
1774       if (EltVT == MVT::i8 || EltVT == MVT::i16)
1775         switch(NumElts) {
1776         case  8: return MVT::v8i1;
1777         case 16: return MVT::v16i1;
1778         case 32: return MVT::v32i1;
1779       }
1780   }
1781
1782   return VT.changeVectorElementTypeToInteger();
1783 }
1784
1785 /// Helper for getByValTypeAlignment to determine
1786 /// the desired ByVal argument alignment.
1787 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1788   if (MaxAlign == 16)
1789     return;
1790   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1791     if (VTy->getBitWidth() == 128)
1792       MaxAlign = 16;
1793   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1794     unsigned EltAlign = 0;
1795     getMaxByValAlign(ATy->getElementType(), EltAlign);
1796     if (EltAlign > MaxAlign)
1797       MaxAlign = EltAlign;
1798   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1799     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
1800       unsigned EltAlign = 0;
1801       getMaxByValAlign(STy->getElementType(i), EltAlign);
1802       if (EltAlign > MaxAlign)
1803         MaxAlign = EltAlign;
1804       if (MaxAlign == 16)
1805         break;
1806     }
1807   }
1808 }
1809
1810 /// Return the desired alignment for ByVal aggregate
1811 /// function arguments in the caller parameter area. For X86, aggregates
1812 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1813 /// are at 4-byte boundaries.
1814 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
1815   if (Subtarget->is64Bit()) {
1816     // Max of 8 and alignment of type.
1817     unsigned TyAlign = TD->getABITypeAlignment(Ty);
1818     if (TyAlign > 8)
1819       return TyAlign;
1820     return 8;
1821   }
1822
1823   unsigned Align = 4;
1824   if (Subtarget->hasSSE1())
1825     getMaxByValAlign(Ty, Align);
1826   return Align;
1827 }
1828
1829 /// Returns the target specific optimal type for load
1830 /// and store operations as a result of memset, memcpy, and memmove
1831 /// lowering. If DstAlign is zero that means it's safe to destination
1832 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1833 /// means there isn't a need to check it against alignment requirement,
1834 /// probably because the source does not need to be loaded. If 'IsMemset' is
1835 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1836 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1837 /// source is constant so it does not need to be loaded.
1838 /// It returns EVT::Other if the type should be determined using generic
1839 /// target-independent logic.
1840 EVT
1841 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1842                                        unsigned DstAlign, unsigned SrcAlign,
1843                                        bool IsMemset, bool ZeroMemset,
1844                                        bool MemcpyStrSrc,
1845                                        MachineFunction &MF) const {
1846   const Function *F = MF.getFunction();
1847   if ((!IsMemset || ZeroMemset) &&
1848       !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
1849                                        Attribute::NoImplicitFloat)) {
1850     if (Size >= 16 &&
1851         (Subtarget->isUnalignedMemAccessFast() ||
1852          ((DstAlign == 0 || DstAlign >= 16) &&
1853           (SrcAlign == 0 || SrcAlign >= 16)))) {
1854       if (Size >= 32) {
1855         if (Subtarget->hasInt256())
1856           return MVT::v8i32;
1857         if (Subtarget->hasFp256())
1858           return MVT::v8f32;
1859       }
1860       if (Subtarget->hasSSE2())
1861         return MVT::v4i32;
1862       if (Subtarget->hasSSE1())
1863         return MVT::v4f32;
1864     } else if (!MemcpyStrSrc && Size >= 8 &&
1865                !Subtarget->is64Bit() &&
1866                Subtarget->hasSSE2()) {
1867       // Do not use f64 to lower memcpy if source is string constant. It's
1868       // better to use i32 to avoid the loads.
1869       return MVT::f64;
1870     }
1871   }
1872   if (Subtarget->is64Bit() && Size >= 8)
1873     return MVT::i64;
1874   return MVT::i32;
1875 }
1876
1877 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1878   if (VT == MVT::f32)
1879     return X86ScalarSSEf32;
1880   else if (VT == MVT::f64)
1881     return X86ScalarSSEf64;
1882   return true;
1883 }
1884
1885 bool
1886 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1887                                                   unsigned,
1888                                                   unsigned,
1889                                                   bool *Fast) const {
1890   if (Fast)
1891     *Fast = Subtarget->isUnalignedMemAccessFast();
1892   return true;
1893 }
1894
1895 /// Return the entry encoding for a jump table in the
1896 /// current function.  The returned value is a member of the
1897 /// MachineJumpTableInfo::JTEntryKind enum.
1898 unsigned X86TargetLowering::getJumpTableEncoding() const {
1899   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1900   // symbol.
1901   if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1902       Subtarget->isPICStyleGOT())
1903     return MachineJumpTableInfo::EK_Custom32;
1904
1905   // Otherwise, use the normal jump table encoding heuristics.
1906   return TargetLowering::getJumpTableEncoding();
1907 }
1908
1909 const MCExpr *
1910 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1911                                              const MachineBasicBlock *MBB,
1912                                              unsigned uid,MCContext &Ctx) const{
1913   assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ &&
1914          Subtarget->isPICStyleGOT());
1915   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1916   // entries.
1917   return MCSymbolRefExpr::Create(MBB->getSymbol(),
1918                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
1919 }
1920
1921 /// Returns relocation base for the given PIC jumptable.
1922 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1923                                                     SelectionDAG &DAG) const {
1924   if (!Subtarget->is64Bit())
1925     // This doesn't have SDLoc associated with it, but is not really the
1926     // same as a Register.
1927     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
1928   return Table;
1929 }
1930
1931 /// This returns the relocation base for the given PIC jumptable,
1932 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
1933 const MCExpr *X86TargetLowering::
1934 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1935                              MCContext &Ctx) const {
1936   // X86-64 uses RIP relative addressing based on the jump table label.
1937   if (Subtarget->isPICStyleRIPRel())
1938     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1939
1940   // Otherwise, the reference is relative to the PIC base.
1941   return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
1942 }
1943
1944 // FIXME: Why this routine is here? Move to RegInfo!
1945 std::pair<const TargetRegisterClass*, uint8_t>
1946 X86TargetLowering::findRepresentativeClass(MVT VT) const{
1947   const TargetRegisterClass *RRC = nullptr;
1948   uint8_t Cost = 1;
1949   switch (VT.SimpleTy) {
1950   default:
1951     return TargetLowering::findRepresentativeClass(VT);
1952   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1953     RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
1954     break;
1955   case MVT::x86mmx:
1956     RRC = &X86::VR64RegClass;
1957     break;
1958   case MVT::f32: case MVT::f64:
1959   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1960   case MVT::v4f32: case MVT::v2f64:
1961   case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
1962   case MVT::v4f64:
1963     RRC = &X86::VR128RegClass;
1964     break;
1965   }
1966   return std::make_pair(RRC, Cost);
1967 }
1968
1969 bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
1970                                                unsigned &Offset) const {
1971   if (!Subtarget->isTargetLinux())
1972     return false;
1973
1974   if (Subtarget->is64Bit()) {
1975     // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
1976     Offset = 0x28;
1977     if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
1978       AddressSpace = 256;
1979     else
1980       AddressSpace = 257;
1981   } else {
1982     // %gs:0x14 on i386
1983     Offset = 0x14;
1984     AddressSpace = 256;
1985   }
1986   return true;
1987 }
1988
1989 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
1990                                             unsigned DestAS) const {
1991   assert(SrcAS != DestAS && "Expected different address spaces!");
1992
1993   return SrcAS < 256 && DestAS < 256;
1994 }
1995
1996 //===----------------------------------------------------------------------===//
1997 //               Return Value Calling Convention Implementation
1998 //===----------------------------------------------------------------------===//
1999
2000 #include "X86GenCallingConv.inc"
2001
2002 bool
2003 X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
2004                                   MachineFunction &MF, bool isVarArg,
2005                         const SmallVectorImpl<ISD::OutputArg> &Outs,
2006                         LLVMContext &Context) const {
2007   SmallVector<CCValAssign, 16> RVLocs;
2008   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2009   return CCInfo.CheckReturn(Outs, RetCC_X86);
2010 }
2011
2012 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2013   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2014   return ScratchRegs;
2015 }
2016
2017 SDValue
2018 X86TargetLowering::LowerReturn(SDValue Chain,
2019                                CallingConv::ID CallConv, bool isVarArg,
2020                                const SmallVectorImpl<ISD::OutputArg> &Outs,
2021                                const SmallVectorImpl<SDValue> &OutVals,
2022                                SDLoc dl, SelectionDAG &DAG) const {
2023   MachineFunction &MF = DAG.getMachineFunction();
2024   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2025
2026   SmallVector<CCValAssign, 16> RVLocs;
2027   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2028   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2029
2030   SDValue Flag;
2031   SmallVector<SDValue, 6> RetOps;
2032   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2033   // Operand #1 = Bytes To Pop
2034   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
2035                    MVT::i16));
2036
2037   // Copy the result values into the output registers.
2038   for (unsigned i = 0; i != RVLocs.size(); ++i) {
2039     CCValAssign &VA = RVLocs[i];
2040     assert(VA.isRegLoc() && "Can only return in registers!");
2041     SDValue ValToCopy = OutVals[i];
2042     EVT ValVT = ValToCopy.getValueType();
2043
2044     // Promote values to the appropriate types.
2045     if (VA.getLocInfo() == CCValAssign::SExt)
2046       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2047     else if (VA.getLocInfo() == CCValAssign::ZExt)
2048       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2049     else if (VA.getLocInfo() == CCValAssign::AExt)
2050       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2051     else if (VA.getLocInfo() == CCValAssign::BCvt)
2052       ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
2053
2054     assert(VA.getLocInfo() != CCValAssign::FPExt &&
2055            "Unexpected FP-extend for return value.");
2056
2057     // If this is x86-64, and we disabled SSE, we can't return FP values,
2058     // or SSE or MMX vectors.
2059     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2060          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2061           (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
2062       report_fatal_error("SSE register return with SSE disabled");
2063     }
2064     // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
2065     // llvm-gcc has never done it right and no one has noticed, so this
2066     // should be OK for now.
2067     if (ValVT == MVT::f64 &&
2068         (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
2069       report_fatal_error("SSE2 register return with SSE2 disabled");
2070
2071     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2072     // the RET instruction and handled by the FP Stackifier.
2073     if (VA.getLocReg() == X86::FP0 ||
2074         VA.getLocReg() == X86::FP1) {
2075       // If this is a copy from an xmm register to ST(0), use an FPExtend to
2076       // change the value to the FP stack register class.
2077       if (isScalarFPTypeInSSEReg(VA.getValVT()))
2078         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2079       RetOps.push_back(ValToCopy);
2080       // Don't emit a copytoreg.
2081       continue;
2082     }
2083
2084     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2085     // which is returned in RAX / RDX.
2086     if (Subtarget->is64Bit()) {
2087       if (ValVT == MVT::x86mmx) {
2088         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2089           ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
2090           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2091                                   ValToCopy);
2092           // If we don't have SSE2 available, convert to v4f32 so the generated
2093           // register is legal.
2094           if (!Subtarget->hasSSE2())
2095             ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
2096         }
2097       }
2098     }
2099
2100     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
2101     Flag = Chain.getValue(1);
2102     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2103   }
2104
2105   // The x86-64 ABIs require that for returning structs by value we copy
2106   // the sret argument into %rax/%eax (depending on ABI) for the return.
2107   // Win32 requires us to put the sret argument to %eax as well.
2108   // We saved the argument into a virtual register in the entry block,
2109   // so now we copy the value out and into %rax/%eax.
2110   //
2111   // Checking Function.hasStructRetAttr() here is insufficient because the IR
2112   // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2113   // false, then an sret argument may be implicitly inserted in the SelDAG. In
2114   // either case FuncInfo->setSRetReturnReg() will have been called.
2115   if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2116     assert((Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) &&
2117            "No need for an sret register");
2118     SDValue Val = DAG.getCopyFromReg(Chain, dl, SRetReg, getPointerTy());
2119
2120     unsigned RetValReg
2121         = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
2122           X86::RAX : X86::EAX;
2123     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2124     Flag = Chain.getValue(1);
2125
2126     // RAX/EAX now acts like a return value.
2127     RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
2128   }
2129
2130   RetOps[0] = Chain;  // Update chain.
2131
2132   // Add the flag if we have it.
2133   if (Flag.getNode())
2134     RetOps.push_back(Flag);
2135
2136   return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps);
2137 }
2138
2139 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2140   if (N->getNumValues() != 1)
2141     return false;
2142   if (!N->hasNUsesOfValue(1, 0))
2143     return false;
2144
2145   SDValue TCChain = Chain;
2146   SDNode *Copy = *N->use_begin();
2147   if (Copy->getOpcode() == ISD::CopyToReg) {
2148     // If the copy has a glue operand, we conservatively assume it isn't safe to
2149     // perform a tail call.
2150     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2151       return false;
2152     TCChain = Copy->getOperand(0);
2153   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2154     return false;
2155
2156   bool HasRet = false;
2157   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2158        UI != UE; ++UI) {
2159     if (UI->getOpcode() != X86ISD::RET_FLAG)
2160       return false;
2161     // If we are returning more than one value, we can definitely
2162     // not make a tail call see PR19530
2163     if (UI->getNumOperands() > 4)
2164       return false;
2165     if (UI->getNumOperands() == 4 &&
2166         UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2167       return false;
2168     HasRet = true;
2169   }
2170
2171   if (!HasRet)
2172     return false;
2173
2174   Chain = TCChain;
2175   return true;
2176 }
2177
2178 EVT
2179 X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
2180                                             ISD::NodeType ExtendKind) const {
2181   MVT ReturnMVT;
2182   // TODO: Is this also valid on 32-bit?
2183   if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
2184     ReturnMVT = MVT::i8;
2185   else
2186     ReturnMVT = MVT::i32;
2187
2188   EVT MinVT = getRegisterType(Context, ReturnMVT);
2189   return VT.bitsLT(MinVT) ? MinVT : VT;
2190 }
2191
2192 /// Lower the result values of a call into the
2193 /// appropriate copies out of appropriate physical registers.
2194 ///
2195 SDValue
2196 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
2197                                    CallingConv::ID CallConv, bool isVarArg,
2198                                    const SmallVectorImpl<ISD::InputArg> &Ins,
2199                                    SDLoc dl, SelectionDAG &DAG,
2200                                    SmallVectorImpl<SDValue> &InVals) const {
2201
2202   // Assign locations to each value returned by this call.
2203   SmallVector<CCValAssign, 16> RVLocs;
2204   bool Is64Bit = Subtarget->is64Bit();
2205   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2206                  *DAG.getContext());
2207   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2208
2209   // Copy all of the result registers out of their specified physreg.
2210   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
2211     CCValAssign &VA = RVLocs[i];
2212     EVT CopyVT = VA.getValVT();
2213
2214     // If this is x86-64, and we disabled SSE, we can't return FP values
2215     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
2216         ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
2217       report_fatal_error("SSE register return with SSE disabled");
2218     }
2219
2220     // If we prefer to use the value in xmm registers, copy it out as f80 and
2221     // use a truncate to move it from fp stack reg to xmm reg.
2222     if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2223         isScalarFPTypeInSSEReg(VA.getValVT()))
2224       CopyVT = MVT::f80;
2225
2226     Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
2227                                CopyVT, InFlag).getValue(1);
2228     SDValue Val = Chain.getValue(0);
2229
2230     if (CopyVT != VA.getValVT())
2231       Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2232                         // This truncation won't change the value.
2233                         DAG.getIntPtrConstant(1));
2234
2235     InFlag = Chain.getValue(2);
2236     InVals.push_back(Val);
2237   }
2238
2239   return Chain;
2240 }
2241
2242 //===----------------------------------------------------------------------===//
2243 //                C & StdCall & Fast Calling Convention implementation
2244 //===----------------------------------------------------------------------===//
2245 //  StdCall calling convention seems to be standard for many Windows' API
2246 //  routines and around. It differs from C calling convention just a little:
2247 //  callee should clean up the stack, not caller. Symbols should be also
2248 //  decorated in some fancy way :) It doesn't support any vector arguments.
2249 //  For info on fast calling convention see Fast Calling Convention (tail call)
2250 //  implementation LowerX86_32FastCCCallTo.
2251
2252 /// CallIsStructReturn - Determines whether a call uses struct return
2253 /// semantics.
2254 enum StructReturnType {
2255   NotStructReturn,
2256   RegStructReturn,
2257   StackStructReturn
2258 };
2259 static StructReturnType
2260 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
2261   if (Outs.empty())
2262     return NotStructReturn;
2263
2264   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2265   if (!Flags.isSRet())
2266     return NotStructReturn;
2267   if (Flags.isInReg())
2268     return RegStructReturn;
2269   return StackStructReturn;
2270 }
2271
2272 /// Determines whether a function uses struct return semantics.
2273 static StructReturnType
2274 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
2275   if (Ins.empty())
2276     return NotStructReturn;
2277
2278   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2279   if (!Flags.isSRet())
2280     return NotStructReturn;
2281   if (Flags.isInReg())
2282     return RegStructReturn;
2283   return StackStructReturn;
2284 }
2285
2286 /// Make a copy of an aggregate at address specified by "Src" to address
2287 /// "Dst" with size and alignment information specified by the specific
2288 /// parameter attribute. The copy will be passed as a byval function parameter.
2289 static SDValue
2290 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
2291                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
2292                           SDLoc dl) {
2293   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
2294
2295   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2296                        /*isVolatile*/false, /*AlwaysInline=*/true,
2297                        MachinePointerInfo(), MachinePointerInfo());
2298 }
2299
2300 /// Return true if the calling convention is one that
2301 /// supports tail call optimization.
2302 static bool IsTailCallConvention(CallingConv::ID CC) {
2303   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2304           CC == CallingConv::HiPE);
2305 }
2306
2307 /// \brief Return true if the calling convention is a C calling convention.
2308 static bool IsCCallConvention(CallingConv::ID CC) {
2309   return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||
2310           CC == CallingConv::X86_64_SysV);
2311 }
2312
2313 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
2314   if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
2315     return false;
2316
2317   CallSite CS(CI);
2318   CallingConv::ID CalleeCC = CS.getCallingConv();
2319   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
2320     return false;
2321
2322   return true;
2323 }
2324
2325 /// Return true if the function is being made into
2326 /// a tailcall target by changing its ABI.
2327 static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
2328                                    bool GuaranteedTailCallOpt) {
2329   return GuaranteedTailCallOpt && IsTailCallConvention(CC);
2330 }
2331
2332 SDValue
2333 X86TargetLowering::LowerMemArgument(SDValue Chain,
2334                                     CallingConv::ID CallConv,
2335                                     const SmallVectorImpl<ISD::InputArg> &Ins,
2336                                     SDLoc dl, SelectionDAG &DAG,
2337                                     const CCValAssign &VA,
2338                                     MachineFrameInfo *MFI,
2339                                     unsigned i) const {
2340   // Create the nodes corresponding to a load from this parameter slot.
2341   ISD::ArgFlagsTy Flags = Ins[i].Flags;
2342   bool AlwaysUseMutable = FuncIsMadeTailCallSafe(
2343       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2344   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2345   EVT ValVT;
2346
2347   // If value is passed by pointer we have address passed instead of the value
2348   // itself.
2349   if (VA.getLocInfo() == CCValAssign::Indirect)
2350     ValVT = VA.getLocVT();
2351   else
2352     ValVT = VA.getValVT();
2353
2354   // FIXME: For now, all byval parameter objects are marked mutable. This can be
2355   // changed with more analysis.
2356   // In case of tail call optimization mark all arguments mutable. Since they
2357   // could be overwritten by lowering of arguments in case of a tail call.
2358   if (Flags.isByVal()) {
2359     unsigned Bytes = Flags.getByValSize();
2360     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2361     int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2362     return DAG.getFrameIndex(FI, getPointerTy());
2363   } else {
2364     int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
2365                                     VA.getLocMemOffset(), isImmutable);
2366     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
2367     return DAG.getLoad(ValVT, dl, Chain, FIN,
2368                        MachinePointerInfo::getFixedStack(FI),
2369                        false, false, false, 0);
2370   }
2371 }
2372
2373 // FIXME: Get this from tablegen.
2374 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2375                                                 const X86Subtarget *Subtarget) {
2376   assert(Subtarget->is64Bit());
2377
2378   if (Subtarget->isCallingConvWin64(CallConv)) {
2379     static const MCPhysReg GPR64ArgRegsWin64[] = {
2380       X86::RCX, X86::RDX, X86::R8,  X86::R9
2381     };
2382     return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2383   }
2384
2385   static const MCPhysReg GPR64ArgRegs64Bit[] = {
2386     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2387   };
2388   return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2389 }
2390
2391 // FIXME: Get this from tablegen.
2392 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2393                                                 CallingConv::ID CallConv,
2394                                                 const X86Subtarget *Subtarget) {
2395   assert(Subtarget->is64Bit());
2396   if (Subtarget->isCallingConvWin64(CallConv)) {
2397     // The XMM registers which might contain var arg parameters are shadowed
2398     // in their paired GPR.  So we only need to save the GPR to their home
2399     // slots.
2400     // TODO: __vectorcall will change this.
2401     return None;
2402   }
2403
2404   const Function *Fn = MF.getFunction();
2405   bool NoImplicitFloatOps = Fn->getAttributes().
2406       hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
2407   assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) &&
2408          "SSE register cannot be used when SSE is disabled!");
2409   if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
2410       !Subtarget->hasSSE1())
2411     // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2412     // registers.
2413     return None;
2414
2415   static const MCPhysReg XMMArgRegs64Bit[] = {
2416     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2417     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2418   };
2419   return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2420 }
2421
2422 SDValue
2423 X86TargetLowering::LowerFormalArguments(SDValue Chain,
2424                                         CallingConv::ID CallConv,
2425                                         bool isVarArg,
2426                                       const SmallVectorImpl<ISD::InputArg> &Ins,
2427                                         SDLoc dl,
2428                                         SelectionDAG &DAG,
2429                                         SmallVectorImpl<SDValue> &InVals)
2430                                           const {
2431   MachineFunction &MF = DAG.getMachineFunction();
2432   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2433
2434   const Function* Fn = MF.getFunction();
2435   if (Fn->hasExternalLinkage() &&
2436       Subtarget->isTargetCygMing() &&
2437       Fn->getName() == "main")
2438     FuncInfo->setForceFramePointer(true);
2439
2440   MachineFrameInfo *MFI = MF.getFrameInfo();
2441   bool Is64Bit = Subtarget->is64Bit();
2442   bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
2443
2444   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
2445          "Var args not supported with calling convention fastcc, ghc or hipe");
2446
2447   // Assign locations to all of the incoming arguments.
2448   SmallVector<CCValAssign, 16> ArgLocs;
2449   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2450
2451   // Allocate shadow area for Win64
2452   if (IsWin64)
2453     CCInfo.AllocateStack(32, 8);
2454
2455   CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
2456
2457   unsigned LastVal = ~0U;
2458   SDValue ArgValue;
2459   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2460     CCValAssign &VA = ArgLocs[i];
2461     // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
2462     // places.
2463     assert(VA.getValNo() != LastVal &&
2464            "Don't support value assigned to multiple locs yet");
2465     (void)LastVal;
2466     LastVal = VA.getValNo();
2467
2468     if (VA.isRegLoc()) {
2469       EVT RegVT = VA.getLocVT();
2470       const TargetRegisterClass *RC;
2471       if (RegVT == MVT::i32)
2472         RC = &X86::GR32RegClass;
2473       else if (Is64Bit && RegVT == MVT::i64)
2474         RC = &X86::GR64RegClass;
2475       else if (RegVT == MVT::f32)
2476         RC = &X86::FR32RegClass;
2477       else if (RegVT == MVT::f64)
2478         RC = &X86::FR64RegClass;
2479       else if (RegVT.is512BitVector())
2480         RC = &X86::VR512RegClass;
2481       else if (RegVT.is256BitVector())
2482         RC = &X86::VR256RegClass;
2483       else if (RegVT.is128BitVector())
2484         RC = &X86::VR128RegClass;
2485       else if (RegVT == MVT::x86mmx)
2486         RC = &X86::VR64RegClass;
2487       else if (RegVT == MVT::i1)
2488         RC = &X86::VK1RegClass;
2489       else if (RegVT == MVT::v8i1)
2490         RC = &X86::VK8RegClass;
2491       else if (RegVT == MVT::v16i1)
2492         RC = &X86::VK16RegClass;
2493       else if (RegVT == MVT::v32i1)
2494         RC = &X86::VK32RegClass;
2495       else if (RegVT == MVT::v64i1)
2496         RC = &X86::VK64RegClass;
2497       else
2498         llvm_unreachable("Unknown argument type!");
2499
2500       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2501       ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2502
2503       // If this is an 8 or 16-bit value, it is really passed promoted to 32
2504       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
2505       // right size.
2506       if (VA.getLocInfo() == CCValAssign::SExt)
2507         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
2508                                DAG.getValueType(VA.getValVT()));
2509       else if (VA.getLocInfo() == CCValAssign::ZExt)
2510         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
2511                                DAG.getValueType(VA.getValVT()));
2512       else if (VA.getLocInfo() == CCValAssign::BCvt)
2513         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
2514
2515       if (VA.isExtInLoc()) {
2516         // Handle MMX values passed in XMM regs.
2517         if (RegVT.isVector())
2518           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
2519         else
2520           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
2521       }
2522     } else {
2523       assert(VA.isMemLoc());
2524       ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
2525     }
2526
2527     // If value is passed via pointer - do a load.
2528     if (VA.getLocInfo() == CCValAssign::Indirect)
2529       ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
2530                              MachinePointerInfo(), false, false, false, 0);
2531
2532     InVals.push_back(ArgValue);
2533   }
2534
2535   if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) {
2536     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2537       // The x86-64 ABIs require that for returning structs by value we copy
2538       // the sret argument into %rax/%eax (depending on ABI) for the return.
2539       // Win32 requires us to put the sret argument to %eax as well.
2540       // Save the argument into a virtual register so that we can access it
2541       // from the return points.
2542       if (Ins[i].Flags.isSRet()) {
2543         unsigned Reg = FuncInfo->getSRetReturnReg();
2544         if (!Reg) {
2545           MVT PtrTy = getPointerTy();
2546           Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
2547           FuncInfo->setSRetReturnReg(Reg);
2548         }
2549         SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
2550         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
2551         break;
2552       }
2553     }
2554   }
2555
2556   unsigned StackSize = CCInfo.getNextStackOffset();
2557   // Align stack specially for tail calls.
2558   if (FuncIsMadeTailCallSafe(CallConv,
2559                              MF.getTarget().Options.GuaranteedTailCallOpt))
2560     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
2561
2562   // If the function takes variable number of arguments, make a frame index for
2563   // the start of the first vararg value... for expansion of llvm.va_start. We
2564   // can skip this if there are no va_start calls.
2565   if (MFI->hasVAStart() &&
2566       (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
2567                    CallConv != CallingConv::X86_ThisCall))) {
2568     FuncInfo->setVarArgsFrameIndex(
2569         MFI->CreateFixedObject(1, StackSize, true));
2570   }
2571
2572   // Figure out if XMM registers are in use.
2573   assert(!(MF.getTarget().Options.UseSoftFloat &&
2574            Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
2575                                             Attribute::NoImplicitFloat)) &&
2576          "SSE register cannot be used when SSE is disabled!");
2577
2578   // 64-bit calling conventions support varargs and register parameters, so we
2579   // have to do extra work to spill them in the prologue.
2580   if (Is64Bit && isVarArg && MFI->hasVAStart()) {
2581     // Find the first unallocated argument registers.
2582     ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
2583     ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
2584     unsigned NumIntRegs =
2585         CCInfo.getFirstUnallocated(ArgGPRs.data(), ArgGPRs.size());
2586     unsigned NumXMMRegs =
2587         CCInfo.getFirstUnallocated(ArgXMMs.data(), ArgXMMs.size());
2588     assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
2589            "SSE register cannot be used when SSE is disabled!");
2590
2591     // Gather all the live in physical registers.
2592     SmallVector<SDValue, 6> LiveGPRs;
2593     SmallVector<SDValue, 8> LiveXMMRegs;
2594     SDValue ALVal;
2595     for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
2596       unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
2597       LiveGPRs.push_back(
2598           DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
2599     }
2600     if (!ArgXMMs.empty()) {
2601       unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2602       ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
2603       for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
2604         unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
2605         LiveXMMRegs.push_back(
2606             DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
2607       }
2608     }
2609
2610     if (IsWin64) {
2611       const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
2612       // Get to the caller-allocated home save location.  Add 8 to account
2613       // for the return address.
2614       int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
2615       FuncInfo->setRegSaveFrameIndex(
2616           MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
2617       // Fixup to set vararg frame on shadow area (4 x i64).
2618       if (NumIntRegs < 4)
2619         FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
2620     } else {
2621       // For X86-64, if there are vararg parameters that are passed via
2622       // registers, then we must store them to their spots on the stack so
2623       // they may be loaded by deferencing the result of va_next.
2624       FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
2625       FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
2626       FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
2627           ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
2628     }
2629
2630     // Store the integer parameter registers.
2631     SmallVector<SDValue, 8> MemOps;
2632     SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
2633                                       getPointerTy());
2634     unsigned Offset = FuncInfo->getVarArgsGPOffset();
2635     for (SDValue Val : LiveGPRs) {
2636       SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
2637                                 DAG.getIntPtrConstant(Offset));
2638       SDValue Store =
2639         DAG.getStore(Val.getValue(1), dl, Val, FIN,
2640                      MachinePointerInfo::getFixedStack(
2641                        FuncInfo->getRegSaveFrameIndex(), Offset),
2642                      false, false, 0);
2643       MemOps.push_back(Store);
2644       Offset += 8;
2645     }
2646
2647     if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
2648       // Now store the XMM (fp + vector) parameter registers.
2649       SmallVector<SDValue, 12> SaveXMMOps;
2650       SaveXMMOps.push_back(Chain);
2651       SaveXMMOps.push_back(ALVal);
2652       SaveXMMOps.push_back(DAG.getIntPtrConstant(
2653                              FuncInfo->getRegSaveFrameIndex()));
2654       SaveXMMOps.push_back(DAG.getIntPtrConstant(
2655                              FuncInfo->getVarArgsFPOffset()));
2656       SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
2657                         LiveXMMRegs.end());
2658       MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
2659                                    MVT::Other, SaveXMMOps));
2660     }
2661
2662     if (!MemOps.empty())
2663       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
2664   }
2665
2666   if (isVarArg && MFI->hasMustTailInVarArgFunc()) {
2667     // Find the largest legal vector type.
2668     MVT VecVT = MVT::Other;
2669     // FIXME: Only some x86_32 calling conventions support AVX512.
2670     if (Subtarget->hasAVX512() &&
2671         (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
2672                      CallConv == CallingConv::Intel_OCL_BI)))
2673       VecVT = MVT::v16f32;
2674     else if (Subtarget->hasAVX())
2675       VecVT = MVT::v8f32;
2676     else if (Subtarget->hasSSE2())
2677       VecVT = MVT::v4f32;
2678
2679     // We forward some GPRs and some vector types.
2680     SmallVector<MVT, 2> RegParmTypes;
2681     MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
2682     RegParmTypes.push_back(IntVT);
2683     if (VecVT != MVT::Other)
2684       RegParmTypes.push_back(VecVT);
2685
2686     // Compute the set of forwarded registers. The rest are scratch.
2687     SmallVectorImpl<ForwardedRegister> &Forwards =
2688         FuncInfo->getForwardedMustTailRegParms();
2689     CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
2690
2691     // Conservatively forward AL on x86_64, since it might be used for varargs.
2692     if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
2693       unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2694       Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
2695     }
2696
2697     // Copy all forwards from physical to virtual registers.
2698     for (ForwardedRegister &F : Forwards) {
2699       // FIXME: Can we use a less constrained schedule?
2700       SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
2701       F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
2702       Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
2703     }
2704   }
2705
2706   // Some CCs need callee pop.
2707   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
2708                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
2709     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
2710   } else {
2711     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
2712     // If this is an sret function, the return should pop the hidden pointer.
2713     if (!Is64Bit && !IsTailCallConvention(CallConv) &&
2714         !Subtarget->getTargetTriple().isOSMSVCRT() &&
2715         argsAreStructReturn(Ins) == StackStructReturn)
2716       FuncInfo->setBytesToPopOnReturn(4);
2717   }
2718
2719   if (!Is64Bit) {
2720     // RegSaveFrameIndex is X86-64 only.
2721     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
2722     if (CallConv == CallingConv::X86_FastCall ||
2723         CallConv == CallingConv::X86_ThisCall)
2724       // fastcc functions can't have varargs.
2725       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
2726   }
2727
2728   FuncInfo->setArgumentStackSize(StackSize);
2729
2730   return Chain;
2731 }
2732
2733 SDValue
2734 X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
2735                                     SDValue StackPtr, SDValue Arg,
2736                                     SDLoc dl, SelectionDAG &DAG,
2737                                     const CCValAssign &VA,
2738                                     ISD::ArgFlagsTy Flags) const {
2739   unsigned LocMemOffset = VA.getLocMemOffset();
2740   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
2741   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
2742   if (Flags.isByVal())
2743     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
2744
2745   return DAG.getStore(Chain, dl, Arg, PtrOff,
2746                       MachinePointerInfo::getStack(LocMemOffset),
2747                       false, false, 0);
2748 }
2749
2750 /// Emit a load of return address if tail call
2751 /// optimization is performed and it is required.
2752 SDValue
2753 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
2754                                            SDValue &OutRetAddr, SDValue Chain,
2755                                            bool IsTailCall, bool Is64Bit,
2756                                            int FPDiff, SDLoc dl) const {
2757   // Adjust the Return address stack slot.
2758   EVT VT = getPointerTy();
2759   OutRetAddr = getReturnAddressFrameIndex(DAG);
2760
2761   // Load the "old" Return address.
2762   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
2763                            false, false, false, 0);
2764   return SDValue(OutRetAddr.getNode(), 1);
2765 }
2766
2767 /// Emit a store of the return address if tail call
2768 /// optimization is performed and it is required (FPDiff!=0).
2769 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
2770                                         SDValue Chain, SDValue RetAddrFrIdx,
2771                                         EVT PtrVT, unsigned SlotSize,
2772                                         int FPDiff, SDLoc dl) {
2773   // Store the return address to the appropriate stack slot.
2774   if (!FPDiff) return Chain;
2775   // Calculate the new stack slot for the return address.
2776   int NewReturnAddrFI =
2777     MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
2778                                          false);
2779   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
2780   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
2781                        MachinePointerInfo::getFixedStack(NewReturnAddrFI),
2782                        false, false, 0);
2783   return Chain;
2784 }
2785
2786 SDValue
2787 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2788                              SmallVectorImpl<SDValue> &InVals) const {
2789   SelectionDAG &DAG                     = CLI.DAG;
2790   SDLoc &dl                             = CLI.DL;
2791   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2792   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
2793   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
2794   SDValue Chain                         = CLI.Chain;
2795   SDValue Callee                        = CLI.Callee;
2796   CallingConv::ID CallConv              = CLI.CallConv;
2797   bool &isTailCall                      = CLI.IsTailCall;
2798   bool isVarArg                         = CLI.IsVarArg;
2799
2800   MachineFunction &MF = DAG.getMachineFunction();
2801   bool Is64Bit        = Subtarget->is64Bit();
2802   bool IsWin64        = Subtarget->isCallingConvWin64(CallConv);
2803   StructReturnType SR = callIsStructReturn(Outs);
2804   bool IsSibcall      = false;
2805   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
2806
2807   if (MF.getTarget().Options.DisableTailCalls)
2808     isTailCall = false;
2809
2810   bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
2811   if (IsMustTail) {
2812     // Force this to be a tail call.  The verifier rules are enough to ensure
2813     // that we can lower this successfully without moving the return address
2814     // around.
2815     isTailCall = true;
2816   } else if (isTailCall) {
2817     // Check if it's really possible to do a tail call.
2818     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
2819                     isVarArg, SR != NotStructReturn,
2820                     MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
2821                     Outs, OutVals, Ins, DAG);
2822
2823     // Sibcalls are automatically detected tailcalls which do not require
2824     // ABI changes.
2825     if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
2826       IsSibcall = true;
2827
2828     if (isTailCall)
2829       ++NumTailCalls;
2830   }
2831
2832   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
2833          "Var args not supported with calling convention fastcc, ghc or hipe");
2834
2835   // Analyze operands of the call, assigning locations to each operand.
2836   SmallVector<CCValAssign, 16> ArgLocs;
2837   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2838
2839   // Allocate shadow area for Win64
2840   if (IsWin64)
2841     CCInfo.AllocateStack(32, 8);
2842
2843   CCInfo.AnalyzeCallOperands(Outs, CC_X86);
2844
2845   // Get a count of how many bytes are to be pushed on the stack.
2846   unsigned NumBytes = CCInfo.getNextStackOffset();
2847   if (IsSibcall)
2848     // This is a sibcall. The memory operands are available in caller's
2849     // own caller's stack.
2850     NumBytes = 0;
2851   else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
2852            IsTailCallConvention(CallConv))
2853     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
2854
2855   int FPDiff = 0;
2856   if (isTailCall && !IsSibcall && !IsMustTail) {
2857     // Lower arguments at fp - stackoffset + fpdiff.
2858     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
2859
2860     FPDiff = NumBytesCallerPushed - NumBytes;
2861
2862     // Set the delta of movement of the returnaddr stackslot.
2863     // But only set if delta is greater than previous delta.
2864     if (FPDiff < X86Info->getTCReturnAddrDelta())
2865       X86Info->setTCReturnAddrDelta(FPDiff);
2866   }
2867
2868   unsigned NumBytesToPush = NumBytes;
2869   unsigned NumBytesToPop = NumBytes;
2870
2871   // If we have an inalloca argument, all stack space has already been allocated
2872   // for us and be right at the top of the stack.  We don't support multiple
2873   // arguments passed in memory when using inalloca.
2874   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
2875     NumBytesToPush = 0;
2876     if (!ArgLocs.back().isMemLoc())
2877       report_fatal_error("cannot use inalloca attribute on a register "
2878                          "parameter");
2879     if (ArgLocs.back().getLocMemOffset() != 0)
2880       report_fatal_error("any parameter with the inalloca attribute must be "
2881                          "the only memory argument");
2882   }
2883
2884   if (!IsSibcall)
2885     Chain = DAG.getCALLSEQ_START(
2886         Chain, DAG.getIntPtrConstant(NumBytesToPush, true), dl);
2887
2888   SDValue RetAddrFrIdx;
2889   // Load return address for tail calls.
2890   if (isTailCall && FPDiff)
2891     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
2892                                     Is64Bit, FPDiff, dl);
2893
2894   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2895   SmallVector<SDValue, 8> MemOpChains;
2896   SDValue StackPtr;
2897
2898   // Walk the register/memloc assignments, inserting copies/loads.  In the case
2899   // of tail call optimization arguments are handle later.
2900   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
2901   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2902     // Skip inalloca arguments, they have already been written.
2903     ISD::ArgFlagsTy Flags = Outs[i].Flags;
2904     if (Flags.isInAlloca())
2905       continue;
2906
2907     CCValAssign &VA = ArgLocs[i];
2908     EVT RegVT = VA.getLocVT();
2909     SDValue Arg = OutVals[i];
2910     bool isByVal = Flags.isByVal();
2911
2912     // Promote the value if needed.
2913     switch (VA.getLocInfo()) {
2914     default: llvm_unreachable("Unknown loc info!");
2915     case CCValAssign::Full: break;
2916     case CCValAssign::SExt:
2917       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
2918       break;
2919     case CCValAssign::ZExt:
2920       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
2921       break;
2922     case CCValAssign::AExt:
2923       if (RegVT.is128BitVector()) {
2924         // Special case: passing MMX values in XMM registers.
2925         Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
2926         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
2927         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
2928       } else
2929         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
2930       break;
2931     case CCValAssign::BCvt:
2932       Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
2933       break;
2934     case CCValAssign::Indirect: {
2935       // Store the argument.
2936       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
2937       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
2938       Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
2939                            MachinePointerInfo::getFixedStack(FI),
2940                            false, false, 0);
2941       Arg = SpillSlot;
2942       break;
2943     }
2944     }
2945
2946     if (VA.isRegLoc()) {
2947       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2948       if (isVarArg && IsWin64) {
2949         // Win64 ABI requires argument XMM reg to be copied to the corresponding
2950         // shadow reg if callee is a varargs function.
2951         unsigned ShadowReg = 0;
2952         switch (VA.getLocReg()) {
2953         case X86::XMM0: ShadowReg = X86::RCX; break;
2954         case X86::XMM1: ShadowReg = X86::RDX; break;
2955         case X86::XMM2: ShadowReg = X86::R8; break;
2956         case X86::XMM3: ShadowReg = X86::R9; break;
2957         }
2958         if (ShadowReg)
2959           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
2960       }
2961     } else if (!IsSibcall && (!isTailCall || isByVal)) {
2962       assert(VA.isMemLoc());
2963       if (!StackPtr.getNode())
2964         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
2965                                       getPointerTy());
2966       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
2967                                              dl, DAG, VA, Flags));
2968     }
2969   }
2970
2971   if (!MemOpChains.empty())
2972     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2973
2974   if (Subtarget->isPICStyleGOT()) {
2975     // ELF / PIC requires GOT in the EBX register before function calls via PLT
2976     // GOT pointer.
2977     if (!isTailCall) {
2978       RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
2979                DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
2980     } else {
2981       // If we are tail calling and generating PIC/GOT style code load the
2982       // address of the callee into ECX. The value in ecx is used as target of
2983       // the tail jump. This is done to circumvent the ebx/callee-saved problem
2984       // for tail calls on PIC/GOT architectures. Normally we would just put the
2985       // address of GOT into ebx and then call target@PLT. But for tail calls
2986       // ebx would be restored (since ebx is callee saved) before jumping to the
2987       // target@PLT.
2988
2989       // Note: The actual moving to ECX is done further down.
2990       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
2991       if (G && !G->getGlobal()->hasHiddenVisibility() &&
2992           !G->getGlobal()->hasProtectedVisibility())
2993         Callee = LowerGlobalAddress(Callee, DAG);
2994       else if (isa<ExternalSymbolSDNode>(Callee))
2995         Callee = LowerExternalSymbol(Callee, DAG);
2996     }
2997   }
2998
2999   if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3000     // From AMD64 ABI document:
3001     // For calls that may call functions that use varargs or stdargs
3002     // (prototype-less calls or calls to functions containing ellipsis (...) in
3003     // the declaration) %al is used as hidden argument to specify the number
3004     // of SSE registers used. The contents of %al do not need to match exactly
3005     // the number of registers, but must be an ubound on the number of SSE
3006     // registers used and is in the range 0 - 8 inclusive.
3007
3008     // Count the number of XMM registers allocated.
3009     static const MCPhysReg XMMArgRegs[] = {
3010       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3011       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3012     };
3013     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
3014     assert((Subtarget->hasSSE1() || !NumXMMRegs)
3015            && "SSE registers cannot be used when SSE is disabled");
3016
3017     RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3018                                         DAG.getConstant(NumXMMRegs, MVT::i8)));
3019   }
3020
3021   if (isVarArg && IsMustTail) {
3022     const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3023     for (const auto &F : Forwards) {
3024       SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3025       RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3026     }
3027   }
3028
3029   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
3030   // don't need this because the eligibility check rejects calls that require
3031   // shuffling arguments passed in memory.
3032   if (!IsSibcall && isTailCall) {
3033     // Force all the incoming stack arguments to be loaded from the stack
3034     // before any new outgoing arguments are stored to the stack, because the
3035     // outgoing stack slots may alias the incoming argument stack slots, and
3036     // the alias isn't otherwise explicit. This is slightly more conservative
3037     // than necessary, because it means that each store effectively depends
3038     // on every argument instead of just those arguments it would clobber.
3039     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3040
3041     SmallVector<SDValue, 8> MemOpChains2;
3042     SDValue FIN;
3043     int FI = 0;
3044     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3045       CCValAssign &VA = ArgLocs[i];
3046       if (VA.isRegLoc())
3047         continue;
3048       assert(VA.isMemLoc());
3049       SDValue Arg = OutVals[i];
3050       ISD::ArgFlagsTy Flags = Outs[i].Flags;
3051       // Skip inalloca arguments.  They don't require any work.
3052       if (Flags.isInAlloca())
3053         continue;
3054       // Create frame index.
3055       int32_t Offset = VA.getLocMemOffset()+FPDiff;
3056       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3057       FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
3058       FIN = DAG.getFrameIndex(FI, getPointerTy());
3059
3060       if (Flags.isByVal()) {
3061         // Copy relative to framepointer.
3062         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
3063         if (!StackPtr.getNode())
3064           StackPtr = DAG.getCopyFromReg(Chain, dl,
3065                                         RegInfo->getStackRegister(),
3066                                         getPointerTy());
3067         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
3068
3069         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3070                                                          ArgChain,
3071                                                          Flags, DAG, dl));
3072       } else {
3073         // Store relative to framepointer.
3074         MemOpChains2.push_back(
3075           DAG.getStore(ArgChain, dl, Arg, FIN,
3076                        MachinePointerInfo::getFixedStack(FI),
3077                        false, false, 0));
3078       }
3079     }
3080
3081     if (!MemOpChains2.empty())
3082       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3083
3084     // Store the return address to the appropriate stack slot.
3085     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3086                                      getPointerTy(), RegInfo->getSlotSize(),
3087                                      FPDiff, dl);
3088   }
3089
3090   // Build a sequence of copy-to-reg nodes chained together with token chain
3091   // and flag operands which copy the outgoing args into registers.
3092   SDValue InFlag;
3093   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3094     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3095                              RegsToPass[i].second, InFlag);
3096     InFlag = Chain.getValue(1);
3097   }
3098
3099   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3100     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3101     // In the 64-bit large code model, we have to make all calls
3102     // through a register, since the call instruction's 32-bit
3103     // pc-relative offset may not be large enough to hold the whole
3104     // address.
3105   } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3106     // If the callee is a GlobalAddress node (quite common, every direct call
3107     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3108     // it.
3109     GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3110
3111     // We should use extra load for direct calls to dllimported functions in
3112     // non-JIT mode.
3113     const GlobalValue *GV = G->getGlobal();
3114     if (!GV->hasDLLImportStorageClass()) {
3115       unsigned char OpFlags = 0;
3116       bool ExtraLoad = false;
3117       unsigned WrapperKind = ISD::DELETED_NODE;
3118
3119       // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
3120       // external symbols most go through the PLT in PIC mode.  If the symbol
3121       // has hidden or protected visibility, or if it is static or local, then
3122       // we don't need to use the PLT - we can directly call it.
3123       if (Subtarget->isTargetELF() &&
3124           DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
3125           GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
3126         OpFlags = X86II::MO_PLT;
3127       } else if (Subtarget->isPICStyleStubAny() &&
3128                  (GV->isDeclaration() || GV->isWeakForLinker()) &&
3129                  (!Subtarget->getTargetTriple().isMacOSX() ||
3130                   Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
3131         // PC-relative references to external symbols should go through $stub,
3132         // unless we're building with the leopard linker or later, which
3133         // automatically synthesizes these stubs.
3134         OpFlags = X86II::MO_DARWIN_STUB;
3135       } else if (Subtarget->isPICStyleRIPRel() &&
3136                  isa<Function>(GV) &&
3137                  cast<Function>(GV)->getAttributes().
3138                    hasAttribute(AttributeSet::FunctionIndex,
3139                                 Attribute::NonLazyBind)) {
3140         // If the function is marked as non-lazy, generate an indirect call
3141         // which loads from the GOT directly. This avoids runtime overhead
3142         // at the cost of eager binding (and one extra byte of encoding).
3143         OpFlags = X86II::MO_GOTPCREL;
3144         WrapperKind = X86ISD::WrapperRIP;
3145         ExtraLoad = true;
3146       }
3147
3148       Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
3149                                           G->getOffset(), OpFlags);
3150
3151       // Add a wrapper if needed.
3152       if (WrapperKind != ISD::DELETED_NODE)
3153         Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
3154       // Add extra indirection if needed.
3155       if (ExtraLoad)
3156         Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
3157                              MachinePointerInfo::getGOT(),
3158                              false, false, false, 0);
3159     }
3160   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3161     unsigned char OpFlags = 0;
3162
3163     // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
3164     // external symbols should go through the PLT.
3165     if (Subtarget->isTargetELF() &&
3166         DAG.getTarget().getRelocationModel() == Reloc::PIC_) {
3167       OpFlags = X86II::MO_PLT;
3168     } else if (Subtarget->isPICStyleStubAny() &&
3169                (!Subtarget->getTargetTriple().isMacOSX() ||
3170                 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
3171       // PC-relative references to external symbols should go through $stub,
3172       // unless we're building with the leopard linker or later, which
3173       // automatically synthesizes these stubs.
3174       OpFlags = X86II::MO_DARWIN_STUB;
3175     }
3176
3177     Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
3178                                          OpFlags);
3179   } else if (Subtarget->isTarget64BitILP32() &&
3180              Callee->getValueType(0) == MVT::i32) {
3181     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3182     Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3183   }
3184
3185   // Returns a chain & a flag for retval copy to use.
3186   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3187   SmallVector<SDValue, 8> Ops;
3188
3189   if (!IsSibcall && isTailCall) {
3190     Chain = DAG.getCALLSEQ_END(Chain,
3191                                DAG.getIntPtrConstant(NumBytesToPop, true),
3192                                DAG.getIntPtrConstant(0, true), InFlag, dl);
3193     InFlag = Chain.getValue(1);
3194   }
3195
3196   Ops.push_back(Chain);
3197   Ops.push_back(Callee);
3198
3199   if (isTailCall)
3200     Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
3201
3202   // Add argument registers to the end of the list so that they are known live
3203   // into the call.
3204   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3205     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3206                                   RegsToPass[i].second.getValueType()));
3207
3208   // Add a register mask operand representing the call-preserved registers.
3209   const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
3210   const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
3211   assert(Mask && "Missing call preserved mask for calling convention");
3212   Ops.push_back(DAG.getRegisterMask(Mask));
3213
3214   if (InFlag.getNode())
3215     Ops.push_back(InFlag);
3216
3217   if (isTailCall) {
3218     // We used to do:
3219     //// If this is the first return lowered for this function, add the regs
3220     //// to the liveout set for the function.
3221     // This isn't right, although it's probably harmless on x86; liveouts
3222     // should be computed from returns not tail calls.  Consider a void
3223     // function making a tail call to a function returning int.
3224     return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3225   }
3226
3227   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3228   InFlag = Chain.getValue(1);
3229
3230   // Create the CALLSEQ_END node.
3231   unsigned NumBytesForCalleeToPop;
3232   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3233                        DAG.getTarget().Options.GuaranteedTailCallOpt))
3234     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
3235   else if (!Is64Bit && !IsTailCallConvention(CallConv) &&
3236            !Subtarget->getTargetTriple().isOSMSVCRT() &&
3237            SR == StackStructReturn)
3238     // If this is a call to a struct-return function, the callee
3239     // pops the hidden struct pointer, so we have to push it back.
3240     // This is common for Darwin/X86, Linux & Mingw32 targets.
3241     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3242     NumBytesForCalleeToPop = 4;
3243   else
3244     NumBytesForCalleeToPop = 0;  // Callee pops nothing.
3245
3246   // Returns a flag for retval copy to use.
3247   if (!IsSibcall) {
3248     Chain = DAG.getCALLSEQ_END(Chain,
3249                                DAG.getIntPtrConstant(NumBytesToPop, true),
3250                                DAG.getIntPtrConstant(NumBytesForCalleeToPop,
3251                                                      true),
3252                                InFlag, dl);
3253     InFlag = Chain.getValue(1);
3254   }
3255
3256   // Handle result values, copying them out of physregs into vregs that we
3257   // return.
3258   return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
3259                          Ins, dl, DAG, InVals);
3260 }
3261
3262 //===----------------------------------------------------------------------===//
3263 //                Fast Calling Convention (tail call) implementation
3264 //===----------------------------------------------------------------------===//
3265
3266 //  Like std call, callee cleans arguments, convention except that ECX is
3267 //  reserved for storing the tail called function address. Only 2 registers are
3268 //  free for argument passing (inreg). Tail call optimization is performed
3269 //  provided:
3270 //                * tailcallopt is enabled
3271 //                * caller/callee are fastcc
3272 //  On X86_64 architecture with GOT-style position independent code only local
3273 //  (within module) calls are supported at the moment.
3274 //  To keep the stack aligned according to platform abi the function
3275 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
3276 //  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3277 //  If a tail called function callee has more arguments than the caller the
3278 //  caller needs to make sure that there is room to move the RETADDR to. This is
3279 //  achieved by reserving an area the size of the argument delta right after the
3280 //  original RETADDR, but before the saved framepointer or the spilled registers
3281 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3282 //  stack layout:
3283 //    arg1
3284 //    arg2
3285 //    RETADDR
3286 //    [ new RETADDR
3287 //      move area ]
3288 //    (possible EBP)
3289 //    ESI
3290 //    EDI
3291 //    local1 ..
3292
3293 /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
3294 /// for a 16 byte align requirement.
3295 unsigned
3296 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3297                                                SelectionDAG& DAG) const {
3298   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
3299   const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
3300   unsigned StackAlignment = TFI.getStackAlignment();
3301   uint64_t AlignMask = StackAlignment - 1;
3302   int64_t Offset = StackSize;
3303   unsigned SlotSize = RegInfo->getSlotSize();
3304   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3305     // Number smaller than 12 so just add the difference.
3306     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3307   } else {
3308     // Mask out lower bits, add stackalignment once plus the 12 bytes.
3309     Offset = ((~AlignMask) & Offset) + StackAlignment +
3310       (StackAlignment-SlotSize);
3311   }
3312   return Offset;
3313 }
3314
3315 /// MatchingStackOffset - Return true if the given stack call argument is
3316 /// already available in the same position (relatively) of the caller's
3317 /// incoming argument stack.
3318 static
3319 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3320                          MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
3321                          const X86InstrInfo *TII) {
3322   unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
3323   int FI = INT_MAX;
3324   if (Arg.getOpcode() == ISD::CopyFromReg) {
3325     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3326     if (!TargetRegisterInfo::isVirtualRegister(VR))
3327       return false;
3328     MachineInstr *Def = MRI->getVRegDef(VR);
3329     if (!Def)
3330       return false;
3331     if (!Flags.isByVal()) {
3332       if (!TII->isLoadFromStackSlot(Def, FI))
3333         return false;
3334     } else {
3335       unsigned Opcode = Def->getOpcode();
3336       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
3337            Opcode == X86::LEA64_32r) &&
3338           Def->getOperand(1).isFI()) {
3339         FI = Def->getOperand(1).getIndex();
3340         Bytes = Flags.getByValSize();
3341       } else
3342         return false;
3343     }
3344   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
3345     if (Flags.isByVal())
3346       // ByVal argument is passed in as a pointer but it's now being
3347       // dereferenced. e.g.
3348       // define @foo(%struct.X* %A) {
3349       //   tail call @bar(%struct.X* byval %A)
3350       // }
3351       return false;
3352     SDValue Ptr = Ld->getBasePtr();
3353     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
3354     if (!FINode)
3355       return false;
3356     FI = FINode->getIndex();
3357   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
3358     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
3359     FI = FINode->getIndex();
3360     Bytes = Flags.getByValSize();
3361   } else
3362     return false;
3363
3364   assert(FI != INT_MAX);
3365   if (!MFI->isFixedObjectIndex(FI))
3366     return false;
3367   return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
3368 }
3369
3370 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
3371 /// for tail call optimization. Targets which want to do tail call
3372 /// optimization should implement this function.
3373 bool
3374 X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
3375                                                      CallingConv::ID CalleeCC,
3376                                                      bool isVarArg,
3377                                                      bool isCalleeStructRet,
3378                                                      bool isCallerStructRet,
3379                                                      Type *RetTy,
3380                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
3381                                     const SmallVectorImpl<SDValue> &OutVals,
3382                                     const SmallVectorImpl<ISD::InputArg> &Ins,
3383                                                      SelectionDAG &DAG) const {
3384   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
3385     return false;
3386
3387   // If -tailcallopt is specified, make fastcc functions tail-callable.
3388   const MachineFunction &MF = DAG.getMachineFunction();
3389   const Function *CallerF = MF.getFunction();
3390
3391   // If the function return type is x86_fp80 and the callee return type is not,
3392   // then the FP_EXTEND of the call result is not a nop. It's not safe to
3393   // perform a tailcall optimization here.
3394   if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
3395     return false;
3396
3397   CallingConv::ID CallerCC = CallerF->getCallingConv();
3398   bool CCMatch = CallerCC == CalleeCC;
3399   bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
3400   bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
3401
3402   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
3403     if (IsTailCallConvention(CalleeCC) && CCMatch)
3404       return true;
3405     return false;
3406   }
3407
3408   // Look for obvious safe cases to perform tail call optimization that do not
3409   // require ABI changes. This is what gcc calls sibcall.
3410
3411   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
3412   // emit a special epilogue.
3413   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
3414   if (RegInfo->needsStackRealignment(MF))
3415     return false;
3416
3417   // Also avoid sibcall optimization if either caller or callee uses struct
3418   // return semantics.
3419   if (isCalleeStructRet || isCallerStructRet)
3420     return false;
3421
3422   // An stdcall/thiscall caller is expected to clean up its arguments; the
3423   // callee isn't going to do that.
3424   // FIXME: this is more restrictive than needed. We could produce a tailcall
3425   // when the stack adjustment matches. For example, with a thiscall that takes
3426   // only one argument.
3427   if (!CCMatch && (CallerCC == CallingConv::X86_StdCall ||
3428                    CallerCC == CallingConv::X86_ThisCall))
3429     return false;
3430
3431   // Do not sibcall optimize vararg calls unless all arguments are passed via
3432   // registers.
3433   if (isVarArg && !Outs.empty()) {
3434
3435     // Optimizing for varargs on Win64 is unlikely to be safe without
3436     // additional testing.
3437     if (IsCalleeWin64 || IsCallerWin64)
3438       return false;
3439
3440     SmallVector<CCValAssign, 16> ArgLocs;
3441     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
3442                    *DAG.getContext());
3443
3444     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3445     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
3446       if (!ArgLocs[i].isRegLoc())
3447         return false;
3448   }
3449
3450   // If the call result is in ST0 / ST1, it needs to be popped off the x87
3451   // stack.  Therefore, if it's not used by the call it is not safe to optimize
3452   // this into a sibcall.
3453   bool Unused = false;
3454   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
3455     if (!Ins[i].Used) {
3456       Unused = true;
3457       break;
3458     }
3459   }
3460   if (Unused) {
3461     SmallVector<CCValAssign, 16> RVLocs;
3462     CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs,
3463                    *DAG.getContext());
3464     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3465     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
3466       CCValAssign &VA = RVLocs[i];
3467       if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
3468         return false;
3469     }
3470   }
3471
3472   // If the calling conventions do not match, then we'd better make sure the
3473   // results are returned in the same way as what the caller expects.
3474   if (!CCMatch) {
3475     SmallVector<CCValAssign, 16> RVLocs1;
3476     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
3477                     *DAG.getContext());
3478     CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
3479
3480     SmallVector<CCValAssign, 16> RVLocs2;
3481     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
3482                     *DAG.getContext());
3483     CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
3484
3485     if (RVLocs1.size() != RVLocs2.size())
3486       return false;
3487     for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
3488       if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
3489         return false;
3490       if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
3491         return false;
3492       if (RVLocs1[i].isRegLoc()) {
3493         if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
3494           return false;
3495       } else {
3496         if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
3497           return false;
3498       }
3499     }
3500   }
3501
3502   // If the callee takes no arguments then go on to check the results of the
3503   // call.
3504   if (!Outs.empty()) {
3505     // Check if stack adjustment is needed. For now, do not do this if any
3506     // argument is passed on the stack.
3507     SmallVector<CCValAssign, 16> ArgLocs;
3508     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
3509                    *DAG.getContext());
3510
3511     // Allocate shadow area for Win64
3512     if (IsCalleeWin64)
3513       CCInfo.AllocateStack(32, 8);
3514
3515     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3516     if (CCInfo.getNextStackOffset()) {
3517       MachineFunction &MF = DAG.getMachineFunction();
3518       if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
3519         return false;
3520
3521       // Check if the arguments are already laid out in the right way as
3522       // the caller's fixed stack objects.
3523       MachineFrameInfo *MFI = MF.getFrameInfo();
3524       const MachineRegisterInfo *MRI = &MF.getRegInfo();
3525       const X86InstrInfo *TII = Subtarget->getInstrInfo();
3526       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3527         CCValAssign &VA = ArgLocs[i];
3528         SDValue Arg = OutVals[i];
3529         ISD::ArgFlagsTy Flags = Outs[i].Flags;
3530         if (VA.getLocInfo() == CCValAssign::Indirect)
3531           return false;
3532         if (!VA.isRegLoc()) {
3533           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
3534                                    MFI, MRI, TII))
3535             return false;
3536         }
3537       }
3538     }
3539
3540     // If the tailcall address may be in a register, then make sure it's
3541     // possible to register allocate for it. In 32-bit, the call address can
3542     // only target EAX, EDX, or ECX since the tail call must be scheduled after
3543     // callee-saved registers are restored. These happen to be the same
3544     // registers used to pass 'inreg' arguments so watch out for those.
3545     if (!Subtarget->is64Bit() &&
3546         ((!isa<GlobalAddressSDNode>(Callee) &&
3547           !isa<ExternalSymbolSDNode>(Callee)) ||
3548          DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
3549       unsigned NumInRegs = 0;
3550       // In PIC we need an extra register to formulate the address computation
3551       // for the callee.
3552       unsigned MaxInRegs =
3553         (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
3554
3555       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3556         CCValAssign &VA = ArgLocs[i];
3557         if (!VA.isRegLoc())
3558           continue;
3559         unsigned Reg = VA.getLocReg();
3560         switch (Reg) {
3561         default: break;
3562         case X86::EAX: case X86::EDX: case X86::ECX:
3563           if (++NumInRegs == MaxInRegs)
3564             return false;
3565           break;
3566         }
3567       }
3568     }
3569   }
3570
3571   return true;
3572 }
3573
3574 FastISel *
3575 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
3576                                   const TargetLibraryInfo *libInfo) const {
3577   return X86::createFastISel(funcInfo, libInfo);
3578 }
3579
3580 //===----------------------------------------------------------------------===//
3581 //                           Other Lowering Hooks
3582 //===----------------------------------------------------------------------===//
3583
3584 static bool MayFoldLoad(SDValue Op) {
3585   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
3586 }
3587
3588 static bool MayFoldIntoStore(SDValue Op) {
3589   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
3590 }
3591
3592 static bool isTargetShuffle(unsigned Opcode) {
3593   switch(Opcode) {
3594   default: return false;
3595   case X86ISD::BLENDI:
3596   case X86ISD::PSHUFB:
3597   case X86ISD::PSHUFD:
3598   case X86ISD::PSHUFHW:
3599   case X86ISD::PSHUFLW:
3600   case X86ISD::SHUFP:
3601   case X86ISD::PALIGNR:
3602   case X86ISD::MOVLHPS:
3603   case X86ISD::MOVLHPD:
3604   case X86ISD::MOVHLPS:
3605   case X86ISD::MOVLPS:
3606   case X86ISD::MOVLPD:
3607   case X86ISD::MOVSHDUP:
3608   case X86ISD::MOVSLDUP:
3609   case X86ISD::MOVDDUP:
3610   case X86ISD::MOVSS:
3611   case X86ISD::MOVSD:
3612   case X86ISD::UNPCKL:
3613   case X86ISD::UNPCKH:
3614   case X86ISD::VPERMILPI:
3615   case X86ISD::VPERM2X128:
3616   case X86ISD::VPERMI:
3617     return true;
3618   }
3619 }
3620
3621 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3622                                     SDValue V1, SelectionDAG &DAG) {
3623   switch(Opc) {
3624   default: llvm_unreachable("Unknown x86 shuffle node");
3625   case X86ISD::MOVSHDUP:
3626   case X86ISD::MOVSLDUP:
3627   case X86ISD::MOVDDUP:
3628     return DAG.getNode(Opc, dl, VT, V1);
3629   }
3630 }
3631
3632 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3633                                     SDValue V1, unsigned TargetMask,
3634                                     SelectionDAG &DAG) {
3635   switch(Opc) {
3636   default: llvm_unreachable("Unknown x86 shuffle node");
3637   case X86ISD::PSHUFD:
3638   case X86ISD::PSHUFHW:
3639   case X86ISD::PSHUFLW:
3640   case X86ISD::VPERMILPI:
3641   case X86ISD::VPERMI:
3642     return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
3643   }
3644 }
3645
3646 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3647                                     SDValue V1, SDValue V2, unsigned TargetMask,
3648                                     SelectionDAG &DAG) {
3649   switch(Opc) {
3650   default: llvm_unreachable("Unknown x86 shuffle node");
3651   case X86ISD::PALIGNR:
3652   case X86ISD::VALIGN:
3653   case X86ISD::SHUFP:
3654   case X86ISD::VPERM2X128:
3655     return DAG.getNode(Opc, dl, VT, V1, V2,
3656                        DAG.getConstant(TargetMask, MVT::i8));
3657   }
3658 }
3659
3660 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3661                                     SDValue V1, SDValue V2, SelectionDAG &DAG) {
3662   switch(Opc) {
3663   default: llvm_unreachable("Unknown x86 shuffle node");
3664   case X86ISD::MOVLHPS:
3665   case X86ISD::MOVLHPD:
3666   case X86ISD::MOVHLPS:
3667   case X86ISD::MOVLPS:
3668   case X86ISD::MOVLPD:
3669   case X86ISD::MOVSS:
3670   case X86ISD::MOVSD:
3671   case X86ISD::UNPCKL:
3672   case X86ISD::UNPCKH:
3673     return DAG.getNode(Opc, dl, VT, V1, V2);
3674   }
3675 }
3676
3677 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
3678   MachineFunction &MF = DAG.getMachineFunction();
3679   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
3680   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3681   int ReturnAddrIndex = FuncInfo->getRAIndex();
3682
3683   if (ReturnAddrIndex == 0) {
3684     // Set up a frame object for the return address.
3685     unsigned SlotSize = RegInfo->getSlotSize();
3686     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
3687                                                            -(int64_t)SlotSize,
3688                                                            false);
3689     FuncInfo->setRAIndex(ReturnAddrIndex);
3690   }
3691
3692   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
3693 }
3694
3695 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
3696                                        bool hasSymbolicDisplacement) {
3697   // Offset should fit into 32 bit immediate field.
3698   if (!isInt<32>(Offset))
3699     return false;
3700
3701   // If we don't have a symbolic displacement - we don't have any extra
3702   // restrictions.
3703   if (!hasSymbolicDisplacement)
3704     return true;
3705
3706   // FIXME: Some tweaks might be needed for medium code model.
3707   if (M != CodeModel::Small && M != CodeModel::Kernel)
3708     return false;
3709
3710   // For small code model we assume that latest object is 16MB before end of 31
3711   // bits boundary. We may also accept pretty large negative constants knowing
3712   // that all objects are in the positive half of address space.
3713   if (M == CodeModel::Small && Offset < 16*1024*1024)
3714     return true;
3715
3716   // For kernel code model we know that all object resist in the negative half
3717   // of 32bits address space. We may not accept negative offsets, since they may
3718   // be just off and we may accept pretty large positive ones.
3719   if (M == CodeModel::Kernel && Offset >= 0)
3720     return true;
3721
3722   return false;
3723 }
3724
3725 /// isCalleePop - Determines whether the callee is required to pop its
3726 /// own arguments. Callee pop is necessary to support tail calls.
3727 bool X86::isCalleePop(CallingConv::ID CallingConv,
3728                       bool is64Bit, bool IsVarArg, bool TailCallOpt) {
3729   switch (CallingConv) {
3730   default:
3731     return false;
3732   case CallingConv::X86_StdCall:
3733   case CallingConv::X86_FastCall:
3734   case CallingConv::X86_ThisCall:
3735     return !is64Bit;
3736   case CallingConv::Fast:
3737   case CallingConv::GHC:
3738   case CallingConv::HiPE:
3739     if (IsVarArg)
3740       return false;
3741     return TailCallOpt;
3742   }
3743 }
3744
3745 /// \brief Return true if the condition is an unsigned comparison operation.
3746 static bool isX86CCUnsigned(unsigned X86CC) {
3747   switch (X86CC) {
3748   default: llvm_unreachable("Invalid integer condition!");
3749   case X86::COND_E:     return true;
3750   case X86::COND_G:     return false;
3751   case X86::COND_GE:    return false;
3752   case X86::COND_L:     return false;
3753   case X86::COND_LE:    return false;
3754   case X86::COND_NE:    return true;
3755   case X86::COND_B:     return true;
3756   case X86::COND_A:     return true;
3757   case X86::COND_BE:    return true;
3758   case X86::COND_AE:    return true;
3759   }
3760   llvm_unreachable("covered switch fell through?!");
3761 }
3762
3763 /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
3764 /// specific condition code, returning the condition code and the LHS/RHS of the
3765 /// comparison to make.
3766 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
3767                                SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
3768   if (!isFP) {
3769     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
3770       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
3771         // X > -1   -> X == 0, jump !sign.
3772         RHS = DAG.getConstant(0, RHS.getValueType());
3773         return X86::COND_NS;
3774       }
3775       if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
3776         // X < 0   -> X == 0, jump on sign.
3777         return X86::COND_S;
3778       }
3779       if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
3780         // X < 1   -> X <= 0
3781         RHS = DAG.getConstant(0, RHS.getValueType());
3782         return X86::COND_LE;
3783       }
3784     }
3785
3786     switch (SetCCOpcode) {
3787     default: llvm_unreachable("Invalid integer condition!");
3788     case ISD::SETEQ:  return X86::COND_E;
3789     case ISD::SETGT:  return X86::COND_G;
3790     case ISD::SETGE:  return X86::COND_GE;
3791     case ISD::SETLT:  return X86::COND_L;
3792     case ISD::SETLE:  return X86::COND_LE;
3793     case ISD::SETNE:  return X86::COND_NE;
3794     case ISD::SETULT: return X86::COND_B;
3795     case ISD::SETUGT: return X86::COND_A;
3796     case ISD::SETULE: return X86::COND_BE;
3797     case ISD::SETUGE: return X86::COND_AE;
3798     }
3799   }
3800
3801   // First determine if it is required or is profitable to flip the operands.
3802
3803   // If LHS is a foldable load, but RHS is not, flip the condition.
3804   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3805       !ISD::isNON_EXTLoad(RHS.getNode())) {
3806     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3807     std::swap(LHS, RHS);
3808   }
3809
3810   switch (SetCCOpcode) {
3811   default: break;
3812   case ISD::SETOLT:
3813   case ISD::SETOLE:
3814   case ISD::SETUGT:
3815   case ISD::SETUGE:
3816     std::swap(LHS, RHS);
3817     break;
3818   }
3819
3820   // On a floating point condition, the flags are set as follows:
3821   // ZF  PF  CF   op
3822   //  0 | 0 | 0 | X > Y
3823   //  0 | 0 | 1 | X < Y
3824   //  1 | 0 | 0 | X == Y
3825   //  1 | 1 | 1 | unordered
3826   switch (SetCCOpcode) {
3827   default: llvm_unreachable("Condcode should be pre-legalized away");
3828   case ISD::SETUEQ:
3829   case ISD::SETEQ:   return X86::COND_E;
3830   case ISD::SETOLT:              // flipped
3831   case ISD::SETOGT:
3832   case ISD::SETGT:   return X86::COND_A;
3833   case ISD::SETOLE:              // flipped
3834   case ISD::SETOGE:
3835   case ISD::SETGE:   return X86::COND_AE;
3836   case ISD::SETUGT:              // flipped
3837   case ISD::SETULT:
3838   case ISD::SETLT:   return X86::COND_B;
3839   case ISD::SETUGE:              // flipped
3840   case ISD::SETULE:
3841   case ISD::SETLE:   return X86::COND_BE;
3842   case ISD::SETONE:
3843   case ISD::SETNE:   return X86::COND_NE;
3844   case ISD::SETUO:   return X86::COND_P;
3845   case ISD::SETO:    return X86::COND_NP;
3846   case ISD::SETOEQ:
3847   case ISD::SETUNE:  return X86::COND_INVALID;
3848   }
3849 }
3850
3851 /// hasFPCMov - is there a floating point cmov for the specific X86 condition
3852 /// code. Current x86 isa includes the following FP cmov instructions:
3853 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3854 static bool hasFPCMov(unsigned X86CC) {
3855   switch (X86CC) {
3856   default:
3857     return false;
3858   case X86::COND_B:
3859   case X86::COND_BE:
3860   case X86::COND_E:
3861   case X86::COND_P:
3862   case X86::COND_A:
3863   case X86::COND_AE:
3864   case X86::COND_NE:
3865   case X86::COND_NP:
3866     return true;
3867   }
3868 }
3869
3870 /// isFPImmLegal - Returns true if the target can instruction select the
3871 /// specified FP immediate natively. If false, the legalizer will
3872 /// materialize the FP immediate as a load from a constant pool.
3873 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
3874   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
3875     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
3876       return true;
3877   }
3878   return false;
3879 }
3880
3881 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
3882                                               ISD::LoadExtType ExtTy,
3883                                               EVT NewVT) const {
3884   // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3885   // relocation target a movq or addq instruction: don't let the load shrink.
3886   SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3887   if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3888     if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3889       return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3890   return true;
3891 }
3892
3893 /// \brief Returns true if it is beneficial to convert a load of a constant
3894 /// to just the constant itself.
3895 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
3896                                                           Type *Ty) const {
3897   assert(Ty->isIntegerTy());
3898
3899   unsigned BitSize = Ty->getPrimitiveSizeInBits();
3900   if (BitSize == 0 || BitSize > 64)
3901     return false;
3902   return true;
3903 }
3904
3905 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
3906                                                 unsigned Index) const {
3907   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
3908     return false;
3909
3910   return (Index == 0 || Index == ResVT.getVectorNumElements());
3911 }
3912
3913 bool X86TargetLowering::isCheapToSpeculateCttz() const {
3914   // Speculate cttz only if we can directly use TZCNT.
3915   return Subtarget->hasBMI();
3916 }
3917
3918 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
3919   // Speculate ctlz only if we can directly use LZCNT.
3920   return Subtarget->hasLZCNT();
3921 }
3922
3923 /// isUndefOrInRange - Return true if Val is undef or if its value falls within
3924 /// the specified range (L, H].
3925 static bool isUndefOrInRange(int Val, int Low, int Hi) {
3926   return (Val < 0) || (Val >= Low && Val < Hi);
3927 }
3928
3929 /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
3930 /// specified value.
3931 static bool isUndefOrEqual(int Val, int CmpVal) {
3932   return (Val < 0 || Val == CmpVal);
3933 }
3934
3935 /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
3936 /// from position Pos and ending in Pos+Size, falls within the specified
3937 /// sequential range (Low, Low+Size]. or is undef.
3938 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
3939                                        unsigned Pos, unsigned Size, int Low) {
3940   for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
3941     if (!isUndefOrEqual(Mask[i], Low))
3942       return false;
3943   return true;
3944 }
3945
3946 /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
3947 /// is suitable for input to PSHUFD. That is, it doesn't reference the other
3948 /// operand - by default will match for first operand.
3949 static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT,
3950                          bool TestSecondOperand = false) {
3951   if (VT != MVT::v4f32 && VT != MVT::v4i32 &&
3952       VT != MVT::v2f64 && VT != MVT::v2i64)
3953     return false;
3954
3955   unsigned NumElems = VT.getVectorNumElements();
3956   unsigned Lo = TestSecondOperand ? NumElems : 0;
3957   unsigned Hi = Lo + NumElems;
3958
3959   for (unsigned i = 0; i < NumElems; ++i)
3960     if (!isUndefOrInRange(Mask[i], (int)Lo, (int)Hi))
3961       return false;
3962
3963   return true;
3964 }
3965
3966 /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
3967 /// is suitable for input to PSHUFHW.
3968 static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
3969   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
3970     return false;
3971
3972   // Lower quadword copied in order or undef.
3973   if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
3974     return false;
3975
3976   // Upper quadword shuffled.
3977   for (unsigned i = 4; i != 8; ++i)
3978     if (!isUndefOrInRange(Mask[i], 4, 8))
3979       return false;
3980
3981   if (VT == MVT::v16i16) {
3982     // Lower quadword copied in order or undef.
3983     if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
3984       return false;
3985
3986     // Upper quadword shuffled.
3987     for (unsigned i = 12; i != 16; ++i)
3988       if (!isUndefOrInRange(Mask[i], 12, 16))
3989         return false;
3990   }
3991
3992   return true;
3993 }
3994
3995 /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
3996 /// is suitable for input to PSHUFLW.
3997 static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
3998   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
3999     return false;
4000
4001   // Upper quadword copied in order.
4002   if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
4003     return false;
4004
4005   // Lower quadword shuffled.
4006   for (unsigned i = 0; i != 4; ++i)
4007     if (!isUndefOrInRange(Mask[i], 0, 4))
4008       return false;
4009
4010   if (VT == MVT::v16i16) {
4011     // Upper quadword copied in order.
4012     if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
4013       return false;
4014
4015     // Lower quadword shuffled.
4016     for (unsigned i = 8; i != 12; ++i)
4017       if (!isUndefOrInRange(Mask[i], 8, 12))
4018         return false;
4019   }
4020
4021   return true;
4022 }
4023
4024 /// \brief Return true if the mask specifies a shuffle of elements that is
4025 /// suitable for input to intralane (palignr) or interlane (valign) vector
4026 /// right-shift.
4027 static bool isAlignrMask(ArrayRef<int> Mask, MVT VT, bool InterLane) {
4028   unsigned NumElts = VT.getVectorNumElements();
4029   unsigned NumLanes = InterLane ? 1: VT.getSizeInBits()/128;
4030   unsigned NumLaneElts = NumElts/NumLanes;
4031
4032   // Do not handle 64-bit element shuffles with palignr.
4033   if (NumLaneElts == 2)
4034     return false;
4035
4036   for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
4037     unsigned i;
4038     for (i = 0; i != NumLaneElts; ++i) {
4039       if (Mask[i+l] >= 0)
4040         break;
4041     }
4042
4043     // Lane is all undef, go to next lane
4044     if (i == NumLaneElts)
4045       continue;
4046
4047     int Start = Mask[i+l];
4048
4049     // Make sure its in this lane in one of the sources
4050     if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
4051         !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
4052       return false;
4053
4054     // If not lane 0, then we must match lane 0
4055     if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
4056       return false;
4057
4058     // Correct second source to be contiguous with first source
4059     if (Start >= (int)NumElts)
4060       Start -= NumElts - NumLaneElts;
4061
4062     // Make sure we're shifting in the right direction.
4063     if (Start <= (int)(i+l))
4064       return false;
4065
4066     Start -= i;
4067
4068     // Check the rest of the elements to see if they are consecutive.
4069     for (++i; i != NumLaneElts; ++i) {
4070       int Idx = Mask[i+l];
4071
4072       // Make sure its in this lane
4073       if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
4074           !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
4075         return false;
4076
4077       // If not lane 0, then we must match lane 0
4078       if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
4079         return false;
4080
4081       if (Idx >= (int)NumElts)
4082         Idx -= NumElts - NumLaneElts;
4083
4084       if (!isUndefOrEqual(Idx, Start+i))
4085         return false;
4086
4087     }
4088   }
4089
4090   return true;
4091 }
4092
4093 /// \brief Return true if the node specifies a shuffle of elements that is
4094 /// suitable for input to PALIGNR.
4095 static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
4096                           const X86Subtarget *Subtarget) {
4097   if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
4098       (VT.is256BitVector() && !Subtarget->hasInt256()) ||
4099       VT.is512BitVector())
4100     // FIXME: Add AVX512BW.
4101     return false;
4102
4103   return isAlignrMask(Mask, VT, false);
4104 }
4105
4106 /// \brief Return true if the node specifies a shuffle of elements that is
4107 /// suitable for input to VALIGN.
4108 static bool isVALIGNMask(ArrayRef<int> Mask, MVT VT,
4109                           const X86Subtarget *Subtarget) {
4110   // FIXME: Add AVX512VL.
4111   if (!VT.is512BitVector() || !Subtarget->hasAVX512())
4112     return false;
4113   return isAlignrMask(Mask, VT, true);
4114 }
4115
4116 /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
4117 /// the two vector operands have swapped position.
4118 static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
4119                                      unsigned NumElems) {
4120   for (unsigned i = 0; i != NumElems; ++i) {
4121     int idx = Mask[i];
4122     if (idx < 0)
4123       continue;
4124     else if (idx < (int)NumElems)
4125       Mask[i] = idx + NumElems;
4126     else
4127       Mask[i] = idx - NumElems;
4128   }
4129 }
4130
4131 /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
4132 /// specifies a shuffle of elements that is suitable for input to 128/256-bit
4133 /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
4134 /// reverse of what x86 shuffles want.
4135 static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) {
4136
4137   unsigned NumElems = VT.getVectorNumElements();
4138   unsigned NumLanes = VT.getSizeInBits()/128;
4139   unsigned NumLaneElems = NumElems/NumLanes;
4140
4141   if (NumLaneElems != 2 && NumLaneElems != 4)
4142     return false;
4143
4144   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4145   bool symetricMaskRequired =
4146     (VT.getSizeInBits() >= 256) && (EltSize == 32);
4147
4148   // VSHUFPSY divides the resulting vector into 4 chunks.
4149   // The sources are also splitted into 4 chunks, and each destination
4150   // chunk must come from a different source chunk.
4151   //
4152   //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
4153   //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
4154   //
4155   //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
4156   //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
4157   //
4158   // VSHUFPDY divides the resulting vector into 4 chunks.
4159   // The sources are also splitted into 4 chunks, and each destination
4160   // chunk must come from a different source chunk.
4161   //
4162   //  SRC1 =>      X3       X2       X1       X0
4163   //  SRC2 =>      Y3       Y2       Y1       Y0
4164   //
4165   //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
4166   //
4167   SmallVector<int, 4> MaskVal(NumLaneElems, -1);
4168   unsigned HalfLaneElems = NumLaneElems/2;
4169   for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
4170     for (unsigned i = 0; i != NumLaneElems; ++i) {
4171       int Idx = Mask[i+l];
4172       unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
4173       if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
4174         return false;
4175       // For VSHUFPSY, the mask of the second half must be the same as the
4176       // first but with the appropriate offsets. This works in the same way as
4177       // VPERMILPS works with masks.
4178       if (!symetricMaskRequired || Idx < 0)
4179         continue;
4180       if (MaskVal[i] < 0) {
4181         MaskVal[i] = Idx - l;
4182         continue;
4183       }
4184       if ((signed)(Idx - l) != MaskVal[i])
4185         return false;
4186     }
4187   }
4188
4189   return true;
4190 }
4191
4192 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
4193 /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
4194 static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) {
4195   if (!VT.is128BitVector())
4196     return false;
4197
4198   unsigned NumElems = VT.getVectorNumElements();
4199
4200   if (NumElems != 4)
4201     return false;
4202
4203   // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
4204   return isUndefOrEqual(Mask[0], 6) &&
4205          isUndefOrEqual(Mask[1], 7) &&
4206          isUndefOrEqual(Mask[2], 2) &&
4207          isUndefOrEqual(Mask[3], 3);
4208 }
4209
4210 /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
4211 /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
4212 /// <2, 3, 2, 3>
4213 static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) {
4214   if (!VT.is128BitVector())
4215     return false;
4216
4217   unsigned NumElems = VT.getVectorNumElements();
4218
4219   if (NumElems != 4)
4220     return false;
4221
4222   return isUndefOrEqual(Mask[0], 2) &&
4223          isUndefOrEqual(Mask[1], 3) &&
4224          isUndefOrEqual(Mask[2], 2) &&
4225          isUndefOrEqual(Mask[3], 3);
4226 }
4227
4228 /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
4229 /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
4230 static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) {
4231   if (!VT.is128BitVector())
4232     return false;
4233
4234   unsigned NumElems = VT.getVectorNumElements();
4235
4236   if (NumElems != 2 && NumElems != 4)
4237     return false;
4238
4239   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4240     if (!isUndefOrEqual(Mask[i], i + NumElems))
4241       return false;
4242
4243   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
4244     if (!isUndefOrEqual(Mask[i], i))
4245       return false;
4246
4247   return true;
4248 }
4249
4250 /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
4251 /// specifies a shuffle of elements that is suitable for input to MOVLHPS.
4252 static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
4253   if (!VT.is128BitVector())
4254     return false;
4255
4256   unsigned NumElems = VT.getVectorNumElements();
4257
4258   if (NumElems != 2 && NumElems != 4)
4259     return false;
4260
4261   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4262     if (!isUndefOrEqual(Mask[i], i))
4263       return false;
4264
4265   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4266     if (!isUndefOrEqual(Mask[i + e], i + NumElems))
4267       return false;
4268
4269   return true;
4270 }
4271
4272 /// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
4273 /// specifies a shuffle of elements that is suitable for input to INSERTPS.
4274 /// i. e: If all but one element come from the same vector.
4275 static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
4276   // TODO: Deal with AVX's VINSERTPS
4277   if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))
4278     return false;
4279
4280   unsigned CorrectPosV1 = 0;
4281   unsigned CorrectPosV2 = 0;
4282   for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) {
4283     if (Mask[i] == -1) {
4284       ++CorrectPosV1;
4285       ++CorrectPosV2;
4286       continue;
4287     }
4288
4289     if (Mask[i] == i)
4290       ++CorrectPosV1;
4291     else if (Mask[i] == i + 4)
4292       ++CorrectPosV2;
4293   }
4294
4295   if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
4296     // We have 3 elements (undefs count as elements from any vector) from one
4297     // vector, and one from another.
4298     return true;
4299
4300   return false;
4301 }
4302
4303 //
4304 // Some special combinations that can be optimized.
4305 //
4306 static
4307 SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
4308                                SelectionDAG &DAG) {
4309   MVT VT = SVOp->getSimpleValueType(0);
4310   SDLoc dl(SVOp);
4311
4312   if (VT != MVT::v8i32 && VT != MVT::v8f32)
4313     return SDValue();
4314
4315   ArrayRef<int> Mask = SVOp->getMask();
4316
4317   // These are the special masks that may be optimized.
4318   static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
4319   static const int MaskToOptimizeOdd[]  = {1, 9, 3, 11, 5, 13, 7, 15};
4320   bool MatchEvenMask = true;
4321   bool MatchOddMask  = true;
4322   for (int i=0; i<8; ++i) {
4323     if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
4324       MatchEvenMask = false;
4325     if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
4326       MatchOddMask = false;
4327   }
4328
4329   if (!MatchEvenMask && !MatchOddMask)
4330     return SDValue();
4331
4332   SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
4333
4334   SDValue Op0 = SVOp->getOperand(0);
4335   SDValue Op1 = SVOp->getOperand(1);
4336
4337   if (MatchEvenMask) {
4338     // Shift the second operand right to 32 bits.
4339     static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
4340     Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
4341   } else {
4342     // Shift the first operand left to 32 bits.
4343     static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
4344     Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
4345   }
4346   static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
4347   return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
4348 }
4349
4350 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
4351 /// specifies a shuffle of elements that is suitable for input to UNPCKL.
4352 static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT,
4353                          bool HasInt256, bool V2IsSplat = false) {
4354
4355   assert(VT.getSizeInBits() >= 128 &&
4356          "Unsupported vector type for unpckl");
4357
4358   unsigned NumElts = VT.getVectorNumElements();
4359   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
4360       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4361     return false;
4362
4363   assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
4364          "Unsupported vector type for unpckh");
4365
4366   // AVX defines UNPCK* to operate independently on 128-bit lanes.
4367   unsigned NumLanes = VT.getSizeInBits()/128;
4368   unsigned NumLaneElts = NumElts/NumLanes;
4369
4370   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4371     for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
4372       int BitI  = Mask[l+i];
4373       int BitI1 = Mask[l+i+1];
4374       if (!isUndefOrEqual(BitI, j))
4375         return false;
4376       if (V2IsSplat) {
4377         if (!isUndefOrEqual(BitI1, NumElts))
4378           return false;
4379       } else {
4380         if (!isUndefOrEqual(BitI1, j + NumElts))
4381           return false;
4382       }
4383     }
4384   }
4385
4386   return true;
4387 }
4388
4389 /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
4390 /// specifies a shuffle of elements that is suitable for input to UNPCKH.
4391 static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT,
4392                          bool HasInt256, bool V2IsSplat = false) {
4393   assert(VT.getSizeInBits() >= 128 &&
4394          "Unsupported vector type for unpckh");
4395
4396   unsigned NumElts = VT.getVectorNumElements();
4397   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
4398       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4399     return false;
4400
4401   assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
4402          "Unsupported vector type for unpckh");
4403
4404   // AVX defines UNPCK* to operate independently on 128-bit lanes.
4405   unsigned NumLanes = VT.getSizeInBits()/128;
4406   unsigned NumLaneElts = NumElts/NumLanes;
4407
4408   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4409     for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
4410       int BitI  = Mask[l+i];
4411       int BitI1 = Mask[l+i+1];
4412       if (!isUndefOrEqual(BitI, j))
4413         return false;
4414       if (V2IsSplat) {
4415         if (isUndefOrEqual(BitI1, NumElts))
4416           return false;
4417       } else {
4418         if (!isUndefOrEqual(BitI1, j+NumElts))
4419           return false;
4420       }
4421     }
4422   }
4423   return true;
4424 }
4425
4426 /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
4427 /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
4428 /// <0, 0, 1, 1>
4429 static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
4430   unsigned NumElts = VT.getVectorNumElements();
4431   bool Is256BitVec = VT.is256BitVector();
4432
4433   if (VT.is512BitVector())
4434     return false;
4435   assert((VT.is128BitVector() || VT.is256BitVector()) &&
4436          "Unsupported vector type for unpckh");
4437
4438   if (Is256BitVec && NumElts != 4 && NumElts != 8 &&
4439       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4440     return false;
4441
4442   // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
4443   // FIXME: Need a better way to get rid of this, there's no latency difference
4444   // between UNPCKLPD and MOVDDUP, the later should always be checked first and
4445   // the former later. We should also remove the "_undef" special mask.
4446   if (NumElts == 4 && Is256BitVec)
4447     return false;
4448
4449   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
4450   // independently on 128-bit lanes.
4451   unsigned NumLanes = VT.getSizeInBits()/128;
4452   unsigned NumLaneElts = NumElts/NumLanes;
4453
4454   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4455     for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
4456       int BitI  = Mask[l+i];
4457       int BitI1 = Mask[l+i+1];
4458
4459       if (!isUndefOrEqual(BitI, j))
4460         return false;
4461       if (!isUndefOrEqual(BitI1, j))
4462         return false;
4463     }
4464   }
4465
4466   return true;
4467 }
4468
4469 /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
4470 /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
4471 /// <2, 2, 3, 3>
4472 static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
4473   unsigned NumElts = VT.getVectorNumElements();
4474
4475   if (VT.is512BitVector())
4476     return false;
4477
4478   assert((VT.is128BitVector() || VT.is256BitVector()) &&
4479          "Unsupported vector type for unpckh");
4480
4481   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
4482       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4483     return false;
4484
4485   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
4486   // independently on 128-bit lanes.
4487   unsigned NumLanes = VT.getSizeInBits()/128;
4488   unsigned NumLaneElts = NumElts/NumLanes;
4489
4490   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4491     for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
4492       int BitI  = Mask[l+i];
4493       int BitI1 = Mask[l+i+1];
4494       if (!isUndefOrEqual(BitI, j))
4495         return false;
4496       if (!isUndefOrEqual(BitI1, j))
4497         return false;
4498     }
4499   }
4500   return true;
4501 }
4502
4503 // Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or
4504 // (src1[0], src0[1]), manipulation with 256-bit sub-vectors
4505 static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) {
4506   if (!VT.is512BitVector())
4507     return false;
4508
4509   unsigned NumElts = VT.getVectorNumElements();
4510   unsigned HalfSize = NumElts/2;
4511   if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) {
4512     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) {
4513       *Imm = 1;
4514       return true;
4515     }
4516   }
4517   if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) {
4518     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) {
4519       *Imm = 0;
4520       return true;
4521     }
4522   }
4523   return false;
4524 }
4525
4526 /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
4527 /// specifies a shuffle of elements that is suitable for input to MOVSS,
4528 /// MOVSD, and MOVD, i.e. setting the lowest element.
4529 static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
4530   if (VT.getVectorElementType().getSizeInBits() < 32)
4531     return false;
4532   if (!VT.is128BitVector())
4533     return false;
4534
4535   unsigned NumElts = VT.getVectorNumElements();
4536
4537   if (!isUndefOrEqual(Mask[0], NumElts))
4538     return false;
4539
4540   for (unsigned i = 1; i != NumElts; ++i)
4541     if (!isUndefOrEqual(Mask[i], i))
4542       return false;
4543
4544   return true;
4545 }
4546
4547 /// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
4548 /// as permutations between 128-bit chunks or halves. As an example: this
4549 /// shuffle bellow:
4550 ///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
4551 /// The first half comes from the second half of V1 and the second half from the
4552 /// the second half of V2.
4553 static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
4554   if (!HasFp256 || !VT.is256BitVector())
4555     return false;
4556
4557   // The shuffle result is divided into half A and half B. In total the two
4558   // sources have 4 halves, namely: C, D, E, F. The final values of A and
4559   // B must come from C, D, E or F.
4560   unsigned HalfSize = VT.getVectorNumElements()/2;
4561   bool MatchA = false, MatchB = false;
4562
4563   // Check if A comes from one of C, D, E, F.
4564   for (unsigned Half = 0; Half != 4; ++Half) {
4565     if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
4566       MatchA = true;
4567       break;
4568     }
4569   }
4570
4571   // Check if B comes from one of C, D, E, F.
4572   for (unsigned Half = 0; Half != 4; ++Half) {
4573     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
4574       MatchB = true;
4575       break;
4576     }
4577   }
4578
4579   return MatchA && MatchB;
4580 }
4581
4582 /// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
4583 /// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
4584 static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
4585   MVT VT = SVOp->getSimpleValueType(0);
4586
4587   unsigned HalfSize = VT.getVectorNumElements()/2;
4588
4589   unsigned FstHalf = 0, SndHalf = 0;
4590   for (unsigned i = 0; i < HalfSize; ++i) {
4591     if (SVOp->getMaskElt(i) > 0) {
4592       FstHalf = SVOp->getMaskElt(i)/HalfSize;
4593       break;
4594     }
4595   }
4596   for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
4597     if (SVOp->getMaskElt(i) > 0) {
4598       SndHalf = SVOp->getMaskElt(i)/HalfSize;
4599       break;
4600     }
4601   }
4602
4603   return (FstHalf | (SndHalf << 4));
4604 }
4605
4606 // Symetric in-lane mask. Each lane has 4 elements (for imm8)
4607 static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) {
4608   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4609   if (EltSize < 32)
4610     return false;
4611
4612   unsigned NumElts = VT.getVectorNumElements();
4613   Imm8 = 0;
4614   if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) {
4615     for (unsigned i = 0; i != NumElts; ++i) {
4616       if (Mask[i] < 0)
4617         continue;
4618       Imm8 |= Mask[i] << (i*2);
4619     }
4620     return true;
4621   }
4622
4623   unsigned LaneSize = 4;
4624   SmallVector<int, 4> MaskVal(LaneSize, -1);
4625
4626   for (unsigned l = 0; l != NumElts; l += LaneSize) {
4627     for (unsigned i = 0; i != LaneSize; ++i) {
4628       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
4629         return false;
4630       if (Mask[i+l] < 0)
4631         continue;
4632       if (MaskVal[i] < 0) {
4633         MaskVal[i] = Mask[i+l] - l;
4634         Imm8 |= MaskVal[i] << (i*2);
4635         continue;
4636       }
4637       if (Mask[i+l] != (signed)(MaskVal[i]+l))
4638         return false;
4639     }
4640   }
4641   return true;
4642 }
4643
4644 /// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
4645 /// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
4646 /// Note that VPERMIL mask matching is different depending whether theunderlying
4647 /// type is 32 or 64. In the VPERMILPS the high half of the mask should point
4648 /// to the same elements of the low, but to the higher half of the source.
4649 /// In VPERMILPD the two lanes could be shuffled independently of each other
4650 /// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
4651 static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {
4652   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4653   if (VT.getSizeInBits() < 256 || EltSize < 32)
4654     return false;
4655   bool symetricMaskRequired = (EltSize == 32);
4656   unsigned NumElts = VT.getVectorNumElements();
4657
4658   unsigned NumLanes = VT.getSizeInBits()/128;
4659   unsigned LaneSize = NumElts/NumLanes;
4660   // 2 or 4 elements in one lane
4661
4662   SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);
4663   for (unsigned l = 0; l != NumElts; l += LaneSize) {
4664     for (unsigned i = 0; i != LaneSize; ++i) {
4665       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
4666         return false;
4667       if (symetricMaskRequired) {
4668         if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {
4669           ExpectedMaskVal[i] = Mask[i+l] - l;
4670           continue;
4671         }
4672         if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l))
4673           return false;
4674       }
4675     }
4676   }
4677   return true;
4678 }
4679
4680 /// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
4681 /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
4682 /// element of vector 2 and the other elements to come from vector 1 in order.
4683 static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT,
4684                                bool V2IsSplat = false, bool V2IsUndef = false) {
4685   if (!VT.is128BitVector())
4686     return false;
4687
4688   unsigned NumOps = VT.getVectorNumElements();
4689   if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
4690     return false;
4691
4692   if (!isUndefOrEqual(Mask[0], 0))
4693     return false;
4694
4695   for (unsigned i = 1; i != NumOps; ++i)
4696     if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
4697           (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
4698           (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
4699       return false;
4700
4701   return true;
4702 }
4703
4704 /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4705 /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
4706 /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
4707 static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT,
4708                            const X86Subtarget *Subtarget) {
4709   if (!Subtarget->hasSSE3())
4710     return false;
4711
4712   unsigned NumElems = VT.getVectorNumElements();
4713
4714   if ((VT.is128BitVector() && NumElems != 4) ||
4715       (VT.is256BitVector() && NumElems != 8) ||
4716       (VT.is512BitVector() && NumElems != 16))
4717     return false;
4718
4719   // "i+1" is the value the indexed mask element must have
4720   for (unsigned i = 0; i != NumElems; i += 2)
4721     if (!isUndefOrEqual(Mask[i], i+1) ||
4722         !isUndefOrEqual(Mask[i+1], i+1))
4723       return false;
4724
4725   return true;
4726 }
4727
4728 /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4729 /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
4730 /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
4731 static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT,
4732                            const X86Subtarget *Subtarget) {
4733   if (!Subtarget->hasSSE3())
4734     return false;
4735
4736   unsigned NumElems = VT.getVectorNumElements();
4737
4738   if ((VT.is128BitVector() && NumElems != 4) ||
4739       (VT.is256BitVector() && NumElems != 8) ||
4740       (VT.is512BitVector() && NumElems != 16))
4741     return false;
4742
4743   // "i" is the value the indexed mask element must have
4744   for (unsigned i = 0; i != NumElems; i += 2)
4745     if (!isUndefOrEqual(Mask[i], i) ||
4746         !isUndefOrEqual(Mask[i+1], i))
4747       return false;
4748
4749   return true;
4750 }
4751
4752 /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
4753 /// specifies a shuffle of elements that is suitable for input to 256-bit
4754 /// version of MOVDDUP.
4755 static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
4756   if (!HasFp256 || !VT.is256BitVector())
4757     return false;
4758
4759   unsigned NumElts = VT.getVectorNumElements();
4760   if (NumElts != 4)
4761     return false;
4762
4763   for (unsigned i = 0; i != NumElts/2; ++i)
4764     if (!isUndefOrEqual(Mask[i], 0))
4765       return false;
4766   for (unsigned i = NumElts/2; i != NumElts; ++i)
4767     if (!isUndefOrEqual(Mask[i], NumElts/2))
4768       return false;
4769   return true;
4770 }
4771
4772 /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4773 /// specifies a shuffle of elements that is suitable for input to 128-bit
4774 /// version of MOVDDUP.
4775 static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) {
4776   if (!VT.is128BitVector())
4777     return false;
4778
4779   unsigned e = VT.getVectorNumElements() / 2;
4780   for (unsigned i = 0; i != e; ++i)
4781     if (!isUndefOrEqual(Mask[i], i))
4782       return false;
4783   for (unsigned i = 0; i != e; ++i)
4784     if (!isUndefOrEqual(Mask[e+i], i))
4785       return false;
4786   return true;
4787 }
4788
4789 /// isVEXTRACTIndex - Return true if the specified
4790 /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
4791 /// suitable for instruction that extract 128 or 256 bit vectors
4792 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
4793   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4794   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4795     return false;
4796
4797   // The index should be aligned on a vecWidth-bit boundary.
4798   uint64_t Index =
4799     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4800
4801   MVT VT = N->getSimpleValueType(0);
4802   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
4803   bool Result = (Index * ElSize) % vecWidth == 0;
4804
4805   return Result;
4806 }
4807
4808 /// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR
4809 /// operand specifies a subvector insert that is suitable for input to
4810 /// insertion of 128 or 256-bit subvectors
4811 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
4812   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4813   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4814     return false;
4815   // The index should be aligned on a vecWidth-bit boundary.
4816   uint64_t Index =
4817     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4818
4819   MVT VT = N->getSimpleValueType(0);
4820   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
4821   bool Result = (Index * ElSize) % vecWidth == 0;
4822
4823   return Result;
4824 }
4825
4826 bool X86::isVINSERT128Index(SDNode *N) {
4827   return isVINSERTIndex(N, 128);
4828 }
4829
4830 bool X86::isVINSERT256Index(SDNode *N) {
4831   return isVINSERTIndex(N, 256);
4832 }
4833
4834 bool X86::isVEXTRACT128Index(SDNode *N) {
4835   return isVEXTRACTIndex(N, 128);
4836 }
4837
4838 bool X86::isVEXTRACT256Index(SDNode *N) {
4839   return isVEXTRACTIndex(N, 256);
4840 }
4841
4842 /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
4843 /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
4844 /// Handles 128-bit and 256-bit.
4845 static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
4846   MVT VT = N->getSimpleValueType(0);
4847
4848   assert((VT.getSizeInBits() >= 128) &&
4849          "Unsupported vector type for PSHUF/SHUFP");
4850
4851   // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
4852   // independently on 128-bit lanes.
4853   unsigned NumElts = VT.getVectorNumElements();
4854   unsigned NumLanes = VT.getSizeInBits()/128;
4855   unsigned NumLaneElts = NumElts/NumLanes;
4856
4857   assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) &&
4858          "Only supports 2, 4 or 8 elements per lane");
4859
4860   unsigned Shift = (NumLaneElts >= 4) ? 1 : 0;
4861   unsigned Mask = 0;
4862   for (unsigned i = 0; i != NumElts; ++i) {
4863     int Elt = N->getMaskElt(i);
4864     if (Elt < 0) continue;
4865     Elt &= NumLaneElts - 1;
4866     unsigned ShAmt = (i << Shift) % 8;
4867     Mask |= Elt << ShAmt;
4868   }
4869
4870   return Mask;
4871 }
4872
4873 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
4874 /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
4875 static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
4876   MVT VT = N->getSimpleValueType(0);
4877
4878   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
4879          "Unsupported vector type for PSHUFHW");
4880
4881   unsigned NumElts = VT.getVectorNumElements();
4882
4883   unsigned Mask = 0;
4884   for (unsigned l = 0; l != NumElts; l += 8) {
4885     // 8 nodes per lane, but we only care about the last 4.
4886     for (unsigned i = 0; i < 4; ++i) {
4887       int Elt = N->getMaskElt(l+i+4);
4888       if (Elt < 0) continue;
4889       Elt &= 0x3; // only 2-bits.
4890       Mask |= Elt << (i * 2);
4891     }
4892   }
4893
4894   return Mask;
4895 }
4896
4897 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
4898 /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
4899 static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
4900   MVT VT = N->getSimpleValueType(0);
4901
4902   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
4903          "Unsupported vector type for PSHUFHW");
4904
4905   unsigned NumElts = VT.getVectorNumElements();
4906
4907   unsigned Mask = 0;
4908   for (unsigned l = 0; l != NumElts; l += 8) {
4909     // 8 nodes per lane, but we only care about the first 4.
4910     for (unsigned i = 0; i < 4; ++i) {
4911       int Elt = N->getMaskElt(l+i);
4912       if (Elt < 0) continue;
4913       Elt &= 0x3; // only 2-bits
4914       Mask |= Elt << (i * 2);
4915     }
4916   }
4917
4918   return Mask;
4919 }
4920
4921 /// \brief Return the appropriate immediate to shuffle the specified
4922 /// VECTOR_SHUFFLE mask with the PALIGNR (if InterLane is false) or with
4923 /// VALIGN (if Interlane is true) instructions.
4924 static unsigned getShuffleAlignrImmediate(ShuffleVectorSDNode *SVOp,
4925                                            bool InterLane) {
4926   MVT VT = SVOp->getSimpleValueType(0);
4927   unsigned EltSize = InterLane ? 1 :
4928     VT.getVectorElementType().getSizeInBits() >> 3;
4929
4930   unsigned NumElts = VT.getVectorNumElements();
4931   unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128;
4932   unsigned NumLaneElts = NumElts/NumLanes;
4933
4934   int Val = 0;
4935   unsigned i;
4936   for (i = 0; i != NumElts; ++i) {
4937     Val = SVOp->getMaskElt(i);
4938     if (Val >= 0)
4939       break;
4940   }
4941   if (Val >= (int)NumElts)
4942     Val -= NumElts - NumLaneElts;
4943
4944   assert(Val - i > 0 && "PALIGNR imm should be positive");
4945   return (Val - i) * EltSize;
4946 }
4947
4948 /// \brief Return the appropriate immediate to shuffle the specified
4949 /// VECTOR_SHUFFLE mask with the PALIGNR instruction.
4950 static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
4951   return getShuffleAlignrImmediate(SVOp, false);
4952 }
4953
4954 /// \brief Return the appropriate immediate to shuffle the specified
4955 /// VECTOR_SHUFFLE mask with the VALIGN instruction.
4956 static unsigned getShuffleVALIGNImmediate(ShuffleVectorSDNode *SVOp) {
4957   return getShuffleAlignrImmediate(SVOp, true);
4958 }
4959
4960
4961 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
4962   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4963   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4964     llvm_unreachable("Illegal extract subvector for VEXTRACT");
4965
4966   uint64_t Index =
4967     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4968
4969   MVT VecVT = N->getOperand(0).getSimpleValueType();
4970   MVT ElVT = VecVT.getVectorElementType();
4971
4972   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4973   return Index / NumElemsPerChunk;
4974 }
4975
4976 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
4977   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4978   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4979     llvm_unreachable("Illegal insert subvector for VINSERT");
4980
4981   uint64_t Index =
4982     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4983
4984   MVT VecVT = N->getSimpleValueType(0);
4985   MVT ElVT = VecVT.getVectorElementType();
4986
4987   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4988   return Index / NumElemsPerChunk;
4989 }
4990
4991 /// getExtractVEXTRACT128Immediate - Return the appropriate immediate
4992 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
4993 /// and VINSERTI128 instructions.
4994 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
4995   return getExtractVEXTRACTImmediate(N, 128);
4996 }
4997
4998 /// getExtractVEXTRACT256Immediate - Return the appropriate immediate
4999 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4
5000 /// and VINSERTI64x4 instructions.
5001 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
5002   return getExtractVEXTRACTImmediate(N, 256);
5003 }
5004
5005 /// getInsertVINSERT128Immediate - Return the appropriate immediate
5006 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
5007 /// and VINSERTI128 instructions.
5008 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
5009   return getInsertVINSERTImmediate(N, 128);
5010 }
5011
5012 /// getInsertVINSERT256Immediate - Return the appropriate immediate
5013 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4
5014 /// and VINSERTI64x4 instructions.
5015 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
5016   return getInsertVINSERTImmediate(N, 256);
5017 }
5018
5019 /// isZero - Returns true if Elt is a constant integer zero
5020 static bool isZero(SDValue V) {
5021   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
5022   return C && C->isNullValue();
5023 }
5024
5025 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
5026 /// constant +0.0.
5027 bool X86::isZeroNode(SDValue Elt) {
5028   if (isZero(Elt))
5029     return true;
5030   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
5031     return CFP->getValueAPF().isPosZero();
5032   return false;
5033 }
5034
5035 /// ShouldXformToMOVHLPS - Return true if the node should be transformed to
5036 /// match movhlps. The lower half elements should come from upper half of
5037 /// V1 (and in order), and the upper half elements should come from the upper
5038 /// half of V2 (and in order).
5039 static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) {
5040   if (!VT.is128BitVector())
5041     return false;
5042   if (VT.getVectorNumElements() != 4)
5043     return false;
5044   for (unsigned i = 0, e = 2; i != e; ++i)
5045     if (!isUndefOrEqual(Mask[i], i+2))
5046       return false;
5047   for (unsigned i = 2; i != 4; ++i)
5048     if (!isUndefOrEqual(Mask[i], i+4))
5049       return false;
5050   return true;
5051 }
5052
5053 /// isScalarLoadToVector - Returns true if the node is a scalar load that
5054 /// is promoted to a vector. It also returns the LoadSDNode by reference if
5055 /// required.
5056 static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) {
5057   if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
5058     return false;
5059   N = N->getOperand(0).getNode();
5060   if (!ISD::isNON_EXTLoad(N))
5061     return false;
5062   if (LD)
5063     *LD = cast<LoadSDNode>(N);
5064   return true;
5065 }
5066
5067 // Test whether the given value is a vector value which will be legalized
5068 // into a load.
5069 static bool WillBeConstantPoolLoad(SDNode *N) {
5070   if (N->getOpcode() != ISD::BUILD_VECTOR)
5071     return false;
5072
5073   // Check for any non-constant elements.
5074   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
5075     switch (N->getOperand(i).getNode()->getOpcode()) {
5076     case ISD::UNDEF:
5077     case ISD::ConstantFP:
5078     case ISD::Constant:
5079       break;
5080     default:
5081       return false;
5082     }
5083
5084   // Vectors of all-zeros and all-ones are materialized with special
5085   // instructions rather than being loaded.
5086   return !ISD::isBuildVectorAllZeros(N) &&
5087          !ISD::isBuildVectorAllOnes(N);
5088 }
5089
5090 /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
5091 /// match movlp{s|d}. The lower half elements should come from lower half of
5092 /// V1 (and in order), and the upper half elements should come from the upper
5093 /// half of V2 (and in order). And since V1 will become the source of the
5094 /// MOVLP, it must be either a vector load or a scalar load to vector.
5095 static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
5096                                ArrayRef<int> Mask, MVT VT) {
5097   if (!VT.is128BitVector())
5098     return false;
5099
5100   if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
5101     return false;
5102   // Is V2 is a vector load, don't do this transformation. We will try to use
5103   // load folding shufps op.
5104   if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
5105     return false;
5106
5107   unsigned NumElems = VT.getVectorNumElements();
5108
5109   if (NumElems != 2 && NumElems != 4)
5110     return false;
5111   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
5112     if (!isUndefOrEqual(Mask[i], i))
5113       return false;
5114   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
5115     if (!isUndefOrEqual(Mask[i], i+NumElems))
5116       return false;
5117   return true;
5118 }
5119
5120 /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
5121 /// to an zero vector.
5122 /// FIXME: move to dag combiner / method on ShuffleVectorSDNode
5123 static bool isZeroShuffle(ShuffleVectorSDNode *N) {
5124   SDValue V1 = N->getOperand(0);
5125   SDValue V2 = N->getOperand(1);
5126   unsigned NumElems = N->getValueType(0).getVectorNumElements();
5127   for (unsigned i = 0; i != NumElems; ++i) {
5128     int Idx = N->getMaskElt(i);
5129     if (Idx >= (int)NumElems) {
5130       unsigned Opc = V2.getOpcode();
5131       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
5132         continue;
5133       if (Opc != ISD::BUILD_VECTOR ||
5134           !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
5135         return false;
5136     } else if (Idx >= 0) {
5137       unsigned Opc = V1.getOpcode();
5138       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
5139         continue;
5140       if (Opc != ISD::BUILD_VECTOR ||
5141           !X86::isZeroNode(V1.getOperand(Idx)))
5142         return false;
5143     }
5144   }
5145   return true;
5146 }
5147
5148 /// getZeroVector - Returns a vector of specified type with all zero elements.
5149 ///
5150 static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
5151                              SelectionDAG &DAG, SDLoc dl) {
5152   assert(VT.isVector() && "Expected a vector type");
5153
5154   // Always build SSE zero vectors as <4 x i32> bitcasted
5155   // to their dest type. This ensures they get CSE'd.
5156   SDValue Vec;
5157   if (VT.is128BitVector()) {  // SSE
5158     if (Subtarget->hasSSE2()) {  // SSE2
5159       SDValue Cst = DAG.getConstant(0, MVT::i32);
5160       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
5161     } else { // SSE1
5162       SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
5163       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
5164     }
5165   } else if (VT.is256BitVector()) { // AVX
5166     if (Subtarget->hasInt256()) { // AVX2
5167       SDValue Cst = DAG.getConstant(0, MVT::i32);
5168       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5169       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
5170     } else {
5171       // 256-bit logic and arithmetic instructions in AVX are all
5172       // floating-point, no support for integer ops. Emit fp zeroed vectors.
5173       SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
5174       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5175       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
5176     }
5177   } else if (VT.is512BitVector()) { // AVX-512
5178       SDValue Cst = DAG.getConstant(0, MVT::i32);
5179       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
5180                         Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5181       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
5182   } else if (VT.getScalarType() == MVT::i1) {
5183     assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type");
5184     SDValue Cst = DAG.getConstant(0, MVT::i1);
5185     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
5186     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
5187   } else
5188     llvm_unreachable("Unexpected vector type");
5189
5190   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
5191 }
5192
5193 /// getOnesVector - Returns a vector of specified type with all bits set.
5194 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
5195 /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
5196 /// Then bitcast to their original type, ensuring they get CSE'd.
5197 static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
5198                              SDLoc dl) {
5199   assert(VT.isVector() && "Expected a vector type");
5200
5201   SDValue Cst = DAG.getConstant(~0U, MVT::i32);
5202   SDValue Vec;
5203   if (VT.is256BitVector()) {
5204     if (HasInt256) { // AVX2
5205       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5206       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
5207     } else { // AVX
5208       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
5209       Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
5210     }
5211   } else if (VT.is128BitVector()) {
5212     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
5213   } else
5214     llvm_unreachable("Unexpected vector type");
5215
5216   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
5217 }
5218
5219 /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
5220 /// that point to V2 points to its first element.
5221 static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
5222   for (unsigned i = 0; i != NumElems; ++i) {
5223     if (Mask[i] > (int)NumElems) {
5224       Mask[i] = NumElems;
5225     }
5226   }
5227 }
5228
5229 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
5230 /// operation of specified width.
5231 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
5232                        SDValue V2) {
5233   unsigned NumElems = VT.getVectorNumElements();
5234   SmallVector<int, 8> Mask;
5235   Mask.push_back(NumElems);
5236   for (unsigned i = 1; i != NumElems; ++i)
5237     Mask.push_back(i);
5238   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
5239 }
5240
5241 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
5242 static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
5243                           SDValue V2) {
5244   unsigned NumElems = VT.getVectorNumElements();
5245   SmallVector<int, 8> Mask;
5246   for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
5247     Mask.push_back(i);
5248     Mask.push_back(i + NumElems);
5249   }
5250   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
5251 }
5252
5253 /// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
5254 static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
5255                           SDValue V2) {
5256   unsigned NumElems = VT.getVectorNumElements();
5257   SmallVector<int, 8> Mask;
5258   for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
5259     Mask.push_back(i + Half);
5260     Mask.push_back(i + NumElems + Half);
5261   }
5262   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
5263 }
5264
5265 // PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
5266 // a generic shuffle instruction because the target has no such instructions.
5267 // Generate shuffles which repeat i16 and i8 several times until they can be
5268 // represented by v4f32 and then be manipulated by target suported shuffles.
5269 static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
5270   MVT VT = V.getSimpleValueType();
5271   int NumElems = VT.getVectorNumElements();
5272   SDLoc dl(V);
5273
5274   while (NumElems > 4) {
5275     if (EltNo < NumElems/2) {
5276       V = getUnpackl(DAG, dl, VT, V, V);
5277     } else {
5278       V = getUnpackh(DAG, dl, VT, V, V);
5279       EltNo -= NumElems/2;
5280     }
5281     NumElems >>= 1;
5282   }
5283   return V;
5284 }
5285
5286 /// getLegalSplat - Generate a legal splat with supported x86 shuffles
5287 static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
5288   MVT VT = V.getSimpleValueType();
5289   SDLoc dl(V);
5290
5291   if (VT.is128BitVector()) {
5292     V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
5293     int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
5294     V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
5295                              &SplatMask[0]);
5296   } else if (VT.is256BitVector()) {
5297     // To use VPERMILPS to splat scalars, the second half of indicies must
5298     // refer to the higher part, which is a duplication of the lower one,
5299     // because VPERMILPS can only handle in-lane permutations.
5300     int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
5301                          EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
5302
5303     V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
5304     V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
5305                              &SplatMask[0]);
5306   } else
5307     llvm_unreachable("Vector size not supported");
5308
5309   return DAG.getNode(ISD::BITCAST, dl, VT, V);
5310 }
5311
5312 /// PromoteSplat - Splat is promoted to target supported vector shuffles.
5313 static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
5314   MVT SrcVT = SV->getSimpleValueType(0);
5315   SDValue V1 = SV->getOperand(0);
5316   SDLoc dl(SV);
5317
5318   int EltNo = SV->getSplatIndex();
5319   int NumElems = SrcVT.getVectorNumElements();
5320   bool Is256BitVec = SrcVT.is256BitVector();
5321
5322   assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) &&
5323          "Unknown how to promote splat for type");
5324
5325   // Extract the 128-bit part containing the splat element and update
5326   // the splat element index when it refers to the higher register.
5327   if (Is256BitVec) {
5328     V1 = Extract128BitVector(V1, EltNo, DAG, dl);
5329     if (EltNo >= NumElems/2)
5330       EltNo -= NumElems/2;
5331   }
5332
5333   // All i16 and i8 vector types can't be used directly by a generic shuffle
5334   // instruction because the target has no such instruction. Generate shuffles
5335   // which repeat i16 and i8 several times until they fit in i32, and then can
5336   // be manipulated by target suported shuffles.
5337   MVT EltVT = SrcVT.getVectorElementType();
5338   if (EltVT == MVT::i8 || EltVT == MVT::i16)
5339     V1 = PromoteSplati8i16(V1, DAG, EltNo);
5340
5341   // Recreate the 256-bit vector and place the same 128-bit vector
5342   // into the low and high part. This is necessary because we want
5343   // to use VPERM* to shuffle the vectors
5344   if (Is256BitVec) {
5345     V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
5346   }
5347
5348   return getLegalSplat(DAG, V1, EltNo);
5349 }
5350
5351 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
5352 /// vector of zero or undef vector.  This produces a shuffle where the low
5353 /// element of V2 is swizzled into the zero/undef vector, landing at element
5354 /// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
5355 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
5356                                            bool IsZero,
5357                                            const X86Subtarget *Subtarget,
5358                                            SelectionDAG &DAG) {
5359   MVT VT = V2.getSimpleValueType();
5360   SDValue V1 = IsZero
5361     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5362   unsigned NumElems = VT.getVectorNumElements();
5363   SmallVector<int, 16> MaskVec;
5364   for (unsigned i = 0; i != NumElems; ++i)
5365     // If this is the insertion idx, put the low elt of V2 here.
5366     MaskVec.push_back(i == Idx ? NumElems : i);
5367   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
5368 }
5369
5370 /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
5371 /// target specific opcode. Returns true if the Mask could be calculated. Sets
5372 /// IsUnary to true if only uses one source. Note that this will set IsUnary for
5373 /// shuffles which use a single input multiple times, and in those cases it will
5374 /// adjust the mask to only have indices within that single input.
5375 static bool getTargetShuffleMask(SDNode *N, MVT VT,
5376                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
5377   unsigned NumElems = VT.getVectorNumElements();
5378   SDValue ImmN;
5379
5380   IsUnary = false;
5381   bool IsFakeUnary = false;
5382   switch(N->getOpcode()) {
5383   case X86ISD::BLENDI:
5384     ImmN = N->getOperand(N->getNumOperands()-1);
5385     DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5386     break;
5387   case X86ISD::SHUFP:
5388     ImmN = N->getOperand(N->getNumOperands()-1);
5389     DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5390     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5391     break;
5392   case X86ISD::UNPCKH:
5393     DecodeUNPCKHMask(VT, Mask);
5394     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5395     break;
5396   case X86ISD::UNPCKL:
5397     DecodeUNPCKLMask(VT, Mask);
5398     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5399     break;
5400   case X86ISD::MOVHLPS:
5401     DecodeMOVHLPSMask(NumElems, Mask);
5402     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5403     break;
5404   case X86ISD::MOVLHPS:
5405     DecodeMOVLHPSMask(NumElems, Mask);
5406     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5407     break;
5408   case X86ISD::PALIGNR:
5409     ImmN = N->getOperand(N->getNumOperands()-1);
5410     DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5411     break;
5412   case X86ISD::PSHUFD:
5413   case X86ISD::VPERMILPI:
5414     ImmN = N->getOperand(N->getNumOperands()-1);
5415     DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5416     IsUnary = true;
5417     break;
5418   case X86ISD::PSHUFHW:
5419     ImmN = N->getOperand(N->getNumOperands()-1);
5420     DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5421     IsUnary = true;
5422     break;
5423   case X86ISD::PSHUFLW:
5424     ImmN = N->getOperand(N->getNumOperands()-1);
5425     DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5426     IsUnary = true;
5427     break;
5428   case X86ISD::PSHUFB: {
5429     IsUnary = true;
5430     SDValue MaskNode = N->getOperand(1);
5431     while (MaskNode->getOpcode() == ISD::BITCAST)
5432       MaskNode = MaskNode->getOperand(0);
5433
5434     if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
5435       // If we have a build-vector, then things are easy.
5436       EVT VT = MaskNode.getValueType();
5437       assert(VT.isVector() &&
5438              "Can't produce a non-vector with a build_vector!");
5439       if (!VT.isInteger())
5440         return false;
5441
5442       int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8;
5443
5444       SmallVector<uint64_t, 32> RawMask;
5445       for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) {
5446         SDValue Op = MaskNode->getOperand(i);
5447         if (Op->getOpcode() == ISD::UNDEF) {
5448           RawMask.push_back((uint64_t)SM_SentinelUndef);
5449           continue;
5450         }
5451         auto *CN = dyn_cast<ConstantSDNode>(Op.getNode());
5452         if (!CN)
5453           return false;
5454         APInt MaskElement = CN->getAPIntValue();
5455
5456         // We now have to decode the element which could be any integer size and
5457         // extract each byte of it.
5458         for (int j = 0; j < NumBytesPerElement; ++j) {
5459           // Note that this is x86 and so always little endian: the low byte is
5460           // the first byte of the mask.
5461           RawMask.push_back(MaskElement.getLoBits(8).getZExtValue());
5462           MaskElement = MaskElement.lshr(8);
5463         }
5464       }
5465       DecodePSHUFBMask(RawMask, Mask);
5466       break;
5467     }
5468
5469     auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
5470     if (!MaskLoad)
5471       return false;
5472
5473     SDValue Ptr = MaskLoad->getBasePtr();
5474     if (Ptr->getOpcode() == X86ISD::Wrapper)
5475       Ptr = Ptr->getOperand(0);
5476
5477     auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
5478     if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
5479       return false;
5480
5481     if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
5482       DecodePSHUFBMask(C, Mask);
5483       break;
5484     }
5485
5486     return false;
5487   }
5488   case X86ISD::VPERMI:
5489     ImmN = N->getOperand(N->getNumOperands()-1);
5490     DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5491     IsUnary = true;
5492     break;
5493   case X86ISD::MOVSS:
5494   case X86ISD::MOVSD:
5495     DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
5496     break;
5497   case X86ISD::VPERM2X128:
5498     ImmN = N->getOperand(N->getNumOperands()-1);
5499     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5500     if (Mask.empty()) return false;
5501     break;
5502   case X86ISD::MOVSLDUP:
5503     DecodeMOVSLDUPMask(VT, Mask);
5504     IsUnary = true;
5505     break;
5506   case X86ISD::MOVSHDUP:
5507     DecodeMOVSHDUPMask(VT, Mask);
5508     IsUnary = true;
5509     break;
5510   case X86ISD::MOVDDUP:
5511     DecodeMOVDDUPMask(VT, Mask);
5512     IsUnary = true;
5513     break;
5514   case X86ISD::MOVLHPD:
5515   case X86ISD::MOVLPD:
5516   case X86ISD::MOVLPS:
5517     // Not yet implemented
5518     return false;
5519   default: llvm_unreachable("unknown target shuffle node");
5520   }
5521
5522   // If we have a fake unary shuffle, the shuffle mask is spread across two
5523   // inputs that are actually the same node. Re-map the mask to always point
5524   // into the first input.
5525   if (IsFakeUnary)
5526     for (int &M : Mask)
5527       if (M >= (int)Mask.size())
5528         M -= Mask.size();
5529
5530   return true;
5531 }
5532
5533 /// getShuffleScalarElt - Returns the scalar element that will make up the ith
5534 /// element of the result of the vector shuffle.
5535 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
5536                                    unsigned Depth) {
5537   if (Depth == 6)
5538     return SDValue();  // Limit search depth.
5539
5540   SDValue V = SDValue(N, 0);
5541   EVT VT = V.getValueType();
5542   unsigned Opcode = V.getOpcode();
5543
5544   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
5545   if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
5546     int Elt = SV->getMaskElt(Index);
5547
5548     if (Elt < 0)
5549       return DAG.getUNDEF(VT.getVectorElementType());
5550
5551     unsigned NumElems = VT.getVectorNumElements();
5552     SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
5553                                          : SV->getOperand(1);
5554     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
5555   }
5556
5557   // Recurse into target specific vector shuffles to find scalars.
5558   if (isTargetShuffle(Opcode)) {
5559     MVT ShufVT = V.getSimpleValueType();
5560     unsigned NumElems = ShufVT.getVectorNumElements();
5561     SmallVector<int, 16> ShuffleMask;
5562     bool IsUnary;
5563
5564     if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
5565       return SDValue();
5566
5567     int Elt = ShuffleMask[Index];
5568     if (Elt < 0)
5569       return DAG.getUNDEF(ShufVT.getVectorElementType());
5570
5571     SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
5572                                          : N->getOperand(1);
5573     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
5574                                Depth+1);
5575   }
5576
5577   // Actual nodes that may contain scalar elements
5578   if (Opcode == ISD::BITCAST) {
5579     V = V.getOperand(0);
5580     EVT SrcVT = V.getValueType();
5581     unsigned NumElems = VT.getVectorNumElements();
5582
5583     if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
5584       return SDValue();
5585   }
5586
5587   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
5588     return (Index == 0) ? V.getOperand(0)
5589                         : DAG.getUNDEF(VT.getVectorElementType());
5590
5591   if (V.getOpcode() == ISD::BUILD_VECTOR)
5592     return V.getOperand(Index);
5593
5594   return SDValue();
5595 }
5596
5597 /// getNumOfConsecutiveZeros - Return the number of elements of a vector
5598 /// shuffle operation which come from a consecutively from a zero. The
5599 /// search can start in two different directions, from left or right.
5600 /// We count undefs as zeros until PreferredNum is reached.
5601 static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp,
5602                                          unsigned NumElems, bool ZerosFromLeft,
5603                                          SelectionDAG &DAG,
5604                                          unsigned PreferredNum = -1U) {
5605   unsigned NumZeros = 0;
5606   for (unsigned i = 0; i != NumElems; ++i) {
5607     unsigned Index = ZerosFromLeft ? i : NumElems - i - 1;
5608     SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
5609     if (!Elt.getNode())
5610       break;
5611
5612     if (X86::isZeroNode(Elt))
5613       ++NumZeros;
5614     else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum.
5615       NumZeros = std::min(NumZeros + 1, PreferredNum);
5616     else
5617       break;
5618   }
5619
5620   return NumZeros;
5621 }
5622
5623 /// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
5624 /// correspond consecutively to elements from one of the vector operands,
5625 /// starting from its index OpIdx. Also tell OpNum which source vector operand.
5626 static
5627 bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
5628                               unsigned MaskI, unsigned MaskE, unsigned OpIdx,
5629                               unsigned NumElems, unsigned &OpNum) {
5630   bool SeenV1 = false;
5631   bool SeenV2 = false;
5632
5633   for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
5634     int Idx = SVOp->getMaskElt(i);
5635     // Ignore undef indicies
5636     if (Idx < 0)
5637       continue;
5638
5639     if (Idx < (int)NumElems)
5640       SeenV1 = true;
5641     else
5642       SeenV2 = true;
5643
5644     // Only accept consecutive elements from the same vector
5645     if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
5646       return false;
5647   }
5648
5649   OpNum = SeenV1 ? 0 : 1;
5650   return true;
5651 }
5652
5653 /// isVectorShiftRight - Returns true if the shuffle can be implemented as a
5654 /// logical left shift of a vector.
5655 static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
5656                                bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5657   unsigned NumElems =
5658     SVOp->getSimpleValueType(0).getVectorNumElements();
5659   unsigned NumZeros = getNumOfConsecutiveZeros(
5660       SVOp, NumElems, false /* check zeros from right */, DAG,
5661       SVOp->getMaskElt(0));
5662   unsigned OpSrc;
5663
5664   if (!NumZeros)
5665     return false;
5666
5667   // Considering the elements in the mask that are not consecutive zeros,
5668   // check if they consecutively come from only one of the source vectors.
5669   //
5670   //               V1 = {X, A, B, C}     0
5671   //                         \  \  \    /
5672   //   vector_shuffle V1, V2 <1, 2, 3, X>
5673   //
5674   if (!isShuffleMaskConsecutive(SVOp,
5675             0,                   // Mask Start Index
5676             NumElems-NumZeros,   // Mask End Index(exclusive)
5677             NumZeros,            // Where to start looking in the src vector
5678             NumElems,            // Number of elements in vector
5679             OpSrc))              // Which source operand ?
5680     return false;
5681
5682   isLeft = false;
5683   ShAmt = NumZeros;
5684   ShVal = SVOp->getOperand(OpSrc);
5685   return true;
5686 }
5687
5688 /// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
5689 /// logical left shift of a vector.
5690 static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
5691                               bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5692   unsigned NumElems =
5693     SVOp->getSimpleValueType(0).getVectorNumElements();
5694   unsigned NumZeros = getNumOfConsecutiveZeros(
5695       SVOp, NumElems, true /* check zeros from left */, DAG,
5696       NumElems - SVOp->getMaskElt(NumElems - 1) - 1);
5697   unsigned OpSrc;
5698
5699   if (!NumZeros)
5700     return false;
5701
5702   // Considering the elements in the mask that are not consecutive zeros,
5703   // check if they consecutively come from only one of the source vectors.
5704   //
5705   //                           0    { A, B, X, X } = V2
5706   //                          / \    /  /
5707   //   vector_shuffle V1, V2 <X, X, 4, 5>
5708   //
5709   if (!isShuffleMaskConsecutive(SVOp,
5710             NumZeros,     // Mask Start Index
5711             NumElems,     // Mask End Index(exclusive)
5712             0,            // Where to start looking in the src vector
5713             NumElems,     // Number of elements in vector
5714             OpSrc))       // Which source operand ?
5715     return false;
5716
5717   isLeft = true;
5718   ShAmt = NumZeros;
5719   ShVal = SVOp->getOperand(OpSrc);
5720   return true;
5721 }
5722
5723 /// isVectorShift - Returns true if the shuffle can be implemented as a
5724 /// logical left or right shift of a vector.
5725 static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
5726                           bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5727   // Although the logic below support any bitwidth size, there are no
5728   // shift instructions which handle more than 128-bit vectors.
5729   if (!SVOp->getSimpleValueType(0).is128BitVector())
5730     return false;
5731
5732   if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
5733       isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
5734     return true;
5735
5736   return false;
5737 }
5738
5739 /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
5740 ///
5741 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
5742                                        unsigned NumNonZero, unsigned NumZero,
5743                                        SelectionDAG &DAG,
5744                                        const X86Subtarget* Subtarget,
5745                                        const TargetLowering &TLI) {
5746   if (NumNonZero > 8)
5747     return SDValue();
5748
5749   SDLoc dl(Op);
5750   SDValue V;
5751   bool First = true;
5752   for (unsigned i = 0; i < 16; ++i) {
5753     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
5754     if (ThisIsNonZero && First) {
5755       if (NumZero)
5756         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5757       else
5758         V = DAG.getUNDEF(MVT::v8i16);
5759       First = false;
5760     }
5761
5762     if ((i & 1) != 0) {
5763       SDValue ThisElt, LastElt;
5764       bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
5765       if (LastIsNonZero) {
5766         LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
5767                               MVT::i16, Op.getOperand(i-1));
5768       }
5769       if (ThisIsNonZero) {
5770         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
5771         ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
5772                               ThisElt, DAG.getConstant(8, MVT::i8));
5773         if (LastIsNonZero)
5774           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
5775       } else
5776         ThisElt = LastElt;
5777
5778       if (ThisElt.getNode())
5779         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
5780                         DAG.getIntPtrConstant(i/2));
5781     }
5782   }
5783
5784   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
5785 }
5786
5787 /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
5788 ///
5789 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
5790                                      unsigned NumNonZero, unsigned NumZero,
5791                                      SelectionDAG &DAG,
5792                                      const X86Subtarget* Subtarget,
5793                                      const TargetLowering &TLI) {
5794   if (NumNonZero > 4)
5795     return SDValue();
5796
5797   SDLoc dl(Op);
5798   SDValue V;
5799   bool First = true;
5800   for (unsigned i = 0; i < 8; ++i) {
5801     bool isNonZero = (NonZeros & (1 << i)) != 0;
5802     if (isNonZero) {
5803       if (First) {
5804         if (NumZero)
5805           V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5806         else
5807           V = DAG.getUNDEF(MVT::v8i16);
5808         First = false;
5809       }
5810       V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
5811                       MVT::v8i16, V, Op.getOperand(i),
5812                       DAG.getIntPtrConstant(i));
5813     }
5814   }
5815
5816   return V;
5817 }
5818
5819 /// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
5820 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
5821                                      const X86Subtarget *Subtarget,
5822                                      const TargetLowering &TLI) {
5823   // Find all zeroable elements.
5824   bool Zeroable[4];
5825   for (int i=0; i < 4; ++i) {
5826     SDValue Elt = Op->getOperand(i);
5827     Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt));
5828   }
5829   assert(std::count_if(&Zeroable[0], &Zeroable[4],
5830                        [](bool M) { return !M; }) > 1 &&
5831          "We expect at least two non-zero elements!");
5832
5833   // We only know how to deal with build_vector nodes where elements are either
5834   // zeroable or extract_vector_elt with constant index.
5835   SDValue FirstNonZero;
5836   unsigned FirstNonZeroIdx;
5837   for (unsigned i=0; i < 4; ++i) {
5838     if (Zeroable[i])
5839       continue;
5840     SDValue Elt = Op->getOperand(i);
5841     if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5842         !isa<ConstantSDNode>(Elt.getOperand(1)))
5843       return SDValue();
5844     // Make sure that this node is extracting from a 128-bit vector.
5845     MVT VT = Elt.getOperand(0).getSimpleValueType();
5846     if (!VT.is128BitVector())
5847       return SDValue();
5848     if (!FirstNonZero.getNode()) {
5849       FirstNonZero = Elt;
5850       FirstNonZeroIdx = i;
5851     }
5852   }
5853
5854   assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
5855   SDValue V1 = FirstNonZero.getOperand(0);
5856   MVT VT = V1.getSimpleValueType();
5857
5858   // See if this build_vector can be lowered as a blend with zero.
5859   SDValue Elt;
5860   unsigned EltMaskIdx, EltIdx;
5861   int Mask[4];
5862   for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
5863     if (Zeroable[EltIdx]) {
5864       // The zero vector will be on the right hand side.
5865       Mask[EltIdx] = EltIdx+4;
5866       continue;
5867     }
5868
5869     Elt = Op->getOperand(EltIdx);
5870     // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
5871     EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
5872     if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
5873       break;
5874     Mask[EltIdx] = EltIdx;
5875   }
5876
5877   if (EltIdx == 4) {
5878     // Let the shuffle legalizer deal with blend operations.
5879     SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
5880     if (V1.getSimpleValueType() != VT)
5881       V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1);
5882     return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]);
5883   }
5884
5885   // See if we can lower this build_vector to a INSERTPS.
5886   if (!Subtarget->hasSSE41())
5887     return SDValue();
5888
5889   SDValue V2 = Elt.getOperand(0);
5890   if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
5891     V1 = SDValue();
5892
5893   bool CanFold = true;
5894   for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
5895     if (Zeroable[i])
5896       continue;
5897
5898     SDValue Current = Op->getOperand(i);
5899     SDValue SrcVector = Current->getOperand(0);
5900     if (!V1.getNode())
5901       V1 = SrcVector;
5902     CanFold = SrcVector == V1 &&
5903       cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
5904   }
5905
5906   if (!CanFold)
5907     return SDValue();
5908
5909   assert(V1.getNode() && "Expected at least two non-zero elements!");
5910   if (V1.getSimpleValueType() != MVT::v4f32)
5911     V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1);
5912   if (V2.getSimpleValueType() != MVT::v4f32)
5913     V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2);
5914
5915   // Ok, we can emit an INSERTPS instruction.
5916   unsigned ZMask = 0;
5917   for (int i = 0; i < 4; ++i)
5918     if (Zeroable[i])
5919       ZMask |= 1 << i;
5920
5921   unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
5922   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
5923   SDValue Result = DAG.getNode(X86ISD::INSERTPS, SDLoc(Op), MVT::v4f32, V1, V2,
5924                                DAG.getIntPtrConstant(InsertPSMask));
5925   return DAG.getNode(ISD::BITCAST, SDLoc(Op), VT, Result);
5926 }
5927
5928 /// Return a vector logical shift node.
5929 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
5930                          unsigned NumBits, SelectionDAG &DAG,
5931                          const TargetLowering &TLI, SDLoc dl) {
5932   assert(VT.is128BitVector() && "Unknown type for VShift");
5933   MVT ShVT = MVT::v2i64;
5934   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
5935   SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
5936   MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(SrcOp.getValueType());
5937   SDValue ShiftVal = DAG.getConstant(NumBits, ScalarShiftTy);
5938   return DAG.getNode(ISD::BITCAST, dl, VT,
5939                      DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
5940 }
5941
5942 static SDValue
5943 LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
5944
5945   // Check if the scalar load can be widened into a vector load. And if
5946   // the address is "base + cst" see if the cst can be "absorbed" into
5947   // the shuffle mask.
5948   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
5949     SDValue Ptr = LD->getBasePtr();
5950     if (!ISD::isNormalLoad(LD) || LD->isVolatile())
5951       return SDValue();
5952     EVT PVT = LD->getValueType(0);
5953     if (PVT != MVT::i32 && PVT != MVT::f32)
5954       return SDValue();
5955
5956     int FI = -1;
5957     int64_t Offset = 0;
5958     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
5959       FI = FINode->getIndex();
5960       Offset = 0;
5961     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
5962                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
5963       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
5964       Offset = Ptr.getConstantOperandVal(1);
5965       Ptr = Ptr.getOperand(0);
5966     } else {
5967       return SDValue();
5968     }
5969
5970     // FIXME: 256-bit vector instructions don't require a strict alignment,
5971     // improve this code to support it better.
5972     unsigned RequiredAlign = VT.getSizeInBits()/8;
5973     SDValue Chain = LD->getChain();
5974     // Make sure the stack object alignment is at least 16 or 32.
5975     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
5976     if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
5977       if (MFI->isFixedObjectIndex(FI)) {
5978         // Can't change the alignment. FIXME: It's possible to compute
5979         // the exact stack offset and reference FI + adjust offset instead.
5980         // If someone *really* cares about this. That's the way to implement it.
5981         return SDValue();
5982       } else {
5983         MFI->setObjectAlignment(FI, RequiredAlign);
5984       }
5985     }
5986
5987     // (Offset % 16 or 32) must be multiple of 4. Then address is then
5988     // Ptr + (Offset & ~15).
5989     if (Offset < 0)
5990       return SDValue();
5991     if ((Offset % RequiredAlign) & 3)
5992       return SDValue();
5993     int64_t StartOffset = Offset & ~(RequiredAlign-1);
5994     if (StartOffset)
5995       Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(),
5996                         Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
5997
5998     int EltNo = (Offset - StartOffset) >> 2;
5999     unsigned NumElems = VT.getVectorNumElements();
6000
6001     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6002     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6003                              LD->getPointerInfo().getWithOffset(StartOffset),
6004                              false, false, false, 0);
6005
6006     SmallVector<int, 8> Mask;
6007     for (unsigned i = 0; i != NumElems; ++i)
6008       Mask.push_back(EltNo);
6009
6010     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
6011   }
6012
6013   return SDValue();
6014 }
6015
6016 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6017 /// elements can be replaced by a single large load which has the same value as
6018 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6019 ///
6020 /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
6021 ///
6022 /// FIXME: we'd also like to handle the case where the last elements are zero
6023 /// rather than undef via VZEXT_LOAD, but we do not detect that case today.
6024 /// There's even a handy isZeroNode for that purpose.
6025 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6026                                         SDLoc &DL, SelectionDAG &DAG,
6027                                         bool isAfterLegalize) {
6028   unsigned NumElems = Elts.size();
6029
6030   LoadSDNode *LDBase = nullptr;
6031   unsigned LastLoadedElt = -1U;
6032
6033   // For each element in the initializer, see if we've found a load or an undef.
6034   // If we don't find an initial load element, or later load elements are
6035   // non-consecutive, bail out.
6036   for (unsigned i = 0; i < NumElems; ++i) {
6037     SDValue Elt = Elts[i];
6038     // Look through a bitcast.
6039     if (Elt.getNode() && Elt.getOpcode() == ISD::BITCAST)
6040       Elt = Elt.getOperand(0);
6041     if (!Elt.getNode() ||
6042         (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
6043       return SDValue();
6044     if (!LDBase) {
6045       if (Elt.getNode()->getOpcode() == ISD::UNDEF)
6046         return SDValue();
6047       LDBase = cast<LoadSDNode>(Elt.getNode());
6048       LastLoadedElt = i;
6049       continue;
6050     }
6051     if (Elt.getOpcode() == ISD::UNDEF)
6052       continue;
6053
6054     LoadSDNode *LD = cast<LoadSDNode>(Elt);
6055     EVT LdVT = Elt.getValueType();
6056     // Each loaded element must be the correct fractional portion of the
6057     // requested vector load.
6058     if (LdVT.getSizeInBits() != VT.getSizeInBits() / NumElems)
6059       return SDValue();
6060     if (!DAG.isConsecutiveLoad(LD, LDBase, LdVT.getSizeInBits() / 8, i))
6061       return SDValue();
6062     LastLoadedElt = i;
6063   }
6064
6065   // If we have found an entire vector of loads and undefs, then return a large
6066   // load of the entire vector width starting at the base pointer.  If we found
6067   // consecutive loads for the low half, generate a vzext_load node.
6068   if (LastLoadedElt == NumElems - 1) {
6069     assert(LDBase && "Did not find base load for merging consecutive loads");
6070     EVT EltVT = LDBase->getValueType(0);
6071     // Ensure that the input vector size for the merged loads matches the
6072     // cumulative size of the input elements.
6073     if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
6074       return SDValue();
6075
6076     if (isAfterLegalize &&
6077         !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
6078       return SDValue();
6079
6080     SDValue NewLd = SDValue();
6081
6082     NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6083                         LDBase->getPointerInfo(), LDBase->isVolatile(),
6084                         LDBase->isNonTemporal(), LDBase->isInvariant(),
6085                         LDBase->getAlignment());
6086
6087     if (LDBase->hasAnyUseOfValue(1)) {
6088       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
6089                                      SDValue(LDBase, 1),
6090                                      SDValue(NewLd.getNode(), 1));
6091       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6092       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6093                              SDValue(NewLd.getNode(), 1));
6094     }
6095
6096     return NewLd;
6097   }
6098
6099   //TODO: The code below fires only for for loading the low v2i32 / v2f32
6100   //of a v4i32 / v4f32. It's probably worth generalizing.
6101   EVT EltVT = VT.getVectorElementType();
6102   if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) &&
6103       DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
6104     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
6105     SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6106     SDValue ResNode =
6107         DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
6108                                 LDBase->getPointerInfo(),
6109                                 LDBase->getAlignment(),
6110                                 false/*isVolatile*/, true/*ReadMem*/,
6111                                 false/*WriteMem*/);
6112
6113     // Make sure the newly-created LOAD is in the same position as LDBase in
6114     // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
6115     // update uses of LDBase's output chain to use the TokenFactor.
6116     if (LDBase->hasAnyUseOfValue(1)) {
6117       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
6118                              SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
6119       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6120       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6121                              SDValue(ResNode.getNode(), 1));
6122     }
6123
6124     return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
6125   }
6126   return SDValue();
6127 }
6128
6129 /// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
6130 /// to generate a splat value for the following cases:
6131 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
6132 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
6133 /// a scalar load, or a constant.
6134 /// The VBROADCAST node is returned when a pattern is found,
6135 /// or SDValue() otherwise.
6136 static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
6137                                     SelectionDAG &DAG) {
6138   // VBROADCAST requires AVX.
6139   // TODO: Splats could be generated for non-AVX CPUs using SSE
6140   // instructions, but there's less potential gain for only 128-bit vectors.
6141   if (!Subtarget->hasAVX())
6142     return SDValue();
6143
6144   MVT VT = Op.getSimpleValueType();
6145   SDLoc dl(Op);
6146
6147   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
6148          "Unsupported vector type for broadcast.");
6149
6150   SDValue Ld;
6151   bool ConstSplatVal;
6152
6153   switch (Op.getOpcode()) {
6154     default:
6155       // Unknown pattern found.
6156       return SDValue();
6157
6158     case ISD::BUILD_VECTOR: {
6159       auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
6160       BitVector UndefElements;
6161       SDValue Splat = BVOp->getSplatValue(&UndefElements);
6162
6163       // We need a splat of a single value to use broadcast, and it doesn't
6164       // make any sense if the value is only in one element of the vector.
6165       if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
6166         return SDValue();
6167
6168       Ld = Splat;
6169       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
6170                        Ld.getOpcode() == ISD::ConstantFP);
6171
6172       // Make sure that all of the users of a non-constant load are from the
6173       // BUILD_VECTOR node.
6174       if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
6175         return SDValue();
6176       break;
6177     }
6178
6179     case ISD::VECTOR_SHUFFLE: {
6180       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
6181
6182       // Shuffles must have a splat mask where the first element is
6183       // broadcasted.
6184       if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
6185         return SDValue();
6186
6187       SDValue Sc = Op.getOperand(0);
6188       if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
6189           Sc.getOpcode() != ISD::BUILD_VECTOR) {
6190
6191         if (!Subtarget->hasInt256())
6192           return SDValue();
6193
6194         // Use the register form of the broadcast instruction available on AVX2.
6195         if (VT.getSizeInBits() >= 256)
6196           Sc = Extract128BitVector(Sc, 0, DAG, dl);
6197         return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
6198       }
6199
6200       Ld = Sc.getOperand(0);
6201       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
6202                        Ld.getOpcode() == ISD::ConstantFP);
6203
6204       // The scalar_to_vector node and the suspected
6205       // load node must have exactly one user.
6206       // Constants may have multiple users.
6207
6208       // AVX-512 has register version of the broadcast
6209       bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
6210         Ld.getValueType().getSizeInBits() >= 32;
6211       if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
6212           !hasRegVer))
6213         return SDValue();
6214       break;
6215     }
6216   }
6217
6218   unsigned ScalarSize = Ld.getValueType().getSizeInBits();
6219   bool IsGE256 = (VT.getSizeInBits() >= 256);
6220
6221   // When optimizing for size, generate up to 5 extra bytes for a broadcast
6222   // instruction to save 8 or more bytes of constant pool data.
6223   // TODO: If multiple splats are generated to load the same constant,
6224   // it may be detrimental to overall size. There needs to be a way to detect
6225   // that condition to know if this is truly a size win.
6226   const Function *F = DAG.getMachineFunction().getFunction();
6227   bool OptForSize = F->getAttributes().
6228     hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
6229
6230   // Handle broadcasting a single constant scalar from the constant pool
6231   // into a vector.
6232   // On Sandybridge (no AVX2), it is still better to load a constant vector
6233   // from the constant pool and not to broadcast it from a scalar.
6234   // But override that restriction when optimizing for size.
6235   // TODO: Check if splatting is recommended for other AVX-capable CPUs.
6236   if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) {
6237     EVT CVT = Ld.getValueType();
6238     assert(!CVT.isVector() && "Must not broadcast a vector type");
6239
6240     // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
6241     // For size optimization, also splat v2f64 and v2i64, and for size opt
6242     // with AVX2, also splat i8 and i16.
6243     // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
6244     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6245         (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) {
6246       const Constant *C = nullptr;
6247       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
6248         C = CI->getConstantIntValue();
6249       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
6250         C = CF->getConstantFPValue();
6251
6252       assert(C && "Invalid constant type");
6253
6254       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6255       SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
6256       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6257       Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
6258                        MachinePointerInfo::getConstantPool(),
6259                        false, false, false, Alignment);
6260
6261       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6262     }
6263   }
6264
6265   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
6266
6267   // Handle AVX2 in-register broadcasts.
6268   if (!IsLoad && Subtarget->hasInt256() &&
6269       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
6270     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6271
6272   // The scalar source must be a normal load.
6273   if (!IsLoad)
6274     return SDValue();
6275
6276   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6277       (Subtarget->hasVLX() && ScalarSize == 64))
6278     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6279
6280   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
6281   // double since there is no vbroadcastsd xmm
6282   if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
6283     if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
6284       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6285   }
6286
6287   // Unsupported broadcast.
6288   return SDValue();
6289 }
6290
6291 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
6292 /// underlying vector and index.
6293 ///
6294 /// Modifies \p ExtractedFromVec to the real vector and returns the real
6295 /// index.
6296 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
6297                                          SDValue ExtIdx) {
6298   int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
6299   if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
6300     return Idx;
6301
6302   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
6303   // lowered this:
6304   //   (extract_vector_elt (v8f32 %vreg1), Constant<6>)
6305   // to:
6306   //   (extract_vector_elt (vector_shuffle<2,u,u,u>
6307   //                           (extract_subvector (v8f32 %vreg0), Constant<4>),
6308   //                           undef)
6309   //                       Constant<0>)
6310   // In this case the vector is the extract_subvector expression and the index
6311   // is 2, as specified by the shuffle.
6312   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
6313   SDValue ShuffleVec = SVOp->getOperand(0);
6314   MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
6315   assert(ShuffleVecVT.getVectorElementType() ==
6316          ExtractedFromVec.getSimpleValueType().getVectorElementType());
6317
6318   int ShuffleIdx = SVOp->getMaskElt(Idx);
6319   if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
6320     ExtractedFromVec = ShuffleVec;
6321     return ShuffleIdx;
6322   }
6323   return Idx;
6324 }
6325
6326 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
6327   MVT VT = Op.getSimpleValueType();
6328
6329   // Skip if insert_vec_elt is not supported.
6330   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6331   if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
6332     return SDValue();
6333
6334   SDLoc DL(Op);
6335   unsigned NumElems = Op.getNumOperands();
6336
6337   SDValue VecIn1;
6338   SDValue VecIn2;
6339   SmallVector<unsigned, 4> InsertIndices;
6340   SmallVector<int, 8> Mask(NumElems, -1);
6341
6342   for (unsigned i = 0; i != NumElems; ++i) {
6343     unsigned Opc = Op.getOperand(i).getOpcode();
6344
6345     if (Opc == ISD::UNDEF)
6346       continue;
6347
6348     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
6349       // Quit if more than 1 elements need inserting.
6350       if (InsertIndices.size() > 1)
6351         return SDValue();
6352
6353       InsertIndices.push_back(i);
6354       continue;
6355     }
6356
6357     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
6358     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
6359     // Quit if non-constant index.
6360     if (!isa<ConstantSDNode>(ExtIdx))
6361       return SDValue();
6362     int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
6363
6364     // Quit if extracted from vector of different type.
6365     if (ExtractedFromVec.getValueType() != VT)
6366       return SDValue();
6367
6368     if (!VecIn1.getNode())
6369       VecIn1 = ExtractedFromVec;
6370     else if (VecIn1 != ExtractedFromVec) {
6371       if (!VecIn2.getNode())
6372         VecIn2 = ExtractedFromVec;
6373       else if (VecIn2 != ExtractedFromVec)
6374         // Quit if more than 2 vectors to shuffle
6375         return SDValue();
6376     }
6377
6378     if (ExtractedFromVec == VecIn1)
6379       Mask[i] = Idx;
6380     else if (ExtractedFromVec == VecIn2)
6381       Mask[i] = Idx + NumElems;
6382   }
6383
6384   if (!VecIn1.getNode())
6385     return SDValue();
6386
6387   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
6388   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
6389   for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
6390     unsigned Idx = InsertIndices[i];
6391     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
6392                      DAG.getIntPtrConstant(Idx));
6393   }
6394
6395   return NV;
6396 }
6397
6398 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
6399 SDValue
6400 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
6401
6402   MVT VT = Op.getSimpleValueType();
6403   assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) &&
6404          "Unexpected type in LowerBUILD_VECTORvXi1!");
6405
6406   SDLoc dl(Op);
6407   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
6408     SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
6409     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
6410     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
6411   }
6412
6413   if (ISD::isBuildVectorAllOnes(Op.getNode())) {
6414     SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
6415     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
6416     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
6417   }
6418
6419   bool AllContants = true;
6420   uint64_t Immediate = 0;
6421   int NonConstIdx = -1;
6422   bool IsSplat = true;
6423   unsigned NumNonConsts = 0;
6424   unsigned NumConsts = 0;
6425   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6426     SDValue In = Op.getOperand(idx);
6427     if (In.getOpcode() == ISD::UNDEF)
6428       continue;
6429     if (!isa<ConstantSDNode>(In)) {
6430       AllContants = false;
6431       NonConstIdx = idx;
6432       NumNonConsts++;
6433     } else {
6434       NumConsts++;
6435       if (cast<ConstantSDNode>(In)->getZExtValue())
6436       Immediate |= (1ULL << idx);
6437     }
6438     if (In != Op.getOperand(0))
6439       IsSplat = false;
6440   }
6441
6442   if (AllContants) {
6443     SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1,
6444       DAG.getConstant(Immediate, MVT::i16));
6445     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask,
6446                        DAG.getIntPtrConstant(0));
6447   }
6448
6449   if (NumNonConsts == 1 && NonConstIdx != 0) {
6450     SDValue DstVec;
6451     if (NumConsts) {
6452       SDValue VecAsImm = DAG.getConstant(Immediate,
6453                                          MVT::getIntegerVT(VT.getSizeInBits()));
6454       DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
6455     }
6456     else
6457       DstVec = DAG.getUNDEF(VT);
6458     return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
6459                        Op.getOperand(NonConstIdx),
6460                        DAG.getIntPtrConstant(NonConstIdx));
6461   }
6462   if (!IsSplat && (NonConstIdx != 0))
6463     llvm_unreachable("Unsupported BUILD_VECTOR operation");
6464   MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8;
6465   SDValue Select;
6466   if (IsSplat)
6467     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
6468                           DAG.getConstant(-1, SelectVT),
6469                           DAG.getConstant(0, SelectVT));
6470   else
6471     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
6472                          DAG.getConstant((Immediate | 1), SelectVT),
6473                          DAG.getConstant(Immediate, SelectVT));
6474   return DAG.getNode(ISD::BITCAST, dl, VT, Select);
6475 }
6476
6477 /// \brief Return true if \p N implements a horizontal binop and return the
6478 /// operands for the horizontal binop into V0 and V1.
6479 ///
6480 /// This is a helper function of PerformBUILD_VECTORCombine.
6481 /// This function checks that the build_vector \p N in input implements a
6482 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
6483 /// operation to match.
6484 /// For example, if \p Opcode is equal to ISD::ADD, then this function
6485 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
6486 /// is equal to ISD::SUB, then this function checks if this is a horizontal
6487 /// arithmetic sub.
6488 ///
6489 /// This function only analyzes elements of \p N whose indices are
6490 /// in range [BaseIdx, LastIdx).
6491 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
6492                               SelectionDAG &DAG,
6493                               unsigned BaseIdx, unsigned LastIdx,
6494                               SDValue &V0, SDValue &V1) {
6495   EVT VT = N->getValueType(0);
6496
6497   assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
6498   assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
6499          "Invalid Vector in input!");
6500
6501   bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
6502   bool CanFold = true;
6503   unsigned ExpectedVExtractIdx = BaseIdx;
6504   unsigned NumElts = LastIdx - BaseIdx;
6505   V0 = DAG.getUNDEF(VT);
6506   V1 = DAG.getUNDEF(VT);
6507
6508   // Check if N implements a horizontal binop.
6509   for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
6510     SDValue Op = N->getOperand(i + BaseIdx);
6511
6512     // Skip UNDEFs.
6513     if (Op->getOpcode() == ISD::UNDEF) {
6514       // Update the expected vector extract index.
6515       if (i * 2 == NumElts)
6516         ExpectedVExtractIdx = BaseIdx;
6517       ExpectedVExtractIdx += 2;
6518       continue;
6519     }
6520
6521     CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
6522
6523     if (!CanFold)
6524       break;
6525
6526     SDValue Op0 = Op.getOperand(0);
6527     SDValue Op1 = Op.getOperand(1);
6528
6529     // Try to match the following pattern:
6530     // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
6531     CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6532         Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6533         Op0.getOperand(0) == Op1.getOperand(0) &&
6534         isa<ConstantSDNode>(Op0.getOperand(1)) &&
6535         isa<ConstantSDNode>(Op1.getOperand(1)));
6536     if (!CanFold)
6537       break;
6538
6539     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
6540     unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
6541
6542     if (i * 2 < NumElts) {
6543       if (V0.getOpcode() == ISD::UNDEF)
6544         V0 = Op0.getOperand(0);
6545     } else {
6546       if (V1.getOpcode() == ISD::UNDEF)
6547         V1 = Op0.getOperand(0);
6548       if (i * 2 == NumElts)
6549         ExpectedVExtractIdx = BaseIdx;
6550     }
6551
6552     SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
6553     if (I0 == ExpectedVExtractIdx)
6554       CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
6555     else if (IsCommutable && I1 == ExpectedVExtractIdx) {
6556       // Try to match the following dag sequence:
6557       // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
6558       CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
6559     } else
6560       CanFold = false;
6561
6562     ExpectedVExtractIdx += 2;
6563   }
6564
6565   return CanFold;
6566 }
6567
6568 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
6569 /// a concat_vector.
6570 ///
6571 /// This is a helper function of PerformBUILD_VECTORCombine.
6572 /// This function expects two 256-bit vectors called V0 and V1.
6573 /// At first, each vector is split into two separate 128-bit vectors.
6574 /// Then, the resulting 128-bit vectors are used to implement two
6575 /// horizontal binary operations.
6576 ///
6577 /// The kind of horizontal binary operation is defined by \p X86Opcode.
6578 ///
6579 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
6580 /// the two new horizontal binop.
6581 /// When Mode is set, the first horizontal binop dag node would take as input
6582 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
6583 /// horizontal binop dag node would take as input the lower 128-bit of V1
6584 /// and the upper 128-bit of V1.
6585 ///   Example:
6586 ///     HADD V0_LO, V0_HI
6587 ///     HADD V1_LO, V1_HI
6588 ///
6589 /// Otherwise, the first horizontal binop dag node takes as input the lower
6590 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
6591 /// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1.
6592 ///   Example:
6593 ///     HADD V0_LO, V1_LO
6594 ///     HADD V0_HI, V1_HI
6595 ///
6596 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
6597 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
6598 /// the upper 128-bits of the result.
6599 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
6600                                      SDLoc DL, SelectionDAG &DAG,
6601                                      unsigned X86Opcode, bool Mode,
6602                                      bool isUndefLO, bool isUndefHI) {
6603   EVT VT = V0.getValueType();
6604   assert(VT.is256BitVector() && VT == V1.getValueType() &&
6605          "Invalid nodes in input!");
6606
6607   unsigned NumElts = VT.getVectorNumElements();
6608   SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL);
6609   SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL);
6610   SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL);
6611   SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL);
6612   EVT NewVT = V0_LO.getValueType();
6613
6614   SDValue LO = DAG.getUNDEF(NewVT);
6615   SDValue HI = DAG.getUNDEF(NewVT);
6616
6617   if (Mode) {
6618     // Don't emit a horizontal binop if the result is expected to be UNDEF.
6619     if (!isUndefLO && V0->getOpcode() != ISD::UNDEF)
6620       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
6621     if (!isUndefHI && V1->getOpcode() != ISD::UNDEF)
6622       HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
6623   } else {
6624     // Don't emit a horizontal binop if the result is expected to be UNDEF.
6625     if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF ||
6626                        V1_LO->getOpcode() != ISD::UNDEF))
6627       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
6628
6629     if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF ||
6630                        V1_HI->getOpcode() != ISD::UNDEF))
6631       HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
6632   }
6633
6634   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
6635 }
6636
6637 /// \brief Try to fold a build_vector that performs an 'addsub' into the
6638 /// sequence of 'vadd + vsub + blendi'.
6639 static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG,
6640                            const X86Subtarget *Subtarget) {
6641   SDLoc DL(BV);
6642   EVT VT = BV->getValueType(0);
6643   unsigned NumElts = VT.getVectorNumElements();
6644   SDValue InVec0 = DAG.getUNDEF(VT);
6645   SDValue InVec1 = DAG.getUNDEF(VT);
6646
6647   assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
6648           VT == MVT::v2f64) && "build_vector with an invalid type found!");
6649
6650   // Odd-numbered elements in the input build vector are obtained from
6651   // adding two integer/float elements.
6652   // Even-numbered elements in the input build vector are obtained from
6653   // subtracting two integer/float elements.
6654   unsigned ExpectedOpcode = ISD::FSUB;
6655   unsigned NextExpectedOpcode = ISD::FADD;
6656   bool AddFound = false;
6657   bool SubFound = false;
6658
6659   for (unsigned i = 0, e = NumElts; i != e; i++) {
6660     SDValue Op = BV->getOperand(i);
6661
6662     // Skip 'undef' values.
6663     unsigned Opcode = Op.getOpcode();
6664     if (Opcode == ISD::UNDEF) {
6665       std::swap(ExpectedOpcode, NextExpectedOpcode);
6666       continue;
6667     }
6668
6669     // Early exit if we found an unexpected opcode.
6670     if (Opcode != ExpectedOpcode)
6671       return SDValue();
6672
6673     SDValue Op0 = Op.getOperand(0);
6674     SDValue Op1 = Op.getOperand(1);
6675
6676     // Try to match the following pattern:
6677     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
6678     // Early exit if we cannot match that sequence.
6679     if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6680         Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6681         !isa<ConstantSDNode>(Op0.getOperand(1)) ||
6682         !isa<ConstantSDNode>(Op1.getOperand(1)) ||
6683         Op0.getOperand(1) != Op1.getOperand(1))
6684       return SDValue();
6685
6686     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
6687     if (I0 != i)
6688       return SDValue();
6689
6690     // We found a valid add/sub node. Update the information accordingly.
6691     if (i & 1)
6692       AddFound = true;
6693     else
6694       SubFound = true;
6695
6696     // Update InVec0 and InVec1.
6697     if (InVec0.getOpcode() == ISD::UNDEF)
6698       InVec0 = Op0.getOperand(0);
6699     if (InVec1.getOpcode() == ISD::UNDEF)
6700       InVec1 = Op1.getOperand(0);
6701
6702     // Make sure that operands in input to each add/sub node always
6703     // come from a same pair of vectors.
6704     if (InVec0 != Op0.getOperand(0)) {
6705       if (ExpectedOpcode == ISD::FSUB)
6706         return SDValue();
6707
6708       // FADD is commutable. Try to commute the operands
6709       // and then test again.
6710       std::swap(Op0, Op1);
6711       if (InVec0 != Op0.getOperand(0))
6712         return SDValue();
6713     }
6714
6715     if (InVec1 != Op1.getOperand(0))
6716       return SDValue();
6717
6718     // Update the pair of expected opcodes.
6719     std::swap(ExpectedOpcode, NextExpectedOpcode);
6720   }
6721
6722   // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
6723   if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF &&
6724       InVec1.getOpcode() != ISD::UNDEF)
6725     return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
6726
6727   return SDValue();
6728 }
6729
6730 static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
6731                                           const X86Subtarget *Subtarget) {
6732   SDLoc DL(N);
6733   EVT VT = N->getValueType(0);
6734   unsigned NumElts = VT.getVectorNumElements();
6735   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N);
6736   SDValue InVec0, InVec1;
6737
6738   // Try to match an ADDSUB.
6739   if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
6740       (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
6741     SDValue Value = matchAddSub(BV, DAG, Subtarget);
6742     if (Value.getNode())
6743       return Value;
6744   }
6745
6746   // Try to match horizontal ADD/SUB.
6747   unsigned NumUndefsLO = 0;
6748   unsigned NumUndefsHI = 0;
6749   unsigned Half = NumElts/2;
6750
6751   // Count the number of UNDEF operands in the build_vector in input.
6752   for (unsigned i = 0, e = Half; i != e; ++i)
6753     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
6754       NumUndefsLO++;
6755
6756   for (unsigned i = Half, e = NumElts; i != e; ++i)
6757     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
6758       NumUndefsHI++;
6759
6760   // Early exit if this is either a build_vector of all UNDEFs or all the
6761   // operands but one are UNDEF.
6762   if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
6763     return SDValue();
6764
6765   if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) {
6766     // Try to match an SSE3 float HADD/HSUB.
6767     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
6768       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
6769
6770     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
6771       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
6772   } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
6773     // Try to match an SSSE3 integer HADD/HSUB.
6774     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
6775       return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
6776
6777     if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
6778       return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
6779   }
6780
6781   if (!Subtarget->hasAVX())
6782     return SDValue();
6783
6784   if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
6785     // Try to match an AVX horizontal add/sub of packed single/double
6786     // precision floating point values from 256-bit vectors.
6787     SDValue InVec2, InVec3;
6788     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
6789         isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
6790         ((InVec0.getOpcode() == ISD::UNDEF ||
6791           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6792         ((InVec1.getOpcode() == ISD::UNDEF ||
6793           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6794       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
6795
6796     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
6797         isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
6798         ((InVec0.getOpcode() == ISD::UNDEF ||
6799           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6800         ((InVec1.getOpcode() == ISD::UNDEF ||
6801           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6802       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
6803   } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
6804     // Try to match an AVX2 horizontal add/sub of signed integers.
6805     SDValue InVec2, InVec3;
6806     unsigned X86Opcode;
6807     bool CanFold = true;
6808
6809     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
6810         isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
6811         ((InVec0.getOpcode() == ISD::UNDEF ||
6812           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6813         ((InVec1.getOpcode() == ISD::UNDEF ||
6814           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6815       X86Opcode = X86ISD::HADD;
6816     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
6817         isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
6818         ((InVec0.getOpcode() == ISD::UNDEF ||
6819           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6820         ((InVec1.getOpcode() == ISD::UNDEF ||
6821           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6822       X86Opcode = X86ISD::HSUB;
6823     else
6824       CanFold = false;
6825
6826     if (CanFold) {
6827       // Fold this build_vector into a single horizontal add/sub.
6828       // Do this only if the target has AVX2.
6829       if (Subtarget->hasAVX2())
6830         return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
6831
6832       // Do not try to expand this build_vector into a pair of horizontal
6833       // add/sub if we can emit a pair of scalar add/sub.
6834       if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
6835         return SDValue();
6836
6837       // Convert this build_vector into a pair of horizontal binop followed by
6838       // a concat vector.
6839       bool isUndefLO = NumUndefsLO == Half;
6840       bool isUndefHI = NumUndefsHI == Half;
6841       return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
6842                                    isUndefLO, isUndefHI);
6843     }
6844   }
6845
6846   if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
6847        VT == MVT::v16i16) && Subtarget->hasAVX()) {
6848     unsigned X86Opcode;
6849     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
6850       X86Opcode = X86ISD::HADD;
6851     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
6852       X86Opcode = X86ISD::HSUB;
6853     else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
6854       X86Opcode = X86ISD::FHADD;
6855     else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
6856       X86Opcode = X86ISD::FHSUB;
6857     else
6858       return SDValue();
6859
6860     // Don't try to expand this build_vector into a pair of horizontal add/sub
6861     // if we can simply emit a pair of scalar add/sub.
6862     if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
6863       return SDValue();
6864
6865     // Convert this build_vector into two horizontal add/sub followed by
6866     // a concat vector.
6867     bool isUndefLO = NumUndefsLO == Half;
6868     bool isUndefHI = NumUndefsHI == Half;
6869     return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
6870                                  isUndefLO, isUndefHI);
6871   }
6872
6873   return SDValue();
6874 }
6875
6876 SDValue
6877 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
6878   SDLoc dl(Op);
6879
6880   MVT VT = Op.getSimpleValueType();
6881   MVT ExtVT = VT.getVectorElementType();
6882   unsigned NumElems = Op.getNumOperands();
6883
6884   // Generate vectors for predicate vectors.
6885   if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512())
6886     return LowerBUILD_VECTORvXi1(Op, DAG);
6887
6888   // Vectors containing all zeros can be matched by pxor and xorps later
6889   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
6890     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
6891     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
6892     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
6893       return Op;
6894
6895     return getZeroVector(VT, Subtarget, DAG, dl);
6896   }
6897
6898   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
6899   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
6900   // vpcmpeqd on 256-bit vectors.
6901   if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
6902     if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
6903       return Op;
6904
6905     if (!VT.is512BitVector())
6906       return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
6907   }
6908
6909   SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
6910   if (Broadcast.getNode())
6911     return Broadcast;
6912
6913   unsigned EVTBits = ExtVT.getSizeInBits();
6914
6915   unsigned NumZero  = 0;
6916   unsigned NumNonZero = 0;
6917   unsigned NonZeros = 0;
6918   bool IsAllConstants = true;
6919   SmallSet<SDValue, 8> Values;
6920   for (unsigned i = 0; i < NumElems; ++i) {
6921     SDValue Elt = Op.getOperand(i);
6922     if (Elt.getOpcode() == ISD::UNDEF)
6923       continue;
6924     Values.insert(Elt);
6925     if (Elt.getOpcode() != ISD::Constant &&
6926         Elt.getOpcode() != ISD::ConstantFP)
6927       IsAllConstants = false;
6928     if (X86::isZeroNode(Elt))
6929       NumZero++;
6930     else {
6931       NonZeros |= (1 << i);
6932       NumNonZero++;
6933     }
6934   }
6935
6936   // All undef vector. Return an UNDEF.  All zero vectors were handled above.
6937   if (NumNonZero == 0)
6938     return DAG.getUNDEF(VT);
6939
6940   // Special case for single non-zero, non-undef, element.
6941   if (NumNonZero == 1) {
6942     unsigned Idx = countTrailingZeros(NonZeros);
6943     SDValue Item = Op.getOperand(Idx);
6944
6945     // If this is an insertion of an i64 value on x86-32, and if the top bits of
6946     // the value are obviously zero, truncate the value to i32 and do the
6947     // insertion that way.  Only do this if the value is non-constant or if the
6948     // value is a constant being inserted into element 0.  It is cheaper to do
6949     // a constant pool load than it is to do a movd + shuffle.
6950     if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
6951         (!IsAllConstants || Idx == 0)) {
6952       if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
6953         // Handle SSE only.
6954         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
6955         EVT VecVT = MVT::v4i32;
6956         unsigned VecElts = 4;
6957
6958         // Truncate the value (which may itself be a constant) to i32, and
6959         // convert it to a vector with movd (S2V+shuffle to zero extend).
6960         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
6961         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
6962
6963         // If using the new shuffle lowering, just directly insert this.
6964         if (ExperimentalVectorShuffleLowering)
6965           return DAG.getNode(
6966               ISD::BITCAST, dl, VT,
6967               getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG));
6968
6969         Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
6970
6971         // Now we have our 32-bit value zero extended in the low element of
6972         // a vector.  If Idx != 0, swizzle it into place.
6973         if (Idx != 0) {
6974           SmallVector<int, 4> Mask;
6975           Mask.push_back(Idx);
6976           for (unsigned i = 1; i != VecElts; ++i)
6977             Mask.push_back(i);
6978           Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
6979                                       &Mask[0]);
6980         }
6981         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
6982       }
6983     }
6984
6985     // If we have a constant or non-constant insertion into the low element of
6986     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
6987     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
6988     // depending on what the source datatype is.
6989     if (Idx == 0) {
6990       if (NumZero == 0)
6991         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
6992
6993       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
6994           (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
6995         if (VT.is256BitVector() || VT.is512BitVector()) {
6996           SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
6997           return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
6998                              Item, DAG.getIntPtrConstant(0));
6999         }
7000         assert(VT.is128BitVector() && "Expected an SSE value type!");
7001         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7002         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
7003         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7004       }
7005
7006       if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
7007         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
7008         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7009         if (VT.is256BitVector()) {
7010           SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
7011           Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
7012         } else {
7013           assert(VT.is128BitVector() && "Expected an SSE value type!");
7014           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7015         }
7016         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
7017       }
7018     }
7019
7020     // Is it a vector logical left shift?
7021     if (NumElems == 2 && Idx == 1 &&
7022         X86::isZeroNode(Op.getOperand(0)) &&
7023         !X86::isZeroNode(Op.getOperand(1))) {
7024       unsigned NumBits = VT.getSizeInBits();
7025       return getVShift(true, VT,
7026                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
7027                                    VT, Op.getOperand(1)),
7028                        NumBits/2, DAG, *this, dl);
7029     }
7030
7031     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
7032       return SDValue();
7033
7034     // Otherwise, if this is a vector with i32 or f32 elements, and the element
7035     // is a non-constant being inserted into an element other than the low one,
7036     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
7037     // movd/movss) to move this into the low element, then shuffle it into
7038     // place.
7039     if (EVTBits == 32) {
7040       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7041
7042       // If using the new shuffle lowering, just directly insert this.
7043       if (ExperimentalVectorShuffleLowering)
7044         return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
7045
7046       // Turn it into a shuffle of zero and zero-extended scalar to vector.
7047       Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
7048       SmallVector<int, 8> MaskVec;
7049       for (unsigned i = 0; i != NumElems; ++i)
7050         MaskVec.push_back(i == Idx ? 0 : 1);
7051       return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
7052     }
7053   }
7054
7055   // Splat is obviously ok. Let legalizer expand it to a shuffle.
7056   if (Values.size() == 1) {
7057     if (EVTBits == 32) {
7058       // Instead of a shuffle like this:
7059       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
7060       // Check if it's possible to issue this instead.
7061       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
7062       unsigned Idx = countTrailingZeros(NonZeros);
7063       SDValue Item = Op.getOperand(Idx);
7064       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
7065         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
7066     }
7067     return SDValue();
7068   }
7069
7070   // A vector full of immediates; various special cases are already
7071   // handled, so this is best done with a single constant-pool load.
7072   if (IsAllConstants)
7073     return SDValue();
7074
7075   // For AVX-length vectors, see if we can use a vector load to get all of the
7076   // elements, otherwise build the individual 128-bit pieces and use
7077   // shuffles to put them in place.
7078   if (VT.is256BitVector() || VT.is512BitVector()) {
7079     SmallVector<SDValue, 64> V;
7080     for (unsigned i = 0; i != NumElems; ++i)
7081       V.push_back(Op.getOperand(i));
7082
7083     // Check for a build vector of consecutive loads.
7084     if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
7085       return LD;
7086
7087     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
7088
7089     // Build both the lower and upper subvector.
7090     SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
7091                                 makeArrayRef(&V[0], NumElems/2));
7092     SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
7093                                 makeArrayRef(&V[NumElems / 2], NumElems/2));
7094
7095     // Recreate the wider vector with the lower and upper part.
7096     if (VT.is256BitVector())
7097       return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7098     return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7099   }
7100
7101   // Let legalizer expand 2-wide build_vectors.
7102   if (EVTBits == 64) {
7103     if (NumNonZero == 1) {
7104       // One half is zero or undef.
7105       unsigned Idx = countTrailingZeros(NonZeros);
7106       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
7107                                  Op.getOperand(Idx));
7108       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
7109     }
7110     return SDValue();
7111   }
7112
7113   // If element VT is < 32 bits, convert it to inserts into a zero vector.
7114   if (EVTBits == 8 && NumElems == 16) {
7115     SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
7116                                         Subtarget, *this);
7117     if (V.getNode()) return V;
7118   }
7119
7120   if (EVTBits == 16 && NumElems == 8) {
7121     SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
7122                                       Subtarget, *this);
7123     if (V.getNode()) return V;
7124   }
7125
7126   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
7127   if (EVTBits == 32 && NumElems == 4) {
7128     SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this);
7129     if (V.getNode())
7130       return V;
7131   }
7132
7133   // If element VT is == 32 bits, turn it into a number of shuffles.
7134   SmallVector<SDValue, 8> V(NumElems);
7135   if (NumElems == 4 && NumZero > 0) {
7136     for (unsigned i = 0; i < 4; ++i) {
7137       bool isZero = !(NonZeros & (1 << i));
7138       if (isZero)
7139         V[i] = getZeroVector(VT, Subtarget, DAG, dl);
7140       else
7141         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7142     }
7143
7144     for (unsigned i = 0; i < 2; ++i) {
7145       switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
7146         default: break;
7147         case 0:
7148           V[i] = V[i*2];  // Must be a zero vector.
7149           break;
7150         case 1:
7151           V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
7152           break;
7153         case 2:
7154           V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
7155           break;
7156         case 3:
7157           V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
7158           break;
7159       }
7160     }
7161
7162     bool Reverse1 = (NonZeros & 0x3) == 2;
7163     bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
7164     int MaskVec[] = {
7165       Reverse1 ? 1 : 0,
7166       Reverse1 ? 0 : 1,
7167       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
7168       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
7169     };
7170     return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
7171   }
7172
7173   if (Values.size() > 1 && VT.is128BitVector()) {
7174     // Check for a build vector of consecutive loads.
7175     for (unsigned i = 0; i < NumElems; ++i)
7176       V[i] = Op.getOperand(i);
7177
7178     // Check for elements which are consecutive loads.
7179     SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false);
7180     if (LD.getNode())
7181       return LD;
7182
7183     // Check for a build vector from mostly shuffle plus few inserting.
7184     SDValue Sh = buildFromShuffleMostly(Op, DAG);
7185     if (Sh.getNode())
7186       return Sh;
7187
7188     // For SSE 4.1, use insertps to put the high elements into the low element.
7189     if (Subtarget->hasSSE41()) {
7190       SDValue Result;
7191       if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
7192         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
7193       else
7194         Result = DAG.getUNDEF(VT);
7195
7196       for (unsigned i = 1; i < NumElems; ++i) {
7197         if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
7198         Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
7199                              Op.getOperand(i), DAG.getIntPtrConstant(i));
7200       }
7201       return Result;
7202     }
7203
7204     // Otherwise, expand into a number of unpckl*, start by extending each of
7205     // our (non-undef) elements to the full vector width with the element in the
7206     // bottom slot of the vector (which generates no code for SSE).
7207     for (unsigned i = 0; i < NumElems; ++i) {
7208       if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
7209         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7210       else
7211         V[i] = DAG.getUNDEF(VT);
7212     }
7213
7214     // Next, we iteratively mix elements, e.g. for v4f32:
7215     //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
7216     //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
7217     //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
7218     unsigned EltStride = NumElems >> 1;
7219     while (EltStride != 0) {
7220       for (unsigned i = 0; i < EltStride; ++i) {
7221         // If V[i+EltStride] is undef and this is the first round of mixing,
7222         // then it is safe to just drop this shuffle: V[i] is already in the
7223         // right place, the one element (since it's the first round) being
7224         // inserted as undef can be dropped.  This isn't safe for successive
7225         // rounds because they will permute elements within both vectors.
7226         if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
7227             EltStride == NumElems/2)
7228           continue;
7229
7230         V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
7231       }
7232       EltStride >>= 1;
7233     }
7234     return V[0];
7235   }
7236   return SDValue();
7237 }
7238
7239 // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
7240 // to create 256-bit vectors from two other 128-bit ones.
7241 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7242   SDLoc dl(Op);
7243   MVT ResVT = Op.getSimpleValueType();
7244
7245   assert((ResVT.is256BitVector() ||
7246           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
7247
7248   SDValue V1 = Op.getOperand(0);
7249   SDValue V2 = Op.getOperand(1);
7250   unsigned NumElems = ResVT.getVectorNumElements();
7251   if(ResVT.is256BitVector())
7252     return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7253
7254   if (Op.getNumOperands() == 4) {
7255     MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),
7256                                 ResVT.getVectorNumElements()/2);
7257     SDValue V3 = Op.getOperand(2);
7258     SDValue V4 = Op.getOperand(3);
7259     return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl),
7260       Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl);
7261   }
7262   return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7263 }
7264
7265 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7266   MVT LLVM_ATTRIBUTE_UNUSED VT = Op.getSimpleValueType();
7267   assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
7268          (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
7269           Op.getNumOperands() == 4)));
7270
7271   // AVX can use the vinsertf128 instruction to create 256-bit vectors
7272   // from two other 128-bit ones.
7273
7274   // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
7275   return LowerAVXCONCAT_VECTORS(Op, DAG);
7276 }
7277
7278
7279 //===----------------------------------------------------------------------===//
7280 // Vector shuffle lowering
7281 //
7282 // This is an experimental code path for lowering vector shuffles on x86. It is
7283 // designed to handle arbitrary vector shuffles and blends, gracefully
7284 // degrading performance as necessary. It works hard to recognize idiomatic
7285 // shuffles and lower them to optimal instruction patterns without leaving
7286 // a framework that allows reasonably efficient handling of all vector shuffle
7287 // patterns.
7288 //===----------------------------------------------------------------------===//
7289
7290 /// \brief Tiny helper function to identify a no-op mask.
7291 ///
7292 /// This is a somewhat boring predicate function. It checks whether the mask
7293 /// array input, which is assumed to be a single-input shuffle mask of the kind
7294 /// used by the X86 shuffle instructions (not a fully general
7295 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
7296 /// in-place shuffle are 'no-op's.
7297 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
7298   for (int i = 0, Size = Mask.size(); i < Size; ++i)
7299     if (Mask[i] != -1 && Mask[i] != i)
7300       return false;
7301   return true;
7302 }
7303
7304 /// \brief Helper function to classify a mask as a single-input mask.
7305 ///
7306 /// This isn't a generic single-input test because in the vector shuffle
7307 /// lowering we canonicalize single inputs to be the first input operand. This
7308 /// means we can more quickly test for a single input by only checking whether
7309 /// an input from the second operand exists. We also assume that the size of
7310 /// mask corresponds to the size of the input vectors which isn't true in the
7311 /// fully general case.
7312 static bool isSingleInputShuffleMask(ArrayRef<int> Mask) {
7313   for (int M : Mask)
7314     if (M >= (int)Mask.size())
7315       return false;
7316   return true;
7317 }
7318
7319 /// \brief Test whether there are elements crossing 128-bit lanes in this
7320 /// shuffle mask.
7321 ///
7322 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
7323 /// and we routinely test for these.
7324 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
7325   int LaneSize = 128 / VT.getScalarSizeInBits();
7326   int Size = Mask.size();
7327   for (int i = 0; i < Size; ++i)
7328     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
7329       return true;
7330   return false;
7331 }
7332
7333 /// \brief Test whether a shuffle mask is equivalent within each 128-bit lane.
7334 ///
7335 /// This checks a shuffle mask to see if it is performing the same
7336 /// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies
7337 /// that it is also not lane-crossing. It may however involve a blend from the
7338 /// same lane of a second vector.
7339 ///
7340 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
7341 /// non-trivial to compute in the face of undef lanes. The representation is
7342 /// *not* suitable for use with existing 128-bit shuffles as it will contain
7343 /// entries from both V1 and V2 inputs to the wider mask.
7344 static bool
7345 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
7346                                 SmallVectorImpl<int> &RepeatedMask) {
7347   int LaneSize = 128 / VT.getScalarSizeInBits();
7348   RepeatedMask.resize(LaneSize, -1);
7349   int Size = Mask.size();
7350   for (int i = 0; i < Size; ++i) {
7351     if (Mask[i] < 0)
7352       continue;
7353     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
7354       // This entry crosses lanes, so there is no way to model this shuffle.
7355       return false;
7356
7357     // Ok, handle the in-lane shuffles by detecting if and when they repeat.
7358     if (RepeatedMask[i % LaneSize] == -1)
7359       // This is the first non-undef entry in this slot of a 128-bit lane.
7360       RepeatedMask[i % LaneSize] =
7361           Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size;
7362     else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i])
7363       // Found a mismatch with the repeated mask.
7364       return false;
7365   }
7366   return true;
7367 }
7368
7369 // Hide this symbol with an anonymous namespace instead of 'static' so that MSVC
7370 // 2013 will allow us to use it as a non-type template parameter.
7371 namespace {
7372
7373 /// \brief Implementation of the \c isShuffleEquivalent variadic functor.
7374 ///
7375 /// See its documentation for details.
7376 bool isShuffleEquivalentImpl(ArrayRef<int> Mask, ArrayRef<const int *> Args) {
7377   if (Mask.size() != Args.size())
7378     return false;
7379   for (int i = 0, e = Mask.size(); i < e; ++i) {
7380     assert(*Args[i] >= 0 && "Arguments must be positive integers!");
7381     if (Mask[i] != -1 && Mask[i] != *Args[i])
7382       return false;
7383   }
7384   return true;
7385 }
7386
7387 } // namespace
7388
7389 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
7390 /// arguments.
7391 ///
7392 /// This is a fast way to test a shuffle mask against a fixed pattern:
7393 ///
7394 ///   if (isShuffleEquivalent(Mask, 3, 2, 1, 0)) { ... }
7395 ///
7396 /// It returns true if the mask is exactly as wide as the argument list, and
7397 /// each element of the mask is either -1 (signifying undef) or the value given
7398 /// in the argument.
7399 static const VariadicFunction1<
7400     bool, ArrayRef<int>, int, isShuffleEquivalentImpl> isShuffleEquivalent = {};
7401
7402 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
7403 ///
7404 /// This helper function produces an 8-bit shuffle immediate corresponding to
7405 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
7406 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
7407 /// example.
7408 ///
7409 /// NB: We rely heavily on "undef" masks preserving the input lane.
7410 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask,
7411                                           SelectionDAG &DAG) {
7412   assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
7413   assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
7414   assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
7415   assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
7416   assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
7417
7418   unsigned Imm = 0;
7419   Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0;
7420   Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2;
7421   Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4;
7422   Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6;
7423   return DAG.getConstant(Imm, MVT::i8);
7424 }
7425
7426 /// \brief Try to emit a blend instruction for a shuffle.
7427 ///
7428 /// This doesn't do any checks for the availability of instructions for blending
7429 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
7430 /// be matched in the backend with the type given. What it does check for is
7431 /// that the shuffle mask is in fact a blend.
7432 static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
7433                                          SDValue V2, ArrayRef<int> Mask,
7434                                          const X86Subtarget *Subtarget,
7435                                          SelectionDAG &DAG) {
7436
7437   unsigned BlendMask = 0;
7438   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7439     if (Mask[i] >= Size) {
7440       if (Mask[i] != i + Size)
7441         return SDValue(); // Shuffled V2 input!
7442       BlendMask |= 1u << i;
7443       continue;
7444     }
7445     if (Mask[i] >= 0 && Mask[i] != i)
7446       return SDValue(); // Shuffled V1 input!
7447   }
7448   switch (VT.SimpleTy) {
7449   case MVT::v2f64:
7450   case MVT::v4f32:
7451   case MVT::v4f64:
7452   case MVT::v8f32:
7453     return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
7454                        DAG.getConstant(BlendMask, MVT::i8));
7455
7456   case MVT::v4i64:
7457   case MVT::v8i32:
7458     assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
7459     // FALLTHROUGH
7460   case MVT::v2i64:
7461   case MVT::v4i32:
7462     // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
7463     // that instruction.
7464     if (Subtarget->hasAVX2()) {
7465       // Scale the blend by the number of 32-bit dwords per element.
7466       int Scale =  VT.getScalarSizeInBits() / 32;
7467       BlendMask = 0;
7468       for (int i = 0, Size = Mask.size(); i < Size; ++i)
7469         if (Mask[i] >= Size)
7470           for (int j = 0; j < Scale; ++j)
7471             BlendMask |= 1u << (i * Scale + j);
7472
7473       MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
7474       V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
7475       V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
7476       return DAG.getNode(ISD::BITCAST, DL, VT,
7477                          DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
7478                                      DAG.getConstant(BlendMask, MVT::i8)));
7479     }
7480     // FALLTHROUGH
7481   case MVT::v8i16: {
7482     // For integer shuffles we need to expand the mask and cast the inputs to
7483     // v8i16s prior to blending.
7484     int Scale = 8 / VT.getVectorNumElements();
7485     BlendMask = 0;
7486     for (int i = 0, Size = Mask.size(); i < Size; ++i)
7487       if (Mask[i] >= Size)
7488         for (int j = 0; j < Scale; ++j)
7489           BlendMask |= 1u << (i * Scale + j);
7490
7491     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
7492     V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
7493     return DAG.getNode(ISD::BITCAST, DL, VT,
7494                        DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
7495                                    DAG.getConstant(BlendMask, MVT::i8)));
7496   }
7497
7498   case MVT::v16i16: {
7499     assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
7500     SmallVector<int, 8> RepeatedMask;
7501     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
7502       // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
7503       assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
7504       BlendMask = 0;
7505       for (int i = 0; i < 8; ++i)
7506         if (RepeatedMask[i] >= 16)
7507           BlendMask |= 1u << i;
7508       return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
7509                          DAG.getConstant(BlendMask, MVT::i8));
7510     }
7511   }
7512     // FALLTHROUGH
7513   case MVT::v32i8: {
7514     assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
7515     // Scale the blend by the number of bytes per element.
7516     int Scale =  VT.getScalarSizeInBits() / 8;
7517     assert(Mask.size() * Scale == 32 && "Not a 256-bit vector!");
7518
7519     // Compute the VSELECT mask. Note that VSELECT is really confusing in the
7520     // mix of LLVM's code generator and the x86 backend. We tell the code
7521     // generator that boolean values in the elements of an x86 vector register
7522     // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
7523     // mapping a select to operand #1, and 'false' mapping to operand #2. The
7524     // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
7525     // of the element (the remaining are ignored) and 0 in that high bit would
7526     // mean operand #1 while 1 in the high bit would mean operand #2. So while
7527     // the LLVM model for boolean values in vector elements gets the relevant
7528     // bit set, it is set backwards and over constrained relative to x86's
7529     // actual model.
7530     SDValue VSELECTMask[32];
7531     for (int i = 0, Size = Mask.size(); i < Size; ++i)
7532       for (int j = 0; j < Scale; ++j)
7533         VSELECTMask[Scale * i + j] =
7534             Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
7535                         : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8);
7536
7537     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1);
7538     V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V2);
7539     return DAG.getNode(
7540         ISD::BITCAST, DL, VT,
7541         DAG.getNode(ISD::VSELECT, DL, MVT::v32i8,
7542                     DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, VSELECTMask),
7543                     V1, V2));
7544   }
7545
7546   default:
7547     llvm_unreachable("Not a supported integer vector type!");
7548   }
7549 }
7550
7551 /// \brief Generic routine to lower a shuffle and blend as a decomposed set of
7552 /// unblended shuffles followed by an unshuffled blend.
7553 ///
7554 /// This matches the extremely common pattern for handling combined
7555 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
7556 /// operations.
7557 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
7558                                                           SDValue V1,
7559                                                           SDValue V2,
7560                                                           ArrayRef<int> Mask,
7561                                                           SelectionDAG &DAG) {
7562   // Shuffle the input elements into the desired positions in V1 and V2 and
7563   // blend them together.
7564   SmallVector<int, 32> V1Mask(Mask.size(), -1);
7565   SmallVector<int, 32> V2Mask(Mask.size(), -1);
7566   SmallVector<int, 32> BlendMask(Mask.size(), -1);
7567   for (int i = 0, Size = Mask.size(); i < Size; ++i)
7568     if (Mask[i] >= 0 && Mask[i] < Size) {
7569       V1Mask[i] = Mask[i];
7570       BlendMask[i] = i;
7571     } else if (Mask[i] >= Size) {
7572       V2Mask[i] = Mask[i] - Size;
7573       BlendMask[i] = i + Size;
7574     }
7575
7576   V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
7577   V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
7578   return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
7579 }
7580
7581 /// \brief Try to lower a vector shuffle as a byte rotation.
7582 ///
7583 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
7584 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
7585 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
7586 /// try to generically lower a vector shuffle through such an pattern. It
7587 /// does not check for the profitability of lowering either as PALIGNR or
7588 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
7589 /// This matches shuffle vectors that look like:
7590 ///
7591 ///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
7592 ///
7593 /// Essentially it concatenates V1 and V2, shifts right by some number of
7594 /// elements, and takes the low elements as the result. Note that while this is
7595 /// specified as a *right shift* because x86 is little-endian, it is a *left
7596 /// rotate* of the vector lanes.
7597 ///
7598 /// Note that this only handles 128-bit vector widths currently.
7599 static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
7600                                               SDValue V2,
7601                                               ArrayRef<int> Mask,
7602                                               const X86Subtarget *Subtarget,
7603                                               SelectionDAG &DAG) {
7604   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
7605
7606   // We need to detect various ways of spelling a rotation:
7607   //   [11, 12, 13, 14, 15,  0,  1,  2]
7608   //   [-1, 12, 13, 14, -1, -1,  1, -1]
7609   //   [-1, -1, -1, -1, -1, -1,  1,  2]
7610   //   [ 3,  4,  5,  6,  7,  8,  9, 10]
7611   //   [-1,  4,  5,  6, -1, -1,  9, -1]
7612   //   [-1,  4,  5,  6, -1, -1, -1, -1]
7613   int Rotation = 0;
7614   SDValue Lo, Hi;
7615   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7616     if (Mask[i] == -1)
7617       continue;
7618     assert(Mask[i] >= 0 && "Only -1 is a valid negative mask element!");
7619
7620     // Based on the mod-Size value of this mask element determine where
7621     // a rotated vector would have started.
7622     int StartIdx = i - (Mask[i] % Size);
7623     if (StartIdx == 0)
7624       // The identity rotation isn't interesting, stop.
7625       return SDValue();
7626
7627     // If we found the tail of a vector the rotation must be the missing
7628     // front. If we found the head of a vector, it must be how much of the head.
7629     int CandidateRotation = StartIdx < 0 ? -StartIdx : Size - StartIdx;
7630
7631     if (Rotation == 0)
7632       Rotation = CandidateRotation;
7633     else if (Rotation != CandidateRotation)
7634       // The rotations don't match, so we can't match this mask.
7635       return SDValue();
7636
7637     // Compute which value this mask is pointing at.
7638     SDValue MaskV = Mask[i] < Size ? V1 : V2;
7639
7640     // Compute which of the two target values this index should be assigned to.
7641     // This reflects whether the high elements are remaining or the low elements
7642     // are remaining.
7643     SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
7644
7645     // Either set up this value if we've not encountered it before, or check
7646     // that it remains consistent.
7647     if (!TargetV)
7648       TargetV = MaskV;
7649     else if (TargetV != MaskV)
7650       // This may be a rotation, but it pulls from the inputs in some
7651       // unsupported interleaving.
7652       return SDValue();
7653   }
7654
7655   // Check that we successfully analyzed the mask, and normalize the results.
7656   assert(Rotation != 0 && "Failed to locate a viable rotation!");
7657   assert((Lo || Hi) && "Failed to find a rotated input vector!");
7658   if (!Lo)
7659     Lo = Hi;
7660   else if (!Hi)
7661     Hi = Lo;
7662
7663   assert(VT.getSizeInBits() == 128 &&
7664          "Rotate-based lowering only supports 128-bit lowering!");
7665   assert(Mask.size() <= 16 &&
7666          "Can shuffle at most 16 bytes in a 128-bit vector!");
7667
7668   // The actual rotate instruction rotates bytes, so we need to scale the
7669   // rotation based on how many bytes are in the vector.
7670   int Scale = 16 / Mask.size();
7671
7672   // SSSE3 targets can use the palignr instruction
7673   if (Subtarget->hasSSSE3()) {
7674     // Cast the inputs to v16i8 to match PALIGNR.
7675     Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Lo);
7676     Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Hi);
7677
7678     return DAG.getNode(ISD::BITCAST, DL, VT,
7679                        DAG.getNode(X86ISD::PALIGNR, DL, MVT::v16i8, Hi, Lo,
7680                                    DAG.getConstant(Rotation * Scale, MVT::i8)));
7681   }
7682
7683   // Default SSE2 implementation
7684   int LoByteShift = 16 - Rotation * Scale;
7685   int HiByteShift = Rotation * Scale;
7686
7687   // Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ.
7688   Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Lo);
7689   Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Hi);
7690
7691   SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo,
7692                                 DAG.getConstant(8 * LoByteShift, MVT::i8));
7693   SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi,
7694                                 DAG.getConstant(8 * HiByteShift, MVT::i8));
7695   return DAG.getNode(ISD::BITCAST, DL, VT,
7696                      DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift));
7697 }
7698
7699 /// \brief Compute whether each element of a shuffle is zeroable.
7700 ///
7701 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
7702 /// Either it is an undef element in the shuffle mask, the element of the input
7703 /// referenced is undef, or the element of the input referenced is known to be
7704 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
7705 /// as many lanes with this technique as possible to simplify the remaining
7706 /// shuffle.
7707 static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
7708                                                      SDValue V1, SDValue V2) {
7709   SmallBitVector Zeroable(Mask.size(), false);
7710
7711   bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
7712   bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
7713
7714   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7715     int M = Mask[i];
7716     // Handle the easy cases.
7717     if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
7718       Zeroable[i] = true;
7719       continue;
7720     }
7721
7722     // If this is an index into a build_vector node, dig out the input value and
7723     // use it.
7724     SDValue V = M < Size ? V1 : V2;
7725     if (V.getOpcode() != ISD::BUILD_VECTOR)
7726       continue;
7727
7728     SDValue Input = V.getOperand(M % Size);
7729     // The UNDEF opcode check really should be dead code here, but not quite
7730     // worth asserting on (it isn't invalid, just unexpected).
7731     if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input))
7732       Zeroable[i] = true;
7733   }
7734
7735   return Zeroable;
7736 }
7737
7738 /// \brief Try to emit a bitmask instruction for a shuffle.
7739 ///
7740 /// This handles cases where we can model a blend exactly as a bitmask due to
7741 /// one of the inputs being zeroable.
7742 static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1,
7743                                            SDValue V2, ArrayRef<int> Mask,
7744                                            SelectionDAG &DAG) {
7745   MVT EltVT = VT.getScalarType();
7746   int NumEltBits = EltVT.getSizeInBits();
7747   MVT IntEltVT = MVT::getIntegerVT(NumEltBits);
7748   SDValue Zero = DAG.getConstant(0, IntEltVT);
7749   SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), IntEltVT);
7750   if (EltVT.isFloatingPoint()) {
7751     Zero = DAG.getNode(ISD::BITCAST, DL, EltVT, Zero);
7752     AllOnes = DAG.getNode(ISD::BITCAST, DL, EltVT, AllOnes);
7753   }
7754   SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
7755   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7756   SDValue V;
7757   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7758     if (Zeroable[i])
7759       continue;
7760     if (Mask[i] % Size != i)
7761       return SDValue(); // Not a blend.
7762     if (!V)
7763       V = Mask[i] < Size ? V1 : V2;
7764     else if (V != (Mask[i] < Size ? V1 : V2))
7765       return SDValue(); // Can only let one input through the mask.
7766
7767     VMaskOps[i] = AllOnes;
7768   }
7769   if (!V)
7770     return SDValue(); // No non-zeroable elements!
7771
7772   SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps);
7773   V = DAG.getNode(VT.isFloatingPoint()
7774                   ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND,
7775                   DL, VT, V, VMask);
7776   return V;
7777 }
7778
7779 /// \brief Try to lower a vector shuffle as a byte shift (shifts in zeros).
7780 ///
7781 /// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ SSE2
7782 /// byte-shift instructions. The mask must consist of a shifted sequential
7783 /// shuffle from one of the input vectors and zeroable elements for the
7784 /// remaining 'shifted in' elements.
7785 ///
7786 /// Note that this only handles 128-bit vector widths currently.
7787 static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1,
7788                                              SDValue V2, ArrayRef<int> Mask,
7789                                              SelectionDAG &DAG) {
7790   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
7791
7792   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7793
7794   int Size = Mask.size();
7795   int Scale = 16 / Size;
7796
7797   for (int Shift = 1; Shift < Size; Shift++) {
7798     int ByteShift = Shift * Scale;
7799
7800     // PSRLDQ : (little-endian) right byte shift
7801     // [ 5,  6,  7, zz, zz, zz, zz, zz]
7802     // [ -1, 5,  6,  7, zz, zz, zz, zz]
7803     // [  1, 2, -1, -1, -1, -1, zz, zz]
7804     bool ZeroableRight = true;
7805     for (int i = Size - Shift; i < Size; i++) {
7806       ZeroableRight &= Zeroable[i];
7807     }
7808
7809     if (ZeroableRight) {
7810       bool ValidShiftRight1 =
7811           isSequentialOrUndefInRange(Mask, 0, Size - Shift, Shift);
7812       bool ValidShiftRight2 =
7813           isSequentialOrUndefInRange(Mask, 0, Size - Shift, Size + Shift);
7814
7815       if (ValidShiftRight1 || ValidShiftRight2) {
7816         // Cast the inputs to v2i64 to match PSRLDQ.
7817         SDValue &TargetV = ValidShiftRight1 ? V1 : V2;
7818         SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
7819         SDValue Shifted = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, V,
7820                                       DAG.getConstant(ByteShift * 8, MVT::i8));
7821         return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
7822       }
7823     }
7824
7825     // PSLLDQ : (little-endian) left byte shift
7826     // [ zz,  0,  1,  2,  3,  4,  5,  6]
7827     // [ zz, zz, -1, -1,  2,  3,  4, -1]
7828     // [ zz, zz, zz, zz, zz, zz, -1,  1]
7829     bool ZeroableLeft = true;
7830     for (int i = 0; i < Shift; i++) {
7831       ZeroableLeft &= Zeroable[i];
7832     }
7833
7834     if (ZeroableLeft) {
7835       bool ValidShiftLeft1 =
7836           isSequentialOrUndefInRange(Mask, Shift, Size - Shift, 0);
7837       bool ValidShiftLeft2 =
7838           isSequentialOrUndefInRange(Mask, Shift, Size - Shift, Size);
7839
7840       if (ValidShiftLeft1 || ValidShiftLeft2) {
7841         // Cast the inputs to v2i64 to match PSLLDQ.
7842         SDValue &TargetV = ValidShiftLeft1 ? V1 : V2;
7843         SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
7844         SDValue Shifted = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, V,
7845                                       DAG.getConstant(ByteShift * 8, MVT::i8));
7846         return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
7847       }
7848     }
7849   }
7850
7851   return SDValue();
7852 }
7853
7854 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
7855 ///
7856 /// Attempts to match a shuffle mask against the PSRL(W/D/Q) and PSLL(W/D/Q)
7857 /// SSE2 and AVX2 logical bit-shift instructions. The function matches
7858 /// elements from one of the input vectors shuffled to the left or right
7859 /// with zeroable elements 'shifted in'.
7860 static SDValue lowerVectorShuffleAsBitShift(SDLoc DL, MVT VT, SDValue V1,
7861                                             SDValue V2, ArrayRef<int> Mask,
7862                                             SelectionDAG &DAG) {
7863   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7864
7865   int Size = Mask.size();
7866   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
7867
7868   // PSRL : (little-endian) right bit shift.
7869   // [  1, zz,  3, zz]
7870   // [ -1, -1,  7, zz]
7871   // PSHL : (little-endian) left bit shift.
7872   // [ zz, 0, zz,  2 ]
7873   // [ -1, 4, zz, -1 ]
7874   auto MatchBitShift = [&](int Shift, int Scale) -> SDValue {
7875     MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale);
7876     MVT ShiftVT = MVT::getVectorVT(ShiftSVT, Size / Scale);
7877     assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
7878            "Illegal integer vector type");
7879
7880     bool MatchLeft = true, MatchRight = true;
7881     for (int i = 0; i != Size; i += Scale) {
7882       for (int j = 0; j != Shift; j++) {
7883         MatchLeft &= Zeroable[i + j];
7884       }
7885       for (int j = Scale - Shift; j != Scale; j++) {
7886         MatchRight &= Zeroable[i + j];
7887       }
7888     }
7889     if (!(MatchLeft || MatchRight))
7890       return SDValue();
7891
7892     bool MatchV1 = true, MatchV2 = true;
7893     for (int i = 0; i != Size; i += Scale) {
7894       unsigned Pos = MatchLeft ? i + Shift : i;
7895       unsigned Low = MatchLeft ? i : i + Shift;
7896       unsigned Len = Scale - Shift;
7897       MatchV1 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low);
7898       MatchV2 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low + Size);
7899     }
7900     if (!(MatchV1 || MatchV2))
7901       return SDValue();
7902
7903     // Cast the inputs to ShiftVT to match VSRLI/VSHLI and back again.
7904     unsigned OpCode = MatchLeft ? X86ISD::VSHLI : X86ISD::VSRLI;
7905     int ShiftAmt = Shift * VT.getScalarSizeInBits();
7906     SDValue V = MatchV1 ? V1 : V2;
7907     V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, V);
7908     V = DAG.getNode(OpCode, DL, ShiftVT, V, DAG.getConstant(ShiftAmt, MVT::i8));
7909     return DAG.getNode(ISD::BITCAST, DL, VT, V);
7910   };
7911
7912   // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
7913   // keep doubling the size of the integer elements up to that. We can
7914   // then shift the elements of the integer vector by whole multiples of
7915   // their width within the elements of the larger integer vector. Test each
7916   // multiple to see if we can find a match with the moved element indices
7917   // and that the shifted in elements are all zeroable.
7918   for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= 64; Scale *= 2)
7919     for (int Shift = 1; Shift != Scale; Shift++)
7920       if (SDValue BitShift = MatchBitShift(Shift, Scale))
7921         return BitShift;
7922
7923   // no match
7924   return SDValue();
7925 }
7926
7927 /// \brief Lower a vector shuffle as a zero or any extension.
7928 ///
7929 /// Given a specific number of elements, element bit width, and extension
7930 /// stride, produce either a zero or any extension based on the available
7931 /// features of the subtarget.
7932 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
7933     SDLoc DL, MVT VT, int Scale, bool AnyExt, SDValue InputV,
7934     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
7935   assert(Scale > 1 && "Need a scale to extend.");
7936   int NumElements = VT.getVectorNumElements();
7937   int EltBits = VT.getScalarSizeInBits();
7938   assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
7939          "Only 8, 16, and 32 bit elements can be extended.");
7940   assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
7941
7942   // Found a valid zext mask! Try various lowering strategies based on the
7943   // input type and available ISA extensions.
7944   if (Subtarget->hasSSE41()) {
7945     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
7946                                  NumElements / Scale);
7947     return DAG.getNode(ISD::BITCAST, DL, VT,
7948                        DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV));
7949   }
7950
7951   // For any extends we can cheat for larger element sizes and use shuffle
7952   // instructions that can fold with a load and/or copy.
7953   if (AnyExt && EltBits == 32) {
7954     int PSHUFDMask[4] = {0, -1, 1, -1};
7955     return DAG.getNode(
7956         ISD::BITCAST, DL, VT,
7957         DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
7958                     DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
7959                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
7960   }
7961   if (AnyExt && EltBits == 16 && Scale > 2) {
7962     int PSHUFDMask[4] = {0, -1, 0, -1};
7963     InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
7964                          DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
7965                          getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG));
7966     int PSHUFHWMask[4] = {1, -1, -1, -1};
7967     return DAG.getNode(
7968         ISD::BITCAST, DL, VT,
7969         DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16,
7970                     DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, InputV),
7971                     getV4X86ShuffleImm8ForMask(PSHUFHWMask, DAG)));
7972   }
7973
7974   // If this would require more than 2 unpack instructions to expand, use
7975   // pshufb when available. We can only use more than 2 unpack instructions
7976   // when zero extending i8 elements which also makes it easier to use pshufb.
7977   if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) {
7978     assert(NumElements == 16 && "Unexpected byte vector width!");
7979     SDValue PSHUFBMask[16];
7980     for (int i = 0; i < 16; ++i)
7981       PSHUFBMask[i] =
7982           DAG.getConstant((i % Scale == 0) ? i / Scale : 0x80, MVT::i8);
7983     InputV = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, InputV);
7984     return DAG.getNode(ISD::BITCAST, DL, VT,
7985                        DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
7986                                    DAG.getNode(ISD::BUILD_VECTOR, DL,
7987                                                MVT::v16i8, PSHUFBMask)));
7988   }
7989
7990   // Otherwise emit a sequence of unpacks.
7991   do {
7992     MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
7993     SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
7994                          : getZeroVector(InputVT, Subtarget, DAG, DL);
7995     InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
7996     InputV = DAG.getNode(X86ISD::UNPCKL, DL, InputVT, InputV, Ext);
7997     Scale /= 2;
7998     EltBits *= 2;
7999     NumElements /= 2;
8000   } while (Scale > 1);
8001   return DAG.getNode(ISD::BITCAST, DL, VT, InputV);
8002 }
8003
8004 /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
8005 ///
8006 /// This routine will try to do everything in its power to cleverly lower
8007 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
8008 /// check for the profitability of this lowering,  it tries to aggressively
8009 /// match this pattern. It will use all of the micro-architectural details it
8010 /// can to emit an efficient lowering. It handles both blends with all-zero
8011 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
8012 /// masking out later).
8013 ///
8014 /// The reason we have dedicated lowering for zext-style shuffles is that they
8015 /// are both incredibly common and often quite performance sensitive.
8016 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
8017     SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
8018     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
8019   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8020
8021   int Bits = VT.getSizeInBits();
8022   int NumElements = VT.getVectorNumElements();
8023   assert(VT.getScalarSizeInBits() <= 32 &&
8024          "Exceeds 32-bit integer zero extension limit");
8025   assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
8026
8027   // Define a helper function to check a particular ext-scale and lower to it if
8028   // valid.
8029   auto Lower = [&](int Scale) -> SDValue {
8030     SDValue InputV;
8031     bool AnyExt = true;
8032     for (int i = 0; i < NumElements; ++i) {
8033       if (Mask[i] == -1)
8034         continue; // Valid anywhere but doesn't tell us anything.
8035       if (i % Scale != 0) {
8036         // Each of the extended elements need to be zeroable.
8037         if (!Zeroable[i])
8038           return SDValue();
8039
8040         // We no longer are in the anyext case.
8041         AnyExt = false;
8042         continue;
8043       }
8044
8045       // Each of the base elements needs to be consecutive indices into the
8046       // same input vector.
8047       SDValue V = Mask[i] < NumElements ? V1 : V2;
8048       if (!InputV)
8049         InputV = V;
8050       else if (InputV != V)
8051         return SDValue(); // Flip-flopping inputs.
8052
8053       if (Mask[i] % NumElements != i / Scale)
8054         return SDValue(); // Non-consecutive strided elements.
8055     }
8056
8057     // If we fail to find an input, we have a zero-shuffle which should always
8058     // have already been handled.
8059     // FIXME: Maybe handle this here in case during blending we end up with one?
8060     if (!InputV)
8061       return SDValue();
8062
8063     return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
8064         DL, VT, Scale, AnyExt, InputV, Subtarget, DAG);
8065   };
8066
8067   // The widest scale possible for extending is to a 64-bit integer.
8068   assert(Bits % 64 == 0 &&
8069          "The number of bits in a vector must be divisible by 64 on x86!");
8070   int NumExtElements = Bits / 64;
8071
8072   // Each iteration, try extending the elements half as much, but into twice as
8073   // many elements.
8074   for (; NumExtElements < NumElements; NumExtElements *= 2) {
8075     assert(NumElements % NumExtElements == 0 &&
8076            "The input vector size must be divisible by the extended size.");
8077     if (SDValue V = Lower(NumElements / NumExtElements))
8078       return V;
8079   }
8080
8081   // General extends failed, but 128-bit vectors may be able to use MOVQ.
8082   if (Bits != 128)
8083     return SDValue();
8084
8085   // Returns one of the source operands if the shuffle can be reduced to a
8086   // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
8087   auto CanZExtLowHalf = [&]() {
8088     for (int i = NumElements / 2; i != NumElements; i++)
8089       if (!Zeroable[i])
8090         return SDValue();
8091     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
8092       return V1;
8093     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
8094       return V2;
8095     return SDValue();
8096   };
8097
8098   if (SDValue V = CanZExtLowHalf()) {
8099     V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V);
8100     V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
8101     return DAG.getNode(ISD::BITCAST, DL, VT, V);
8102   }
8103
8104   // No viable ext lowering found.
8105   return SDValue();
8106 }
8107
8108 /// \brief Try to get a scalar value for a specific element of a vector.
8109 ///
8110 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
8111 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
8112                                               SelectionDAG &DAG) {
8113   MVT VT = V.getSimpleValueType();
8114   MVT EltVT = VT.getVectorElementType();
8115   while (V.getOpcode() == ISD::BITCAST)
8116     V = V.getOperand(0);
8117   // If the bitcasts shift the element size, we can't extract an equivalent
8118   // element from it.
8119   MVT NewVT = V.getSimpleValueType();
8120   if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
8121     return SDValue();
8122
8123   if (V.getOpcode() == ISD::BUILD_VECTOR ||
8124       (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR))
8125     return DAG.getNode(ISD::BITCAST, SDLoc(V), EltVT, V.getOperand(Idx));
8126
8127   return SDValue();
8128 }
8129
8130 /// \brief Helper to test for a load that can be folded with x86 shuffles.
8131 ///
8132 /// This is particularly important because the set of instructions varies
8133 /// significantly based on whether the operand is a load or not.
8134 static bool isShuffleFoldableLoad(SDValue V) {
8135   while (V.getOpcode() == ISD::BITCAST)
8136     V = V.getOperand(0);
8137
8138   return ISD::isNON_EXTLoad(V.getNode());
8139 }
8140
8141 /// \brief Try to lower insertion of a single element into a zero vector.
8142 ///
8143 /// This is a common pattern that we have especially efficient patterns to lower
8144 /// across all subtarget feature sets.
8145 static SDValue lowerVectorShuffleAsElementInsertion(
8146     MVT VT, SDLoc DL, SDValue V1, SDValue V2, ArrayRef<int> Mask,
8147     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
8148   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8149   MVT ExtVT = VT;
8150   MVT EltVT = VT.getVectorElementType();
8151
8152   int V2Index = std::find_if(Mask.begin(), Mask.end(),
8153                              [&Mask](int M) { return M >= (int)Mask.size(); }) -
8154                 Mask.begin();
8155   bool IsV1Zeroable = true;
8156   for (int i = 0, Size = Mask.size(); i < Size; ++i)
8157     if (i != V2Index && !Zeroable[i]) {
8158       IsV1Zeroable = false;
8159       break;
8160     }
8161
8162   // Check for a single input from a SCALAR_TO_VECTOR node.
8163   // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
8164   // all the smarts here sunk into that routine. However, the current
8165   // lowering of BUILD_VECTOR makes that nearly impossible until the old
8166   // vector shuffle lowering is dead.
8167   if (SDValue V2S = getScalarValueForVectorElement(
8168           V2, Mask[V2Index] - Mask.size(), DAG)) {
8169     // We need to zext the scalar if it is smaller than an i32.
8170     V2S = DAG.getNode(ISD::BITCAST, DL, EltVT, V2S);
8171     if (EltVT == MVT::i8 || EltVT == MVT::i16) {
8172       // Using zext to expand a narrow element won't work for non-zero
8173       // insertions.
8174       if (!IsV1Zeroable)
8175         return SDValue();
8176
8177       // Zero-extend directly to i32.
8178       ExtVT = MVT::v4i32;
8179       V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
8180     }
8181     V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
8182   } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
8183              EltVT == MVT::i16) {
8184     // Either not inserting from the low element of the input or the input
8185     // element size is too small to use VZEXT_MOVL to clear the high bits.
8186     return SDValue();
8187   }
8188
8189   if (!IsV1Zeroable) {
8190     // If V1 can't be treated as a zero vector we have fewer options to lower
8191     // this. We can't support integer vectors or non-zero targets cheaply, and
8192     // the V1 elements can't be permuted in any way.
8193     assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
8194     if (!VT.isFloatingPoint() || V2Index != 0)
8195       return SDValue();
8196     SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
8197     V1Mask[V2Index] = -1;
8198     if (!isNoopShuffleMask(V1Mask))
8199       return SDValue();
8200     // This is essentially a special case blend operation, but if we have
8201     // general purpose blend operations, they are always faster. Bail and let
8202     // the rest of the lowering handle these as blends.
8203     if (Subtarget->hasSSE41())
8204       return SDValue();
8205
8206     // Otherwise, use MOVSD or MOVSS.
8207     assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
8208            "Only two types of floating point element types to handle!");
8209     return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
8210                        ExtVT, V1, V2);
8211   }
8212
8213   V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
8214   if (ExtVT != VT)
8215     V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
8216
8217   if (V2Index != 0) {
8218     // If we have 4 or fewer lanes we can cheaply shuffle the element into
8219     // the desired position. Otherwise it is more efficient to do a vector
8220     // shift left. We know that we can do a vector shift left because all
8221     // the inputs are zero.
8222     if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
8223       SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
8224       V2Shuffle[V2Index] = 0;
8225       V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
8226     } else {
8227       V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V2);
8228       V2 = DAG.getNode(
8229           X86ISD::VSHLDQ, DL, MVT::v2i64, V2,
8230           DAG.getConstant(
8231               V2Index * EltVT.getSizeInBits(),
8232               DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64)));
8233       V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
8234     }
8235   }
8236   return V2;
8237 }
8238
8239 /// \brief Try to lower broadcast of a single element.
8240 ///
8241 /// For convenience, this code also bundles all of the subtarget feature set
8242 /// filtering. While a little annoying to re-dispatch on type here, there isn't
8243 /// a convenient way to factor it out.
8244 static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V,
8245                                              ArrayRef<int> Mask,
8246                                              const X86Subtarget *Subtarget,
8247                                              SelectionDAG &DAG) {
8248   if (!Subtarget->hasAVX())
8249     return SDValue();
8250   if (VT.isInteger() && !Subtarget->hasAVX2())
8251     return SDValue();
8252
8253   // Check that the mask is a broadcast.
8254   int BroadcastIdx = -1;
8255   for (int M : Mask)
8256     if (M >= 0 && BroadcastIdx == -1)
8257       BroadcastIdx = M;
8258     else if (M >= 0 && M != BroadcastIdx)
8259       return SDValue();
8260
8261   assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
8262                                             "a sorted mask where the broadcast "
8263                                             "comes from V1.");
8264
8265   // Go up the chain of (vector) values to try and find a scalar load that
8266   // we can combine with the broadcast.
8267   for (;;) {
8268     switch (V.getOpcode()) {
8269     case ISD::CONCAT_VECTORS: {
8270       int OperandSize = Mask.size() / V.getNumOperands();
8271       V = V.getOperand(BroadcastIdx / OperandSize);
8272       BroadcastIdx %= OperandSize;
8273       continue;
8274     }
8275
8276     case ISD::INSERT_SUBVECTOR: {
8277       SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
8278       auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
8279       if (!ConstantIdx)
8280         break;
8281
8282       int BeginIdx = (int)ConstantIdx->getZExtValue();
8283       int EndIdx =
8284           BeginIdx + (int)VInner.getValueType().getVectorNumElements();
8285       if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
8286         BroadcastIdx -= BeginIdx;
8287         V = VInner;
8288       } else {
8289         V = VOuter;
8290       }
8291       continue;
8292     }
8293     }
8294     break;
8295   }
8296
8297   // Check if this is a broadcast of a scalar. We special case lowering
8298   // for scalars so that we can more effectively fold with loads.
8299   if (V.getOpcode() == ISD::BUILD_VECTOR ||
8300       (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
8301     V = V.getOperand(BroadcastIdx);
8302
8303     // If the scalar isn't a load we can't broadcast from it in AVX1, only with
8304     // AVX2.
8305     if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V))
8306       return SDValue();
8307   } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) {
8308     // We can't broadcast from a vector register w/o AVX2, and we can only
8309     // broadcast from the zero-element of a vector register.
8310     return SDValue();
8311   }
8312
8313   return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V);
8314 }
8315
8316 // Check for whether we can use INSERTPS to perform the shuffle. We only use
8317 // INSERTPS when the V1 elements are already in the correct locations
8318 // because otherwise we can just always use two SHUFPS instructions which
8319 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
8320 // perform INSERTPS if a single V1 element is out of place and all V2
8321 // elements are zeroable.
8322 static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
8323                                             ArrayRef<int> Mask,
8324                                             SelectionDAG &DAG) {
8325   assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
8326   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8327   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8328   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8329
8330   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8331
8332   unsigned ZMask = 0;
8333   int V1DstIndex = -1;
8334   int V2DstIndex = -1;
8335   bool V1UsedInPlace = false;
8336
8337   for (int i = 0; i < 4; i++) {
8338     // Synthesize a zero mask from the zeroable elements (includes undefs).
8339     if (Zeroable[i]) {
8340       ZMask |= 1 << i;
8341       continue;
8342     }
8343
8344     // Flag if we use any V1 inputs in place.
8345     if (i == Mask[i]) {
8346       V1UsedInPlace = true;
8347       continue;
8348     }
8349
8350     // We can only insert a single non-zeroable element.
8351     if (V1DstIndex != -1 || V2DstIndex != -1)
8352       return SDValue();
8353
8354     if (Mask[i] < 4) {
8355       // V1 input out of place for insertion.
8356       V1DstIndex = i;
8357     } else {
8358       // V2 input for insertion.
8359       V2DstIndex = i;
8360     }
8361   }
8362
8363   // Don't bother if we have no (non-zeroable) element for insertion.
8364   if (V1DstIndex == -1 && V2DstIndex == -1)
8365     return SDValue();
8366
8367   // Determine element insertion src/dst indices. The src index is from the
8368   // start of the inserted vector, not the start of the concatenated vector.
8369   unsigned V2SrcIndex = 0;
8370   if (V1DstIndex != -1) {
8371     // If we have a V1 input out of place, we use V1 as the V2 element insertion
8372     // and don't use the original V2 at all.
8373     V2SrcIndex = Mask[V1DstIndex];
8374     V2DstIndex = V1DstIndex;
8375     V2 = V1;
8376   } else {
8377     V2SrcIndex = Mask[V2DstIndex] - 4;
8378   }
8379
8380   // If no V1 inputs are used in place, then the result is created only from
8381   // the zero mask and the V2 insertion - so remove V1 dependency.
8382   if (!V1UsedInPlace)
8383     V1 = DAG.getUNDEF(MVT::v4f32);
8384
8385   unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
8386   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
8387
8388   // Insert the V2 element into the desired position.
8389   SDLoc DL(Op);
8390   return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
8391                      DAG.getConstant(InsertPSMask, MVT::i8));
8392 }
8393
8394 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
8395 ///
8396 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
8397 /// support for floating point shuffles but not integer shuffles. These
8398 /// instructions will incur a domain crossing penalty on some chips though so
8399 /// it is better to avoid lowering through this for integer vectors where
8400 /// possible.
8401 static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8402                                        const X86Subtarget *Subtarget,
8403                                        SelectionDAG &DAG) {
8404   SDLoc DL(Op);
8405   assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!");
8406   assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
8407   assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
8408   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8409   ArrayRef<int> Mask = SVOp->getMask();
8410   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
8411
8412   if (isSingleInputShuffleMask(Mask)) {
8413     // Use low duplicate instructions for masks that match their pattern.
8414     if (Subtarget->hasSSE3())
8415       if (isShuffleEquivalent(Mask, 0, 0))
8416         return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, V1);
8417
8418     // Straight shuffle of a single input vector. Simulate this by using the
8419     // single input as both of the "inputs" to this instruction..
8420     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
8421
8422     if (Subtarget->hasAVX()) {
8423       // If we have AVX, we can use VPERMILPS which will allow folding a load
8424       // into the shuffle.
8425       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
8426                          DAG.getConstant(SHUFPDMask, MVT::i8));
8427     }
8428
8429     return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V1,
8430                        DAG.getConstant(SHUFPDMask, MVT::i8));
8431   }
8432   assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
8433   assert(Mask[1] >= 2 && "Non-canonicalized blend!");
8434
8435   // Use dedicated unpack instructions for masks that match their pattern.
8436   if (isShuffleEquivalent(Mask, 0, 2))
8437     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2);
8438   if (isShuffleEquivalent(Mask, 1, 3))
8439     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2);
8440
8441   // If we have a single input, insert that into V1 if we can do so cheaply.
8442   if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
8443     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8444             MVT::v2f64, DL, V1, V2, Mask, Subtarget, DAG))
8445       return Insertion;
8446     // Try inverting the insertion since for v2 masks it is easy to do and we
8447     // can't reliably sort the mask one way or the other.
8448     int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
8449                           Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
8450     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8451             MVT::v2f64, DL, V2, V1, InverseMask, Subtarget, DAG))
8452       return Insertion;
8453   }
8454
8455   // Try to use one of the special instruction patterns to handle two common
8456   // blend patterns if a zero-blend above didn't work.
8457   if (isShuffleEquivalent(Mask, 0, 3) || isShuffleEquivalent(Mask, 1, 3))
8458     if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
8459       // We can either use a special instruction to load over the low double or
8460       // to move just the low double.
8461       return DAG.getNode(
8462           isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
8463           DL, MVT::v2f64, V2,
8464           DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
8465
8466   if (Subtarget->hasSSE41())
8467     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
8468                                                   Subtarget, DAG))
8469       return Blend;
8470
8471   unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
8472   return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2,
8473                      DAG.getConstant(SHUFPDMask, MVT::i8));
8474 }
8475
8476 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
8477 ///
8478 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
8479 /// the integer unit to minimize domain crossing penalties. However, for blends
8480 /// it falls back to the floating point shuffle operation with appropriate bit
8481 /// casting.
8482 static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8483                                        const X86Subtarget *Subtarget,
8484                                        SelectionDAG &DAG) {
8485   SDLoc DL(Op);
8486   assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!");
8487   assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
8488   assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
8489   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8490   ArrayRef<int> Mask = SVOp->getMask();
8491   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
8492
8493   if (isSingleInputShuffleMask(Mask)) {
8494     // Check for being able to broadcast a single element.
8495     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v2i64, DL, V1,
8496                                                           Mask, Subtarget, DAG))
8497       return Broadcast;
8498
8499     // Straight shuffle of a single input vector. For everything from SSE2
8500     // onward this has a single fast instruction with no scary immediates.
8501     // We have to map the mask as it is actually a v4i32 shuffle instruction.
8502     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V1);
8503     int WidenedMask[4] = {
8504         std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
8505         std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
8506     return DAG.getNode(
8507         ISD::BITCAST, DL, MVT::v2i64,
8508         DAG.getNode(X86ISD::PSHUFD, SDLoc(Op), MVT::v4i32, V1,
8509                     getV4X86ShuffleImm8ForMask(WidenedMask, DAG)));
8510   }
8511
8512   // Try to use byte shift instructions.
8513   if (SDValue Shift = lowerVectorShuffleAsByteShift(
8514           DL, MVT::v2i64, V1, V2, Mask, DAG))
8515     return Shift;
8516
8517   // If we have a single input from V2 insert that into V1 if we can do so
8518   // cheaply.
8519   if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
8520     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8521             MVT::v2i64, DL, V1, V2, Mask, Subtarget, DAG))
8522       return Insertion;
8523     // Try inverting the insertion since for v2 masks it is easy to do and we
8524     // can't reliably sort the mask one way or the other.
8525     int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
8526                           Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
8527     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8528             MVT::v2i64, DL, V2, V1, InverseMask, Subtarget, DAG))
8529       return Insertion;
8530   }
8531
8532   // Use dedicated unpack instructions for masks that match their pattern.
8533   if (isShuffleEquivalent(Mask, 0, 2))
8534     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2);
8535   if (isShuffleEquivalent(Mask, 1, 3))
8536     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2);
8537
8538   if (Subtarget->hasSSE41())
8539     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
8540                                                   Subtarget, DAG))
8541       return Blend;
8542
8543   // Try to use byte rotation instructions.
8544   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
8545   if (Subtarget->hasSSSE3())
8546     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
8547             DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
8548       return Rotate;
8549
8550   // We implement this with SHUFPD which is pretty lame because it will likely
8551   // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
8552   // However, all the alternatives are still more cycles and newer chips don't
8553   // have this problem. It would be really nice if x86 had better shuffles here.
8554   V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V1);
8555   V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V2);
8556   return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
8557                      DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
8558 }
8559
8560 /// \brief Lower a vector shuffle using the SHUFPS instruction.
8561 ///
8562 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
8563 /// It makes no assumptions about whether this is the *best* lowering, it simply
8564 /// uses it.
8565 static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT,
8566                                             ArrayRef<int> Mask, SDValue V1,
8567                                             SDValue V2, SelectionDAG &DAG) {
8568   SDValue LowV = V1, HighV = V2;
8569   int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
8570
8571   int NumV2Elements =
8572       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
8573
8574   if (NumV2Elements == 1) {
8575     int V2Index =
8576         std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
8577         Mask.begin();
8578
8579     // Compute the index adjacent to V2Index and in the same half by toggling
8580     // the low bit.
8581     int V2AdjIndex = V2Index ^ 1;
8582
8583     if (Mask[V2AdjIndex] == -1) {
8584       // Handles all the cases where we have a single V2 element and an undef.
8585       // This will only ever happen in the high lanes because we commute the
8586       // vector otherwise.
8587       if (V2Index < 2)
8588         std::swap(LowV, HighV);
8589       NewMask[V2Index] -= 4;
8590     } else {
8591       // Handle the case where the V2 element ends up adjacent to a V1 element.
8592       // To make this work, blend them together as the first step.
8593       int V1Index = V2AdjIndex;
8594       int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
8595       V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
8596                        getV4X86ShuffleImm8ForMask(BlendMask, DAG));
8597
8598       // Now proceed to reconstruct the final blend as we have the necessary
8599       // high or low half formed.
8600       if (V2Index < 2) {
8601         LowV = V2;
8602         HighV = V1;
8603       } else {
8604         HighV = V2;
8605       }
8606       NewMask[V1Index] = 2; // We put the V1 element in V2[2].
8607       NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
8608     }
8609   } else if (NumV2Elements == 2) {
8610     if (Mask[0] < 4 && Mask[1] < 4) {
8611       // Handle the easy case where we have V1 in the low lanes and V2 in the
8612       // high lanes.
8613       NewMask[2] -= 4;
8614       NewMask[3] -= 4;
8615     } else if (Mask[2] < 4 && Mask[3] < 4) {
8616       // We also handle the reversed case because this utility may get called
8617       // when we detect a SHUFPS pattern but can't easily commute the shuffle to
8618       // arrange things in the right direction.
8619       NewMask[0] -= 4;
8620       NewMask[1] -= 4;
8621       HighV = V1;
8622       LowV = V2;
8623     } else {
8624       // We have a mixture of V1 and V2 in both low and high lanes. Rather than
8625       // trying to place elements directly, just blend them and set up the final
8626       // shuffle to place them.
8627
8628       // The first two blend mask elements are for V1, the second two are for
8629       // V2.
8630       int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
8631                           Mask[2] < 4 ? Mask[2] : Mask[3],
8632                           (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
8633                           (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
8634       V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
8635                        getV4X86ShuffleImm8ForMask(BlendMask, DAG));
8636
8637       // Now we do a normal shuffle of V1 by giving V1 as both operands to
8638       // a blend.
8639       LowV = HighV = V1;
8640       NewMask[0] = Mask[0] < 4 ? 0 : 2;
8641       NewMask[1] = Mask[0] < 4 ? 2 : 0;
8642       NewMask[2] = Mask[2] < 4 ? 1 : 3;
8643       NewMask[3] = Mask[2] < 4 ? 3 : 1;
8644     }
8645   }
8646   return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
8647                      getV4X86ShuffleImm8ForMask(NewMask, DAG));
8648 }
8649
8650 /// \brief Lower 4-lane 32-bit floating point shuffles.
8651 ///
8652 /// Uses instructions exclusively from the floating point unit to minimize
8653 /// domain crossing penalties, as these are sufficient to implement all v4f32
8654 /// shuffles.
8655 static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8656                                        const X86Subtarget *Subtarget,
8657                                        SelectionDAG &DAG) {
8658   SDLoc DL(Op);
8659   assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
8660   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8661   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8662   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8663   ArrayRef<int> Mask = SVOp->getMask();
8664   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8665
8666   int NumV2Elements =
8667       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
8668
8669   if (NumV2Elements == 0) {
8670     // Check for being able to broadcast a single element.
8671     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f32, DL, V1,
8672                                                           Mask, Subtarget, DAG))
8673       return Broadcast;
8674
8675     // Use even/odd duplicate instructions for masks that match their pattern.
8676     if (Subtarget->hasSSE3()) {
8677       if (isShuffleEquivalent(Mask, 0, 0, 2, 2))
8678         return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
8679       if (isShuffleEquivalent(Mask, 1, 1, 3, 3))
8680         return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
8681     }
8682
8683     if (Subtarget->hasAVX()) {
8684       // If we have AVX, we can use VPERMILPS which will allow folding a load
8685       // into the shuffle.
8686       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
8687                          getV4X86ShuffleImm8ForMask(Mask, DAG));
8688     }
8689
8690     // Otherwise, use a straight shuffle of a single input vector. We pass the
8691     // input vector to both operands to simulate this with a SHUFPS.
8692     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
8693                        getV4X86ShuffleImm8ForMask(Mask, DAG));
8694   }
8695
8696   // Use dedicated unpack instructions for masks that match their pattern.
8697   if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
8698     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2);
8699   if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
8700     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
8701
8702   // There are special ways we can lower some single-element blends. However, we
8703   // have custom ways we can lower more complex single-element blends below that
8704   // we defer to if both this and BLENDPS fail to match, so restrict this to
8705   // when the V2 input is targeting element 0 of the mask -- that is the fast
8706   // case here.
8707   if (NumV2Elements == 1 && Mask[0] >= 4)
8708     if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4f32, DL, V1, V2,
8709                                                          Mask, Subtarget, DAG))
8710       return V;
8711
8712   if (Subtarget->hasSSE41()) {
8713     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
8714                                                   Subtarget, DAG))
8715       return Blend;
8716
8717     // Use INSERTPS if we can complete the shuffle efficiently.
8718     if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG))
8719       return V;
8720   }
8721
8722   // Otherwise fall back to a SHUFPS lowering strategy.
8723   return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
8724 }
8725
8726 /// \brief Lower 4-lane i32 vector shuffles.
8727 ///
8728 /// We try to handle these with integer-domain shuffles where we can, but for
8729 /// blends we use the floating point domain blend instructions.
8730 static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8731                                        const X86Subtarget *Subtarget,
8732                                        SelectionDAG &DAG) {
8733   SDLoc DL(Op);
8734   assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!");
8735   assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
8736   assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
8737   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8738   ArrayRef<int> Mask = SVOp->getMask();
8739   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8740
8741   // Whenever we can lower this as a zext, that instruction is strictly faster
8742   // than any alternative. It also allows us to fold memory operands into the
8743   // shuffle in many cases.
8744   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2,
8745                                                          Mask, Subtarget, DAG))
8746     return ZExt;
8747
8748   int NumV2Elements =
8749       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
8750
8751   if (NumV2Elements == 0) {
8752     // Check for being able to broadcast a single element.
8753     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i32, DL, V1,
8754                                                           Mask, Subtarget, DAG))
8755       return Broadcast;
8756
8757     // Straight shuffle of a single input vector. For everything from SSE2
8758     // onward this has a single fast instruction with no scary immediates.
8759     // We coerce the shuffle pattern to be compatible with UNPCK instructions
8760     // but we aren't actually going to use the UNPCK instruction because doing
8761     // so prevents folding a load into this instruction or making a copy.
8762     const int UnpackLoMask[] = {0, 0, 1, 1};
8763     const int UnpackHiMask[] = {2, 2, 3, 3};
8764     if (isShuffleEquivalent(Mask, 0, 0, 1, 1))
8765       Mask = UnpackLoMask;
8766     else if (isShuffleEquivalent(Mask, 2, 2, 3, 3))
8767       Mask = UnpackHiMask;
8768
8769     return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
8770                        getV4X86ShuffleImm8ForMask(Mask, DAG));
8771   }
8772
8773   // Try to use bit shift instructions.
8774   if (SDValue Shift = lowerVectorShuffleAsBitShift(
8775           DL, MVT::v4i32, V1, V2, Mask, DAG))
8776     return Shift;
8777
8778   // Try to use byte shift instructions.
8779   if (SDValue Shift = lowerVectorShuffleAsByteShift(
8780           DL, MVT::v4i32, V1, V2, Mask, DAG))
8781     return Shift;
8782
8783   // There are special ways we can lower some single-element blends.
8784   if (NumV2Elements == 1)
8785     if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2,
8786                                                          Mask, Subtarget, DAG))
8787       return V;
8788
8789   if (Subtarget->hasSSE41())
8790     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
8791                                                   Subtarget, DAG))
8792       return Blend;
8793
8794   if (SDValue Masked =
8795           lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, DAG))
8796     return Masked;
8797
8798   // Use dedicated unpack instructions for masks that match their pattern.
8799   if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
8800     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2);
8801   if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
8802     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
8803
8804   // Try to use byte rotation instructions.
8805   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
8806   if (Subtarget->hasSSSE3())
8807     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
8808             DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
8809       return Rotate;
8810
8811   // We implement this with SHUFPS because it can blend from two vectors.
8812   // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
8813   // up the inputs, bypassing domain shift penalties that we would encur if we
8814   // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
8815   // relevant.
8816   return DAG.getNode(ISD::BITCAST, DL, MVT::v4i32,
8817                      DAG.getVectorShuffle(
8818                          MVT::v4f32, DL,
8819                          DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V1),
8820                          DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V2), Mask));
8821 }
8822
8823 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
8824 /// shuffle lowering, and the most complex part.
8825 ///
8826 /// The lowering strategy is to try to form pairs of input lanes which are
8827 /// targeted at the same half of the final vector, and then use a dword shuffle
8828 /// to place them onto the right half, and finally unpack the paired lanes into
8829 /// their final position.
8830 ///
8831 /// The exact breakdown of how to form these dword pairs and align them on the
8832 /// correct sides is really tricky. See the comments within the function for
8833 /// more of the details.
8834 static SDValue lowerV8I16SingleInputVectorShuffle(
8835     SDLoc DL, SDValue V, MutableArrayRef<int> Mask,
8836     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
8837   assert(V.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
8838   MutableArrayRef<int> LoMask = Mask.slice(0, 4);
8839   MutableArrayRef<int> HiMask = Mask.slice(4, 4);
8840
8841   SmallVector<int, 4> LoInputs;
8842   std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
8843                [](int M) { return M >= 0; });
8844   std::sort(LoInputs.begin(), LoInputs.end());
8845   LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
8846   SmallVector<int, 4> HiInputs;
8847   std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
8848                [](int M) { return M >= 0; });
8849   std::sort(HiInputs.begin(), HiInputs.end());
8850   HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
8851   int NumLToL =
8852       std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
8853   int NumHToL = LoInputs.size() - NumLToL;
8854   int NumLToH =
8855       std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
8856   int NumHToH = HiInputs.size() - NumLToH;
8857   MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
8858   MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
8859   MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
8860   MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
8861
8862   // Check for being able to broadcast a single element.
8863   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i16, DL, V,
8864                                                         Mask, Subtarget, DAG))
8865     return Broadcast;
8866
8867   // Try to use bit shift instructions.
8868   if (SDValue Shift = lowerVectorShuffleAsBitShift(
8869           DL, MVT::v8i16, V, V, Mask, DAG))
8870     return Shift;
8871
8872   // Try to use byte shift instructions.
8873   if (SDValue Shift = lowerVectorShuffleAsByteShift(
8874           DL, MVT::v8i16, V, V, Mask, DAG))
8875     return Shift;
8876
8877   // Use dedicated unpack instructions for masks that match their pattern.
8878   if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3))
8879     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V);
8880   if (isShuffleEquivalent(Mask, 4, 4, 5, 5, 6, 6, 7, 7))
8881     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V);
8882
8883   // Try to use byte rotation instructions.
8884   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
8885           DL, MVT::v8i16, V, V, Mask, Subtarget, DAG))
8886     return Rotate;
8887
8888   // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
8889   // such inputs we can swap two of the dwords across the half mark and end up
8890   // with <=2 inputs to each half in each half. Once there, we can fall through
8891   // to the generic code below. For example:
8892   //
8893   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
8894   // Mask:  [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
8895   //
8896   // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
8897   // and an existing 2-into-2 on the other half. In this case we may have to
8898   // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
8899   // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
8900   // Fortunately, we don't have to handle anything but a 2-into-2 pattern
8901   // because any other situation (including a 3-into-1 or 1-into-3 in the other
8902   // half than the one we target for fixing) will be fixed when we re-enter this
8903   // path. We will also combine away any sequence of PSHUFD instructions that
8904   // result into a single instruction. Here is an example of the tricky case:
8905   //
8906   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
8907   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
8908   //
8909   // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
8910   //
8911   // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
8912   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
8913   //
8914   // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
8915   // Mask:  [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
8916   //
8917   // The result is fine to be handled by the generic logic.
8918   auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
8919                           ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
8920                           int AOffset, int BOffset) {
8921     assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
8922            "Must call this with A having 3 or 1 inputs from the A half.");
8923     assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
8924            "Must call this with B having 1 or 3 inputs from the B half.");
8925     assert(AToAInputs.size() + BToAInputs.size() == 4 &&
8926            "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
8927
8928     // Compute the index of dword with only one word among the three inputs in
8929     // a half by taking the sum of the half with three inputs and subtracting
8930     // the sum of the actual three inputs. The difference is the remaining
8931     // slot.
8932     int ADWord, BDWord;
8933     int &TripleDWord = AToAInputs.size() == 3 ? ADWord : BDWord;
8934     int &OneInputDWord = AToAInputs.size() == 3 ? BDWord : ADWord;
8935     int TripleInputOffset = AToAInputs.size() == 3 ? AOffset : BOffset;
8936     ArrayRef<int> TripleInputs = AToAInputs.size() == 3 ? AToAInputs : BToAInputs;
8937     int OneInput = AToAInputs.size() == 3 ? BToAInputs[0] : AToAInputs[0];
8938     int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
8939     int TripleNonInputIdx =
8940         TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
8941     TripleDWord = TripleNonInputIdx / 2;
8942
8943     // We use xor with one to compute the adjacent DWord to whichever one the
8944     // OneInput is in.
8945     OneInputDWord = (OneInput / 2) ^ 1;
8946
8947     // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
8948     // and BToA inputs. If there is also such a problem with the BToB and AToB
8949     // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
8950     // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
8951     // is essential that we don't *create* a 3<-1 as then we might oscillate.
8952     if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
8953       // Compute how many inputs will be flipped by swapping these DWords. We
8954       // need
8955       // to balance this to ensure we don't form a 3-1 shuffle in the other
8956       // half.
8957       int NumFlippedAToBInputs =
8958           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
8959           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
8960       int NumFlippedBToBInputs =
8961           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
8962           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
8963       if ((NumFlippedAToBInputs == 1 &&
8964            (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
8965           (NumFlippedBToBInputs == 1 &&
8966            (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
8967         // We choose whether to fix the A half or B half based on whether that
8968         // half has zero flipped inputs. At zero, we may not be able to fix it
8969         // with that half. We also bias towards fixing the B half because that
8970         // will more commonly be the high half, and we have to bias one way.
8971         auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
8972                                                        ArrayRef<int> Inputs) {
8973           int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
8974           bool IsFixIdxInput = std::find(Inputs.begin(), Inputs.end(),
8975                                          PinnedIdx ^ 1) != Inputs.end();
8976           // Determine whether the free index is in the flipped dword or the
8977           // unflipped dword based on where the pinned index is. We use this bit
8978           // in an xor to conditionally select the adjacent dword.
8979           int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
8980           bool IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
8981                                              FixFreeIdx) != Inputs.end();
8982           if (IsFixIdxInput == IsFixFreeIdxInput)
8983             FixFreeIdx += 1;
8984           IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
8985                                         FixFreeIdx) != Inputs.end();
8986           assert(IsFixIdxInput != IsFixFreeIdxInput &&
8987                  "We need to be changing the number of flipped inputs!");
8988           int PSHUFHalfMask[] = {0, 1, 2, 3};
8989           std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
8990           V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
8991                           MVT::v8i16, V,
8992                           getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DAG));
8993
8994           for (int &M : Mask)
8995             if (M != -1 && M == FixIdx)
8996               M = FixFreeIdx;
8997             else if (M != -1 && M == FixFreeIdx)
8998               M = FixIdx;
8999         };
9000         if (NumFlippedBToBInputs != 0) {
9001           int BPinnedIdx =
9002               BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
9003           FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
9004         } else {
9005           assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
9006           int APinnedIdx =
9007               AToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
9008           FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
9009         }
9010       }
9011     }
9012
9013     int PSHUFDMask[] = {0, 1, 2, 3};
9014     PSHUFDMask[ADWord] = BDWord;
9015     PSHUFDMask[BDWord] = ADWord;
9016     V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9017                     DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9018                                 DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
9019                                 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
9020
9021     // Adjust the mask to match the new locations of A and B.
9022     for (int &M : Mask)
9023       if (M != -1 && M/2 == ADWord)
9024         M = 2 * BDWord + M % 2;
9025       else if (M != -1 && M/2 == BDWord)
9026         M = 2 * ADWord + M % 2;
9027
9028     // Recurse back into this routine to re-compute state now that this isn't
9029     // a 3 and 1 problem.
9030     return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
9031                                 Mask);
9032   };
9033   if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
9034     return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
9035   else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
9036     return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
9037
9038   // At this point there are at most two inputs to the low and high halves from
9039   // each half. That means the inputs can always be grouped into dwords and
9040   // those dwords can then be moved to the correct half with a dword shuffle.
9041   // We use at most one low and one high word shuffle to collect these paired
9042   // inputs into dwords, and finally a dword shuffle to place them.
9043   int PSHUFLMask[4] = {-1, -1, -1, -1};
9044   int PSHUFHMask[4] = {-1, -1, -1, -1};
9045   int PSHUFDMask[4] = {-1, -1, -1, -1};
9046
9047   // First fix the masks for all the inputs that are staying in their
9048   // original halves. This will then dictate the targets of the cross-half
9049   // shuffles.
9050   auto fixInPlaceInputs =
9051       [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
9052                     MutableArrayRef<int> SourceHalfMask,
9053                     MutableArrayRef<int> HalfMask, int HalfOffset) {
9054     if (InPlaceInputs.empty())
9055       return;
9056     if (InPlaceInputs.size() == 1) {
9057       SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
9058           InPlaceInputs[0] - HalfOffset;
9059       PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
9060       return;
9061     }
9062     if (IncomingInputs.empty()) {
9063       // Just fix all of the in place inputs.
9064       for (int Input : InPlaceInputs) {
9065         SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
9066         PSHUFDMask[Input / 2] = Input / 2;
9067       }
9068       return;
9069     }
9070
9071     assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
9072     SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
9073         InPlaceInputs[0] - HalfOffset;
9074     // Put the second input next to the first so that they are packed into
9075     // a dword. We find the adjacent index by toggling the low bit.
9076     int AdjIndex = InPlaceInputs[0] ^ 1;
9077     SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
9078     std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
9079     PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
9080   };
9081   fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
9082   fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
9083
9084   // Now gather the cross-half inputs and place them into a free dword of
9085   // their target half.
9086   // FIXME: This operation could almost certainly be simplified dramatically to
9087   // look more like the 3-1 fixing operation.
9088   auto moveInputsToRightHalf = [&PSHUFDMask](
9089       MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
9090       MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
9091       MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
9092       int DestOffset) {
9093     auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
9094       return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word;
9095     };
9096     auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
9097                                                int Word) {
9098       int LowWord = Word & ~1;
9099       int HighWord = Word | 1;
9100       return isWordClobbered(SourceHalfMask, LowWord) ||
9101              isWordClobbered(SourceHalfMask, HighWord);
9102     };
9103
9104     if (IncomingInputs.empty())
9105       return;
9106
9107     if (ExistingInputs.empty()) {
9108       // Map any dwords with inputs from them into the right half.
9109       for (int Input : IncomingInputs) {
9110         // If the source half mask maps over the inputs, turn those into
9111         // swaps and use the swapped lane.
9112         if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
9113           if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == -1) {
9114             SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
9115                 Input - SourceOffset;
9116             // We have to swap the uses in our half mask in one sweep.
9117             for (int &M : HalfMask)
9118               if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
9119                 M = Input;
9120               else if (M == Input)
9121                 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
9122           } else {
9123             assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
9124                        Input - SourceOffset &&
9125                    "Previous placement doesn't match!");
9126           }
9127           // Note that this correctly re-maps both when we do a swap and when
9128           // we observe the other side of the swap above. We rely on that to
9129           // avoid swapping the members of the input list directly.
9130           Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
9131         }
9132
9133         // Map the input's dword into the correct half.
9134         if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == -1)
9135           PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
9136         else
9137           assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
9138                      Input / 2 &&
9139                  "Previous placement doesn't match!");
9140       }
9141
9142       // And just directly shift any other-half mask elements to be same-half
9143       // as we will have mirrored the dword containing the element into the
9144       // same position within that half.
9145       for (int &M : HalfMask)
9146         if (M >= SourceOffset && M < SourceOffset + 4) {
9147           M = M - SourceOffset + DestOffset;
9148           assert(M >= 0 && "This should never wrap below zero!");
9149         }
9150       return;
9151     }
9152
9153     // Ensure we have the input in a viable dword of its current half. This
9154     // is particularly tricky because the original position may be clobbered
9155     // by inputs being moved and *staying* in that half.
9156     if (IncomingInputs.size() == 1) {
9157       if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
9158         int InputFixed = std::find(std::begin(SourceHalfMask),
9159                                    std::end(SourceHalfMask), -1) -
9160                          std::begin(SourceHalfMask) + SourceOffset;
9161         SourceHalfMask[InputFixed - SourceOffset] =
9162             IncomingInputs[0] - SourceOffset;
9163         std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
9164                      InputFixed);
9165         IncomingInputs[0] = InputFixed;
9166       }
9167     } else if (IncomingInputs.size() == 2) {
9168       if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
9169           isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
9170         // We have two non-adjacent or clobbered inputs we need to extract from
9171         // the source half. To do this, we need to map them into some adjacent
9172         // dword slot in the source mask.
9173         int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
9174                               IncomingInputs[1] - SourceOffset};
9175
9176         // If there is a free slot in the source half mask adjacent to one of
9177         // the inputs, place the other input in it. We use (Index XOR 1) to
9178         // compute an adjacent index.
9179         if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
9180             SourceHalfMask[InputsFixed[0] ^ 1] == -1) {
9181           SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
9182           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
9183           InputsFixed[1] = InputsFixed[0] ^ 1;
9184         } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
9185                    SourceHalfMask[InputsFixed[1] ^ 1] == -1) {
9186           SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
9187           SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
9188           InputsFixed[0] = InputsFixed[1] ^ 1;
9189         } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] == -1 &&
9190                    SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] == -1) {
9191           // The two inputs are in the same DWord but it is clobbered and the
9192           // adjacent DWord isn't used at all. Move both inputs to the free
9193           // slot.
9194           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
9195           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
9196           InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
9197           InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
9198         } else {
9199           // The only way we hit this point is if there is no clobbering
9200           // (because there are no off-half inputs to this half) and there is no
9201           // free slot adjacent to one of the inputs. In this case, we have to
9202           // swap an input with a non-input.
9203           for (int i = 0; i < 4; ++i)
9204             assert((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) &&
9205                    "We can't handle any clobbers here!");
9206           assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
9207                  "Cannot have adjacent inputs here!");
9208
9209           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
9210           SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
9211
9212           // We also have to update the final source mask in this case because
9213           // it may need to undo the above swap.
9214           for (int &M : FinalSourceHalfMask)
9215             if (M == (InputsFixed[0] ^ 1) + SourceOffset)
9216               M = InputsFixed[1] + SourceOffset;
9217             else if (M == InputsFixed[1] + SourceOffset)
9218               M = (InputsFixed[0] ^ 1) + SourceOffset;
9219
9220           InputsFixed[1] = InputsFixed[0] ^ 1;
9221         }
9222
9223         // Point everything at the fixed inputs.
9224         for (int &M : HalfMask)
9225           if (M == IncomingInputs[0])
9226             M = InputsFixed[0] + SourceOffset;
9227           else if (M == IncomingInputs[1])
9228             M = InputsFixed[1] + SourceOffset;
9229
9230         IncomingInputs[0] = InputsFixed[0] + SourceOffset;
9231         IncomingInputs[1] = InputsFixed[1] + SourceOffset;
9232       }
9233     } else {
9234       llvm_unreachable("Unhandled input size!");
9235     }
9236
9237     // Now hoist the DWord down to the right half.
9238     int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2;
9239     assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free");
9240     PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
9241     for (int &M : HalfMask)
9242       for (int Input : IncomingInputs)
9243         if (M == Input)
9244           M = FreeDWord * 2 + Input % 2;
9245   };
9246   moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
9247                         /*SourceOffset*/ 4, /*DestOffset*/ 0);
9248   moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
9249                         /*SourceOffset*/ 0, /*DestOffset*/ 4);
9250
9251   // Now enact all the shuffles we've computed to move the inputs into their
9252   // target half.
9253   if (!isNoopShuffleMask(PSHUFLMask))
9254     V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
9255                     getV4X86ShuffleImm8ForMask(PSHUFLMask, DAG));
9256   if (!isNoopShuffleMask(PSHUFHMask))
9257     V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
9258                     getV4X86ShuffleImm8ForMask(PSHUFHMask, DAG));
9259   if (!isNoopShuffleMask(PSHUFDMask))
9260     V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9261                     DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9262                                 DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
9263                                 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
9264
9265   // At this point, each half should contain all its inputs, and we can then
9266   // just shuffle them into their final position.
9267   assert(std::count_if(LoMask.begin(), LoMask.end(),
9268                        [](int M) { return M >= 4; }) == 0 &&
9269          "Failed to lift all the high half inputs to the low mask!");
9270   assert(std::count_if(HiMask.begin(), HiMask.end(),
9271                        [](int M) { return M >= 0 && M < 4; }) == 0 &&
9272          "Failed to lift all the low half inputs to the high mask!");
9273
9274   // Do a half shuffle for the low mask.
9275   if (!isNoopShuffleMask(LoMask))
9276     V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
9277                     getV4X86ShuffleImm8ForMask(LoMask, DAG));
9278
9279   // Do a half shuffle with the high mask after shifting its values down.
9280   for (int &M : HiMask)
9281     if (M >= 0)
9282       M -= 4;
9283   if (!isNoopShuffleMask(HiMask))
9284     V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
9285                     getV4X86ShuffleImm8ForMask(HiMask, DAG));
9286
9287   return V;
9288 }
9289
9290 /// \brief Detect whether the mask pattern should be lowered through
9291 /// interleaving.
9292 ///
9293 /// This essentially tests whether viewing the mask as an interleaving of two
9294 /// sub-sequences reduces the cross-input traffic of a blend operation. If so,
9295 /// lowering it through interleaving is a significantly better strategy.
9296 static bool shouldLowerAsInterleaving(ArrayRef<int> Mask) {
9297   int NumEvenInputs[2] = {0, 0};
9298   int NumOddInputs[2] = {0, 0};
9299   int NumLoInputs[2] = {0, 0};
9300   int NumHiInputs[2] = {0, 0};
9301   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9302     if (Mask[i] < 0)
9303       continue;
9304
9305     int InputIdx = Mask[i] >= Size;
9306
9307     if (i < Size / 2)
9308       ++NumLoInputs[InputIdx];
9309     else
9310       ++NumHiInputs[InputIdx];
9311
9312     if ((i % 2) == 0)
9313       ++NumEvenInputs[InputIdx];
9314     else
9315       ++NumOddInputs[InputIdx];
9316   }
9317
9318   // The minimum number of cross-input results for both the interleaved and
9319   // split cases. If interleaving results in fewer cross-input results, return
9320   // true.
9321   int InterleavedCrosses = std::min(NumEvenInputs[1] + NumOddInputs[0],
9322                                     NumEvenInputs[0] + NumOddInputs[1]);
9323   int SplitCrosses = std::min(NumLoInputs[1] + NumHiInputs[0],
9324                               NumLoInputs[0] + NumHiInputs[1]);
9325   return InterleavedCrosses < SplitCrosses;
9326 }
9327
9328 /// \brief Blend two v8i16 vectors using a naive unpack strategy.
9329 ///
9330 /// This strategy only works when the inputs from each vector fit into a single
9331 /// half of that vector, and generally there are not so many inputs as to leave
9332 /// the in-place shuffles required highly constrained (and thus expensive). It
9333 /// shifts all the inputs into a single side of both input vectors and then
9334 /// uses an unpack to interleave these inputs in a single vector. At that
9335 /// point, we will fall back on the generic single input shuffle lowering.
9336 static SDValue lowerV8I16BasicBlendVectorShuffle(SDLoc DL, SDValue V1,
9337                                                  SDValue V2,
9338                                                  MutableArrayRef<int> Mask,
9339                                                  const X86Subtarget *Subtarget,
9340                                                  SelectionDAG &DAG) {
9341   assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
9342   assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
9343   SmallVector<int, 3> LoV1Inputs, HiV1Inputs, LoV2Inputs, HiV2Inputs;
9344   for (int i = 0; i < 8; ++i)
9345     if (Mask[i] >= 0 && Mask[i] < 4)
9346       LoV1Inputs.push_back(i);
9347     else if (Mask[i] >= 4 && Mask[i] < 8)
9348       HiV1Inputs.push_back(i);
9349     else if (Mask[i] >= 8 && Mask[i] < 12)
9350       LoV2Inputs.push_back(i);
9351     else if (Mask[i] >= 12)
9352       HiV2Inputs.push_back(i);
9353
9354   int NumV1Inputs = LoV1Inputs.size() + HiV1Inputs.size();
9355   int NumV2Inputs = LoV2Inputs.size() + HiV2Inputs.size();
9356   (void)NumV1Inputs;
9357   (void)NumV2Inputs;
9358   assert(NumV1Inputs > 0 && NumV1Inputs <= 3 && "At most 3 inputs supported");
9359   assert(NumV2Inputs > 0 && NumV2Inputs <= 3 && "At most 3 inputs supported");
9360   assert(NumV1Inputs + NumV2Inputs <= 4 && "At most 4 combined inputs");
9361
9362   bool MergeFromLo = LoV1Inputs.size() + LoV2Inputs.size() >=
9363                      HiV1Inputs.size() + HiV2Inputs.size();
9364
9365   auto moveInputsToHalf = [&](SDValue V, ArrayRef<int> LoInputs,
9366                               ArrayRef<int> HiInputs, bool MoveToLo,
9367                               int MaskOffset) {
9368     ArrayRef<int> GoodInputs = MoveToLo ? LoInputs : HiInputs;
9369     ArrayRef<int> BadInputs = MoveToLo ? HiInputs : LoInputs;
9370     if (BadInputs.empty())
9371       return V;
9372
9373     int MoveMask[] = {-1, -1, -1, -1, -1, -1, -1, -1};
9374     int MoveOffset = MoveToLo ? 0 : 4;
9375
9376     if (GoodInputs.empty()) {
9377       for (int BadInput : BadInputs) {
9378         MoveMask[Mask[BadInput] % 4 + MoveOffset] = Mask[BadInput] - MaskOffset;
9379         Mask[BadInput] = Mask[BadInput] % 4 + MoveOffset + MaskOffset;
9380       }
9381     } else {
9382       if (GoodInputs.size() == 2) {
9383         // If the low inputs are spread across two dwords, pack them into
9384         // a single dword.
9385         MoveMask[MoveOffset] = Mask[GoodInputs[0]] - MaskOffset;
9386         MoveMask[MoveOffset + 1] = Mask[GoodInputs[1]] - MaskOffset;
9387         Mask[GoodInputs[0]] = MoveOffset + MaskOffset;
9388         Mask[GoodInputs[1]] = MoveOffset + 1 + MaskOffset;
9389       } else {
9390         // Otherwise pin the good inputs.
9391         for (int GoodInput : GoodInputs)
9392           MoveMask[Mask[GoodInput] - MaskOffset] = Mask[GoodInput] - MaskOffset;
9393       }
9394
9395       if (BadInputs.size() == 2) {
9396         // If we have two bad inputs then there may be either one or two good
9397         // inputs fixed in place. Find a fixed input, and then find the *other*
9398         // two adjacent indices by using modular arithmetic.
9399         int GoodMaskIdx =
9400             std::find_if(std::begin(MoveMask) + MoveOffset, std::end(MoveMask),
9401                          [](int M) { return M >= 0; }) -
9402             std::begin(MoveMask);
9403         int MoveMaskIdx =
9404             ((((GoodMaskIdx - MoveOffset) & ~1) + 2) % 4) + MoveOffset;
9405         assert(MoveMask[MoveMaskIdx] == -1 && "Expected empty slot");
9406         assert(MoveMask[MoveMaskIdx + 1] == -1 && "Expected empty slot");
9407         MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
9408         MoveMask[MoveMaskIdx + 1] = Mask[BadInputs[1]] - MaskOffset;
9409         Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
9410         Mask[BadInputs[1]] = MoveMaskIdx + 1 + MaskOffset;
9411       } else {
9412         assert(BadInputs.size() == 1 && "All sizes handled");
9413         int MoveMaskIdx = std::find(std::begin(MoveMask) + MoveOffset,
9414                                     std::end(MoveMask), -1) -
9415                           std::begin(MoveMask);
9416         MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
9417         Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
9418       }
9419     }
9420
9421     return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
9422                                 MoveMask);
9423   };
9424   V1 = moveInputsToHalf(V1, LoV1Inputs, HiV1Inputs, MergeFromLo,
9425                         /*MaskOffset*/ 0);
9426   V2 = moveInputsToHalf(V2, LoV2Inputs, HiV2Inputs, MergeFromLo,
9427                         /*MaskOffset*/ 8);
9428
9429   // FIXME: Select an interleaving of the merge of V1 and V2 that minimizes
9430   // cross-half traffic in the final shuffle.
9431
9432   // Munge the mask to be a single-input mask after the unpack merges the
9433   // results.
9434   for (int &M : Mask)
9435     if (M != -1)
9436       M = 2 * (M % 4) + (M / 8);
9437
9438   return DAG.getVectorShuffle(
9439       MVT::v8i16, DL, DAG.getNode(MergeFromLo ? X86ISD::UNPCKL : X86ISD::UNPCKH,
9440                                   DL, MVT::v8i16, V1, V2),
9441       DAG.getUNDEF(MVT::v8i16), Mask);
9442 }
9443
9444 /// \brief Generic lowering of 8-lane i16 shuffles.
9445 ///
9446 /// This handles both single-input shuffles and combined shuffle/blends with
9447 /// two inputs. The single input shuffles are immediately delegated to
9448 /// a dedicated lowering routine.
9449 ///
9450 /// The blends are lowered in one of three fundamental ways. If there are few
9451 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
9452 /// of the input is significantly cheaper when lowered as an interleaving of
9453 /// the two inputs, try to interleave them. Otherwise, blend the low and high
9454 /// halves of the inputs separately (making them have relatively few inputs)
9455 /// and then concatenate them.
9456 static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9457                                        const X86Subtarget *Subtarget,
9458                                        SelectionDAG &DAG) {
9459   SDLoc DL(Op);
9460   assert(Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!");
9461   assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
9462   assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
9463   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
9464   ArrayRef<int> OrigMask = SVOp->getMask();
9465   int MaskStorage[8] = {OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3],
9466                         OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7]};
9467   MutableArrayRef<int> Mask(MaskStorage);
9468
9469   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
9470
9471   // Whenever we can lower this as a zext, that instruction is strictly faster
9472   // than any alternative.
9473   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
9474           DL, MVT::v8i16, V1, V2, OrigMask, Subtarget, DAG))
9475     return ZExt;
9476
9477   auto isV1 = [](int M) { return M >= 0 && M < 8; };
9478   auto isV2 = [](int M) { return M >= 8; };
9479
9480   int NumV1Inputs = std::count_if(Mask.begin(), Mask.end(), isV1);
9481   int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2);
9482
9483   if (NumV2Inputs == 0)
9484     return lowerV8I16SingleInputVectorShuffle(DL, V1, Mask, Subtarget, DAG);
9485
9486   assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized "
9487                             "to be V1-input shuffles.");
9488
9489   // Try to use bit shift instructions.
9490   if (SDValue Shift = lowerVectorShuffleAsBitShift(
9491           DL, MVT::v8i16, V1, V2, Mask, DAG))
9492     return Shift;
9493
9494   // Try to use byte shift instructions.
9495   if (SDValue Shift = lowerVectorShuffleAsByteShift(
9496           DL, MVT::v8i16, V1, V2, Mask, DAG))
9497     return Shift;
9498
9499   // There are special ways we can lower some single-element blends.
9500   if (NumV2Inputs == 1)
9501     if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v8i16, DL, V1, V2,
9502                                                          Mask, Subtarget, DAG))
9503       return V;
9504
9505   if (Subtarget->hasSSE41())
9506     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
9507                                                   Subtarget, DAG))
9508       return Blend;
9509
9510   if (SDValue Masked =
9511           lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, DAG))
9512     return Masked;
9513
9514   // Use dedicated unpack instructions for masks that match their pattern.
9515   if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 2, 10, 3, 11))
9516     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2);
9517   if (isShuffleEquivalent(Mask, 4, 12, 5, 13, 6, 14, 7, 15))
9518     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2);
9519
9520   // Try to use byte rotation instructions.
9521   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9522           DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
9523     return Rotate;
9524
9525   if (NumV1Inputs + NumV2Inputs <= 4)
9526     return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG);
9527
9528   // Check whether an interleaving lowering is likely to be more efficient.
9529   // This isn't perfect but it is a strong heuristic that tends to work well on
9530   // the kinds of shuffles that show up in practice.
9531   //
9532   // FIXME: Handle 1x, 2x, and 4x interleaving.
9533   if (shouldLowerAsInterleaving(Mask)) {
9534     // FIXME: Figure out whether we should pack these into the low or high
9535     // halves.
9536
9537     int EMask[8], OMask[8];
9538     for (int i = 0; i < 4; ++i) {
9539       EMask[i] = Mask[2*i];
9540       OMask[i] = Mask[2*i + 1];
9541       EMask[i + 4] = -1;
9542       OMask[i + 4] = -1;
9543     }
9544
9545     SDValue Evens = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, EMask);
9546     SDValue Odds = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, OMask);
9547
9548     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, Evens, Odds);
9549   }
9550
9551   int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9552   int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9553
9554   for (int i = 0; i < 4; ++i) {
9555     LoBlendMask[i] = Mask[i];
9556     HiBlendMask[i] = Mask[i + 4];
9557   }
9558
9559   SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
9560   SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
9561   LoV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, LoV);
9562   HiV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, HiV);
9563
9564   return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9565                      DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, LoV, HiV));
9566 }
9567
9568 /// \brief Check whether a compaction lowering can be done by dropping even
9569 /// elements and compute how many times even elements must be dropped.
9570 ///
9571 /// This handles shuffles which take every Nth element where N is a power of
9572 /// two. Example shuffle masks:
9573 ///
9574 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14,  0,  2,  4,  6,  8, 10, 12, 14
9575 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
9576 ///  N = 2:  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12
9577 ///  N = 2:  0,  4,  8, 12, 16, 20, 24, 28,  0,  4,  8, 12, 16, 20, 24, 28
9578 ///  N = 3:  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8
9579 ///  N = 3:  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24
9580 ///
9581 /// Any of these lanes can of course be undef.
9582 ///
9583 /// This routine only supports N <= 3.
9584 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
9585 /// for larger N.
9586 ///
9587 /// \returns N above, or the number of times even elements must be dropped if
9588 /// there is such a number. Otherwise returns zero.
9589 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask) {
9590   // Figure out whether we're looping over two inputs or just one.
9591   bool IsSingleInput = isSingleInputShuffleMask(Mask);
9592
9593   // The modulus for the shuffle vector entries is based on whether this is
9594   // a single input or not.
9595   int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
9596   assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
9597          "We should only be called with masks with a power-of-2 size!");
9598
9599   uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
9600
9601   // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
9602   // and 2^3 simultaneously. This is because we may have ambiguity with
9603   // partially undef inputs.
9604   bool ViableForN[3] = {true, true, true};
9605
9606   for (int i = 0, e = Mask.size(); i < e; ++i) {
9607     // Ignore undef lanes, we'll optimistically collapse them to the pattern we
9608     // want.
9609     if (Mask[i] == -1)
9610       continue;
9611
9612     bool IsAnyViable = false;
9613     for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
9614       if (ViableForN[j]) {
9615         uint64_t N = j + 1;
9616
9617         // The shuffle mask must be equal to (i * 2^N) % M.
9618         if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
9619           IsAnyViable = true;
9620         else
9621           ViableForN[j] = false;
9622       }
9623     // Early exit if we exhaust the possible powers of two.
9624     if (!IsAnyViable)
9625       break;
9626   }
9627
9628   for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
9629     if (ViableForN[j])
9630       return j + 1;
9631
9632   // Return 0 as there is no viable power of two.
9633   return 0;
9634 }
9635
9636 /// \brief Generic lowering of v16i8 shuffles.
9637 ///
9638 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
9639 /// detect any complexity reducing interleaving. If that doesn't help, it uses
9640 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
9641 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
9642 /// back together.
9643 static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9644                                        const X86Subtarget *Subtarget,
9645                                        SelectionDAG &DAG) {
9646   SDLoc DL(Op);
9647   assert(Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!");
9648   assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
9649   assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
9650   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
9651   ArrayRef<int> OrigMask = SVOp->getMask();
9652   assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!");
9653
9654   // Try to use bit shift instructions.
9655   if (SDValue Shift = lowerVectorShuffleAsBitShift(
9656           DL, MVT::v16i8, V1, V2, OrigMask, DAG))
9657     return Shift;
9658
9659   // Try to use byte shift instructions.
9660   if (SDValue Shift = lowerVectorShuffleAsByteShift(
9661           DL, MVT::v16i8, V1, V2, OrigMask, DAG))
9662     return Shift;
9663
9664   // Try to use byte rotation instructions.
9665   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9666           DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
9667     return Rotate;
9668
9669   // Try to use a zext lowering.
9670   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
9671           DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
9672     return ZExt;
9673
9674   int MaskStorage[16] = {
9675       OrigMask[0],  OrigMask[1],  OrigMask[2],  OrigMask[3],
9676       OrigMask[4],  OrigMask[5],  OrigMask[6],  OrigMask[7],
9677       OrigMask[8],  OrigMask[9],  OrigMask[10], OrigMask[11],
9678       OrigMask[12], OrigMask[13], OrigMask[14], OrigMask[15]};
9679   MutableArrayRef<int> Mask(MaskStorage);
9680   MutableArrayRef<int> LoMask = Mask.slice(0, 8);
9681   MutableArrayRef<int> HiMask = Mask.slice(8, 8);
9682
9683   int NumV2Elements =
9684       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; });
9685
9686   // For single-input shuffles, there are some nicer lowering tricks we can use.
9687   if (NumV2Elements == 0) {
9688     // Check for being able to broadcast a single element.
9689     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i8, DL, V1,
9690                                                           Mask, Subtarget, DAG))
9691       return Broadcast;
9692
9693     // Check whether we can widen this to an i16 shuffle by duplicating bytes.
9694     // Notably, this handles splat and partial-splat shuffles more efficiently.
9695     // However, it only makes sense if the pre-duplication shuffle simplifies
9696     // things significantly. Currently, this means we need to be able to
9697     // express the pre-duplication shuffle as an i16 shuffle.
9698     //
9699     // FIXME: We should check for other patterns which can be widened into an
9700     // i16 shuffle as well.
9701     auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
9702       for (int i = 0; i < 16; i += 2)
9703         if (Mask[i] != -1 && Mask[i + 1] != -1 && Mask[i] != Mask[i + 1])
9704           return false;
9705
9706       return true;
9707     };
9708     auto tryToWidenViaDuplication = [&]() -> SDValue {
9709       if (!canWidenViaDuplication(Mask))
9710         return SDValue();
9711       SmallVector<int, 4> LoInputs;
9712       std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
9713                    [](int M) { return M >= 0 && M < 8; });
9714       std::sort(LoInputs.begin(), LoInputs.end());
9715       LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
9716                      LoInputs.end());
9717       SmallVector<int, 4> HiInputs;
9718       std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
9719                    [](int M) { return M >= 8; });
9720       std::sort(HiInputs.begin(), HiInputs.end());
9721       HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
9722                      HiInputs.end());
9723
9724       bool TargetLo = LoInputs.size() >= HiInputs.size();
9725       ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
9726       ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
9727
9728       int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
9729       SmallDenseMap<int, int, 8> LaneMap;
9730       for (int I : InPlaceInputs) {
9731         PreDupI16Shuffle[I/2] = I/2;
9732         LaneMap[I] = I;
9733       }
9734       int j = TargetLo ? 0 : 4, je = j + 4;
9735       for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
9736         // Check if j is already a shuffle of this input. This happens when
9737         // there are two adjacent bytes after we move the low one.
9738         if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
9739           // If we haven't yet mapped the input, search for a slot into which
9740           // we can map it.
9741           while (j < je && PreDupI16Shuffle[j] != -1)
9742             ++j;
9743
9744           if (j == je)
9745             // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
9746             return SDValue();
9747
9748           // Map this input with the i16 shuffle.
9749           PreDupI16Shuffle[j] = MovingInputs[i] / 2;
9750         }
9751
9752         // Update the lane map based on the mapping we ended up with.
9753         LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
9754       }
9755       V1 = DAG.getNode(
9756           ISD::BITCAST, DL, MVT::v16i8,
9757           DAG.getVectorShuffle(MVT::v8i16, DL,
9758                                DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
9759                                DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
9760
9761       // Unpack the bytes to form the i16s that will be shuffled into place.
9762       V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
9763                        MVT::v16i8, V1, V1);
9764
9765       int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9766       for (int i = 0; i < 16; ++i)
9767         if (Mask[i] != -1) {
9768           int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
9769           assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
9770           if (PostDupI16Shuffle[i / 2] == -1)
9771             PostDupI16Shuffle[i / 2] = MappedMask;
9772           else
9773             assert(PostDupI16Shuffle[i / 2] == MappedMask &&
9774                    "Conflicting entrties in the original shuffle!");
9775         }
9776       return DAG.getNode(
9777           ISD::BITCAST, DL, MVT::v16i8,
9778           DAG.getVectorShuffle(MVT::v8i16, DL,
9779                                DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
9780                                DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
9781     };
9782     if (SDValue V = tryToWidenViaDuplication())
9783       return V;
9784   }
9785
9786   // Check whether an interleaving lowering is likely to be more efficient.
9787   // This isn't perfect but it is a strong heuristic that tends to work well on
9788   // the kinds of shuffles that show up in practice.
9789   //
9790   // FIXME: We need to handle other interleaving widths (i16, i32, ...).
9791   if (shouldLowerAsInterleaving(Mask)) {
9792     int NumLoHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {
9793       return (M >= 0 && M < 8) || (M >= 16 && M < 24);
9794     });
9795     int NumHiHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {
9796       return (M >= 8 && M < 16) || M >= 24;
9797     });
9798     int EMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,
9799                      -1, -1, -1, -1, -1, -1, -1, -1};
9800     int OMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,
9801                      -1, -1, -1, -1, -1, -1, -1, -1};
9802     bool UnpackLo = NumLoHalf >= NumHiHalf;
9803     MutableArrayRef<int> TargetEMask(UnpackLo ? EMask : EMask + 8, 8);
9804     MutableArrayRef<int> TargetOMask(UnpackLo ? OMask : OMask + 8, 8);
9805     for (int i = 0; i < 8; ++i) {
9806       TargetEMask[i] = Mask[2 * i];
9807       TargetOMask[i] = Mask[2 * i + 1];
9808     }
9809
9810     SDValue Evens = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, EMask);
9811     SDValue Odds = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, OMask);
9812
9813     return DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
9814                        MVT::v16i8, Evens, Odds);
9815   }
9816
9817   // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
9818   // with PSHUFB. It is important to do this before we attempt to generate any
9819   // blends but after all of the single-input lowerings. If the single input
9820   // lowerings can find an instruction sequence that is faster than a PSHUFB, we
9821   // want to preserve that and we can DAG combine any longer sequences into
9822   // a PSHUFB in the end. But once we start blending from multiple inputs,
9823   // the complexity of DAG combining bad patterns back into PSHUFB is too high,
9824   // and there are *very* few patterns that would actually be faster than the
9825   // PSHUFB approach because of its ability to zero lanes.
9826   //
9827   // FIXME: The only exceptions to the above are blends which are exact
9828   // interleavings with direct instructions supporting them. We currently don't
9829   // handle those well here.
9830   if (Subtarget->hasSSSE3()) {
9831     SDValue V1Mask[16];
9832     SDValue V2Mask[16];
9833     bool V1InUse = false;
9834     bool V2InUse = false;
9835     SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
9836
9837     for (int i = 0; i < 16; ++i) {
9838       if (Mask[i] == -1) {
9839         V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
9840       } else {
9841         const int ZeroMask = 0x80;
9842         int V1Idx = (Mask[i] < 16 ? Mask[i] : ZeroMask);
9843         int V2Idx = (Mask[i] < 16 ? ZeroMask : Mask[i] - 16);
9844         if (Zeroable[i])
9845           V1Idx = V2Idx = ZeroMask;
9846         V1Mask[i] = DAG.getConstant(V1Idx, MVT::i8);
9847         V2Mask[i] = DAG.getConstant(V2Idx, MVT::i8);
9848         V1InUse |= (ZeroMask != V1Idx);
9849         V2InUse |= (ZeroMask != V2Idx);
9850       }
9851     }
9852
9853     if (V1InUse)
9854       V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V1,
9855                        DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
9856     if (V2InUse)
9857       V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V2,
9858                        DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));
9859
9860     // If we need shuffled inputs from both, blend the two.
9861     if (V1InUse && V2InUse)
9862       return DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
9863     if (V1InUse)
9864       return V1; // Single inputs are easy.
9865     if (V2InUse)
9866       return V2; // Single inputs are easy.
9867     // Shuffling to a zeroable vector.
9868     return getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
9869   }
9870
9871   // There are special ways we can lower some single-element blends.
9872   if (NumV2Elements == 1)
9873     if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v16i8, DL, V1, V2,
9874                                                          Mask, Subtarget, DAG))
9875       return V;
9876
9877   // Check whether a compaction lowering can be done. This handles shuffles
9878   // which take every Nth element for some even N. See the helper function for
9879   // details.
9880   //
9881   // We special case these as they can be particularly efficiently handled with
9882   // the PACKUSB instruction on x86 and they show up in common patterns of
9883   // rearranging bytes to truncate wide elements.
9884   if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask)) {
9885     // NumEvenDrops is the power of two stride of the elements. Another way of
9886     // thinking about it is that we need to drop the even elements this many
9887     // times to get the original input.
9888     bool IsSingleInput = isSingleInputShuffleMask(Mask);
9889
9890     // First we need to zero all the dropped bytes.
9891     assert(NumEvenDrops <= 3 &&
9892            "No support for dropping even elements more than 3 times.");
9893     // We use the mask type to pick which bytes are preserved based on how many
9894     // elements are dropped.
9895     MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
9896     SDValue ByteClearMask =
9897         DAG.getNode(ISD::BITCAST, DL, MVT::v16i8,
9898                     DAG.getConstant(0xFF, MaskVTs[NumEvenDrops - 1]));
9899     V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
9900     if (!IsSingleInput)
9901       V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
9902
9903     // Now pack things back together.
9904     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
9905     V2 = IsSingleInput ? V1 : DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
9906     SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
9907     for (int i = 1; i < NumEvenDrops; ++i) {
9908       Result = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Result);
9909       Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
9910     }
9911
9912     return Result;
9913   }
9914
9915   int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9916   int V1HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9917   int V2LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9918   int V2HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9919
9920   auto buildBlendMasks = [](MutableArrayRef<int> HalfMask,
9921                             MutableArrayRef<int> V1HalfBlendMask,
9922                             MutableArrayRef<int> V2HalfBlendMask) {
9923     for (int i = 0; i < 8; ++i)
9924       if (HalfMask[i] >= 0 && HalfMask[i] < 16) {
9925         V1HalfBlendMask[i] = HalfMask[i];
9926         HalfMask[i] = i;
9927       } else if (HalfMask[i] >= 16) {
9928         V2HalfBlendMask[i] = HalfMask[i] - 16;
9929         HalfMask[i] = i + 8;
9930       }
9931   };
9932   buildBlendMasks(LoMask, V1LoBlendMask, V2LoBlendMask);
9933   buildBlendMasks(HiMask, V1HiBlendMask, V2HiBlendMask);
9934
9935   SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
9936
9937   auto buildLoAndHiV8s = [&](SDValue V, MutableArrayRef<int> LoBlendMask,
9938                              MutableArrayRef<int> HiBlendMask) {
9939     SDValue V1, V2;
9940     // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
9941     // them out and avoid using UNPCK{L,H} to extract the elements of V as
9942     // i16s.
9943     if (std::none_of(LoBlendMask.begin(), LoBlendMask.end(),
9944                      [](int M) { return M >= 0 && M % 2 == 1; }) &&
9945         std::none_of(HiBlendMask.begin(), HiBlendMask.end(),
9946                      [](int M) { return M >= 0 && M % 2 == 1; })) {
9947       // Use a mask to drop the high bytes.
9948       V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
9949       V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, V1,
9950                        DAG.getConstant(0x00FF, MVT::v8i16));
9951
9952       // This will be a single vector shuffle instead of a blend so nuke V2.
9953       V2 = DAG.getUNDEF(MVT::v8i16);
9954
9955       // Squash the masks to point directly into V1.
9956       for (int &M : LoBlendMask)
9957         if (M >= 0)
9958           M /= 2;
9959       for (int &M : HiBlendMask)
9960         if (M >= 0)
9961           M /= 2;
9962     } else {
9963       // Otherwise just unpack the low half of V into V1 and the high half into
9964       // V2 so that we can blend them as i16s.
9965       V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9966                        DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
9967       V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9968                        DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
9969     }
9970
9971     SDValue BlendedLo = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
9972     SDValue BlendedHi = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
9973     return std::make_pair(BlendedLo, BlendedHi);
9974   };
9975   SDValue V1Lo, V1Hi, V2Lo, V2Hi;
9976   std::tie(V1Lo, V1Hi) = buildLoAndHiV8s(V1, V1LoBlendMask, V1HiBlendMask);
9977   std::tie(V2Lo, V2Hi) = buildLoAndHiV8s(V2, V2LoBlendMask, V2HiBlendMask);
9978
9979   SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Lo, V2Lo, LoMask);
9980   SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Hi, V2Hi, HiMask);
9981
9982   return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
9983 }
9984
9985 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
9986 ///
9987 /// This routine breaks down the specific type of 128-bit shuffle and
9988 /// dispatches to the lowering routines accordingly.
9989 static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9990                                         MVT VT, const X86Subtarget *Subtarget,
9991                                         SelectionDAG &DAG) {
9992   switch (VT.SimpleTy) {
9993   case MVT::v2i64:
9994     return lowerV2I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
9995   case MVT::v2f64:
9996     return lowerV2F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
9997   case MVT::v4i32:
9998     return lowerV4I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
9999   case MVT::v4f32:
10000     return lowerV4F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10001   case MVT::v8i16:
10002     return lowerV8I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
10003   case MVT::v16i8:
10004     return lowerV16I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
10005
10006   default:
10007     llvm_unreachable("Unimplemented!");
10008   }
10009 }
10010
10011 /// \brief Helper function to test whether a shuffle mask could be
10012 /// simplified by widening the elements being shuffled.
10013 ///
10014 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
10015 /// leaves it in an unspecified state.
10016 ///
10017 /// NOTE: This must handle normal vector shuffle masks and *target* vector
10018 /// shuffle masks. The latter have the special property of a '-2' representing
10019 /// a zero-ed lane of a vector.
10020 static bool canWidenShuffleElements(ArrayRef<int> Mask,
10021                                     SmallVectorImpl<int> &WidenedMask) {
10022   for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
10023     // If both elements are undef, its trivial.
10024     if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
10025       WidenedMask.push_back(SM_SentinelUndef);
10026       continue;
10027     }
10028
10029     // Check for an undef mask and a mask value properly aligned to fit with
10030     // a pair of values. If we find such a case, use the non-undef mask's value.
10031     if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) {
10032       WidenedMask.push_back(Mask[i + 1] / 2);
10033       continue;
10034     }
10035     if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
10036       WidenedMask.push_back(Mask[i] / 2);
10037       continue;
10038     }
10039
10040     // When zeroing, we need to spread the zeroing across both lanes to widen.
10041     if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
10042       if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
10043           (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
10044         WidenedMask.push_back(SM_SentinelZero);
10045         continue;
10046       }
10047       return false;
10048     }
10049
10050     // Finally check if the two mask values are adjacent and aligned with
10051     // a pair.
10052     if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) {
10053       WidenedMask.push_back(Mask[i] / 2);
10054       continue;
10055     }
10056
10057     // Otherwise we can't safely widen the elements used in this shuffle.
10058     return false;
10059   }
10060   assert(WidenedMask.size() == Mask.size() / 2 &&
10061          "Incorrect size of mask after widening the elements!");
10062
10063   return true;
10064 }
10065
10066 /// \brief Generic routine to split ector shuffle into half-sized shuffles.
10067 ///
10068 /// This routine just extracts two subvectors, shuffles them independently, and
10069 /// then concatenates them back together. This should work effectively with all
10070 /// AVX vector shuffle types.
10071 static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
10072                                           SDValue V2, ArrayRef<int> Mask,
10073                                           SelectionDAG &DAG) {
10074   assert(VT.getSizeInBits() >= 256 &&
10075          "Only for 256-bit or wider vector shuffles!");
10076   assert(V1.getSimpleValueType() == VT && "Bad operand type!");
10077   assert(V2.getSimpleValueType() == VT && "Bad operand type!");
10078
10079   ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
10080   ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
10081
10082   int NumElements = VT.getVectorNumElements();
10083   int SplitNumElements = NumElements / 2;
10084   MVT ScalarVT = VT.getScalarType();
10085   MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
10086
10087   SDValue LoV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1,
10088                              DAG.getIntPtrConstant(0));
10089   SDValue HiV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1,
10090                              DAG.getIntPtrConstant(SplitNumElements));
10091   SDValue LoV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2,
10092                              DAG.getIntPtrConstant(0));
10093   SDValue HiV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2,
10094                              DAG.getIntPtrConstant(SplitNumElements));
10095
10096   // Now create two 4-way blends of these half-width vectors.
10097   auto HalfBlend = [&](ArrayRef<int> HalfMask) {
10098     bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
10099     SmallVector<int, 32> V1BlendMask, V2BlendMask, BlendMask;
10100     for (int i = 0; i < SplitNumElements; ++i) {
10101       int M = HalfMask[i];
10102       if (M >= NumElements) {
10103         if (M >= NumElements + SplitNumElements)
10104           UseHiV2 = true;
10105         else
10106           UseLoV2 = true;
10107         V2BlendMask.push_back(M - NumElements);
10108         V1BlendMask.push_back(-1);
10109         BlendMask.push_back(SplitNumElements + i);
10110       } else if (M >= 0) {
10111         if (M >= SplitNumElements)
10112           UseHiV1 = true;
10113         else
10114           UseLoV1 = true;
10115         V2BlendMask.push_back(-1);
10116         V1BlendMask.push_back(M);
10117         BlendMask.push_back(i);
10118       } else {
10119         V2BlendMask.push_back(-1);
10120         V1BlendMask.push_back(-1);
10121         BlendMask.push_back(-1);
10122       }
10123     }
10124
10125     // Because the lowering happens after all combining takes place, we need to
10126     // manually combine these blend masks as much as possible so that we create
10127     // a minimal number of high-level vector shuffle nodes.
10128
10129     // First try just blending the halves of V1 or V2.
10130     if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
10131       return DAG.getUNDEF(SplitVT);
10132     if (!UseLoV2 && !UseHiV2)
10133       return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
10134     if (!UseLoV1 && !UseHiV1)
10135       return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
10136
10137     SDValue V1Blend, V2Blend;
10138     if (UseLoV1 && UseHiV1) {
10139       V1Blend =
10140         DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
10141     } else {
10142       // We only use half of V1 so map the usage down into the final blend mask.
10143       V1Blend = UseLoV1 ? LoV1 : HiV1;
10144       for (int i = 0; i < SplitNumElements; ++i)
10145         if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
10146           BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
10147     }
10148     if (UseLoV2 && UseHiV2) {
10149       V2Blend =
10150         DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
10151     } else {
10152       // We only use half of V2 so map the usage down into the final blend mask.
10153       V2Blend = UseLoV2 ? LoV2 : HiV2;
10154       for (int i = 0; i < SplitNumElements; ++i)
10155         if (BlendMask[i] >= SplitNumElements)
10156           BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
10157     }
10158     return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
10159   };
10160   SDValue Lo = HalfBlend(LoMask);
10161   SDValue Hi = HalfBlend(HiMask);
10162   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
10163 }
10164
10165 /// \brief Either split a vector in halves or decompose the shuffles and the
10166 /// blend.
10167 ///
10168 /// This is provided as a good fallback for many lowerings of non-single-input
10169 /// shuffles with more than one 128-bit lane. In those cases, we want to select
10170 /// between splitting the shuffle into 128-bit components and stitching those
10171 /// back together vs. extracting the single-input shuffles and blending those
10172 /// results.
10173 static SDValue lowerVectorShuffleAsSplitOrBlend(SDLoc DL, MVT VT, SDValue V1,
10174                                                 SDValue V2, ArrayRef<int> Mask,
10175                                                 SelectionDAG &DAG) {
10176   assert(!isSingleInputShuffleMask(Mask) && "This routine must not be used to "
10177                                             "lower single-input shuffles as it "
10178                                             "could then recurse on itself.");
10179   int Size = Mask.size();
10180
10181   // If this can be modeled as a broadcast of two elements followed by a blend,
10182   // prefer that lowering. This is especially important because broadcasts can
10183   // often fold with memory operands.
10184   auto DoBothBroadcast = [&] {
10185     int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
10186     for (int M : Mask)
10187       if (M >= Size) {
10188         if (V2BroadcastIdx == -1)
10189           V2BroadcastIdx = M - Size;
10190         else if (M - Size != V2BroadcastIdx)
10191           return false;
10192       } else if (M >= 0) {
10193         if (V1BroadcastIdx == -1)
10194           V1BroadcastIdx = M;
10195         else if (M != V1BroadcastIdx)
10196           return false;
10197       }
10198     return true;
10199   };
10200   if (DoBothBroadcast())
10201     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
10202                                                       DAG);
10203
10204   // If the inputs all stem from a single 128-bit lane of each input, then we
10205   // split them rather than blending because the split will decompose to
10206   // unusually few instructions.
10207   int LaneCount = VT.getSizeInBits() / 128;
10208   int LaneSize = Size / LaneCount;
10209   SmallBitVector LaneInputs[2];
10210   LaneInputs[0].resize(LaneCount, false);
10211   LaneInputs[1].resize(LaneCount, false);
10212   for (int i = 0; i < Size; ++i)
10213     if (Mask[i] >= 0)
10214       LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
10215   if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
10216     return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10217
10218   // Otherwise, just fall back to decomposed shuffles and a blend. This requires
10219   // that the decomposed single-input shuffles don't end up here.
10220   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
10221 }
10222
10223 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
10224 /// a permutation and blend of those lanes.
10225 ///
10226 /// This essentially blends the out-of-lane inputs to each lane into the lane
10227 /// from a permuted copy of the vector. This lowering strategy results in four
10228 /// instructions in the worst case for a single-input cross lane shuffle which
10229 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
10230 /// of. Special cases for each particular shuffle pattern should be handled
10231 /// prior to trying this lowering.
10232 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT,
10233                                                        SDValue V1, SDValue V2,
10234                                                        ArrayRef<int> Mask,
10235                                                        SelectionDAG &DAG) {
10236   // FIXME: This should probably be generalized for 512-bit vectors as well.
10237   assert(VT.getSizeInBits() == 256 && "Only for 256-bit vector shuffles!");
10238   int LaneSize = Mask.size() / 2;
10239
10240   // If there are only inputs from one 128-bit lane, splitting will in fact be
10241   // less expensive. The flags track wether the given lane contains an element
10242   // that crosses to another lane.
10243   bool LaneCrossing[2] = {false, false};
10244   for (int i = 0, Size = Mask.size(); i < Size; ++i)
10245     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
10246       LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
10247   if (!LaneCrossing[0] || !LaneCrossing[1])
10248     return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10249
10250   if (isSingleInputShuffleMask(Mask)) {
10251     SmallVector<int, 32> FlippedBlendMask;
10252     for (int i = 0, Size = Mask.size(); i < Size; ++i)
10253       FlippedBlendMask.push_back(
10254           Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
10255                                   ? Mask[i]
10256                                   : Mask[i] % LaneSize +
10257                                         (i / LaneSize) * LaneSize + Size));
10258
10259     // Flip the vector, and blend the results which should now be in-lane. The
10260     // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
10261     // 5 for the high source. The value 3 selects the high half of source 2 and
10262     // the value 2 selects the low half of source 2. We only use source 2 to
10263     // allow folding it into a memory operand.
10264     unsigned PERMMask = 3 | 2 << 4;
10265     SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
10266                                   V1, DAG.getConstant(PERMMask, MVT::i8));
10267     return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
10268   }
10269
10270   // This now reduces to two single-input shuffles of V1 and V2 which at worst
10271   // will be handled by the above logic and a blend of the results, much like
10272   // other patterns in AVX.
10273   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
10274 }
10275
10276 /// \brief Handle lowering 2-lane 128-bit shuffles.
10277 static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
10278                                         SDValue V2, ArrayRef<int> Mask,
10279                                         const X86Subtarget *Subtarget,
10280                                         SelectionDAG &DAG) {
10281   // Blends are faster and handle all the non-lane-crossing cases.
10282   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
10283                                                 Subtarget, DAG))
10284     return Blend;
10285
10286   MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
10287                                VT.getVectorNumElements() / 2);
10288   // Check for patterns which can be matched with a single insert of a 128-bit
10289   // subvector.
10290   if (isShuffleEquivalent(Mask, 0, 1, 0, 1) ||
10291       isShuffleEquivalent(Mask, 0, 1, 4, 5)) {
10292     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
10293                               DAG.getIntPtrConstant(0));
10294     SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
10295                               Mask[2] < 4 ? V1 : V2, DAG.getIntPtrConstant(0));
10296     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
10297   }
10298   if (isShuffleEquivalent(Mask, 0, 1, 6, 7)) {
10299     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
10300                               DAG.getIntPtrConstant(0));
10301     SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
10302                               DAG.getIntPtrConstant(2));
10303     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
10304   }
10305
10306   // Otherwise form a 128-bit permutation.
10307   // FIXME: Detect zero-vector inputs and use the VPERM2X128 to zero that half.
10308   unsigned PermMask = Mask[0] / 2 | (Mask[2] / 2) << 4;
10309   return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
10310                      DAG.getConstant(PermMask, MVT::i8));
10311 }
10312
10313 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
10314 /// shuffling each lane.
10315 ///
10316 /// This will only succeed when the result of fixing the 128-bit lanes results
10317 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
10318 /// each 128-bit lanes. This handles many cases where we can quickly blend away
10319 /// the lane crosses early and then use simpler shuffles within each lane.
10320 ///
10321 /// FIXME: It might be worthwhile at some point to support this without
10322 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
10323 /// in x86 only floating point has interesting non-repeating shuffles, and even
10324 /// those are still *marginally* more expensive.
10325 static SDValue lowerVectorShuffleByMerging128BitLanes(
10326     SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10327     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
10328   assert(!isSingleInputShuffleMask(Mask) &&
10329          "This is only useful with multiple inputs.");
10330
10331   int Size = Mask.size();
10332   int LaneSize = 128 / VT.getScalarSizeInBits();
10333   int NumLanes = Size / LaneSize;
10334   assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
10335
10336   // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
10337   // check whether the in-128-bit lane shuffles share a repeating pattern.
10338   SmallVector<int, 4> Lanes;
10339   Lanes.resize(NumLanes, -1);
10340   SmallVector<int, 4> InLaneMask;
10341   InLaneMask.resize(LaneSize, -1);
10342   for (int i = 0; i < Size; ++i) {
10343     if (Mask[i] < 0)
10344       continue;
10345
10346     int j = i / LaneSize;
10347
10348     if (Lanes[j] < 0) {
10349       // First entry we've seen for this lane.
10350       Lanes[j] = Mask[i] / LaneSize;
10351     } else if (Lanes[j] != Mask[i] / LaneSize) {
10352       // This doesn't match the lane selected previously!
10353       return SDValue();
10354     }
10355
10356     // Check that within each lane we have a consistent shuffle mask.
10357     int k = i % LaneSize;
10358     if (InLaneMask[k] < 0) {
10359       InLaneMask[k] = Mask[i] % LaneSize;
10360     } else if (InLaneMask[k] != Mask[i] % LaneSize) {
10361       // This doesn't fit a repeating in-lane mask.
10362       return SDValue();
10363     }
10364   }
10365
10366   // First shuffle the lanes into place.
10367   MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
10368                                 VT.getSizeInBits() / 64);
10369   SmallVector<int, 8> LaneMask;
10370   LaneMask.resize(NumLanes * 2, -1);
10371   for (int i = 0; i < NumLanes; ++i)
10372     if (Lanes[i] >= 0) {
10373       LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
10374       LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
10375     }
10376
10377   V1 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V1);
10378   V2 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V2);
10379   SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
10380
10381   // Cast it back to the type we actually want.
10382   LaneShuffle = DAG.getNode(ISD::BITCAST, DL, VT, LaneShuffle);
10383
10384   // Now do a simple shuffle that isn't lane crossing.
10385   SmallVector<int, 8> NewMask;
10386   NewMask.resize(Size, -1);
10387   for (int i = 0; i < Size; ++i)
10388     if (Mask[i] >= 0)
10389       NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
10390   assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
10391          "Must not introduce lane crosses at this point!");
10392
10393   return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
10394 }
10395
10396 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
10397 /// given mask.
10398 ///
10399 /// This returns true if the elements from a particular input are already in the
10400 /// slot required by the given mask and require no permutation.
10401 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
10402   assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
10403   int Size = Mask.size();
10404   for (int i = 0; i < Size; ++i)
10405     if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
10406       return false;
10407
10408   return true;
10409 }
10410
10411 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
10412 ///
10413 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
10414 /// isn't available.
10415 static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10416                                        const X86Subtarget *Subtarget,
10417                                        SelectionDAG &DAG) {
10418   SDLoc DL(Op);
10419   assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
10420   assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
10421   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10422   ArrayRef<int> Mask = SVOp->getMask();
10423   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10424
10425   SmallVector<int, 4> WidenedMask;
10426   if (canWidenShuffleElements(Mask, WidenedMask))
10427     return lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, Subtarget,
10428                                     DAG);
10429
10430   if (isSingleInputShuffleMask(Mask)) {
10431     // Check for being able to broadcast a single element.
10432     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f64, DL, V1,
10433                                                           Mask, Subtarget, DAG))
10434       return Broadcast;
10435
10436     // Use low duplicate instructions for masks that match their pattern.
10437     if (isShuffleEquivalent(Mask, 0, 0, 2, 2))
10438       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
10439
10440     if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
10441       // Non-half-crossing single input shuffles can be lowerid with an
10442       // interleaved permutation.
10443       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
10444                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
10445       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
10446                          DAG.getConstant(VPERMILPMask, MVT::i8));
10447     }
10448
10449     // With AVX2 we have direct support for this permutation.
10450     if (Subtarget->hasAVX2())
10451       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
10452                          getV4X86ShuffleImm8ForMask(Mask, DAG));
10453
10454     // Otherwise, fall back.
10455     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
10456                                                    DAG);
10457   }
10458
10459   // X86 has dedicated unpack instructions that can handle specific blend
10460   // operations: UNPCKH and UNPCKL.
10461   if (isShuffleEquivalent(Mask, 0, 4, 2, 6))
10462     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2);
10463   if (isShuffleEquivalent(Mask, 1, 5, 3, 7))
10464     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2);
10465
10466   // If we have a single input to the zero element, insert that into V1 if we
10467   // can do so cheaply.
10468   int NumV2Elements =
10469       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
10470   if (NumV2Elements == 1 && Mask[0] >= 4)
10471     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10472             MVT::v4f64, DL, V1, V2, Mask, Subtarget, DAG))
10473       return Insertion;
10474
10475   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
10476                                                 Subtarget, DAG))
10477     return Blend;
10478
10479   // Check if the blend happens to exactly fit that of SHUFPD.
10480   if ((Mask[0] == -1 || Mask[0] < 2) &&
10481       (Mask[1] == -1 || (Mask[1] >= 4 && Mask[1] < 6)) &&
10482       (Mask[2] == -1 || (Mask[2] >= 2 && Mask[2] < 4)) &&
10483       (Mask[3] == -1 || Mask[3] >= 6)) {
10484     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 5) << 1) |
10485                           ((Mask[2] == 3) << 2) | ((Mask[3] == 7) << 3);
10486     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V1, V2,
10487                        DAG.getConstant(SHUFPDMask, MVT::i8));
10488   }
10489   if ((Mask[0] == -1 || (Mask[0] >= 4 && Mask[0] < 6)) &&
10490       (Mask[1] == -1 || Mask[1] < 2) &&
10491       (Mask[2] == -1 || Mask[2] >= 6) &&
10492       (Mask[3] == -1 || (Mask[3] >= 2 && Mask[3] < 4))) {
10493     unsigned SHUFPDMask = (Mask[0] == 5) | ((Mask[1] == 1) << 1) |
10494                           ((Mask[2] == 7) << 2) | ((Mask[3] == 3) << 3);
10495     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V2, V1,
10496                        DAG.getConstant(SHUFPDMask, MVT::i8));
10497   }
10498
10499   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10500   // shuffle. However, if we have AVX2 and either inputs are already in place,
10501   // we will be able to shuffle even across lanes the other input in a single
10502   // instruction so skip this pattern.
10503   if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
10504                                  isShuffleMaskInputInPlace(1, Mask))))
10505     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10506             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
10507       return Result;
10508
10509   // If we have AVX2 then we always want to lower with a blend because an v4 we
10510   // can fully permute the elements.
10511   if (Subtarget->hasAVX2())
10512     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
10513                                                       Mask, DAG);
10514
10515   // Otherwise fall back on generic lowering.
10516   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
10517 }
10518
10519 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
10520 ///
10521 /// This routine is only called when we have AVX2 and thus a reasonable
10522 /// instruction set for v4i64 shuffling..
10523 static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10524                                        const X86Subtarget *Subtarget,
10525                                        SelectionDAG &DAG) {
10526   SDLoc DL(Op);
10527   assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
10528   assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
10529   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10530   ArrayRef<int> Mask = SVOp->getMask();
10531   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10532   assert(Subtarget->hasAVX2() && "We can only lower v4i64 with AVX2!");
10533
10534   SmallVector<int, 4> WidenedMask;
10535   if (canWidenShuffleElements(Mask, WidenedMask))
10536     return lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, Subtarget,
10537                                     DAG);
10538
10539   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
10540                                                 Subtarget, DAG))
10541     return Blend;
10542
10543   // Check for being able to broadcast a single element.
10544   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i64, DL, V1,
10545                                                         Mask, Subtarget, DAG))
10546     return Broadcast;
10547
10548   // When the shuffle is mirrored between the 128-bit lanes of the unit, we can
10549   // use lower latency instructions that will operate on both 128-bit lanes.
10550   SmallVector<int, 2> RepeatedMask;
10551   if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
10552     if (isSingleInputShuffleMask(Mask)) {
10553       int PSHUFDMask[] = {-1, -1, -1, -1};
10554       for (int i = 0; i < 2; ++i)
10555         if (RepeatedMask[i] >= 0) {
10556           PSHUFDMask[2 * i] = 2 * RepeatedMask[i];
10557           PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1;
10558         }
10559       return DAG.getNode(
10560           ISD::BITCAST, DL, MVT::v4i64,
10561           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
10562                       DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, V1),
10563                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
10564     }
10565
10566     // Use dedicated unpack instructions for masks that match their pattern.
10567     if (isShuffleEquivalent(Mask, 0, 4, 2, 6))
10568       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
10569     if (isShuffleEquivalent(Mask, 1, 5, 3, 7))
10570       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);
10571   }
10572
10573   // AVX2 provides a direct instruction for permuting a single input across
10574   // lanes.
10575   if (isSingleInputShuffleMask(Mask))
10576     return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
10577                        getV4X86ShuffleImm8ForMask(Mask, DAG));
10578
10579   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10580   // shuffle. However, if we have AVX2 and either inputs are already in place,
10581   // we will be able to shuffle even across lanes the other input in a single
10582   // instruction so skip this pattern.
10583   if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
10584                                  isShuffleMaskInputInPlace(1, Mask))))
10585     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10586             DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
10587       return Result;
10588
10589   // Otherwise fall back on generic blend lowering.
10590   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
10591                                                     Mask, DAG);
10592 }
10593
10594 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
10595 ///
10596 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
10597 /// isn't available.
10598 static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10599                                        const X86Subtarget *Subtarget,
10600                                        SelectionDAG &DAG) {
10601   SDLoc DL(Op);
10602   assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
10603   assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
10604   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10605   ArrayRef<int> Mask = SVOp->getMask();
10606   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10607
10608   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
10609                                                 Subtarget, DAG))
10610     return Blend;
10611
10612   // Check for being able to broadcast a single element.
10613   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8f32, DL, V1,
10614                                                         Mask, Subtarget, DAG))
10615     return Broadcast;
10616
10617   // If the shuffle mask is repeated in each 128-bit lane, we have many more
10618   // options to efficiently lower the shuffle.
10619   SmallVector<int, 4> RepeatedMask;
10620   if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
10621     assert(RepeatedMask.size() == 4 &&
10622            "Repeated masks must be half the mask width!");
10623
10624     // Use even/odd duplicate instructions for masks that match their pattern.
10625     if (isShuffleEquivalent(Mask, 0, 0, 2, 2, 4, 4, 6, 6))
10626       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
10627     if (isShuffleEquivalent(Mask, 1, 1, 3, 3, 5, 5, 7, 7))
10628       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
10629
10630     if (isSingleInputShuffleMask(Mask))
10631       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
10632                          getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
10633
10634     // Use dedicated unpack instructions for masks that match their pattern.
10635     if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13))
10636       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2);
10637     if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15))
10638       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2);
10639
10640     // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
10641     // have already handled any direct blends. We also need to squash the
10642     // repeated mask into a simulated v4f32 mask.
10643     for (int i = 0; i < 4; ++i)
10644       if (RepeatedMask[i] >= 8)
10645         RepeatedMask[i] -= 4;
10646     return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
10647   }
10648
10649   // If we have a single input shuffle with different shuffle patterns in the
10650   // two 128-bit lanes use the variable mask to VPERMILPS.
10651   if (isSingleInputShuffleMask(Mask)) {
10652     SDValue VPermMask[8];
10653     for (int i = 0; i < 8; ++i)
10654       VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
10655                                  : DAG.getConstant(Mask[i], MVT::i32);
10656     if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
10657       return DAG.getNode(
10658           X86ISD::VPERMILPV, DL, MVT::v8f32, V1,
10659           DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask));
10660
10661     if (Subtarget->hasAVX2())
10662       return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32,
10663                          DAG.getNode(ISD::BITCAST, DL, MVT::v8f32,
10664                                      DAG.getNode(ISD::BUILD_VECTOR, DL,
10665                                                  MVT::v8i32, VPermMask)),
10666                          V1);
10667
10668     // Otherwise, fall back.
10669     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
10670                                                    DAG);
10671   }
10672
10673   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10674   // shuffle.
10675   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10676           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
10677     return Result;
10678
10679   // If we have AVX2 then we always want to lower with a blend because at v8 we
10680   // can fully permute the elements.
10681   if (Subtarget->hasAVX2())
10682     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
10683                                                       Mask, DAG);
10684
10685   // Otherwise fall back on generic lowering.
10686   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
10687 }
10688
10689 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
10690 ///
10691 /// This routine is only called when we have AVX2 and thus a reasonable
10692 /// instruction set for v8i32 shuffling..
10693 static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10694                                        const X86Subtarget *Subtarget,
10695                                        SelectionDAG &DAG) {
10696   SDLoc DL(Op);
10697   assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
10698   assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
10699   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10700   ArrayRef<int> Mask = SVOp->getMask();
10701   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10702   assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!");
10703
10704   // Whenever we can lower this as a zext, that instruction is strictly faster
10705   // than any alternative. It also allows us to fold memory operands into the
10706   // shuffle in many cases.
10707   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2,
10708                                                          Mask, Subtarget, DAG))
10709     return ZExt;
10710
10711   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
10712                                                 Subtarget, DAG))
10713     return Blend;
10714
10715   // Check for being able to broadcast a single element.
10716   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i32, DL, V1,
10717                                                         Mask, Subtarget, DAG))
10718     return Broadcast;
10719
10720   // If the shuffle mask is repeated in each 128-bit lane we can use more
10721   // efficient instructions that mirror the shuffles across the two 128-bit
10722   // lanes.
10723   SmallVector<int, 4> RepeatedMask;
10724   if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) {
10725     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
10726     if (isSingleInputShuffleMask(Mask))
10727       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
10728                          getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
10729
10730     // Use dedicated unpack instructions for masks that match their pattern.
10731     if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13))
10732       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2);
10733     if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15))
10734       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2);
10735   }
10736
10737   // If the shuffle patterns aren't repeated but it is a single input, directly
10738   // generate a cross-lane VPERMD instruction.
10739   if (isSingleInputShuffleMask(Mask)) {
10740     SDValue VPermMask[8];
10741     for (int i = 0; i < 8; ++i)
10742       VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
10743                                  : DAG.getConstant(Mask[i], MVT::i32);
10744     return DAG.getNode(
10745         X86ISD::VPERMV, DL, MVT::v8i32,
10746         DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
10747   }
10748
10749   // Try to use bit shift instructions.
10750   if (SDValue Shift = lowerVectorShuffleAsBitShift(
10751           DL, MVT::v8i32, V1, V2, Mask, DAG))
10752     return Shift;
10753
10754   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10755   // shuffle.
10756   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10757           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
10758     return Result;
10759
10760   // Otherwise fall back on generic blend lowering.
10761   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
10762                                                     Mask, DAG);
10763 }
10764
10765 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
10766 ///
10767 /// This routine is only called when we have AVX2 and thus a reasonable
10768 /// instruction set for v16i16 shuffling..
10769 static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10770                                         const X86Subtarget *Subtarget,
10771                                         SelectionDAG &DAG) {
10772   SDLoc DL(Op);
10773   assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
10774   assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
10775   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10776   ArrayRef<int> Mask = SVOp->getMask();
10777   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
10778   assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!");
10779
10780   // Whenever we can lower this as a zext, that instruction is strictly faster
10781   // than any alternative. It also allows us to fold memory operands into the
10782   // shuffle in many cases.
10783   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v16i16, V1, V2,
10784                                                          Mask, Subtarget, DAG))
10785     return ZExt;
10786
10787   // Check for being able to broadcast a single element.
10788   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i16, DL, V1,
10789                                                         Mask, Subtarget, DAG))
10790     return Broadcast;
10791
10792   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
10793                                                 Subtarget, DAG))
10794     return Blend;
10795
10796   // Use dedicated unpack instructions for masks that match their pattern.
10797   if (isShuffleEquivalent(Mask,
10798                           // First 128-bit lane:
10799                           0, 16, 1, 17, 2, 18, 3, 19,
10800                           // Second 128-bit lane:
10801                           8, 24, 9, 25, 10, 26, 11, 27))
10802     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2);
10803   if (isShuffleEquivalent(Mask,
10804                           // First 128-bit lane:
10805                           4, 20, 5, 21, 6, 22, 7, 23,
10806                           // Second 128-bit lane:
10807                           12, 28, 13, 29, 14, 30, 15, 31))
10808     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2);
10809
10810   if (isSingleInputShuffleMask(Mask)) {
10811     // There are no generalized cross-lane shuffle operations available on i16
10812     // element types.
10813     if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
10814       return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
10815                                                      Mask, DAG);
10816
10817     SDValue PSHUFBMask[32];
10818     for (int i = 0; i < 16; ++i) {
10819       if (Mask[i] == -1) {
10820         PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8);
10821         continue;
10822       }
10823
10824       int M = i < 8 ? Mask[i] : Mask[i] - 8;
10825       assert(M >= 0 && M < 8 && "Invalid single-input mask!");
10826       PSHUFBMask[2 * i] = DAG.getConstant(2 * M, MVT::i8);
10827       PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, MVT::i8);
10828     }
10829     return DAG.getNode(
10830         ISD::BITCAST, DL, MVT::v16i16,
10831         DAG.getNode(
10832             X86ISD::PSHUFB, DL, MVT::v32i8,
10833             DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1),
10834             DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)));
10835   }
10836
10837   // Try to use bit shift instructions.
10838   if (SDValue Shift = lowerVectorShuffleAsBitShift(
10839           DL, MVT::v16i16, V1, V2, Mask, DAG))
10840     return Shift;
10841
10842   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10843   // shuffle.
10844   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10845           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
10846     return Result;
10847
10848   // Otherwise fall back on generic lowering.
10849   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
10850 }
10851
10852 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
10853 ///
10854 /// This routine is only called when we have AVX2 and thus a reasonable
10855 /// instruction set for v32i8 shuffling..
10856 static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10857                                        const X86Subtarget *Subtarget,
10858                                        SelectionDAG &DAG) {
10859   SDLoc DL(Op);
10860   assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
10861   assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
10862   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10863   ArrayRef<int> Mask = SVOp->getMask();
10864   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
10865   assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!");
10866
10867   // Whenever we can lower this as a zext, that instruction is strictly faster
10868   // than any alternative. It also allows us to fold memory operands into the
10869   // shuffle in many cases.
10870   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2,
10871                                                          Mask, Subtarget, DAG))
10872     return ZExt;
10873
10874   // Check for being able to broadcast a single element.
10875   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v32i8, DL, V1,
10876                                                         Mask, Subtarget, DAG))
10877     return Broadcast;
10878
10879   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
10880                                                 Subtarget, DAG))
10881     return Blend;
10882
10883   // Use dedicated unpack instructions for masks that match their pattern.
10884   // Note that these are repeated 128-bit lane unpacks, not unpacks across all
10885   // 256-bit lanes.
10886   if (isShuffleEquivalent(
10887           Mask,
10888           // First 128-bit lane:
10889           0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
10890           // Second 128-bit lane:
10891           16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55))
10892     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2);
10893   if (isShuffleEquivalent(
10894           Mask,
10895           // First 128-bit lane:
10896           8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
10897           // Second 128-bit lane:
10898           24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63))
10899     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2);
10900
10901   if (isSingleInputShuffleMask(Mask)) {
10902     // There are no generalized cross-lane shuffle operations available on i8
10903     // element types.
10904     if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
10905       return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2,
10906                                                      Mask, DAG);
10907
10908     SDValue PSHUFBMask[32];
10909     for (int i = 0; i < 32; ++i)
10910       PSHUFBMask[i] =
10911           Mask[i] < 0
10912               ? DAG.getUNDEF(MVT::i8)
10913               : DAG.getConstant(Mask[i] < 16 ? Mask[i] : Mask[i] - 16, MVT::i8);
10914
10915     return DAG.getNode(
10916         X86ISD::PSHUFB, DL, MVT::v32i8, V1,
10917         DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask));
10918   }
10919
10920   // Try to use bit shift instructions.
10921   if (SDValue Shift = lowerVectorShuffleAsBitShift(
10922           DL, MVT::v32i8, V1, V2, Mask, DAG))
10923     return Shift;
10924
10925   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10926   // shuffle.
10927   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10928           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
10929     return Result;
10930
10931   // Otherwise fall back on generic lowering.
10932   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
10933 }
10934
10935 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
10936 ///
10937 /// This routine either breaks down the specific type of a 256-bit x86 vector
10938 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
10939 /// together based on the available instructions.
10940 static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10941                                         MVT VT, const X86Subtarget *Subtarget,
10942                                         SelectionDAG &DAG) {
10943   SDLoc DL(Op);
10944   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10945   ArrayRef<int> Mask = SVOp->getMask();
10946
10947   // There is a really nice hard cut-over between AVX1 and AVX2 that means we can
10948   // check for those subtargets here and avoid much of the subtarget querying in
10949   // the per-vector-type lowering routines. With AVX1 we have essentially *zero*
10950   // ability to manipulate a 256-bit vector with integer types. Since we'll use
10951   // floating point types there eventually, just immediately cast everything to
10952   // a float and operate entirely in that domain.
10953   if (VT.isInteger() && !Subtarget->hasAVX2()) {
10954     int ElementBits = VT.getScalarSizeInBits();
10955     if (ElementBits < 32)
10956       // No floating point type available, decompose into 128-bit vectors.
10957       return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10958
10959     MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
10960                                 VT.getVectorNumElements());
10961     V1 = DAG.getNode(ISD::BITCAST, DL, FpVT, V1);
10962     V2 = DAG.getNode(ISD::BITCAST, DL, FpVT, V2);
10963     return DAG.getNode(ISD::BITCAST, DL, VT,
10964                        DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
10965   }
10966
10967   switch (VT.SimpleTy) {
10968   case MVT::v4f64:
10969     return lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
10970   case MVT::v4i64:
10971     return lowerV4I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
10972   case MVT::v8f32:
10973     return lowerV8F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10974   case MVT::v8i32:
10975     return lowerV8I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10976   case MVT::v16i16:
10977     return lowerV16I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
10978   case MVT::v32i8:
10979     return lowerV32I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
10980
10981   default:
10982     llvm_unreachable("Not a valid 256-bit x86 vector type!");
10983   }
10984 }
10985
10986 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
10987 static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10988                                        const X86Subtarget *Subtarget,
10989                                        SelectionDAG &DAG) {
10990   SDLoc DL(Op);
10991   assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
10992   assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
10993   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10994   ArrayRef<int> Mask = SVOp->getMask();
10995   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10996
10997   // X86 has dedicated unpack instructions that can handle specific blend
10998   // operations: UNPCKH and UNPCKL.
10999   if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14))
11000     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2);
11001   if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15))
11002     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2);
11003
11004   // FIXME: Implement direct support for this type!
11005   return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG);
11006 }
11007
11008 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
11009 static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11010                                        const X86Subtarget *Subtarget,
11011                                        SelectionDAG &DAG) {
11012   SDLoc DL(Op);
11013   assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
11014   assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
11015   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11016   ArrayRef<int> Mask = SVOp->getMask();
11017   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11018
11019   // Use dedicated unpack instructions for masks that match their pattern.
11020   if (isShuffleEquivalent(Mask,
11021                           0, 16, 1, 17, 4, 20, 5, 21,
11022                           8, 24, 9, 25, 12, 28, 13, 29))
11023     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2);
11024   if (isShuffleEquivalent(Mask,
11025                           2, 18, 3, 19, 6, 22, 7, 23,
11026                           10, 26, 11, 27, 14, 30, 15, 31))
11027     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2);
11028
11029   // FIXME: Implement direct support for this type!
11030   return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG);
11031 }
11032
11033 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
11034 static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11035                                        const X86Subtarget *Subtarget,
11036                                        SelectionDAG &DAG) {
11037   SDLoc DL(Op);
11038   assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
11039   assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
11040   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11041   ArrayRef<int> Mask = SVOp->getMask();
11042   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11043
11044   // X86 has dedicated unpack instructions that can handle specific blend
11045   // operations: UNPCKH and UNPCKL.
11046   if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14))
11047     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2);
11048   if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15))
11049     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2);
11050
11051   // FIXME: Implement direct support for this type!
11052   return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG);
11053 }
11054
11055 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
11056 static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11057                                        const X86Subtarget *Subtarget,
11058                                        SelectionDAG &DAG) {
11059   SDLoc DL(Op);
11060   assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
11061   assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
11062   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11063   ArrayRef<int> Mask = SVOp->getMask();
11064   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11065
11066   // Use dedicated unpack instructions for masks that match their pattern.
11067   if (isShuffleEquivalent(Mask,
11068                           0, 16, 1, 17, 4, 20, 5, 21,
11069                           8, 24, 9, 25, 12, 28, 13, 29))
11070     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2);
11071   if (isShuffleEquivalent(Mask,
11072                           2, 18, 3, 19, 6, 22, 7, 23,
11073                           10, 26, 11, 27, 14, 30, 15, 31))
11074     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2);
11075
11076   // FIXME: Implement direct support for this type!
11077   return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG);
11078 }
11079
11080 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
11081 static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11082                                         const X86Subtarget *Subtarget,
11083                                         SelectionDAG &DAG) {
11084   SDLoc DL(Op);
11085   assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
11086   assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
11087   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11088   ArrayRef<int> Mask = SVOp->getMask();
11089   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
11090   assert(Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
11091
11092   // FIXME: Implement direct support for this type!
11093   return splitAndLowerVectorShuffle(DL, MVT::v32i16, V1, V2, Mask, DAG);
11094 }
11095
11096 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
11097 static SDValue lowerV64I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11098                                        const X86Subtarget *Subtarget,
11099                                        SelectionDAG &DAG) {
11100   SDLoc DL(Op);
11101   assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
11102   assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
11103   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11104   ArrayRef<int> Mask = SVOp->getMask();
11105   assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
11106   assert(Subtarget->hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
11107
11108   // FIXME: Implement direct support for this type!
11109   return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
11110 }
11111
11112 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
11113 ///
11114 /// This routine either breaks down the specific type of a 512-bit x86 vector
11115 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
11116 /// together based on the available instructions.
11117 static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11118                                         MVT VT, const X86Subtarget *Subtarget,
11119                                         SelectionDAG &DAG) {
11120   SDLoc DL(Op);
11121   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11122   ArrayRef<int> Mask = SVOp->getMask();
11123   assert(Subtarget->hasAVX512() &&
11124          "Cannot lower 512-bit vectors w/ basic ISA!");
11125
11126   // Check for being able to broadcast a single element.
11127   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(VT.SimpleTy, DL, V1,
11128                                                         Mask, Subtarget, DAG))
11129     return Broadcast;
11130
11131   // Dispatch to each element type for lowering. If we don't have supprot for
11132   // specific element type shuffles at 512 bits, immediately split them and
11133   // lower them. Each lowering routine of a given type is allowed to assume that
11134   // the requisite ISA extensions for that element type are available.
11135   switch (VT.SimpleTy) {
11136   case MVT::v8f64:
11137     return lowerV8F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
11138   case MVT::v16f32:
11139     return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
11140   case MVT::v8i64:
11141     return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
11142   case MVT::v16i32:
11143     return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
11144   case MVT::v32i16:
11145     if (Subtarget->hasBWI())
11146       return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
11147     break;
11148   case MVT::v64i8:
11149     if (Subtarget->hasBWI())
11150       return lowerV64I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
11151     break;
11152
11153   default:
11154     llvm_unreachable("Not a valid 512-bit x86 vector type!");
11155   }
11156
11157   // Otherwise fall back on splitting.
11158   return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
11159 }
11160
11161 /// \brief Top-level lowering for x86 vector shuffles.
11162 ///
11163 /// This handles decomposition, canonicalization, and lowering of all x86
11164 /// vector shuffles. Most of the specific lowering strategies are encapsulated
11165 /// above in helper routines. The canonicalization attempts to widen shuffles
11166 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
11167 /// s.t. only one of the two inputs needs to be tested, etc.
11168 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
11169                                   SelectionDAG &DAG) {
11170   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11171   ArrayRef<int> Mask = SVOp->getMask();
11172   SDValue V1 = Op.getOperand(0);
11173   SDValue V2 = Op.getOperand(1);
11174   MVT VT = Op.getSimpleValueType();
11175   int NumElements = VT.getVectorNumElements();
11176   SDLoc dl(Op);
11177
11178   assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
11179
11180   bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
11181   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
11182   if (V1IsUndef && V2IsUndef)
11183     return DAG.getUNDEF(VT);
11184
11185   // When we create a shuffle node we put the UNDEF node to second operand,
11186   // but in some cases the first operand may be transformed to UNDEF.
11187   // In this case we should just commute the node.
11188   if (V1IsUndef)
11189     return DAG.getCommutedVectorShuffle(*SVOp);
11190
11191   // Check for non-undef masks pointing at an undef vector and make the masks
11192   // undef as well. This makes it easier to match the shuffle based solely on
11193   // the mask.
11194   if (V2IsUndef)
11195     for (int M : Mask)
11196       if (M >= NumElements) {
11197         SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
11198         for (int &M : NewMask)
11199           if (M >= NumElements)
11200             M = -1;
11201         return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask);
11202       }
11203
11204   // Try to collapse shuffles into using a vector type with fewer elements but
11205   // wider element types. We cap this to not form integers or floating point
11206   // elements wider than 64 bits, but it might be interesting to form i128
11207   // integers to handle flipping the low and high halves of AVX 256-bit vectors.
11208   SmallVector<int, 16> WidenedMask;
11209   if (VT.getScalarSizeInBits() < 64 &&
11210       canWidenShuffleElements(Mask, WidenedMask)) {
11211     MVT NewEltVT = VT.isFloatingPoint()
11212                        ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
11213                        : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
11214     MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
11215     // Make sure that the new vector type is legal. For example, v2f64 isn't
11216     // legal on SSE1.
11217     if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
11218       V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
11219       V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
11220       return DAG.getNode(ISD::BITCAST, dl, VT,
11221                          DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask));
11222     }
11223   }
11224
11225   int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0;
11226   for (int M : SVOp->getMask())
11227     if (M < 0)
11228       ++NumUndefElements;
11229     else if (M < NumElements)
11230       ++NumV1Elements;
11231     else
11232       ++NumV2Elements;
11233
11234   // Commute the shuffle as needed such that more elements come from V1 than
11235   // V2. This allows us to match the shuffle pattern strictly on how many
11236   // elements come from V1 without handling the symmetric cases.
11237   if (NumV2Elements > NumV1Elements)
11238     return DAG.getCommutedVectorShuffle(*SVOp);
11239
11240   // When the number of V1 and V2 elements are the same, try to minimize the
11241   // number of uses of V2 in the low half of the vector. When that is tied,
11242   // ensure that the sum of indices for V1 is equal to or lower than the sum
11243   // indices for V2. When those are equal, try to ensure that the number of odd
11244   // indices for V1 is lower than the number of odd indices for V2.
11245   if (NumV1Elements == NumV2Elements) {
11246     int LowV1Elements = 0, LowV2Elements = 0;
11247     for (int M : SVOp->getMask().slice(0, NumElements / 2))
11248       if (M >= NumElements)
11249         ++LowV2Elements;
11250       else if (M >= 0)
11251         ++LowV1Elements;
11252     if (LowV2Elements > LowV1Elements) {
11253       return DAG.getCommutedVectorShuffle(*SVOp);
11254     } else if (LowV2Elements == LowV1Elements) {
11255       int SumV1Indices = 0, SumV2Indices = 0;
11256       for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
11257         if (SVOp->getMask()[i] >= NumElements)
11258           SumV2Indices += i;
11259         else if (SVOp->getMask()[i] >= 0)
11260           SumV1Indices += i;
11261       if (SumV2Indices < SumV1Indices) {
11262         return DAG.getCommutedVectorShuffle(*SVOp);
11263       } else if (SumV2Indices == SumV1Indices) {
11264         int NumV1OddIndices = 0, NumV2OddIndices = 0;
11265         for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
11266           if (SVOp->getMask()[i] >= NumElements)
11267             NumV2OddIndices += i % 2;
11268           else if (SVOp->getMask()[i] >= 0)
11269             NumV1OddIndices += i % 2;
11270         if (NumV2OddIndices < NumV1OddIndices)
11271           return DAG.getCommutedVectorShuffle(*SVOp);
11272       }
11273     }
11274   }
11275
11276   // For each vector width, delegate to a specialized lowering routine.
11277   if (VT.getSizeInBits() == 128)
11278     return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
11279
11280   if (VT.getSizeInBits() == 256)
11281     return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
11282
11283   // Force AVX-512 vectors to be scalarized for now.
11284   // FIXME: Implement AVX-512 support!
11285   if (VT.getSizeInBits() == 512)
11286     return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
11287
11288   llvm_unreachable("Unimplemented!");
11289 }
11290
11291
11292 //===----------------------------------------------------------------------===//
11293 // Legacy vector shuffle lowering
11294 //
11295 // This code is the legacy code handling vector shuffles until the above
11296 // replaces its functionality and performance.
11297 //===----------------------------------------------------------------------===//
11298
11299 static bool isBlendMask(ArrayRef<int> MaskVals, MVT VT, bool hasSSE41,
11300                         bool hasInt256, unsigned *MaskOut = nullptr) {
11301   MVT EltVT = VT.getVectorElementType();
11302
11303   // There is no blend with immediate in AVX-512.
11304   if (VT.is512BitVector())
11305     return false;
11306
11307   if (!hasSSE41 || EltVT == MVT::i8)
11308     return false;
11309   if (!hasInt256 && VT == MVT::v16i16)
11310     return false;
11311
11312   unsigned MaskValue = 0;
11313   unsigned NumElems = VT.getVectorNumElements();
11314   // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
11315   unsigned NumLanes = (NumElems - 1) / 8 + 1;
11316   unsigned NumElemsInLane = NumElems / NumLanes;
11317
11318   // Blend for v16i16 should be symetric for the both lanes.
11319   for (unsigned i = 0; i < NumElemsInLane; ++i) {
11320
11321     int SndLaneEltIdx = (NumLanes == 2) ? MaskVals[i + NumElemsInLane] : -1;
11322     int EltIdx = MaskVals[i];
11323
11324     if ((EltIdx < 0 || EltIdx == (int)i) &&
11325         (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane)))
11326       continue;
11327
11328     if (((unsigned)EltIdx == (i + NumElems)) &&
11329         (SndLaneEltIdx < 0 ||
11330          (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane))
11331       MaskValue |= (1 << i);
11332     else
11333       return false;
11334   }
11335
11336   if (MaskOut)
11337     *MaskOut = MaskValue;
11338   return true;
11339 }
11340
11341 // Try to lower a shuffle node into a simple blend instruction.
11342 // This function assumes isBlendMask returns true for this
11343 // SuffleVectorSDNode
11344 static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
11345                                           unsigned MaskValue,
11346                                           const X86Subtarget *Subtarget,
11347                                           SelectionDAG &DAG) {
11348   MVT VT = SVOp->getSimpleValueType(0);
11349   MVT EltVT = VT.getVectorElementType();
11350   assert(isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(),
11351                      Subtarget->hasInt256() && "Trying to lower a "
11352                                                "VECTOR_SHUFFLE to a Blend but "
11353                                                "with the wrong mask"));
11354   SDValue V1 = SVOp->getOperand(0);
11355   SDValue V2 = SVOp->getOperand(1);
11356   SDLoc dl(SVOp);
11357   unsigned NumElems = VT.getVectorNumElements();
11358
11359   // Convert i32 vectors to floating point if it is not AVX2.
11360   // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
11361   MVT BlendVT = VT;
11362   if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
11363     BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
11364                                NumElems);
11365     V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1);
11366     V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2);
11367   }
11368
11369   SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2,
11370                             DAG.getConstant(MaskValue, MVT::i32));
11371   return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
11372 }
11373
11374 /// In vector type \p VT, return true if the element at index \p InputIdx
11375 /// falls on a different 128-bit lane than \p OutputIdx.
11376 static bool ShuffleCrosses128bitLane(MVT VT, unsigned InputIdx,
11377                                      unsigned OutputIdx) {
11378   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
11379   return InputIdx * EltSize / 128 != OutputIdx * EltSize / 128;
11380 }
11381
11382 /// Generate a PSHUFB if possible.  Selects elements from \p V1 according to
11383 /// \p MaskVals.  MaskVals[OutputIdx] = InputIdx specifies that we want to
11384 /// shuffle the element at InputIdx in V1 to OutputIdx in the result.  If \p
11385 /// MaskVals refers to elements outside of \p V1 or is undef (-1), insert a
11386 /// zero.
11387 static SDValue getPSHUFB(ArrayRef<int> MaskVals, SDValue V1, SDLoc &dl,
11388                          SelectionDAG &DAG) {
11389   MVT VT = V1.getSimpleValueType();
11390   assert(VT.is128BitVector() || VT.is256BitVector());
11391
11392   MVT EltVT = VT.getVectorElementType();
11393   unsigned EltSizeInBytes = EltVT.getSizeInBits() / 8;
11394   unsigned NumElts = VT.getVectorNumElements();
11395
11396   SmallVector<SDValue, 32> PshufbMask;
11397   for (unsigned OutputIdx = 0; OutputIdx < NumElts; ++OutputIdx) {
11398     int InputIdx = MaskVals[OutputIdx];
11399     unsigned InputByteIdx;
11400
11401     if (InputIdx < 0 || NumElts <= (unsigned)InputIdx)
11402       InputByteIdx = 0x80;
11403     else {
11404       // Cross lane is not allowed.
11405       if (ShuffleCrosses128bitLane(VT, InputIdx, OutputIdx))
11406         return SDValue();
11407       InputByteIdx = InputIdx * EltSizeInBytes;
11408       // Index is an byte offset within the 128-bit lane.
11409       InputByteIdx &= 0xf;
11410     }
11411
11412     for (unsigned j = 0; j < EltSizeInBytes; ++j) {
11413       PshufbMask.push_back(DAG.getConstant(InputByteIdx, MVT::i8));
11414       if (InputByteIdx != 0x80)
11415         ++InputByteIdx;
11416     }
11417   }
11418
11419   MVT ShufVT = MVT::getVectorVT(MVT::i8, PshufbMask.size());
11420   if (ShufVT != VT)
11421     V1 = DAG.getNode(ISD::BITCAST, dl, ShufVT, V1);
11422   return DAG.getNode(X86ISD::PSHUFB, dl, ShufVT, V1,
11423                      DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT, PshufbMask));
11424 }
11425
11426 // v8i16 shuffles - Prefer shuffles in the following order:
11427 // 1. [all]   pshuflw, pshufhw, optional move
11428 // 2. [ssse3] 1 x pshufb
11429 // 3. [ssse3] 2 x pshufb + 1 x por
11430 // 4. [all]   mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
11431 static SDValue
11432 LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
11433                          SelectionDAG &DAG) {
11434   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11435   SDValue V1 = SVOp->getOperand(0);
11436   SDValue V2 = SVOp->getOperand(1);
11437   SDLoc dl(SVOp);
11438   SmallVector<int, 8> MaskVals;
11439
11440   // Determine if more than 1 of the words in each of the low and high quadwords
11441   // of the result come from the same quadword of one of the two inputs.  Undef
11442   // mask values count as coming from any quadword, for better codegen.
11443   //
11444   // Lo/HiQuad[i] = j indicates how many words from the ith quad of the input
11445   // feeds this quad.  For i, 0 and 1 refer to V1, 2 and 3 refer to V2.
11446   unsigned LoQuad[] = { 0, 0, 0, 0 };
11447   unsigned HiQuad[] = { 0, 0, 0, 0 };
11448   // Indices of quads used.
11449   std::bitset<4> InputQuads;
11450   for (unsigned i = 0; i < 8; ++i) {
11451     unsigned *Quad = i < 4 ? LoQuad : HiQuad;
11452     int EltIdx = SVOp->getMaskElt(i);
11453     MaskVals.push_back(EltIdx);
11454     if (EltIdx < 0) {
11455       ++Quad[0];
11456       ++Quad[1];
11457       ++Quad[2];
11458       ++Quad[3];
11459       continue;
11460     }
11461     ++Quad[EltIdx / 4];
11462     InputQuads.set(EltIdx / 4);
11463   }
11464
11465   int BestLoQuad = -1;
11466   unsigned MaxQuad = 1;
11467   for (unsigned i = 0; i < 4; ++i) {
11468     if (LoQuad[i] > MaxQuad) {
11469       BestLoQuad = i;
11470       MaxQuad = LoQuad[i];
11471     }
11472   }
11473
11474   int BestHiQuad = -1;
11475   MaxQuad = 1;
11476   for (unsigned i = 0; i < 4; ++i) {
11477     if (HiQuad[i] > MaxQuad) {
11478       BestHiQuad = i;
11479       MaxQuad = HiQuad[i];
11480     }
11481   }
11482
11483   // For SSSE3, If all 8 words of the result come from only 1 quadword of each
11484   // of the two input vectors, shuffle them into one input vector so only a
11485   // single pshufb instruction is necessary. If there are more than 2 input
11486   // quads, disable the next transformation since it does not help SSSE3.
11487   bool V1Used = InputQuads[0] || InputQuads[1];
11488   bool V2Used = InputQuads[2] || InputQuads[3];
11489   if (Subtarget->hasSSSE3()) {
11490     if (InputQuads.count() == 2 && V1Used && V2Used) {
11491       BestLoQuad = InputQuads[0] ? 0 : 1;
11492       BestHiQuad = InputQuads[2] ? 2 : 3;
11493     }
11494     if (InputQuads.count() > 2) {
11495       BestLoQuad = -1;
11496       BestHiQuad = -1;
11497     }
11498   }
11499
11500   // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
11501   // the shuffle mask.  If a quad is scored as -1, that means that it contains
11502   // words from all 4 input quadwords.
11503   SDValue NewV;
11504   if (BestLoQuad >= 0 || BestHiQuad >= 0) {
11505     int MaskV[] = {
11506       BestLoQuad < 0 ? 0 : BestLoQuad,
11507       BestHiQuad < 0 ? 1 : BestHiQuad
11508     };
11509     NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
11510                   DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
11511                   DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
11512     NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV);
11513
11514     // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
11515     // source words for the shuffle, to aid later transformations.
11516     bool AllWordsInNewV = true;
11517     bool InOrder[2] = { true, true };
11518     for (unsigned i = 0; i != 8; ++i) {
11519       int idx = MaskVals[i];
11520       if (idx != (int)i)
11521         InOrder[i/4] = false;
11522       if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
11523         continue;
11524       AllWordsInNewV = false;
11525       break;
11526     }
11527
11528     bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
11529     if (AllWordsInNewV) {
11530       for (int i = 0; i != 8; ++i) {
11531         int idx = MaskVals[i];
11532         if (idx < 0)
11533           continue;
11534         idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
11535         if ((idx != i) && idx < 4)
11536           pshufhw = false;
11537         if ((idx != i) && idx > 3)
11538           pshuflw = false;
11539       }
11540       V1 = NewV;
11541       V2Used = false;
11542       BestLoQuad = 0;
11543       BestHiQuad = 1;
11544     }
11545
11546     // If we've eliminated the use of V2, and the new mask is a pshuflw or
11547     // pshufhw, that's as cheap as it gets.  Return the new shuffle.
11548     if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
11549       unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;
11550       unsigned TargetMask = 0;
11551       NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
11552                                   DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
11553       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
11554       TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp):
11555                              getShufflePSHUFLWImmediate(SVOp);
11556       V1 = NewV.getOperand(0);
11557       return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
11558     }
11559   }
11560
11561   // Promote splats to a larger type which usually leads to more efficient code.
11562   // FIXME: Is this true if pshufb is available?
11563   if (SVOp->isSplat())
11564     return PromoteSplat(SVOp, DAG);
11565
11566   // If we have SSSE3, and all words of the result are from 1 input vector,
11567   // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
11568   // is present, fall back to case 4.
11569   if (Subtarget->hasSSSE3()) {
11570     SmallVector<SDValue,16> pshufbMask;
11571
11572     // If we have elements from both input vectors, set the high bit of the
11573     // shuffle mask element to zero out elements that come from V2 in the V1
11574     // mask, and elements that come from V1 in the V2 mask, so that the two
11575     // results can be OR'd together.
11576     bool TwoInputs = V1Used && V2Used;
11577     V1 = getPSHUFB(MaskVals, V1, dl, DAG);
11578     if (!TwoInputs)
11579       return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
11580
11581     // Calculate the shuffle mask for the second input, shuffle it, and
11582     // OR it with the first shuffled input.
11583     CommuteVectorShuffleMask(MaskVals, 8);
11584     V2 = getPSHUFB(MaskVals, V2, dl, DAG);
11585     V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
11586     return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
11587   }
11588
11589   // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
11590   // and update MaskVals with new element order.
11591   std::bitset<8> InOrder;
11592   if (BestLoQuad >= 0) {
11593     int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 };
11594     for (int i = 0; i != 4; ++i) {
11595       int idx = MaskVals[i];
11596       if (idx < 0) {
11597         InOrder.set(i);
11598       } else if ((idx / 4) == BestLoQuad) {
11599         MaskV[i] = idx & 3;
11600         InOrder.set(i);
11601       }
11602     }
11603     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
11604                                 &MaskV[0]);
11605
11606     if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
11607       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
11608       NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
11609                                   NewV.getOperand(0),
11610                                   getShufflePSHUFLWImmediate(SVOp), DAG);
11611     }
11612   }
11613
11614   // If BestHi >= 0, generate a pshufhw to put the high elements in order,
11615   // and update MaskVals with the new element order.
11616   if (BestHiQuad >= 0) {
11617     int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 };
11618     for (unsigned i = 4; i != 8; ++i) {
11619       int idx = MaskVals[i];
11620       if (idx < 0) {
11621         InOrder.set(i);
11622       } else if ((idx / 4) == BestHiQuad) {
11623         MaskV[i] = (idx & 3) + 4;
11624         InOrder.set(i);
11625       }
11626     }
11627     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
11628                                 &MaskV[0]);
11629
11630     if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
11631       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
11632       NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
11633                                   NewV.getOperand(0),
11634                                   getShufflePSHUFHWImmediate(SVOp), DAG);
11635     }
11636   }
11637
11638   // In case BestHi & BestLo were both -1, which means each quadword has a word
11639   // from each of the four input quadwords, calculate the InOrder bitvector now
11640   // before falling through to the insert/extract cleanup.
11641   if (BestLoQuad == -1 && BestHiQuad == -1) {
11642     NewV = V1;
11643     for (int i = 0; i != 8; ++i)
11644       if (MaskVals[i] < 0 || MaskVals[i] == i)
11645         InOrder.set(i);
11646   }
11647
11648   // The other elements are put in the right place using pextrw and pinsrw.
11649   for (unsigned i = 0; i != 8; ++i) {
11650     if (InOrder[i])
11651       continue;
11652     int EltIdx = MaskVals[i];
11653     if (EltIdx < 0)
11654       continue;
11655     SDValue ExtOp = (EltIdx < 8) ?
11656       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
11657                   DAG.getIntPtrConstant(EltIdx)) :
11658       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
11659                   DAG.getIntPtrConstant(EltIdx - 8));
11660     NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
11661                        DAG.getIntPtrConstant(i));
11662   }
11663   return NewV;
11664 }
11665
11666 /// \brief v16i16 shuffles
11667 ///
11668 /// FIXME: We only support generation of a single pshufb currently.  We can
11669 /// generalize the other applicable cases from LowerVECTOR_SHUFFLEv8i16 as
11670 /// well (e.g 2 x pshufb + 1 x por).
11671 static SDValue
11672 LowerVECTOR_SHUFFLEv16i16(SDValue Op, SelectionDAG &DAG) {
11673   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11674   SDValue V1 = SVOp->getOperand(0);
11675   SDValue V2 = SVOp->getOperand(1);
11676   SDLoc dl(SVOp);
11677
11678   if (V2.getOpcode() != ISD::UNDEF)
11679     return SDValue();
11680
11681   SmallVector<int, 16> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
11682   return getPSHUFB(MaskVals, V1, dl, DAG);
11683 }
11684
11685 // v16i8 shuffles - Prefer shuffles in the following order:
11686 // 1. [ssse3] 1 x pshufb
11687 // 2. [ssse3] 2 x pshufb + 1 x por
11688 // 3. [all]   v8i16 shuffle + N x pextrw + rotate + pinsrw
11689 static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
11690                                         const X86Subtarget* Subtarget,
11691                                         SelectionDAG &DAG) {
11692   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
11693   SDValue V1 = SVOp->getOperand(0);
11694   SDValue V2 = SVOp->getOperand(1);
11695   SDLoc dl(SVOp);
11696   ArrayRef<int> MaskVals = SVOp->getMask();
11697
11698   // Promote splats to a larger type which usually leads to more efficient code.
11699   // FIXME: Is this true if pshufb is available?
11700   if (SVOp->isSplat())
11701     return PromoteSplat(SVOp, DAG);
11702
11703   // If we have SSSE3, case 1 is generated when all result bytes come from
11704   // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
11705   // present, fall back to case 3.
11706
11707   // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
11708   if (Subtarget->hasSSSE3()) {
11709     SmallVector<SDValue,16> pshufbMask;
11710
11711     // If all result elements are from one input vector, then only translate
11712     // undef mask values to 0x80 (zero out result) in the pshufb mask.
11713     //
11714     // Otherwise, we have elements from both input vectors, and must zero out
11715     // elements that come from V2 in the first mask, and V1 in the second mask
11716     // so that we can OR them together.
11717     for (unsigned i = 0; i != 16; ++i) {
11718       int EltIdx = MaskVals[i];
11719       if (EltIdx < 0 || EltIdx >= 16)
11720         EltIdx = 0x80;
11721       pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
11722     }
11723     V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
11724                      DAG.getNode(ISD::BUILD_VECTOR, dl,
11725                                  MVT::v16i8, pshufbMask));
11726
11727     // As PSHUFB will zero elements with negative indices, it's safe to ignore
11728     // the 2nd operand if it's undefined or zero.
11729     if (V2.getOpcode() == ISD::UNDEF ||
11730         ISD::isBuildVectorAllZeros(V2.getNode()))
11731       return V1;
11732
11733     // Calculate the shuffle mask for the second input, shuffle it, and
11734     // OR it with the first shuffled input.
11735     pshufbMask.clear();
11736     for (unsigned i = 0; i != 16; ++i) {
11737       int EltIdx = MaskVals[i];
11738       EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16;
11739       pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
11740     }
11741     V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
11742                      DAG.getNode(ISD::BUILD_VECTOR, dl,
11743                                  MVT::v16i8, pshufbMask));
11744     return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
11745   }
11746
11747   // No SSSE3 - Calculate in place words and then fix all out of place words
11748   // With 0-16 extracts & inserts.  Worst case is 16 bytes out of order from
11749   // the 16 different words that comprise the two doublequadword input vectors.
11750   V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
11751   V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
11752   SDValue NewV = V1;
11753   for (int i = 0; i != 8; ++i) {
11754     int Elt0 = MaskVals[i*2];
11755     int Elt1 = MaskVals[i*2+1];
11756
11757     // This word of the result is all undef, skip it.
11758     if (Elt0 < 0 && Elt1 < 0)
11759       continue;
11760
11761     // This word of the result is already in the correct place, skip it.
11762     if ((Elt0 == i*2) && (Elt1 == i*2+1))
11763       continue;
11764
11765     SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
11766     SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
11767     SDValue InsElt;
11768
11769     // If Elt0 and Elt1 are defined, are consecutive, and can be load
11770     // using a single extract together, load it and store it.
11771     if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
11772       InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
11773                            DAG.getIntPtrConstant(Elt1 / 2));
11774       NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
11775                         DAG.getIntPtrConstant(i));
11776       continue;
11777     }
11778
11779     // If Elt1 is defined, extract it from the appropriate source.  If the
11780     // source byte is not also odd, shift the extracted word left 8 bits
11781     // otherwise clear the bottom 8 bits if we need to do an or.
11782     if (Elt1 >= 0) {
11783       InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
11784                            DAG.getIntPtrConstant(Elt1 / 2));
11785       if ((Elt1 & 1) == 0)
11786         InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
11787                              DAG.getConstant(8,
11788                                   TLI.getShiftAmountTy(InsElt.getValueType())));
11789       else if (Elt0 >= 0)
11790         InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
11791                              DAG.getConstant(0xFF00, MVT::i16));
11792     }
11793     // If Elt0 is defined, extract it from the appropriate source.  If the
11794     // source byte is not also even, shift the extracted word right 8 bits. If
11795     // Elt1 was also defined, OR the extracted values together before
11796     // inserting them in the result.
11797     if (Elt0 >= 0) {
11798       SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
11799                                     Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
11800       if ((Elt0 & 1) != 0)
11801         InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
11802                               DAG.getConstant(8,
11803                                  TLI.getShiftAmountTy(InsElt0.getValueType())));
11804       else if (Elt1 >= 0)
11805         InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
11806                              DAG.getConstant(0x00FF, MVT::i16));
11807       InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
11808                          : InsElt0;
11809     }
11810     NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
11811                        DAG.getIntPtrConstant(i));
11812   }
11813   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
11814 }
11815
11816 // v32i8 shuffles - Translate to VPSHUFB if possible.
11817 static
11818 SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
11819                                  const X86Subtarget *Subtarget,
11820                                  SelectionDAG &DAG) {
11821   MVT VT = SVOp->getSimpleValueType(0);
11822   SDValue V1 = SVOp->getOperand(0);
11823   SDValue V2 = SVOp->getOperand(1);
11824   SDLoc dl(SVOp);
11825   SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
11826
11827   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
11828   bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode());
11829   bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode());
11830
11831   // VPSHUFB may be generated if
11832   // (1) one of input vector is undefined or zeroinitializer.
11833   // The mask value 0x80 puts 0 in the corresponding slot of the vector.
11834   // And (2) the mask indexes don't cross the 128-bit lane.
11835   if (VT != MVT::v32i8 || !Subtarget->hasInt256() ||
11836       (!V2IsUndef && !V2IsAllZero && !V1IsAllZero))
11837     return SDValue();
11838
11839   if (V1IsAllZero && !V2IsAllZero) {
11840     CommuteVectorShuffleMask(MaskVals, 32);
11841     V1 = V2;
11842   }
11843   return getPSHUFB(MaskVals, V1, dl, DAG);
11844 }
11845
11846 /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
11847 /// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
11848 /// done when every pair / quad of shuffle mask elements point to elements in
11849 /// the right sequence. e.g.
11850 /// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15>
11851 static
11852 SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
11853                                  SelectionDAG &DAG) {
11854   MVT VT = SVOp->getSimpleValueType(0);
11855   SDLoc dl(SVOp);
11856   unsigned NumElems = VT.getVectorNumElements();
11857   MVT NewVT;
11858   unsigned Scale;
11859   switch (VT.SimpleTy) {
11860   default: llvm_unreachable("Unexpected!");
11861   case MVT::v2i64:
11862   case MVT::v2f64:
11863            return SDValue(SVOp, 0);
11864   case MVT::v4f32:  NewVT = MVT::v2f64; Scale = 2; break;
11865   case MVT::v4i32:  NewVT = MVT::v2i64; Scale = 2; break;
11866   case MVT::v8i16:  NewVT = MVT::v4i32; Scale = 2; break;
11867   case MVT::v16i8:  NewVT = MVT::v4i32; Scale = 4; break;
11868   case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break;
11869   case MVT::v32i8:  NewVT = MVT::v8i32; Scale = 4; break;
11870   }
11871
11872   SmallVector<int, 8> MaskVec;
11873   for (unsigned i = 0; i != NumElems; i += Scale) {
11874     int StartIdx = -1;
11875     for (unsigned j = 0; j != Scale; ++j) {
11876       int EltIdx = SVOp->getMaskElt(i+j);
11877       if (EltIdx < 0)
11878         continue;
11879       if (StartIdx < 0)
11880         StartIdx = (EltIdx / Scale);
11881       if (EltIdx != (int)(StartIdx*Scale + j))
11882         return SDValue();
11883     }
11884     MaskVec.push_back(StartIdx);
11885   }
11886
11887   SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0));
11888   SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1));
11889   return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
11890 }
11891
11892 /// getVZextMovL - Return a zero-extending vector move low node.
11893 ///
11894 static SDValue getVZextMovL(MVT VT, MVT OpVT,
11895                             SDValue SrcOp, SelectionDAG &DAG,
11896                             const X86Subtarget *Subtarget, SDLoc dl) {
11897   if (VT == MVT::v2f64 || VT == MVT::v4f32) {
11898     LoadSDNode *LD = nullptr;
11899     if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
11900       LD = dyn_cast<LoadSDNode>(SrcOp);
11901     if (!LD) {
11902       // movssrr and movsdrr do not clear top bits. Try to use movd, movq
11903       // instead.
11904       MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
11905       if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) &&
11906           SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
11907           SrcOp.getOperand(0).getOpcode() == ISD::BITCAST &&
11908           SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
11909         // PR2108
11910         OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
11911         return DAG.getNode(ISD::BITCAST, dl, VT,
11912                            DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
11913                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
11914                                                    OpVT,
11915                                                    SrcOp.getOperand(0)
11916                                                           .getOperand(0))));
11917       }
11918     }
11919   }
11920
11921   return DAG.getNode(ISD::BITCAST, dl, VT,
11922                      DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
11923                                  DAG.getNode(ISD::BITCAST, dl,
11924                                              OpVT, SrcOp)));
11925 }
11926
11927 /// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
11928 /// which could not be matched by any known target speficic shuffle
11929 static SDValue
11930 LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
11931
11932   SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG);
11933   if (NewOp.getNode())
11934     return NewOp;
11935
11936   MVT VT = SVOp->getSimpleValueType(0);
11937
11938   unsigned NumElems = VT.getVectorNumElements();
11939   unsigned NumLaneElems = NumElems / 2;
11940
11941   SDLoc dl(SVOp);
11942   MVT EltVT = VT.getVectorElementType();
11943   MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
11944   SDValue Output[2];
11945
11946   SmallVector<int, 16> Mask;
11947   for (unsigned l = 0; l < 2; ++l) {
11948     // Build a shuffle mask for the output, discovering on the fly which
11949     // input vectors to use as shuffle operands (recorded in InputUsed).
11950     // If building a suitable shuffle vector proves too hard, then bail
11951     // out with UseBuildVector set.
11952     bool UseBuildVector = false;
11953     int InputUsed[2] = { -1, -1 }; // Not yet discovered.
11954     unsigned LaneStart = l * NumLaneElems;
11955     for (unsigned i = 0; i != NumLaneElems; ++i) {
11956       // The mask element.  This indexes into the input.
11957       int Idx = SVOp->getMaskElt(i+LaneStart);
11958       if (Idx < 0) {
11959         // the mask element does not index into any input vector.
11960         Mask.push_back(-1);
11961         continue;
11962       }
11963
11964       // The input vector this mask element indexes into.
11965       int Input = Idx / NumLaneElems;
11966
11967       // Turn the index into an offset from the start of the input vector.
11968       Idx -= Input * NumLaneElems;
11969
11970       // Find or create a shuffle vector operand to hold this input.
11971       unsigned OpNo;
11972       for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
11973         if (InputUsed[OpNo] == Input)
11974           // This input vector is already an operand.
11975           break;
11976         if (InputUsed[OpNo] < 0) {
11977           // Create a new operand for this input vector.
11978           InputUsed[OpNo] = Input;
11979           break;
11980         }
11981       }
11982
11983       if (OpNo >= array_lengthof(InputUsed)) {
11984         // More than two input vectors used!  Give up on trying to create a
11985         // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
11986         UseBuildVector = true;
11987         break;
11988       }
11989
11990       // Add the mask index for the new shuffle vector.
11991       Mask.push_back(Idx + OpNo * NumLaneElems);
11992     }
11993
11994     if (UseBuildVector) {
11995       SmallVector<SDValue, 16> SVOps;
11996       for (unsigned i = 0; i != NumLaneElems; ++i) {
11997         // The mask element.  This indexes into the input.
11998         int Idx = SVOp->getMaskElt(i+LaneStart);
11999         if (Idx < 0) {
12000           SVOps.push_back(DAG.getUNDEF(EltVT));
12001           continue;
12002         }
12003
12004         // The input vector this mask element indexes into.
12005         int Input = Idx / NumElems;
12006
12007         // Turn the index into an offset from the start of the input vector.
12008         Idx -= Input * NumElems;
12009
12010         // Extract the vector element by hand.
12011         SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
12012                                     SVOp->getOperand(Input),
12013                                     DAG.getIntPtrConstant(Idx)));
12014       }
12015
12016       // Construct the output using a BUILD_VECTOR.
12017       Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, SVOps);
12018     } else if (InputUsed[0] < 0) {
12019       // No input vectors were used! The result is undefined.
12020       Output[l] = DAG.getUNDEF(NVT);
12021     } else {
12022       SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),
12023                                         (InputUsed[0] % 2) * NumLaneElems,
12024                                         DAG, dl);
12025       // If only one input was used, use an undefined vector for the other.
12026       SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :
12027         Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),
12028                             (InputUsed[1] % 2) * NumLaneElems, DAG, dl);
12029       // At least one input vector was used. Create a new shuffle vector.
12030       Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
12031     }
12032
12033     Mask.clear();
12034   }
12035
12036   // Concatenate the result back
12037   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]);
12038 }
12039
12040 /// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
12041 /// 4 elements, and match them with several different shuffle types.
12042 static SDValue
12043 LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
12044   SDValue V1 = SVOp->getOperand(0);
12045   SDValue V2 = SVOp->getOperand(1);
12046   SDLoc dl(SVOp);
12047   MVT VT = SVOp->getSimpleValueType(0);
12048
12049   assert(VT.is128BitVector() && "Unsupported vector size");
12050
12051   std::pair<int, int> Locs[4];
12052   int Mask1[] = { -1, -1, -1, -1 };
12053   SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end());
12054
12055   unsigned NumHi = 0;
12056   unsigned NumLo = 0;
12057   for (unsigned i = 0; i != 4; ++i) {
12058     int Idx = PermMask[i];
12059     if (Idx < 0) {
12060       Locs[i] = std::make_pair(-1, -1);
12061     } else {
12062       assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
12063       if (Idx < 4) {
12064         Locs[i] = std::make_pair(0, NumLo);
12065         Mask1[NumLo] = Idx;
12066         NumLo++;
12067       } else {
12068         Locs[i] = std::make_pair(1, NumHi);
12069         if (2+NumHi < 4)
12070           Mask1[2+NumHi] = Idx;
12071         NumHi++;
12072       }
12073     }
12074   }
12075
12076   if (NumLo <= 2 && NumHi <= 2) {
12077     // If no more than two elements come from either vector. This can be
12078     // implemented with two shuffles. First shuffle gather the elements.
12079     // The second shuffle, which takes the first shuffle as both of its
12080     // vector operands, put the elements into the right order.
12081     V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
12082
12083     int Mask2[] = { -1, -1, -1, -1 };
12084
12085     for (unsigned i = 0; i != 4; ++i)
12086       if (Locs[i].first != -1) {
12087         unsigned Idx = (i < 2) ? 0 : 4;
12088         Idx += Locs[i].first * 2 + Locs[i].second;
12089         Mask2[i] = Idx;
12090       }
12091
12092     return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
12093   }
12094
12095   if (NumLo == 3 || NumHi == 3) {
12096     // Otherwise, we must have three elements from one vector, call it X, and
12097     // one element from the other, call it Y.  First, use a shufps to build an
12098     // intermediate vector with the one element from Y and the element from X
12099     // that will be in the same half in the final destination (the indexes don't
12100     // matter). Then, use a shufps to build the final vector, taking the half
12101     // containing the element from Y from the intermediate, and the other half
12102     // from X.
12103     if (NumHi == 3) {
12104       // Normalize it so the 3 elements come from V1.
12105       CommuteVectorShuffleMask(PermMask, 4);
12106       std::swap(V1, V2);
12107     }
12108
12109     // Find the element from V2.
12110     unsigned HiIndex;
12111     for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
12112       int Val = PermMask[HiIndex];
12113       if (Val < 0)
12114         continue;
12115       if (Val >= 4)
12116         break;
12117     }
12118
12119     Mask1[0] = PermMask[HiIndex];
12120     Mask1[1] = -1;
12121     Mask1[2] = PermMask[HiIndex^1];
12122     Mask1[3] = -1;
12123     V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
12124
12125     if (HiIndex >= 2) {
12126       Mask1[0] = PermMask[0];
12127       Mask1[1] = PermMask[1];
12128       Mask1[2] = HiIndex & 1 ? 6 : 4;
12129       Mask1[3] = HiIndex & 1 ? 4 : 6;
12130       return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
12131     }
12132
12133     Mask1[0] = HiIndex & 1 ? 2 : 0;
12134     Mask1[1] = HiIndex & 1 ? 0 : 2;
12135     Mask1[2] = PermMask[2];
12136     Mask1[3] = PermMask[3];
12137     if (Mask1[2] >= 0)
12138       Mask1[2] += 4;
12139     if (Mask1[3] >= 0)
12140       Mask1[3] += 4;
12141     return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
12142   }
12143
12144   // Break it into (shuffle shuffle_hi, shuffle_lo).
12145   int LoMask[] = { -1, -1, -1, -1 };
12146   int HiMask[] = { -1, -1, -1, -1 };
12147
12148   int *MaskPtr = LoMask;
12149   unsigned MaskIdx = 0;
12150   unsigned LoIdx = 0;
12151   unsigned HiIdx = 2;
12152   for (unsigned i = 0; i != 4; ++i) {
12153     if (i == 2) {
12154       MaskPtr = HiMask;
12155       MaskIdx = 1;
12156       LoIdx = 0;
12157       HiIdx = 2;
12158     }
12159     int Idx = PermMask[i];
12160     if (Idx < 0) {
12161       Locs[i] = std::make_pair(-1, -1);
12162     } else if (Idx < 4) {
12163       Locs[i] = std::make_pair(MaskIdx, LoIdx);
12164       MaskPtr[LoIdx] = Idx;
12165       LoIdx++;
12166     } else {
12167       Locs[i] = std::make_pair(MaskIdx, HiIdx);
12168       MaskPtr[HiIdx] = Idx;
12169       HiIdx++;
12170     }
12171   }
12172
12173   SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
12174   SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
12175   int MaskOps[] = { -1, -1, -1, -1 };
12176   for (unsigned i = 0; i != 4; ++i)
12177     if (Locs[i].first != -1)
12178       MaskOps[i] = Locs[i].first * 4 + Locs[i].second;
12179   return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
12180 }
12181
12182 static bool MayFoldVectorLoad(SDValue V) {
12183   while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
12184     V = V.getOperand(0);
12185
12186   if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
12187     V = V.getOperand(0);
12188   if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&
12189       V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)
12190     // BUILD_VECTOR (load), undef
12191     V = V.getOperand(0);
12192
12193   return MayFoldLoad(V);
12194 }
12195
12196 static
12197 SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) {
12198   MVT VT = Op.getSimpleValueType();
12199
12200   // Canonizalize to v2f64.
12201   V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
12202   return DAG.getNode(ISD::BITCAST, dl, VT,
12203                      getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64,
12204                                           V1, DAG));
12205 }
12206
12207 static
12208 SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG,
12209                         bool HasSSE2) {
12210   SDValue V1 = Op.getOperand(0);
12211   SDValue V2 = Op.getOperand(1);
12212   MVT VT = Op.getSimpleValueType();
12213
12214   assert(VT != MVT::v2i64 && "unsupported shuffle type");
12215
12216   if (HasSSE2 && VT == MVT::v2f64)
12217     return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
12218
12219   // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1)
12220   return DAG.getNode(ISD::BITCAST, dl, VT,
12221                      getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32,
12222                            DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1),
12223                            DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG));
12224 }
12225
12226 static
12227 SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG) {
12228   SDValue V1 = Op.getOperand(0);
12229   SDValue V2 = Op.getOperand(1);
12230   MVT VT = Op.getSimpleValueType();
12231
12232   assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
12233          "unsupported shuffle type");
12234
12235   if (V2.getOpcode() == ISD::UNDEF)
12236     V2 = V1;
12237
12238   // v4i32 or v4f32
12239   return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
12240 }
12241
12242 static
12243 SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
12244   SDValue V1 = Op.getOperand(0);
12245   SDValue V2 = Op.getOperand(1);
12246   MVT VT = Op.getSimpleValueType();
12247   unsigned NumElems = VT.getVectorNumElements();
12248
12249   // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
12250   // operand of these instructions is only memory, so check if there's a
12251   // potencial load folding here, otherwise use SHUFPS or MOVSD to match the
12252   // same masks.
12253   bool CanFoldLoad = false;
12254
12255   // Trivial case, when V2 comes from a load.
12256   if (MayFoldVectorLoad(V2))
12257     CanFoldLoad = true;
12258
12259   // When V1 is a load, it can be folded later into a store in isel, example:
12260   //  (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1)
12261   //    turns into:
12262   //  (MOVLPSmr addr:$src1, VR128:$src2)
12263   // So, recognize this potential and also use MOVLPS or MOVLPD
12264   else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
12265     CanFoldLoad = true;
12266
12267   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12268   if (CanFoldLoad) {
12269     if (HasSSE2 && NumElems == 2)
12270       return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
12271
12272     if (NumElems == 4)
12273       // If we don't care about the second element, proceed to use movss.
12274       if (SVOp->getMaskElt(1) != -1)
12275         return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
12276   }
12277
12278   // movl and movlp will both match v2i64, but v2i64 is never matched by
12279   // movl earlier because we make it strict to avoid messing with the movlp load
12280   // folding logic (see the code above getMOVLP call). Match it here then,
12281   // this is horrible, but will stay like this until we move all shuffle
12282   // matching to x86 specific nodes. Note that for the 1st condition all
12283   // types are matched with movsd.
12284   if (HasSSE2) {
12285     // FIXME: isMOVLMask should be checked and matched before getMOVLP,
12286     // as to remove this logic from here, as much as possible
12287     if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT))
12288       return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
12289     return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
12290   }
12291
12292   assert(VT != MVT::v4i32 && "unsupported shuffle type");
12293
12294   // Invert the operand order and use SHUFPS to match it.
12295   return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1,
12296                               getShuffleSHUFImmediate(SVOp), DAG);
12297 }
12298
12299 static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
12300                                          SelectionDAG &DAG) {
12301   SDLoc dl(Load);
12302   MVT VT = Load->getSimpleValueType(0);
12303   MVT EVT = VT.getVectorElementType();
12304   SDValue Addr = Load->getOperand(1);
12305   SDValue NewAddr = DAG.getNode(
12306       ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
12307       DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType()));
12308
12309   SDValue NewLoad =
12310       DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
12311                   DAG.getMachineFunction().getMachineMemOperand(
12312                       Load->getMemOperand(), 0, EVT.getStoreSize()));
12313   return NewLoad;
12314 }
12315
12316 // It is only safe to call this function if isINSERTPSMask is true for
12317 // this shufflevector mask.
12318 static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
12319                            SelectionDAG &DAG) {
12320   // Generate an insertps instruction when inserting an f32 from memory onto a
12321   // v4f32 or when copying a member from one v4f32 to another.
12322   // We also use it for transferring i32 from one register to another,
12323   // since it simply copies the same bits.
12324   // If we're transferring an i32 from memory to a specific element in a
12325   // register, we output a generic DAG that will match the PINSRD
12326   // instruction.
12327   MVT VT = SVOp->getSimpleValueType(0);
12328   MVT EVT = VT.getVectorElementType();
12329   SDValue V1 = SVOp->getOperand(0);
12330   SDValue V2 = SVOp->getOperand(1);
12331   auto Mask = SVOp->getMask();
12332   assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
12333          "unsupported vector type for insertps/pinsrd");
12334
12335   auto FromV1Predicate = [](const int &i) { return i < 4 && i > -1; };
12336   auto FromV2Predicate = [](const int &i) { return i >= 4; };
12337   int FromV1 = std::count_if(Mask.begin(), Mask.end(), FromV1Predicate);
12338
12339   SDValue From;
12340   SDValue To;
12341   unsigned DestIndex;
12342   if (FromV1 == 1) {
12343     From = V1;
12344     To = V2;
12345     DestIndex = std::find_if(Mask.begin(), Mask.end(), FromV1Predicate) -
12346                 Mask.begin();
12347
12348     // If we have 1 element from each vector, we have to check if we're
12349     // changing V1's element's place. If so, we're done. Otherwise, we
12350     // should assume we're changing V2's element's place and behave
12351     // accordingly.
12352     int FromV2 = std::count_if(Mask.begin(), Mask.end(), FromV2Predicate);
12353     assert(DestIndex <= INT32_MAX && "truncated destination index");
12354     if (FromV1 == FromV2 &&
12355         static_cast<int>(DestIndex) == Mask[DestIndex] % 4) {
12356       From = V2;
12357       To = V1;
12358       DestIndex =
12359           std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
12360     }
12361   } else {
12362     assert(std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 &&
12363            "More than one element from V1 and from V2, or no elements from one "
12364            "of the vectors. This case should not have returned true from "
12365            "isINSERTPSMask");
12366     From = V2;
12367     To = V1;
12368     DestIndex =
12369         std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
12370   }
12371
12372   // Get an index into the source vector in the range [0,4) (the mask is
12373   // in the range [0,8) because it can address V1 and V2)
12374   unsigned SrcIndex = Mask[DestIndex] % 4;
12375   if (MayFoldLoad(From)) {
12376     // Trivial case, when From comes from a load and is only used by the
12377     // shuffle. Make it use insertps from the vector that we need from that
12378     // load.
12379     SDValue NewLoad =
12380         NarrowVectorLoadToElement(cast<LoadSDNode>(From), SrcIndex, DAG);
12381     if (!NewLoad.getNode())
12382       return SDValue();
12383
12384     if (EVT == MVT::f32) {
12385       // Create this as a scalar to vector to match the instruction pattern.
12386       SDValue LoadScalarToVector =
12387           DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad);
12388       SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4);
12389       return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector,
12390                          InsertpsMask);
12391     } else { // EVT == MVT::i32
12392       // If we're getting an i32 from memory, use an INSERT_VECTOR_ELT
12393       // instruction, to match the PINSRD instruction, which loads an i32 to a
12394       // certain vector element.
12395       return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, To, NewLoad,
12396                          DAG.getConstant(DestIndex, MVT::i32));
12397     }
12398   }
12399
12400   // Vector-element-to-vector
12401   SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6);
12402   return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask);
12403 }
12404
12405 // Reduce a vector shuffle to zext.
12406 static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
12407                                     SelectionDAG &DAG) {
12408   // PMOVZX is only available from SSE41.
12409   if (!Subtarget->hasSSE41())
12410     return SDValue();
12411
12412   MVT VT = Op.getSimpleValueType();
12413
12414   // Only AVX2 support 256-bit vector integer extending.
12415   if (!Subtarget->hasInt256() && VT.is256BitVector())
12416     return SDValue();
12417
12418   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12419   SDLoc DL(Op);
12420   SDValue V1 = Op.getOperand(0);
12421   SDValue V2 = Op.getOperand(1);
12422   unsigned NumElems = VT.getVectorNumElements();
12423
12424   // Extending is an unary operation and the element type of the source vector
12425   // won't be equal to or larger than i64.
12426   if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() ||
12427       VT.getVectorElementType() == MVT::i64)
12428     return SDValue();
12429
12430   // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4.
12431   unsigned Shift = 1; // Start from 2, i.e. 1 << 1.
12432   while ((1U << Shift) < NumElems) {
12433     if (SVOp->getMaskElt(1U << Shift) == 1)
12434       break;
12435     Shift += 1;
12436     // The maximal ratio is 8, i.e. from i8 to i64.
12437     if (Shift > 3)
12438       return SDValue();
12439   }
12440
12441   // Check the shuffle mask.
12442   unsigned Mask = (1U << Shift) - 1;
12443   for (unsigned i = 0; i != NumElems; ++i) {
12444     int EltIdx = SVOp->getMaskElt(i);
12445     if ((i & Mask) != 0 && EltIdx != -1)
12446       return SDValue();
12447     if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift))
12448       return SDValue();
12449   }
12450
12451   unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift;
12452   MVT NeVT = MVT::getIntegerVT(NBits);
12453   MVT NVT = MVT::getVectorVT(NeVT, NumElems >> Shift);
12454
12455   if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT))
12456     return SDValue();
12457
12458   return DAG.getNode(ISD::BITCAST, DL, VT,
12459                      DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
12460 }
12461
12462 static SDValue NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
12463                                       SelectionDAG &DAG) {
12464   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12465   MVT VT = Op.getSimpleValueType();
12466   SDLoc dl(Op);
12467   SDValue V1 = Op.getOperand(0);
12468   SDValue V2 = Op.getOperand(1);
12469
12470   if (isZeroShuffle(SVOp))
12471     return getZeroVector(VT, Subtarget, DAG, dl);
12472
12473   // Handle splat operations
12474   if (SVOp->isSplat()) {
12475     // Use vbroadcast whenever the splat comes from a foldable load
12476     SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
12477     if (Broadcast.getNode())
12478       return Broadcast;
12479   }
12480
12481   // Check integer expanding shuffles.
12482   SDValue NewOp = LowerVectorIntExtend(Op, Subtarget, DAG);
12483   if (NewOp.getNode())
12484     return NewOp;
12485
12486   // If the shuffle can be profitably rewritten as a narrower shuffle, then
12487   // do it!
12488   if (VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v16i16 ||
12489       VT == MVT::v32i8) {
12490     SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
12491     if (NewOp.getNode())
12492       return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
12493   } else if (VT.is128BitVector() && Subtarget->hasSSE2()) {
12494     // FIXME: Figure out a cleaner way to do this.
12495     if (ISD::isBuildVectorAllZeros(V2.getNode())) {
12496       SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
12497       if (NewOp.getNode()) {
12498         MVT NewVT = NewOp.getSimpleValueType();
12499         if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
12500                                NewVT, true, false))
12501           return getVZextMovL(VT, NewVT, NewOp.getOperand(0), DAG, Subtarget,
12502                               dl);
12503       }
12504     } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
12505       SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
12506       if (NewOp.getNode()) {
12507         MVT NewVT = NewOp.getSimpleValueType();
12508         if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
12509           return getVZextMovL(VT, NewVT, NewOp.getOperand(1), DAG, Subtarget,
12510                               dl);
12511       }
12512     }
12513   }
12514   return SDValue();
12515 }
12516
12517 SDValue
12518 X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
12519   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12520   SDValue V1 = Op.getOperand(0);
12521   SDValue V2 = Op.getOperand(1);
12522   MVT VT = Op.getSimpleValueType();
12523   SDLoc dl(Op);
12524   unsigned NumElems = VT.getVectorNumElements();
12525   bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
12526   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
12527   bool V1IsSplat = false;
12528   bool V2IsSplat = false;
12529   bool HasSSE2 = Subtarget->hasSSE2();
12530   bool HasFp256    = Subtarget->hasFp256();
12531   bool HasInt256   = Subtarget->hasInt256();
12532   MachineFunction &MF = DAG.getMachineFunction();
12533   bool OptForSize = MF.getFunction()->getAttributes().
12534     hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
12535
12536   // Check if we should use the experimental vector shuffle lowering. If so,
12537   // delegate completely to that code path.
12538   if (ExperimentalVectorShuffleLowering)
12539     return lowerVectorShuffle(Op, Subtarget, DAG);
12540
12541   assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
12542
12543   if (V1IsUndef && V2IsUndef)
12544     return DAG.getUNDEF(VT);
12545
12546   // When we create a shuffle node we put the UNDEF node to second operand,
12547   // but in some cases the first operand may be transformed to UNDEF.
12548   // In this case we should just commute the node.
12549   if (V1IsUndef)
12550     return DAG.getCommutedVectorShuffle(*SVOp);
12551
12552   // Vector shuffle lowering takes 3 steps:
12553   //
12554   // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable
12555   //    narrowing and commutation of operands should be handled.
12556   // 2) Matching of shuffles with known shuffle masks to x86 target specific
12557   //    shuffle nodes.
12558   // 3) Rewriting of unmatched masks into new generic shuffle operations,
12559   //    so the shuffle can be broken into other shuffles and the legalizer can
12560   //    try the lowering again.
12561   //
12562   // The general idea is that no vector_shuffle operation should be left to
12563   // be matched during isel, all of them must be converted to a target specific
12564   // node here.
12565
12566   // Normalize the input vectors. Here splats, zeroed vectors, profitable
12567   // narrowing and commutation of operands should be handled. The actual code
12568   // doesn't include all of those, work in progress...
12569   SDValue NewOp = NormalizeVectorShuffle(Op, Subtarget, DAG);
12570   if (NewOp.getNode())
12571     return NewOp;
12572
12573   SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end());
12574
12575   // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
12576   // unpckh_undef). Only use pshufd if speed is more important than size.
12577   if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasInt256))
12578     return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
12579   if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasInt256))
12580     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
12581
12582   if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() &&
12583       V2IsUndef && MayFoldVectorLoad(V1))
12584     return getMOVDDup(Op, dl, V1, DAG);
12585
12586   if (isMOVHLPS_v_undef_Mask(M, VT))
12587     return getMOVHighToLow(Op, dl, DAG);
12588
12589   // Use to match splats
12590   if (HasSSE2 && isUNPCKHMask(M, VT, HasInt256) && V2IsUndef &&
12591       (VT == MVT::v2f64 || VT == MVT::v2i64))
12592     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
12593
12594   if (isPSHUFDMask(M, VT)) {
12595     // The actual implementation will match the mask in the if above and then
12596     // during isel it can match several different instructions, not only pshufd
12597     // as its name says, sad but true, emulate the behavior for now...
12598     if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64)))
12599       return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG);
12600
12601     unsigned TargetMask = getShuffleSHUFImmediate(SVOp);
12602
12603     if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
12604       return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
12605
12606     if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64))
12607       return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1, TargetMask,
12608                                   DAG);
12609
12610     return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1,
12611                                 TargetMask, DAG);
12612   }
12613
12614   if (isPALIGNRMask(M, VT, Subtarget))
12615     return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2,
12616                                 getShufflePALIGNRImmediate(SVOp),
12617                                 DAG);
12618
12619   if (isVALIGNMask(M, VT, Subtarget))
12620     return getTargetShuffleNode(X86ISD::VALIGN, dl, VT, V1, V2,
12621                                 getShuffleVALIGNImmediate(SVOp),
12622                                 DAG);
12623
12624   // Check if this can be converted into a logical shift.
12625   bool isLeft = false;
12626   unsigned ShAmt = 0;
12627   SDValue ShVal;
12628   bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
12629   if (isShift && ShVal.hasOneUse()) {
12630     // If the shifted value has multiple uses, it may be cheaper to use
12631     // v_set0 + movlhps or movhlps, etc.
12632     MVT EltVT = VT.getVectorElementType();
12633     ShAmt *= EltVT.getSizeInBits();
12634     return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
12635   }
12636
12637   if (isMOVLMask(M, VT)) {
12638     if (ISD::isBuildVectorAllZeros(V1.getNode()))
12639       return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
12640     if (!isMOVLPMask(M, VT)) {
12641       if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
12642         return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
12643
12644       if (VT == MVT::v4i32 || VT == MVT::v4f32)
12645         return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
12646     }
12647   }
12648
12649   // FIXME: fold these into legal mask.
12650   if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasInt256))
12651     return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
12652
12653   if (isMOVHLPSMask(M, VT))
12654     return getMOVHighToLow(Op, dl, DAG);
12655
12656   if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget))
12657     return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
12658
12659   if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget))
12660     return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
12661
12662   if (isMOVLPMask(M, VT))
12663     return getMOVLP(Op, dl, DAG, HasSSE2);
12664
12665   if (ShouldXformToMOVHLPS(M, VT) ||
12666       ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT))
12667     return DAG.getCommutedVectorShuffle(*SVOp);
12668
12669   if (isShift) {
12670     // No better options. Use a vshldq / vsrldq.
12671     MVT EltVT = VT.getVectorElementType();
12672     ShAmt *= EltVT.getSizeInBits();
12673     return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
12674   }
12675
12676   bool Commuted = false;
12677   // FIXME: This should also accept a bitcast of a splat?  Be careful, not
12678   // 1,1,1,1 -> v8i16 though.
12679   BitVector UndefElements;
12680   if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V1.getNode()))
12681     if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
12682       V1IsSplat = true;
12683   if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V2.getNode()))
12684     if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
12685       V2IsSplat = true;
12686
12687   // Canonicalize the splat or undef, if present, to be on the RHS.
12688   if (!V2IsUndef && V1IsSplat && !V2IsSplat) {
12689     CommuteVectorShuffleMask(M, NumElems);
12690     std::swap(V1, V2);
12691     std::swap(V1IsSplat, V2IsSplat);
12692     Commuted = true;
12693   }
12694
12695   if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) {
12696     // Shuffling low element of v1 into undef, just return v1.
12697     if (V2IsUndef)
12698       return V1;
12699     // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
12700     // the instruction selector will not match, so get a canonical MOVL with
12701     // swapped operands to undo the commute.
12702     return getMOVL(DAG, dl, VT, V2, V1);
12703   }
12704
12705   if (isUNPCKLMask(M, VT, HasInt256))
12706     return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
12707
12708   if (isUNPCKHMask(M, VT, HasInt256))
12709     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
12710
12711   if (V2IsSplat) {
12712     // Normalize mask so all entries that point to V2 points to its first
12713     // element then try to match unpck{h|l} again. If match, return a
12714     // new vector_shuffle with the corrected mask.p
12715     SmallVector<int, 8> NewMask(M.begin(), M.end());
12716     NormalizeMask(NewMask, NumElems);
12717     if (isUNPCKLMask(NewMask, VT, HasInt256, true))
12718       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
12719     if (isUNPCKHMask(NewMask, VT, HasInt256, true))
12720       return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
12721   }
12722
12723   if (Commuted) {
12724     // Commute is back and try unpck* again.
12725     // FIXME: this seems wrong.
12726     CommuteVectorShuffleMask(M, NumElems);
12727     std::swap(V1, V2);
12728     std::swap(V1IsSplat, V2IsSplat);
12729
12730     if (isUNPCKLMask(M, VT, HasInt256))
12731       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
12732
12733     if (isUNPCKHMask(M, VT, HasInt256))
12734       return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
12735   }
12736
12737   // Normalize the node to match x86 shuffle ops if needed
12738   if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true)))
12739     return DAG.getCommutedVectorShuffle(*SVOp);
12740
12741   // The checks below are all present in isShuffleMaskLegal, but they are
12742   // inlined here right now to enable us to directly emit target specific
12743   // nodes, and remove one by one until they don't return Op anymore.
12744
12745   if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
12746       SVOp->getSplatIndex() == 0 && V2IsUndef) {
12747     if (VT == MVT::v2f64 || VT == MVT::v2i64)
12748       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
12749   }
12750
12751   if (isPSHUFHWMask(M, VT, HasInt256))
12752     return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
12753                                 getShufflePSHUFHWImmediate(SVOp),
12754                                 DAG);
12755
12756   if (isPSHUFLWMask(M, VT, HasInt256))
12757     return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
12758                                 getShufflePSHUFLWImmediate(SVOp),
12759                                 DAG);
12760
12761   unsigned MaskValue;
12762   if (isBlendMask(M, VT, Subtarget->hasSSE41(), Subtarget->hasInt256(),
12763                   &MaskValue))
12764     return LowerVECTOR_SHUFFLEtoBlend(SVOp, MaskValue, Subtarget, DAG);
12765
12766   if (isSHUFPMask(M, VT))
12767     return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
12768                                 getShuffleSHUFImmediate(SVOp), DAG);
12769
12770   if (isUNPCKL_v_undef_Mask(M, VT, HasInt256))
12771     return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
12772   if (isUNPCKH_v_undef_Mask(M, VT, HasInt256))
12773     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
12774
12775   //===--------------------------------------------------------------------===//
12776   // Generate target specific nodes for 128 or 256-bit shuffles only
12777   // supported in the AVX instruction set.
12778   //
12779
12780   // Handle VMOVDDUPY permutations
12781   if (V2IsUndef && isMOVDDUPYMask(M, VT, HasFp256))
12782     return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
12783
12784   // Handle VPERMILPS/D* permutations
12785   if (isVPERMILPMask(M, VT)) {
12786     if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32)
12787       return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
12788                                   getShuffleSHUFImmediate(SVOp), DAG);
12789     return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1,
12790                                 getShuffleSHUFImmediate(SVOp), DAG);
12791   }
12792
12793   unsigned Idx;
12794   if (VT.is512BitVector() && isINSERT64x4Mask(M, VT, &Idx))
12795     return Insert256BitVector(V1, Extract256BitVector(V2, 0, DAG, dl),
12796                               Idx*(NumElems/2), DAG, dl);
12797
12798   // Handle VPERM2F128/VPERM2I128 permutations
12799   if (isVPERM2X128Mask(M, VT, HasFp256))
12800     return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
12801                                 V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
12802
12803   if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT))
12804     return getINSERTPS(SVOp, dl, DAG);
12805
12806   unsigned Imm8;
12807   if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8))
12808     return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG);
12809
12810   if ((V2IsUndef && HasInt256 && VT.is256BitVector() && NumElems == 8) ||
12811       VT.is512BitVector()) {
12812     MVT MaskEltVT = MVT::getIntegerVT(VT.getVectorElementType().getSizeInBits());
12813     MVT MaskVectorVT = MVT::getVectorVT(MaskEltVT, NumElems);
12814     SmallVector<SDValue, 16> permclMask;
12815     for (unsigned i = 0; i != NumElems; ++i) {
12816       permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT));
12817     }
12818
12819     SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, permclMask);
12820     if (V2IsUndef)
12821       // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
12822       return DAG.getNode(X86ISD::VPERMV, dl, VT,
12823                           DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
12824     return DAG.getNode(X86ISD::VPERMV3, dl, VT, V1,
12825                        DAG.getNode(ISD::BITCAST, dl, VT, Mask), V2);
12826   }
12827
12828   //===--------------------------------------------------------------------===//
12829   // Since no target specific shuffle was selected for this generic one,
12830   // lower it into other known shuffles. FIXME: this isn't true yet, but
12831   // this is the plan.
12832   //
12833
12834   // Handle v8i16 specifically since SSE can do byte extraction and insertion.
12835   if (VT == MVT::v8i16) {
12836     SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG);
12837     if (NewOp.getNode())
12838       return NewOp;
12839   }
12840
12841   if (VT == MVT::v16i16 && Subtarget->hasInt256()) {
12842     SDValue NewOp = LowerVECTOR_SHUFFLEv16i16(Op, DAG);
12843     if (NewOp.getNode())
12844       return NewOp;
12845   }
12846
12847   if (VT == MVT::v16i8) {
12848     SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, Subtarget, DAG);
12849     if (NewOp.getNode())
12850       return NewOp;
12851   }
12852
12853   if (VT == MVT::v32i8) {
12854     SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG);
12855     if (NewOp.getNode())
12856       return NewOp;
12857   }
12858
12859   // Handle all 128-bit wide vectors with 4 elements, and match them with
12860   // several different shuffle types.
12861   if (NumElems == 4 && VT.is128BitVector())
12862     return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
12863
12864   // Handle general 256-bit shuffles
12865   if (VT.is256BitVector())
12866     return LowerVECTOR_SHUFFLE_256(SVOp, DAG);
12867
12868   return SDValue();
12869 }
12870
12871 // This function assumes its argument is a BUILD_VECTOR of constants or
12872 // undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is
12873 // true.
12874 static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
12875                                     unsigned &MaskValue) {
12876   MaskValue = 0;
12877   unsigned NumElems = BuildVector->getNumOperands();
12878   // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
12879   unsigned NumLanes = (NumElems - 1) / 8 + 1;
12880   unsigned NumElemsInLane = NumElems / NumLanes;
12881
12882   // Blend for v16i16 should be symetric for the both lanes.
12883   for (unsigned i = 0; i < NumElemsInLane; ++i) {
12884     SDValue EltCond = BuildVector->getOperand(i);
12885     SDValue SndLaneEltCond =
12886         (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond;
12887
12888     int Lane1Cond = -1, Lane2Cond = -1;
12889     if (isa<ConstantSDNode>(EltCond))
12890       Lane1Cond = !isZero(EltCond);
12891     if (isa<ConstantSDNode>(SndLaneEltCond))
12892       Lane2Cond = !isZero(SndLaneEltCond);
12893
12894     if (Lane1Cond == Lane2Cond || Lane2Cond < 0)
12895       // Lane1Cond != 0, means we want the first argument.
12896       // Lane1Cond == 0, means we want the second argument.
12897       // The encoding of this argument is 0 for the first argument, 1
12898       // for the second. Therefore, invert the condition.
12899       MaskValue |= !Lane1Cond << i;
12900     else if (Lane1Cond < 0)
12901       MaskValue |= !Lane2Cond << i;
12902     else
12903       return false;
12904   }
12905   return true;
12906 }
12907
12908 /// \brief Try to lower a VSELECT instruction to an immediate-controlled blend
12909 /// instruction.
12910 static SDValue lowerVSELECTtoBLENDI(SDValue Op, const X86Subtarget *Subtarget,
12911                                     SelectionDAG &DAG) {
12912   SDValue Cond = Op.getOperand(0);
12913   SDValue LHS = Op.getOperand(1);
12914   SDValue RHS = Op.getOperand(2);
12915   SDLoc dl(Op);
12916   MVT VT = Op.getSimpleValueType();
12917   MVT EltVT = VT.getVectorElementType();
12918   unsigned NumElems = VT.getVectorNumElements();
12919
12920   // There is no blend with immediate in AVX-512.
12921   if (VT.is512BitVector())
12922     return SDValue();
12923
12924   if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
12925     return SDValue();
12926   if (!Subtarget->hasInt256() && VT == MVT::v16i16)
12927     return SDValue();
12928
12929   if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
12930     return SDValue();
12931
12932   // Check the mask for BLEND and build the value.
12933   unsigned MaskValue = 0;
12934   if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
12935     return SDValue();
12936
12937   // Convert i32 vectors to floating point if it is not AVX2.
12938   // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
12939   MVT BlendVT = VT;
12940   if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
12941     BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
12942                                NumElems);
12943     LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS);
12944     RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS);
12945   }
12946
12947   SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS,
12948                             DAG.getConstant(MaskValue, MVT::i32));
12949   return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
12950 }
12951
12952 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
12953   // A vselect where all conditions and data are constants can be optimized into
12954   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
12955   if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
12956       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
12957       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
12958     return SDValue();
12959
12960   SDValue BlendOp = lowerVSELECTtoBLENDI(Op, Subtarget, DAG);
12961   if (BlendOp.getNode())
12962     return BlendOp;
12963
12964   // Some types for vselect were previously set to Expand, not Legal or
12965   // Custom. Return an empty SDValue so we fall-through to Expand, after
12966   // the Custom lowering phase.
12967   MVT VT = Op.getSimpleValueType();
12968   switch (VT.SimpleTy) {
12969   default:
12970     break;
12971   case MVT::v8i16:
12972   case MVT::v16i16:
12973     if (Subtarget->hasBWI() && Subtarget->hasVLX())
12974       break;
12975     return SDValue();
12976   }
12977
12978   // We couldn't create a "Blend with immediate" node.
12979   // This node should still be legal, but we'll have to emit a blendv*
12980   // instruction.
12981   return Op;
12982 }
12983
12984 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
12985   MVT VT = Op.getSimpleValueType();
12986   SDLoc dl(Op);
12987
12988   if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
12989     return SDValue();
12990
12991   if (VT.getSizeInBits() == 8) {
12992     SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
12993                                   Op.getOperand(0), Op.getOperand(1));
12994     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
12995                                   DAG.getValueType(VT));
12996     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
12997   }
12998
12999   if (VT.getSizeInBits() == 16) {
13000     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
13001     // If Idx is 0, it's cheaper to do a move instead of a pextrw.
13002     if (Idx == 0)
13003       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
13004                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
13005                                      DAG.getNode(ISD::BITCAST, dl,
13006                                                  MVT::v4i32,
13007                                                  Op.getOperand(0)),
13008                                      Op.getOperand(1)));
13009     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
13010                                   Op.getOperand(0), Op.getOperand(1));
13011     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
13012                                   DAG.getValueType(VT));
13013     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
13014   }
13015
13016   if (VT == MVT::f32) {
13017     // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
13018     // the result back to FR32 register. It's only worth matching if the
13019     // result has a single use which is a store or a bitcast to i32.  And in
13020     // the case of a store, it's not worth it if the index is a constant 0,
13021     // because a MOVSSmr can be used instead, which is smaller and faster.
13022     if (!Op.hasOneUse())
13023       return SDValue();
13024     SDNode *User = *Op.getNode()->use_begin();
13025     if ((User->getOpcode() != ISD::STORE ||
13026          (isa<ConstantSDNode>(Op.getOperand(1)) &&
13027           cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
13028         (User->getOpcode() != ISD::BITCAST ||
13029          User->getValueType(0) != MVT::i32))
13030       return SDValue();
13031     SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
13032                                   DAG.getNode(ISD::BITCAST, dl, MVT::v4i32,
13033                                               Op.getOperand(0)),
13034                                               Op.getOperand(1));
13035     return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
13036   }
13037
13038   if (VT == MVT::i32 || VT == MVT::i64) {
13039     // ExtractPS/pextrq works with constant index.
13040     if (isa<ConstantSDNode>(Op.getOperand(1)))
13041       return Op;
13042   }
13043   return SDValue();
13044 }
13045
13046 /// Extract one bit from mask vector, like v16i1 or v8i1.
13047 /// AVX-512 feature.
13048 SDValue
13049 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
13050   SDValue Vec = Op.getOperand(0);
13051   SDLoc dl(Vec);
13052   MVT VecVT = Vec.getSimpleValueType();
13053   SDValue Idx = Op.getOperand(1);
13054   MVT EltVT = Op.getSimpleValueType();
13055
13056   assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
13057   assert((VecVT.getVectorNumElements() <= 16 || Subtarget->hasBWI()) &&
13058          "Unexpected vector type in ExtractBitFromMaskVector");
13059
13060   // variable index can't be handled in mask registers,
13061   // extend vector to VR512
13062   if (!isa<ConstantSDNode>(Idx)) {
13063     MVT ExtVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
13064     SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
13065     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
13066                               ExtVT.getVectorElementType(), Ext, Idx);
13067     return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
13068   }
13069
13070   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13071   const TargetRegisterClass* rc = getRegClassFor(VecVT);
13072   if (!Subtarget->hasDQI() && (VecVT.getVectorNumElements() <= 8))
13073     rc = getRegClassFor(MVT::v16i1);
13074   unsigned MaxSift = rc->getSize()*8 - 1;
13075   Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
13076                     DAG.getConstant(MaxSift - IdxVal, MVT::i8));
13077   Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
13078                     DAG.getConstant(MaxSift, MVT::i8));
13079   return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
13080                        DAG.getIntPtrConstant(0));
13081 }
13082
13083 SDValue
13084 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
13085                                            SelectionDAG &DAG) const {
13086   SDLoc dl(Op);
13087   SDValue Vec = Op.getOperand(0);
13088   MVT VecVT = Vec.getSimpleValueType();
13089   SDValue Idx = Op.getOperand(1);
13090
13091   if (Op.getSimpleValueType() == MVT::i1)
13092     return ExtractBitFromMaskVector(Op, DAG);
13093
13094   if (!isa<ConstantSDNode>(Idx)) {
13095     if (VecVT.is512BitVector() ||
13096         (VecVT.is256BitVector() && Subtarget->hasInt256() &&
13097          VecVT.getVectorElementType().getSizeInBits() == 32)) {
13098
13099       MVT MaskEltVT =
13100         MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits());
13101       MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
13102                                     MaskEltVT.getSizeInBits());
13103
13104       Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
13105       SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
13106                                 getZeroVector(MaskVT, Subtarget, DAG, dl),
13107                                 Idx, DAG.getConstant(0, getPointerTy()));
13108       SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
13109       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(),
13110                         Perm, DAG.getConstant(0, getPointerTy()));
13111     }
13112     return SDValue();
13113   }
13114
13115   // If this is a 256-bit vector result, first extract the 128-bit vector and
13116   // then extract the element from the 128-bit vector.
13117   if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
13118
13119     unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13120     // Get the 128-bit vector.
13121     Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
13122     MVT EltVT = VecVT.getVectorElementType();
13123
13124     unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
13125
13126     //if (IdxVal >= NumElems/2)
13127     //  IdxVal -= NumElems/2;
13128     IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk;
13129     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
13130                        DAG.getConstant(IdxVal, MVT::i32));
13131   }
13132
13133   assert(VecVT.is128BitVector() && "Unexpected vector length");
13134
13135   if (Subtarget->hasSSE41()) {
13136     SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
13137     if (Res.getNode())
13138       return Res;
13139   }
13140
13141   MVT VT = Op.getSimpleValueType();
13142   // TODO: handle v16i8.
13143   if (VT.getSizeInBits() == 16) {
13144     SDValue Vec = Op.getOperand(0);
13145     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
13146     if (Idx == 0)
13147       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
13148                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
13149                                      DAG.getNode(ISD::BITCAST, dl,
13150                                                  MVT::v4i32, Vec),
13151                                      Op.getOperand(1)));
13152     // Transform it so it match pextrw which produces a 32-bit result.
13153     MVT EltVT = MVT::i32;
13154     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
13155                                   Op.getOperand(0), Op.getOperand(1));
13156     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
13157                                   DAG.getValueType(VT));
13158     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
13159   }
13160
13161   if (VT.getSizeInBits() == 32) {
13162     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
13163     if (Idx == 0)
13164       return Op;
13165
13166     // SHUFPS the element to the lowest double word, then movss.
13167     int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
13168     MVT VVT = Op.getOperand(0).getSimpleValueType();
13169     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
13170                                        DAG.getUNDEF(VVT), Mask);
13171     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
13172                        DAG.getIntPtrConstant(0));
13173   }
13174
13175   if (VT.getSizeInBits() == 64) {
13176     // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
13177     // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
13178     //        to match extract_elt for f64.
13179     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
13180     if (Idx == 0)
13181       return Op;
13182
13183     // UNPCKHPD the element to the lowest double word, then movsd.
13184     // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
13185     // to a f64mem, the whole operation is folded into a single MOVHPDmr.
13186     int Mask[2] = { 1, -1 };
13187     MVT VVT = Op.getOperand(0).getSimpleValueType();
13188     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
13189                                        DAG.getUNDEF(VVT), Mask);
13190     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
13191                        DAG.getIntPtrConstant(0));
13192   }
13193
13194   return SDValue();
13195 }
13196
13197 /// Insert one bit to mask vector, like v16i1 or v8i1.
13198 /// AVX-512 feature.
13199 SDValue
13200 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
13201   SDLoc dl(Op);
13202   SDValue Vec = Op.getOperand(0);
13203   SDValue Elt = Op.getOperand(1);
13204   SDValue Idx = Op.getOperand(2);
13205   MVT VecVT = Vec.getSimpleValueType();
13206
13207   if (!isa<ConstantSDNode>(Idx)) {
13208     // Non constant index. Extend source and destination,
13209     // insert element and then truncate the result.
13210     MVT ExtVecVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
13211     MVT ExtEltVT = (VecVT == MVT::v8i1 ?  MVT::i64 : MVT::i32);
13212     SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
13213       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
13214       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
13215     return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
13216   }
13217
13218   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13219   SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
13220   if (Vec.getOpcode() == ISD::UNDEF)
13221     return DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
13222                        DAG.getConstant(IdxVal, MVT::i8));
13223   const TargetRegisterClass* rc = getRegClassFor(VecVT);
13224   unsigned MaxSift = rc->getSize()*8 - 1;
13225   EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
13226                     DAG.getConstant(MaxSift, MVT::i8));
13227   EltInVec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, EltInVec,
13228                     DAG.getConstant(MaxSift - IdxVal, MVT::i8));
13229   return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
13230 }
13231
13232 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
13233                                                   SelectionDAG &DAG) const {
13234   MVT VT = Op.getSimpleValueType();
13235   MVT EltVT = VT.getVectorElementType();
13236
13237   if (EltVT == MVT::i1)
13238     return InsertBitToMaskVector(Op, DAG);
13239
13240   SDLoc dl(Op);
13241   SDValue N0 = Op.getOperand(0);
13242   SDValue N1 = Op.getOperand(1);
13243   SDValue N2 = Op.getOperand(2);
13244   if (!isa<ConstantSDNode>(N2))
13245     return SDValue();
13246   auto *N2C = cast<ConstantSDNode>(N2);
13247   unsigned IdxVal = N2C->getZExtValue();
13248
13249   // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
13250   // into that, and then insert the subvector back into the result.
13251   if (VT.is256BitVector() || VT.is512BitVector()) {
13252     // Get the desired 128-bit vector half.
13253     SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
13254
13255     // Insert the element into the desired half.
13256     unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
13257     unsigned IdxIn128 = IdxVal - (IdxVal / NumEltsIn128) * NumEltsIn128;
13258
13259     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
13260                     DAG.getConstant(IdxIn128, MVT::i32));
13261
13262     // Insert the changed part back to the 256-bit vector
13263     return Insert128BitVector(N0, V, IdxVal, DAG, dl);
13264   }
13265   assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
13266
13267   if (Subtarget->hasSSE41()) {
13268     if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) {
13269       unsigned Opc;
13270       if (VT == MVT::v8i16) {
13271         Opc = X86ISD::PINSRW;
13272       } else {
13273         assert(VT == MVT::v16i8);
13274         Opc = X86ISD::PINSRB;
13275       }
13276
13277       // Transform it so it match pinsr{b,w} which expects a GR32 as its second
13278       // argument.
13279       if (N1.getValueType() != MVT::i32)
13280         N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
13281       if (N2.getValueType() != MVT::i32)
13282         N2 = DAG.getIntPtrConstant(IdxVal);
13283       return DAG.getNode(Opc, dl, VT, N0, N1, N2);
13284     }
13285
13286     if (EltVT == MVT::f32) {
13287       // Bits [7:6] of the constant are the source select.  This will always be
13288       //  zero here.  The DAG Combiner may combine an extract_elt index into
13289       //  these
13290       //  bits.  For example (insert (extract, 3), 2) could be matched by
13291       //  putting
13292       //  the '3' into bits [7:6] of X86ISD::INSERTPS.
13293       // Bits [5:4] of the constant are the destination select.  This is the
13294       //  value of the incoming immediate.
13295       // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may
13296       //   combine either bitwise AND or insert of float 0.0 to set these bits.
13297       N2 = DAG.getIntPtrConstant(IdxVal << 4);
13298       // Create this as a scalar to vector..
13299       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
13300       return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
13301     }
13302
13303     if (EltVT == MVT::i32 || EltVT == MVT::i64) {
13304       // PINSR* works with constant index.
13305       return Op;
13306     }
13307   }
13308
13309   if (EltVT == MVT::i8)
13310     return SDValue();
13311
13312   if (EltVT.getSizeInBits() == 16) {
13313     // Transform it so it match pinsrw which expects a 16-bit value in a GR32
13314     // as its second argument.
13315     if (N1.getValueType() != MVT::i32)
13316       N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
13317     if (N2.getValueType() != MVT::i32)
13318       N2 = DAG.getIntPtrConstant(IdxVal);
13319     return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
13320   }
13321   return SDValue();
13322 }
13323
13324 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
13325   SDLoc dl(Op);
13326   MVT OpVT = Op.getSimpleValueType();
13327
13328   // If this is a 256-bit vector result, first insert into a 128-bit
13329   // vector and then insert into the 256-bit vector.
13330   if (!OpVT.is128BitVector()) {
13331     // Insert into a 128-bit vector.
13332     unsigned SizeFactor = OpVT.getSizeInBits()/128;
13333     MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
13334                                  OpVT.getVectorNumElements() / SizeFactor);
13335
13336     Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
13337
13338     // Insert the 128-bit vector.
13339     return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
13340   }
13341
13342   if (OpVT == MVT::v1i64 &&
13343       Op.getOperand(0).getValueType() == MVT::i64)
13344     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
13345
13346   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
13347   assert(OpVT.is128BitVector() && "Expected an SSE type!");
13348   return DAG.getNode(ISD::BITCAST, dl, OpVT,
13349                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
13350 }
13351
13352 // Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
13353 // a simple subregister reference or explicit instructions to grab
13354 // upper bits of a vector.
13355 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
13356                                       SelectionDAG &DAG) {
13357   SDLoc dl(Op);
13358   SDValue In =  Op.getOperand(0);
13359   SDValue Idx = Op.getOperand(1);
13360   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13361   MVT ResVT   = Op.getSimpleValueType();
13362   MVT InVT    = In.getSimpleValueType();
13363
13364   if (Subtarget->hasFp256()) {
13365     if (ResVT.is128BitVector() &&
13366         (InVT.is256BitVector() || InVT.is512BitVector()) &&
13367         isa<ConstantSDNode>(Idx)) {
13368       return Extract128BitVector(In, IdxVal, DAG, dl);
13369     }
13370     if (ResVT.is256BitVector() && InVT.is512BitVector() &&
13371         isa<ConstantSDNode>(Idx)) {
13372       return Extract256BitVector(In, IdxVal, DAG, dl);
13373     }
13374   }
13375   return SDValue();
13376 }
13377
13378 // Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
13379 // simple superregister reference or explicit instructions to insert
13380 // the upper bits of a vector.
13381 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
13382                                      SelectionDAG &DAG) {
13383   if (!Subtarget->hasAVX())
13384     return SDValue();
13385
13386   SDLoc dl(Op);
13387   SDValue Vec = Op.getOperand(0);
13388   SDValue SubVec = Op.getOperand(1);
13389   SDValue Idx = Op.getOperand(2);
13390
13391   if (!isa<ConstantSDNode>(Idx))
13392     return SDValue();
13393
13394   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13395   MVT OpVT = Op.getSimpleValueType();
13396   MVT SubVecVT = SubVec.getSimpleValueType();
13397
13398   // Fold two 16-byte subvector loads into one 32-byte load:
13399   // (insert_subvector (insert_subvector undef, (load addr), 0),
13400   //                   (load addr + 16), Elts/2)
13401   // --> load32 addr
13402   if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
13403       Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
13404       OpVT.is256BitVector() && SubVecVT.is128BitVector() &&
13405       !Subtarget->isUnalignedMem32Slow()) {
13406     SDValue SubVec2 = Vec.getOperand(1);
13407     if (auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2))) {
13408       if (Idx2->getZExtValue() == 0) {
13409         SDValue Ops[] = { SubVec2, SubVec };
13410         SDValue LD = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false);
13411         if (LD.getNode())
13412           return LD;
13413       }
13414     }
13415   }
13416
13417   if ((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
13418       SubVecVT.is128BitVector())
13419     return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
13420
13421   if (OpVT.is512BitVector() && SubVecVT.is256BitVector())
13422     return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
13423
13424   return SDValue();
13425 }
13426
13427 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
13428 // their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
13429 // one of the above mentioned nodes. It has to be wrapped because otherwise
13430 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
13431 // be used to form addressing mode. These wrapped nodes will be selected
13432 // into MOV32ri.
13433 SDValue
13434 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
13435   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
13436
13437   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13438   // global base reg.
13439   unsigned char OpFlag = 0;
13440   unsigned WrapperKind = X86ISD::Wrapper;
13441   CodeModel::Model M = DAG.getTarget().getCodeModel();
13442
13443   if (Subtarget->isPICStyleRIPRel() &&
13444       (M == CodeModel::Small || M == CodeModel::Kernel))
13445     WrapperKind = X86ISD::WrapperRIP;
13446   else if (Subtarget->isPICStyleGOT())
13447     OpFlag = X86II::MO_GOTOFF;
13448   else if (Subtarget->isPICStyleStubPIC())
13449     OpFlag = X86II::MO_PIC_BASE_OFFSET;
13450
13451   SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
13452                                              CP->getAlignment(),
13453                                              CP->getOffset(), OpFlag);
13454   SDLoc DL(CP);
13455   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13456   // With PIC, the address is actually $g + Offset.
13457   if (OpFlag) {
13458     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13459                          DAG.getNode(X86ISD::GlobalBaseReg,
13460                                      SDLoc(), getPointerTy()),
13461                          Result);
13462   }
13463
13464   return Result;
13465 }
13466
13467 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
13468   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
13469
13470   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13471   // global base reg.
13472   unsigned char OpFlag = 0;
13473   unsigned WrapperKind = X86ISD::Wrapper;
13474   CodeModel::Model M = DAG.getTarget().getCodeModel();
13475
13476   if (Subtarget->isPICStyleRIPRel() &&
13477       (M == CodeModel::Small || M == CodeModel::Kernel))
13478     WrapperKind = X86ISD::WrapperRIP;
13479   else if (Subtarget->isPICStyleGOT())
13480     OpFlag = X86II::MO_GOTOFF;
13481   else if (Subtarget->isPICStyleStubPIC())
13482     OpFlag = X86II::MO_PIC_BASE_OFFSET;
13483
13484   SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
13485                                           OpFlag);
13486   SDLoc DL(JT);
13487   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13488
13489   // With PIC, the address is actually $g + Offset.
13490   if (OpFlag)
13491     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13492                          DAG.getNode(X86ISD::GlobalBaseReg,
13493                                      SDLoc(), getPointerTy()),
13494                          Result);
13495
13496   return Result;
13497 }
13498
13499 SDValue
13500 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
13501   const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
13502
13503   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13504   // global base reg.
13505   unsigned char OpFlag = 0;
13506   unsigned WrapperKind = X86ISD::Wrapper;
13507   CodeModel::Model M = DAG.getTarget().getCodeModel();
13508
13509   if (Subtarget->isPICStyleRIPRel() &&
13510       (M == CodeModel::Small || M == CodeModel::Kernel)) {
13511     if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())
13512       OpFlag = X86II::MO_GOTPCREL;
13513     WrapperKind = X86ISD::WrapperRIP;
13514   } else if (Subtarget->isPICStyleGOT()) {
13515     OpFlag = X86II::MO_GOT;
13516   } else if (Subtarget->isPICStyleStubPIC()) {
13517     OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE;
13518   } else if (Subtarget->isPICStyleStubNoDynamic()) {
13519     OpFlag = X86II::MO_DARWIN_NONLAZY;
13520   }
13521
13522   SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
13523
13524   SDLoc DL(Op);
13525   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13526
13527   // With PIC, the address is actually $g + Offset.
13528   if (DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
13529       !Subtarget->is64Bit()) {
13530     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13531                          DAG.getNode(X86ISD::GlobalBaseReg,
13532                                      SDLoc(), getPointerTy()),
13533                          Result);
13534   }
13535
13536   // For symbols that require a load from a stub to get the address, emit the
13537   // load.
13538   if (isGlobalStubReference(OpFlag))
13539     Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
13540                          MachinePointerInfo::getGOT(), false, false, false, 0);
13541
13542   return Result;
13543 }
13544
13545 SDValue
13546 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
13547   // Create the TargetBlockAddressAddress node.
13548   unsigned char OpFlags =
13549     Subtarget->ClassifyBlockAddressReference();
13550   CodeModel::Model M = DAG.getTarget().getCodeModel();
13551   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
13552   int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
13553   SDLoc dl(Op);
13554   SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset,
13555                                              OpFlags);
13556
13557   if (Subtarget->isPICStyleRIPRel() &&
13558       (M == CodeModel::Small || M == CodeModel::Kernel))
13559     Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
13560   else
13561     Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
13562
13563   // With PIC, the address is actually $g + Offset.
13564   if (isGlobalRelativeToPICBase(OpFlags)) {
13565     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
13566                          DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
13567                          Result);
13568   }
13569
13570   return Result;
13571 }
13572
13573 SDValue
13574 X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
13575                                       int64_t Offset, SelectionDAG &DAG) const {
13576   // Create the TargetGlobalAddress node, folding in the constant
13577   // offset if it is legal.
13578   unsigned char OpFlags =
13579       Subtarget->ClassifyGlobalReference(GV, DAG.getTarget());
13580   CodeModel::Model M = DAG.getTarget().getCodeModel();
13581   SDValue Result;
13582   if (OpFlags == X86II::MO_NO_FLAG &&
13583       X86::isOffsetSuitableForCodeModel(Offset, M)) {
13584     // A direct static reference to a global.
13585     Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
13586     Offset = 0;
13587   } else {
13588     Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
13589   }
13590
13591   if (Subtarget->isPICStyleRIPRel() &&
13592       (M == CodeModel::Small || M == CodeModel::Kernel))
13593     Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
13594   else
13595     Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
13596
13597   // With PIC, the address is actually $g + Offset.
13598   if (isGlobalRelativeToPICBase(OpFlags)) {
13599     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
13600                          DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
13601                          Result);
13602   }
13603
13604   // For globals that require a load from a stub to get the address, emit the
13605   // load.
13606   if (isGlobalStubReference(OpFlags))
13607     Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
13608                          MachinePointerInfo::getGOT(), false, false, false, 0);
13609
13610   // If there was a non-zero offset that we didn't fold, create an explicit
13611   // addition for it.
13612   if (Offset != 0)
13613     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
13614                          DAG.getConstant(Offset, getPointerTy()));
13615
13616   return Result;
13617 }
13618
13619 SDValue
13620 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
13621   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
13622   int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
13623   return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
13624 }
13625
13626 static SDValue
13627 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
13628            SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
13629            unsigned char OperandFlags, bool LocalDynamic = false) {
13630   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
13631   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
13632   SDLoc dl(GA);
13633   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13634                                            GA->getValueType(0),
13635                                            GA->getOffset(),
13636                                            OperandFlags);
13637
13638   X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
13639                                            : X86ISD::TLSADDR;
13640
13641   if (InFlag) {
13642     SDValue Ops[] = { Chain,  TGA, *InFlag };
13643     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
13644   } else {
13645     SDValue Ops[]  = { Chain, TGA };
13646     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
13647   }
13648
13649   // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
13650   MFI->setAdjustsStack(true);
13651   MFI->setHasCalls(true);
13652
13653   SDValue Flag = Chain.getValue(1);
13654   return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
13655 }
13656
13657 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
13658 static SDValue
13659 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13660                                 const EVT PtrVT) {
13661   SDValue InFlag;
13662   SDLoc dl(GA);  // ? function entry point might be better
13663   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
13664                                    DAG.getNode(X86ISD::GlobalBaseReg,
13665                                                SDLoc(), PtrVT), InFlag);
13666   InFlag = Chain.getValue(1);
13667
13668   return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
13669 }
13670
13671 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
13672 static SDValue
13673 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13674                                 const EVT PtrVT) {
13675   return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
13676                     X86::RAX, X86II::MO_TLSGD);
13677 }
13678
13679 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
13680                                            SelectionDAG &DAG,
13681                                            const EVT PtrVT,
13682                                            bool is64Bit) {
13683   SDLoc dl(GA);
13684
13685   // Get the start address of the TLS block for this module.
13686   X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
13687       .getInfo<X86MachineFunctionInfo>();
13688   MFI->incNumLocalDynamicTLSAccesses();
13689
13690   SDValue Base;
13691   if (is64Bit) {
13692     Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
13693                       X86II::MO_TLSLD, /*LocalDynamic=*/true);
13694   } else {
13695     SDValue InFlag;
13696     SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
13697         DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
13698     InFlag = Chain.getValue(1);
13699     Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
13700                       X86II::MO_TLSLDM, /*LocalDynamic=*/true);
13701   }
13702
13703   // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
13704   // of Base.
13705
13706   // Build x@dtpoff.
13707   unsigned char OperandFlags = X86II::MO_DTPOFF;
13708   unsigned WrapperKind = X86ISD::Wrapper;
13709   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13710                                            GA->getValueType(0),
13711                                            GA->getOffset(), OperandFlags);
13712   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
13713
13714   // Add x@dtpoff with the base.
13715   return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
13716 }
13717
13718 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
13719 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13720                                    const EVT PtrVT, TLSModel::Model model,
13721                                    bool is64Bit, bool isPIC) {
13722   SDLoc dl(GA);
13723
13724   // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
13725   Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
13726                                                          is64Bit ? 257 : 256));
13727
13728   SDValue ThreadPointer =
13729       DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0),
13730                   MachinePointerInfo(Ptr), false, false, false, 0);
13731
13732   unsigned char OperandFlags = 0;
13733   // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
13734   // initialexec.
13735   unsigned WrapperKind = X86ISD::Wrapper;
13736   if (model == TLSModel::LocalExec) {
13737     OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
13738   } else if (model == TLSModel::InitialExec) {
13739     if (is64Bit) {
13740       OperandFlags = X86II::MO_GOTTPOFF;
13741       WrapperKind = X86ISD::WrapperRIP;
13742     } else {
13743       OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
13744     }
13745   } else {
13746     llvm_unreachable("Unexpected model");
13747   }
13748
13749   // emit "addl x@ntpoff,%eax" (local exec)
13750   // or "addl x@indntpoff,%eax" (initial exec)
13751   // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
13752   SDValue TGA =
13753       DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
13754                                  GA->getOffset(), OperandFlags);
13755   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
13756
13757   if (model == TLSModel::InitialExec) {
13758     if (isPIC && !is64Bit) {
13759       Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
13760                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
13761                            Offset);
13762     }
13763
13764     Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
13765                          MachinePointerInfo::getGOT(), false, false, false, 0);
13766   }
13767
13768   // The address of the thread local variable is the add of the thread
13769   // pointer with the offset of the variable.
13770   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
13771 }
13772
13773 SDValue
13774 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
13775
13776   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
13777   const GlobalValue *GV = GA->getGlobal();
13778
13779   if (Subtarget->isTargetELF()) {
13780     TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
13781
13782     switch (model) {
13783       case TLSModel::GeneralDynamic:
13784         if (Subtarget->is64Bit())
13785           return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
13786         return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
13787       case TLSModel::LocalDynamic:
13788         return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(),
13789                                            Subtarget->is64Bit());
13790       case TLSModel::InitialExec:
13791       case TLSModel::LocalExec:
13792         return LowerToTLSExecModel(
13793             GA, DAG, getPointerTy(), model, Subtarget->is64Bit(),
13794             DAG.getTarget().getRelocationModel() == Reloc::PIC_);
13795     }
13796     llvm_unreachable("Unknown TLS model.");
13797   }
13798
13799   if (Subtarget->isTargetDarwin()) {
13800     // Darwin only has one model of TLS.  Lower to that.
13801     unsigned char OpFlag = 0;
13802     unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
13803                            X86ISD::WrapperRIP : X86ISD::Wrapper;
13804
13805     // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13806     // global base reg.
13807     bool PIC32 = (DAG.getTarget().getRelocationModel() == Reloc::PIC_) &&
13808                  !Subtarget->is64Bit();
13809     if (PIC32)
13810       OpFlag = X86II::MO_TLVP_PIC_BASE;
13811     else
13812       OpFlag = X86II::MO_TLVP;
13813     SDLoc DL(Op);
13814     SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
13815                                                 GA->getValueType(0),
13816                                                 GA->getOffset(), OpFlag);
13817     SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13818
13819     // With PIC32, the address is actually $g + Offset.
13820     if (PIC32)
13821       Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13822                            DAG.getNode(X86ISD::GlobalBaseReg,
13823                                        SDLoc(), getPointerTy()),
13824                            Offset);
13825
13826     // Lowering the machine isd will make sure everything is in the right
13827     // location.
13828     SDValue Chain = DAG.getEntryNode();
13829     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
13830     SDValue Args[] = { Chain, Offset };
13831     Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
13832
13833     // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
13834     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
13835     MFI->setAdjustsStack(true);
13836
13837     // And our return value (tls address) is in the standard call return value
13838     // location.
13839     unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
13840     return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
13841                               Chain.getValue(1));
13842   }
13843
13844   if (Subtarget->isTargetKnownWindowsMSVC() ||
13845       Subtarget->isTargetWindowsGNU()) {
13846     // Just use the implicit TLS architecture
13847     // Need to generate someting similar to:
13848     //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
13849     //                                  ; from TEB
13850     //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
13851     //   mov     rcx, qword [rdx+rcx*8]
13852     //   mov     eax, .tls$:tlsvar
13853     //   [rax+rcx] contains the address
13854     // Windows 64bit: gs:0x58
13855     // Windows 32bit: fs:__tls_array
13856
13857     SDLoc dl(GA);
13858     SDValue Chain = DAG.getEntryNode();
13859
13860     // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
13861     // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
13862     // use its literal value of 0x2C.
13863     Value *Ptr = Constant::getNullValue(Subtarget->is64Bit()
13864                                         ? Type::getInt8PtrTy(*DAG.getContext(),
13865                                                              256)
13866                                         : Type::getInt32PtrTy(*DAG.getContext(),
13867                                                               257));
13868
13869     SDValue TlsArray =
13870         Subtarget->is64Bit()
13871             ? DAG.getIntPtrConstant(0x58)
13872             : (Subtarget->isTargetWindowsGNU()
13873                    ? DAG.getIntPtrConstant(0x2C)
13874                    : DAG.getExternalSymbol("_tls_array", getPointerTy()));
13875
13876     SDValue ThreadPointer =
13877         DAG.getLoad(getPointerTy(), dl, Chain, TlsArray,
13878                     MachinePointerInfo(Ptr), false, false, false, 0);
13879
13880     // Load the _tls_index variable
13881     SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy());
13882     if (Subtarget->is64Bit())
13883       IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain,
13884                            IDX, MachinePointerInfo(), MVT::i32,
13885                            false, false, false, 0);
13886     else
13887       IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(),
13888                         false, false, false, 0);
13889
13890     SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()),
13891                                     getPointerTy());
13892     IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);
13893
13894     SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);
13895     res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(),
13896                       false, false, false, 0);
13897
13898     // Get the offset of start of .tls section
13899     SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13900                                              GA->getValueType(0),
13901                                              GA->getOffset(), X86II::MO_SECREL);
13902     SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA);
13903
13904     // The address of the thread local variable is the add of the thread
13905     // pointer with the offset of the variable.
13906     return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset);
13907   }
13908
13909   llvm_unreachable("TLS not implemented for this target.");
13910 }
13911
13912 /// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
13913 /// and take a 2 x i32 value to shift plus a shift amount.
13914 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
13915   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
13916   MVT VT = Op.getSimpleValueType();
13917   unsigned VTBits = VT.getSizeInBits();
13918   SDLoc dl(Op);
13919   bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
13920   SDValue ShOpLo = Op.getOperand(0);
13921   SDValue ShOpHi = Op.getOperand(1);
13922   SDValue ShAmt  = Op.getOperand(2);
13923   // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
13924   // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
13925   // during isel.
13926   SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
13927                                   DAG.getConstant(VTBits - 1, MVT::i8));
13928   SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
13929                                      DAG.getConstant(VTBits - 1, MVT::i8))
13930                        : DAG.getConstant(0, VT);
13931
13932   SDValue Tmp2, Tmp3;
13933   if (Op.getOpcode() == ISD::SHL_PARTS) {
13934     Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
13935     Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
13936   } else {
13937     Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
13938     Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
13939   }
13940
13941   // If the shift amount is larger or equal than the width of a part we can't
13942   // rely on the results of shld/shrd. Insert a test and select the appropriate
13943   // values for large shift amounts.
13944   SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
13945                                 DAG.getConstant(VTBits, MVT::i8));
13946   SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
13947                              AndNode, DAG.getConstant(0, MVT::i8));
13948
13949   SDValue Hi, Lo;
13950   SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
13951   SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
13952   SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
13953
13954   if (Op.getOpcode() == ISD::SHL_PARTS) {
13955     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
13956     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
13957   } else {
13958     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
13959     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
13960   }
13961
13962   SDValue Ops[2] = { Lo, Hi };
13963   return DAG.getMergeValues(Ops, dl);
13964 }
13965
13966 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
13967                                            SelectionDAG &DAG) const {
13968   MVT SrcVT = Op.getOperand(0).getSimpleValueType();
13969   SDLoc dl(Op);
13970
13971   if (SrcVT.isVector()) {
13972     if (SrcVT.getVectorElementType() == MVT::i1) {
13973       MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
13974       return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
13975                          DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT,
13976                                      Op.getOperand(0)));
13977     }
13978     return SDValue();
13979   }
13980
13981   assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
13982          "Unknown SINT_TO_FP to lower!");
13983
13984   // These are really Legal; return the operand so the caller accepts it as
13985   // Legal.
13986   if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
13987     return Op;
13988   if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
13989       Subtarget->is64Bit()) {
13990     return Op;
13991   }
13992
13993   unsigned Size = SrcVT.getSizeInBits()/8;
13994   MachineFunction &MF = DAG.getMachineFunction();
13995   int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
13996   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
13997   SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
13998                                StackSlot,
13999                                MachinePointerInfo::getFixedStack(SSFI),
14000                                false, false, 0);
14001   return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
14002 }
14003
14004 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
14005                                      SDValue StackSlot,
14006                                      SelectionDAG &DAG) const {
14007   // Build the FILD
14008   SDLoc DL(Op);
14009   SDVTList Tys;
14010   bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
14011   if (useSSE)
14012     Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
14013   else
14014     Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
14015
14016   unsigned ByteSize = SrcVT.getSizeInBits()/8;
14017
14018   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
14019   MachineMemOperand *MMO;
14020   if (FI) {
14021     int SSFI = FI->getIndex();
14022     MMO =
14023       DAG.getMachineFunction()
14024       .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14025                             MachineMemOperand::MOLoad, ByteSize, ByteSize);
14026   } else {
14027     MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
14028     StackSlot = StackSlot.getOperand(1);
14029   }
14030   SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
14031   SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
14032                                            X86ISD::FILD, DL,
14033                                            Tys, Ops, SrcVT, MMO);
14034
14035   if (useSSE) {
14036     Chain = Result.getValue(1);
14037     SDValue InFlag = Result.getValue(2);
14038
14039     // FIXME: Currently the FST is flagged to the FILD_FLAG. This
14040     // shouldn't be necessary except that RFP cannot be live across
14041     // multiple blocks. When stackifier is fixed, they can be uncoupled.
14042     MachineFunction &MF = DAG.getMachineFunction();
14043     unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
14044     int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
14045     SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
14046     Tys = DAG.getVTList(MVT::Other);
14047     SDValue Ops[] = {
14048       Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
14049     };
14050     MachineMemOperand *MMO =
14051       DAG.getMachineFunction()
14052       .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14053                             MachineMemOperand::MOStore, SSFISize, SSFISize);
14054
14055     Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
14056                                     Ops, Op.getValueType(), MMO);
14057     Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
14058                          MachinePointerInfo::getFixedStack(SSFI),
14059                          false, false, false, 0);
14060   }
14061
14062   return Result;
14063 }
14064
14065 // LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
14066 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
14067                                                SelectionDAG &DAG) const {
14068   // This algorithm is not obvious. Here it is what we're trying to output:
14069   /*
14070      movq       %rax,  %xmm0
14071      punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
14072      subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
14073      #ifdef __SSE3__
14074        haddpd   %xmm0, %xmm0
14075      #else
14076        pshufd   $0x4e, %xmm0, %xmm1
14077        addpd    %xmm1, %xmm0
14078      #endif
14079   */
14080
14081   SDLoc dl(Op);
14082   LLVMContext *Context = DAG.getContext();
14083
14084   // Build some magic constants.
14085   static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
14086   Constant *C0 = ConstantDataVector::get(*Context, CV0);
14087   SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
14088
14089   SmallVector<Constant*,2> CV1;
14090   CV1.push_back(
14091     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
14092                                       APInt(64, 0x4330000000000000ULL))));
14093   CV1.push_back(
14094     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
14095                                       APInt(64, 0x4530000000000000ULL))));
14096   Constant *C1 = ConstantVector::get(CV1);
14097   SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
14098
14099   // Load the 64-bit value into an XMM register.
14100   SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
14101                             Op.getOperand(0));
14102   SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
14103                               MachinePointerInfo::getConstantPool(),
14104                               false, false, false, 16);
14105   SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32,
14106                               DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1),
14107                               CLod0);
14108
14109   SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
14110                               MachinePointerInfo::getConstantPool(),
14111                               false, false, false, 16);
14112   SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1);
14113   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
14114   SDValue Result;
14115
14116   if (Subtarget->hasSSE3()) {
14117     // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
14118     Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
14119   } else {
14120     SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub);
14121     SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
14122                                            S2F, 0x4E, DAG);
14123     Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
14124                          DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle),
14125                          Sub);
14126   }
14127
14128   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
14129                      DAG.getIntPtrConstant(0));
14130 }
14131
14132 // LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
14133 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
14134                                                SelectionDAG &DAG) const {
14135   SDLoc dl(Op);
14136   // FP constant to bias correct the final result.
14137   SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
14138                                    MVT::f64);
14139
14140   // Load the 32-bit value into an XMM register.
14141   SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
14142                              Op.getOperand(0));
14143
14144   // Zero out the upper parts of the register.
14145   Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
14146
14147   Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
14148                      DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
14149                      DAG.getIntPtrConstant(0));
14150
14151   // Or the load with the bias.
14152   SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
14153                            DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
14154                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
14155                                                    MVT::v2f64, Load)),
14156                            DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
14157                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
14158                                                    MVT::v2f64, Bias)));
14159   Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
14160                    DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or),
14161                    DAG.getIntPtrConstant(0));
14162
14163   // Subtract the bias.
14164   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
14165
14166   // Handle final rounding.
14167   EVT DestVT = Op.getValueType();
14168
14169   if (DestVT.bitsLT(MVT::f64))
14170     return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
14171                        DAG.getIntPtrConstant(0));
14172   if (DestVT.bitsGT(MVT::f64))
14173     return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
14174
14175   // Handle final rounding.
14176   return Sub;
14177 }
14178
14179 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
14180                                      const X86Subtarget &Subtarget) {
14181   // The algorithm is the following:
14182   // #ifdef __SSE4_1__
14183   //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
14184   //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
14185   //                                 (uint4) 0x53000000, 0xaa);
14186   // #else
14187   //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
14188   //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
14189   // #endif
14190   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
14191   //     return (float4) lo + fhi;
14192
14193   SDLoc DL(Op);
14194   SDValue V = Op->getOperand(0);
14195   EVT VecIntVT = V.getValueType();
14196   bool Is128 = VecIntVT == MVT::v4i32;
14197   EVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
14198   // If we convert to something else than the supported type, e.g., to v4f64,
14199   // abort early.
14200   if (VecFloatVT != Op->getValueType(0))
14201     return SDValue();
14202
14203   unsigned NumElts = VecIntVT.getVectorNumElements();
14204   assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
14205          "Unsupported custom type");
14206   assert(NumElts <= 8 && "The size of the constant array must be fixed");
14207
14208   // In the #idef/#else code, we have in common:
14209   // - The vector of constants:
14210   // -- 0x4b000000
14211   // -- 0x53000000
14212   // - A shift:
14213   // -- v >> 16
14214
14215   // Create the splat vector for 0x4b000000.
14216   SDValue CstLow = DAG.getConstant(0x4b000000, MVT::i32);
14217   SDValue CstLowArray[] = {CstLow, CstLow, CstLow, CstLow,
14218                            CstLow, CstLow, CstLow, CstLow};
14219   SDValue VecCstLow = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
14220                                   makeArrayRef(&CstLowArray[0], NumElts));
14221   // Create the splat vector for 0x53000000.
14222   SDValue CstHigh = DAG.getConstant(0x53000000, MVT::i32);
14223   SDValue CstHighArray[] = {CstHigh, CstHigh, CstHigh, CstHigh,
14224                             CstHigh, CstHigh, CstHigh, CstHigh};
14225   SDValue VecCstHigh = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
14226                                    makeArrayRef(&CstHighArray[0], NumElts));
14227
14228   // Create the right shift.
14229   SDValue CstShift = DAG.getConstant(16, MVT::i32);
14230   SDValue CstShiftArray[] = {CstShift, CstShift, CstShift, CstShift,
14231                              CstShift, CstShift, CstShift, CstShift};
14232   SDValue VecCstShift = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
14233                                     makeArrayRef(&CstShiftArray[0], NumElts));
14234   SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
14235
14236   SDValue Low, High;
14237   if (Subtarget.hasSSE41()) {
14238     EVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
14239     //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
14240     SDValue VecCstLowBitcast =
14241         DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstLow);
14242     SDValue VecBitcast = DAG.getNode(ISD::BITCAST, DL, VecI16VT, V);
14243     // Low will be bitcasted right away, so do not bother bitcasting back to its
14244     // original type.
14245     Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
14246                       VecCstLowBitcast, DAG.getConstant(0xaa, MVT::i32));
14247     //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
14248     //                                 (uint4) 0x53000000, 0xaa);
14249     SDValue VecCstHighBitcast =
14250         DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstHigh);
14251     SDValue VecShiftBitcast =
14252         DAG.getNode(ISD::BITCAST, DL, VecI16VT, HighShift);
14253     // High will be bitcasted right away, so do not bother bitcasting back to
14254     // its original type.
14255     High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
14256                        VecCstHighBitcast, DAG.getConstant(0xaa, MVT::i32));
14257   } else {
14258     SDValue CstMask = DAG.getConstant(0xffff, MVT::i32);
14259     SDValue VecCstMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, CstMask,
14260                                      CstMask, CstMask, CstMask);
14261     //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
14262     SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
14263     Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
14264
14265     //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
14266     High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
14267   }
14268
14269   // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
14270   SDValue CstFAdd = DAG.getConstantFP(
14271       APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), MVT::f32);
14272   SDValue CstFAddArray[] = {CstFAdd, CstFAdd, CstFAdd, CstFAdd,
14273                             CstFAdd, CstFAdd, CstFAdd, CstFAdd};
14274   SDValue VecCstFAdd = DAG.getNode(ISD::BUILD_VECTOR, DL, VecFloatVT,
14275                                    makeArrayRef(&CstFAddArray[0], NumElts));
14276
14277   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
14278   SDValue HighBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, High);
14279   SDValue FHigh =
14280       DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
14281   //     return (float4) lo + fhi;
14282   SDValue LowBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, Low);
14283   return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
14284 }
14285
14286 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
14287                                                SelectionDAG &DAG) const {
14288   SDValue N0 = Op.getOperand(0);
14289   MVT SVT = N0.getSimpleValueType();
14290   SDLoc dl(Op);
14291
14292   switch (SVT.SimpleTy) {
14293   default:
14294     llvm_unreachable("Custom UINT_TO_FP is not supported!");
14295   case MVT::v4i8:
14296   case MVT::v4i16:
14297   case MVT::v8i8:
14298   case MVT::v8i16: {
14299     MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements());
14300     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14301                        DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
14302   }
14303   case MVT::v4i32:
14304   case MVT::v8i32:
14305     return lowerUINT_TO_FP_vXi32(Op, DAG, *Subtarget);
14306   }
14307   llvm_unreachable(nullptr);
14308 }
14309
14310 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
14311                                            SelectionDAG &DAG) const {
14312   SDValue N0 = Op.getOperand(0);
14313   SDLoc dl(Op);
14314
14315   if (Op.getValueType().isVector())
14316     return lowerUINT_TO_FP_vec(Op, DAG);
14317
14318   // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
14319   // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
14320   // the optimization here.
14321   if (DAG.SignBitIsZero(N0))
14322     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
14323
14324   MVT SrcVT = N0.getSimpleValueType();
14325   MVT DstVT = Op.getSimpleValueType();
14326   if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
14327     return LowerUINT_TO_FP_i64(Op, DAG);
14328   if (SrcVT == MVT::i32 && X86ScalarSSEf64)
14329     return LowerUINT_TO_FP_i32(Op, DAG);
14330   if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
14331     return SDValue();
14332
14333   // Make a 64-bit buffer, and use it to build an FILD.
14334   SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
14335   if (SrcVT == MVT::i32) {
14336     SDValue WordOff = DAG.getConstant(4, getPointerTy());
14337     SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
14338                                      getPointerTy(), StackSlot, WordOff);
14339     SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
14340                                   StackSlot, MachinePointerInfo(),
14341                                   false, false, 0);
14342     SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
14343                                   OffsetSlot, MachinePointerInfo(),
14344                                   false, false, 0);
14345     SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
14346     return Fild;
14347   }
14348
14349   assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
14350   SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
14351                                StackSlot, MachinePointerInfo(),
14352                                false, false, 0);
14353   // For i64 source, we need to add the appropriate power of 2 if the input
14354   // was negative.  This is the same as the optimization in
14355   // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
14356   // we must be careful to do the computation in x87 extended precision, not
14357   // in SSE. (The generic code can't know it's OK to do this, or how to.)
14358   int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
14359   MachineMemOperand *MMO =
14360     DAG.getMachineFunction()
14361     .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14362                           MachineMemOperand::MOLoad, 8, 8);
14363
14364   SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
14365   SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
14366   SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
14367                                          MVT::i64, MMO);
14368
14369   APInt FF(32, 0x5F800000ULL);
14370
14371   // Check whether the sign bit is set.
14372   SDValue SignSet = DAG.getSetCC(dl,
14373                                  getSetCCResultType(*DAG.getContext(), MVT::i64),
14374                                  Op.getOperand(0), DAG.getConstant(0, MVT::i64),
14375                                  ISD::SETLT);
14376
14377   // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
14378   SDValue FudgePtr = DAG.getConstantPool(
14379                              ConstantInt::get(*DAG.getContext(), FF.zext(64)),
14380                                          getPointerTy());
14381
14382   // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
14383   SDValue Zero = DAG.getIntPtrConstant(0);
14384   SDValue Four = DAG.getIntPtrConstant(4);
14385   SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
14386                                Zero, Four);
14387   FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset);
14388
14389   // Load the value out, extending it from f32 to f80.
14390   // FIXME: Avoid the extend by constructing the right constant pool?
14391   SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(),
14392                                  FudgePtr, MachinePointerInfo::getConstantPool(),
14393                                  MVT::f32, false, false, false, 4);
14394   // Extend everything to 80 bits to force it to be done on x87.
14395   SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
14396   return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0));
14397 }
14398
14399 std::pair<SDValue,SDValue>
14400 X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
14401                                     bool IsSigned, bool IsReplace) const {
14402   SDLoc DL(Op);
14403
14404   EVT DstTy = Op.getValueType();
14405
14406   if (!IsSigned && !isIntegerTypeFTOL(DstTy)) {
14407     assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
14408     DstTy = MVT::i64;
14409   }
14410
14411   assert(DstTy.getSimpleVT() <= MVT::i64 &&
14412          DstTy.getSimpleVT() >= MVT::i16 &&
14413          "Unknown FP_TO_INT to lower!");
14414
14415   // These are really Legal.
14416   if (DstTy == MVT::i32 &&
14417       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
14418     return std::make_pair(SDValue(), SDValue());
14419   if (Subtarget->is64Bit() &&
14420       DstTy == MVT::i64 &&
14421       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
14422     return std::make_pair(SDValue(), SDValue());
14423
14424   // We lower FP->int64 either into FISTP64 followed by a load from a temporary
14425   // stack slot, or into the FTOL runtime function.
14426   MachineFunction &MF = DAG.getMachineFunction();
14427   unsigned MemSize = DstTy.getSizeInBits()/8;
14428   int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
14429   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
14430
14431   unsigned Opc;
14432   if (!IsSigned && isIntegerTypeFTOL(DstTy))
14433     Opc = X86ISD::WIN_FTOL;
14434   else
14435     switch (DstTy.getSimpleVT().SimpleTy) {
14436     default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
14437     case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
14438     case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
14439     case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
14440     }
14441
14442   SDValue Chain = DAG.getEntryNode();
14443   SDValue Value = Op.getOperand(0);
14444   EVT TheVT = Op.getOperand(0).getValueType();
14445   // FIXME This causes a redundant load/store if the SSE-class value is already
14446   // in memory, such as if it is on the callstack.
14447   if (isScalarFPTypeInSSEReg(TheVT)) {
14448     assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
14449     Chain = DAG.getStore(Chain, DL, Value, StackSlot,
14450                          MachinePointerInfo::getFixedStack(SSFI),
14451                          false, false, 0);
14452     SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
14453     SDValue Ops[] = {
14454       Chain, StackSlot, DAG.getValueType(TheVT)
14455     };
14456
14457     MachineMemOperand *MMO =
14458       MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14459                               MachineMemOperand::MOLoad, MemSize, MemSize);
14460     Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
14461     Chain = Value.getValue(1);
14462     SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
14463     StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
14464   }
14465
14466   MachineMemOperand *MMO =
14467     MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14468                             MachineMemOperand::MOStore, MemSize, MemSize);
14469
14470   if (Opc != X86ISD::WIN_FTOL) {
14471     // Build the FP_TO_INT*_IN_MEM
14472     SDValue Ops[] = { Chain, Value, StackSlot };
14473     SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
14474                                            Ops, DstTy, MMO);
14475     return std::make_pair(FIST, StackSlot);
14476   } else {
14477     SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
14478       DAG.getVTList(MVT::Other, MVT::Glue),
14479       Chain, Value);
14480     SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX,
14481       MVT::i32, ftol.getValue(1));
14482     SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX,
14483       MVT::i32, eax.getValue(2));
14484     SDValue Ops[] = { eax, edx };
14485     SDValue pair = IsReplace
14486       ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops)
14487       : DAG.getMergeValues(Ops, DL);
14488     return std::make_pair(pair, SDValue());
14489   }
14490 }
14491
14492 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
14493                               const X86Subtarget *Subtarget) {
14494   MVT VT = Op->getSimpleValueType(0);
14495   SDValue In = Op->getOperand(0);
14496   MVT InVT = In.getSimpleValueType();
14497   SDLoc dl(Op);
14498
14499   // Optimize vectors in AVX mode:
14500   //
14501   //   v8i16 -> v8i32
14502   //   Use vpunpcklwd for 4 lower elements  v8i16 -> v4i32.
14503   //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
14504   //   Concat upper and lower parts.
14505   //
14506   //   v4i32 -> v4i64
14507   //   Use vpunpckldq for 4 lower elements  v4i32 -> v2i64.
14508   //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
14509   //   Concat upper and lower parts.
14510   //
14511
14512   if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
14513       ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
14514       ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
14515     return SDValue();
14516
14517   if (Subtarget->hasInt256())
14518     return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
14519
14520   SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
14521   SDValue Undef = DAG.getUNDEF(InVT);
14522   bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
14523   SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
14524   SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
14525
14526   MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
14527                              VT.getVectorNumElements()/2);
14528
14529   OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
14530   OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
14531
14532   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
14533 }
14534
14535 static  SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
14536                                         SelectionDAG &DAG) {
14537   MVT VT = Op->getSimpleValueType(0);
14538   SDValue In = Op->getOperand(0);
14539   MVT InVT = In.getSimpleValueType();
14540   SDLoc DL(Op);
14541   unsigned int NumElts = VT.getVectorNumElements();
14542   if (NumElts != 8 && NumElts != 16)
14543     return SDValue();
14544
14545   if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
14546     return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
14547
14548   EVT ExtVT = (NumElts == 8)? MVT::v8i64 : MVT::v16i32;
14549   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14550   // Now we have only mask extension
14551   assert(InVT.getVectorElementType() == MVT::i1);
14552   SDValue Cst = DAG.getTargetConstant(1, ExtVT.getScalarType());
14553   const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
14554   SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
14555   unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
14556   SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
14557                            MachinePointerInfo::getConstantPool(),
14558                            false, false, false, Alignment);
14559
14560   SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, DL, ExtVT, In, Ld);
14561   if (VT.is512BitVector())
14562     return Brcst;
14563   return DAG.getNode(X86ISD::VTRUNC, DL, VT, Brcst);
14564 }
14565
14566 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
14567                                SelectionDAG &DAG) {
14568   if (Subtarget->hasFp256()) {
14569     SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
14570     if (Res.getNode())
14571       return Res;
14572   }
14573
14574   return SDValue();
14575 }
14576
14577 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
14578                                 SelectionDAG &DAG) {
14579   SDLoc DL(Op);
14580   MVT VT = Op.getSimpleValueType();
14581   SDValue In = Op.getOperand(0);
14582   MVT SVT = In.getSimpleValueType();
14583
14584   if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
14585     return LowerZERO_EXTEND_AVX512(Op, DAG);
14586
14587   if (Subtarget->hasFp256()) {
14588     SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
14589     if (Res.getNode())
14590       return Res;
14591   }
14592
14593   assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
14594          VT.getVectorNumElements() != SVT.getVectorNumElements());
14595   return SDValue();
14596 }
14597
14598 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
14599   SDLoc DL(Op);
14600   MVT VT = Op.getSimpleValueType();
14601   SDValue In = Op.getOperand(0);
14602   MVT InVT = In.getSimpleValueType();
14603
14604   if (VT == MVT::i1) {
14605     assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
14606            "Invalid scalar TRUNCATE operation");
14607     if (InVT.getSizeInBits() >= 32)
14608       return SDValue();
14609     In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
14610     return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
14611   }
14612   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
14613          "Invalid TRUNCATE operation");
14614
14615   if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) {
14616     if (VT.getVectorElementType().getSizeInBits() >=8)
14617       return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
14618
14619     assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
14620     unsigned NumElts = InVT.getVectorNumElements();
14621     assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type");
14622     if (InVT.getSizeInBits() < 512) {
14623       MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64;
14624       In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
14625       InVT = ExtVT;
14626     }
14627
14628     SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType());
14629     const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
14630     SDValue CP = DAG.getConstantPool(C, getPointerTy());
14631     unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
14632     SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
14633                            MachinePointerInfo::getConstantPool(),
14634                            false, false, false, Alignment);
14635     SDValue OneV = DAG.getNode(X86ISD::VBROADCAST, DL, InVT, Ld);
14636     SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In);
14637     return DAG.getNode(X86ISD::TESTM, DL, VT, And, And);
14638   }
14639
14640   if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
14641     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
14642     if (Subtarget->hasInt256()) {
14643       static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
14644       In = DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, In);
14645       In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
14646                                 ShufMask);
14647       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
14648                          DAG.getIntPtrConstant(0));
14649     }
14650
14651     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14652                                DAG.getIntPtrConstant(0));
14653     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14654                                DAG.getIntPtrConstant(2));
14655     OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
14656     OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
14657     static const int ShufMask[] = {0, 2, 4, 6};
14658     return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
14659   }
14660
14661   if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
14662     // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
14663     if (Subtarget->hasInt256()) {
14664       In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In);
14665
14666       SmallVector<SDValue,32> pshufbMask;
14667       for (unsigned i = 0; i < 2; ++i) {
14668         pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8));
14669         pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8));
14670         pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8));
14671         pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8));
14672         pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8));
14673         pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8));
14674         pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8));
14675         pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8));
14676         for (unsigned j = 0; j < 8; ++j)
14677           pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
14678       }
14679       SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask);
14680       In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
14681       In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In);
14682
14683       static const int ShufMask[] = {0,  2,  -1,  -1};
14684       In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, DAG.getUNDEF(MVT::v4i64),
14685                                 &ShufMask[0]);
14686       In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14687                        DAG.getIntPtrConstant(0));
14688       return DAG.getNode(ISD::BITCAST, DL, VT, In);
14689     }
14690
14691     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
14692                                DAG.getIntPtrConstant(0));
14693
14694     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
14695                                DAG.getIntPtrConstant(4));
14696
14697     OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpLo);
14698     OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpHi);
14699
14700     // The PSHUFB mask:
14701     static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
14702                                    -1, -1, -1, -1, -1, -1, -1, -1};
14703
14704     SDValue Undef = DAG.getUNDEF(MVT::v16i8);
14705     OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
14706     OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
14707
14708     OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
14709     OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
14710
14711     // The MOVLHPS Mask:
14712     static const int ShufMask2[] = {0, 1, 4, 5};
14713     SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
14714     return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, res);
14715   }
14716
14717   // Handle truncation of V256 to V128 using shuffles.
14718   if (!VT.is128BitVector() || !InVT.is256BitVector())
14719     return SDValue();
14720
14721   assert(Subtarget->hasFp256() && "256-bit vector without AVX!");
14722
14723   unsigned NumElems = VT.getVectorNumElements();
14724   MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
14725
14726   SmallVector<int, 16> MaskVec(NumElems * 2, -1);
14727   // Prepare truncation shuffle mask
14728   for (unsigned i = 0; i != NumElems; ++i)
14729     MaskVec[i] = i * 2;
14730   SDValue V = DAG.getVectorShuffle(NVT, DL,
14731                                    DAG.getNode(ISD::BITCAST, DL, NVT, In),
14732                                    DAG.getUNDEF(NVT), &MaskVec[0]);
14733   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
14734                      DAG.getIntPtrConstant(0));
14735 }
14736
14737 SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
14738                                            SelectionDAG &DAG) const {
14739   assert(!Op.getSimpleValueType().isVector());
14740
14741   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
14742     /*IsSigned=*/ true, /*IsReplace=*/ false);
14743   SDValue FIST = Vals.first, StackSlot = Vals.second;
14744   // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
14745   if (!FIST.getNode()) return Op;
14746
14747   if (StackSlot.getNode())
14748     // Load the result.
14749     return DAG.getLoad(Op.getValueType(), SDLoc(Op),
14750                        FIST, StackSlot, MachinePointerInfo(),
14751                        false, false, false, 0);
14752
14753   // The node is the result.
14754   return FIST;
14755 }
14756
14757 SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
14758                                            SelectionDAG &DAG) const {
14759   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
14760     /*IsSigned=*/ false, /*IsReplace=*/ false);
14761   SDValue FIST = Vals.first, StackSlot = Vals.second;
14762   assert(FIST.getNode() && "Unexpected failure");
14763
14764   if (StackSlot.getNode())
14765     // Load the result.
14766     return DAG.getLoad(Op.getValueType(), SDLoc(Op),
14767                        FIST, StackSlot, MachinePointerInfo(),
14768                        false, false, false, 0);
14769
14770   // The node is the result.
14771   return FIST;
14772 }
14773
14774 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
14775   SDLoc DL(Op);
14776   MVT VT = Op.getSimpleValueType();
14777   SDValue In = Op.getOperand(0);
14778   MVT SVT = In.getSimpleValueType();
14779
14780   assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
14781
14782   return DAG.getNode(X86ISD::VFPEXT, DL, VT,
14783                      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
14784                                  In, DAG.getUNDEF(SVT)));
14785 }
14786
14787 /// The only differences between FABS and FNEG are the mask and the logic op.
14788 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
14789 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
14790   assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
14791          "Wrong opcode for lowering FABS or FNEG.");
14792
14793   bool IsFABS = (Op.getOpcode() == ISD::FABS);
14794
14795   // If this is a FABS and it has an FNEG user, bail out to fold the combination
14796   // into an FNABS. We'll lower the FABS after that if it is still in use.
14797   if (IsFABS)
14798     for (SDNode *User : Op->uses())
14799       if (User->getOpcode() == ISD::FNEG)
14800         return Op;
14801
14802   SDValue Op0 = Op.getOperand(0);
14803   bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
14804
14805   SDLoc dl(Op);
14806   MVT VT = Op.getSimpleValueType();
14807   // Assume scalar op for initialization; update for vector if needed.
14808   // Note that there are no scalar bitwise logical SSE/AVX instructions, so we
14809   // generate a 16-byte vector constant and logic op even for the scalar case.
14810   // Using a 16-byte mask allows folding the load of the mask with
14811   // the logic op, so it can save (~4 bytes) on code size.
14812   MVT EltVT = VT;
14813   unsigned NumElts = VT == MVT::f64 ? 2 : 4;
14814   // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
14815   // decide if we should generate a 16-byte constant mask when we only need 4 or
14816   // 8 bytes for the scalar case.
14817   if (VT.isVector()) {
14818     EltVT = VT.getVectorElementType();
14819     NumElts = VT.getVectorNumElements();
14820   }
14821
14822   unsigned EltBits = EltVT.getSizeInBits();
14823   LLVMContext *Context = DAG.getContext();
14824   // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
14825   APInt MaskElt =
14826     IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits);
14827   Constant *C = ConstantInt::get(*Context, MaskElt);
14828   C = ConstantVector::getSplat(NumElts, C);
14829   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14830   SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy());
14831   unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
14832   SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
14833                              MachinePointerInfo::getConstantPool(),
14834                              false, false, false, Alignment);
14835
14836   if (VT.isVector()) {
14837     // For a vector, cast operands to a vector type, perform the logic op,
14838     // and cast the result back to the original value type.
14839     MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
14840     SDValue MaskCasted = DAG.getNode(ISD::BITCAST, dl, VecVT, Mask);
14841     SDValue Operand = IsFNABS ?
14842       DAG.getNode(ISD::BITCAST, dl, VecVT, Op0.getOperand(0)) :
14843       DAG.getNode(ISD::BITCAST, dl, VecVT, Op0);
14844     unsigned BitOp = IsFABS ? ISD::AND : IsFNABS ? ISD::OR : ISD::XOR;
14845     return DAG.getNode(ISD::BITCAST, dl, VT,
14846                        DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted));
14847   }
14848
14849   // If not vector, then scalar.
14850   unsigned BitOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
14851   SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
14852   return DAG.getNode(BitOp, dl, VT, Operand, Mask);
14853 }
14854
14855 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
14856   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14857   LLVMContext *Context = DAG.getContext();
14858   SDValue Op0 = Op.getOperand(0);
14859   SDValue Op1 = Op.getOperand(1);
14860   SDLoc dl(Op);
14861   MVT VT = Op.getSimpleValueType();
14862   MVT SrcVT = Op1.getSimpleValueType();
14863
14864   // If second operand is smaller, extend it first.
14865   if (SrcVT.bitsLT(VT)) {
14866     Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
14867     SrcVT = VT;
14868   }
14869   // And if it is bigger, shrink it first.
14870   if (SrcVT.bitsGT(VT)) {
14871     Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
14872     SrcVT = VT;
14873   }
14874
14875   // At this point the operands and the result should have the same
14876   // type, and that won't be f80 since that is not custom lowered.
14877
14878   const fltSemantics &Sem =
14879       VT == MVT::f64 ? APFloat::IEEEdouble : APFloat::IEEEsingle;
14880   const unsigned SizeInBits = VT.getSizeInBits();
14881
14882   SmallVector<Constant *, 4> CV(
14883       VT == MVT::f64 ? 2 : 4,
14884       ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0))));
14885
14886   // First, clear all bits but the sign bit from the second operand (sign).
14887   CV[0] = ConstantFP::get(*Context,
14888                           APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1)));
14889   Constant *C = ConstantVector::get(CV);
14890   SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
14891   SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
14892                               MachinePointerInfo::getConstantPool(),
14893                               false, false, false, 16);
14894   SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
14895
14896   // Next, clear the sign bit from the first operand (magnitude).
14897   // If it's a constant, we can clear it here.
14898   if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Op0)) {
14899     APFloat APF = Op0CN->getValueAPF();
14900     // If the magnitude is a positive zero, the sign bit alone is enough.
14901     if (APF.isPosZero())
14902       return SignBit;
14903     APF.clearSign();
14904     CV[0] = ConstantFP::get(*Context, APF);
14905   } else {
14906     CV[0] = ConstantFP::get(
14907         *Context,
14908         APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1)));
14909   }
14910   C = ConstantVector::get(CV);
14911   CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
14912   SDValue Val = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
14913                             MachinePointerInfo::getConstantPool(),
14914                             false, false, false, 16);
14915   // If the magnitude operand wasn't a constant, we need to AND out the sign.
14916   if (!isa<ConstantFPSDNode>(Op0))
14917     Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Val);
14918
14919   // OR the magnitude value with the sign bit.
14920   return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
14921 }
14922
14923 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
14924   SDValue N0 = Op.getOperand(0);
14925   SDLoc dl(Op);
14926   MVT VT = Op.getSimpleValueType();
14927
14928   // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
14929   SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
14930                                   DAG.getConstant(1, VT));
14931   return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
14932 }
14933
14934 // Check whether an OR'd tree is PTEST-able.
14935 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
14936                                       SelectionDAG &DAG) {
14937   assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
14938
14939   if (!Subtarget->hasSSE41())
14940     return SDValue();
14941
14942   if (!Op->hasOneUse())
14943     return SDValue();
14944
14945   SDNode *N = Op.getNode();
14946   SDLoc DL(N);
14947
14948   SmallVector<SDValue, 8> Opnds;
14949   DenseMap<SDValue, unsigned> VecInMap;
14950   SmallVector<SDValue, 8> VecIns;
14951   EVT VT = MVT::Other;
14952
14953   // Recognize a special case where a vector is casted into wide integer to
14954   // test all 0s.
14955   Opnds.push_back(N->getOperand(0));
14956   Opnds.push_back(N->getOperand(1));
14957
14958   for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
14959     SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
14960     // BFS traverse all OR'd operands.
14961     if (I->getOpcode() == ISD::OR) {
14962       Opnds.push_back(I->getOperand(0));
14963       Opnds.push_back(I->getOperand(1));
14964       // Re-evaluate the number of nodes to be traversed.
14965       e += 2; // 2 more nodes (LHS and RHS) are pushed.
14966       continue;
14967     }
14968
14969     // Quit if a non-EXTRACT_VECTOR_ELT
14970     if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
14971       return SDValue();
14972
14973     // Quit if without a constant index.
14974     SDValue Idx = I->getOperand(1);
14975     if (!isa<ConstantSDNode>(Idx))
14976       return SDValue();
14977
14978     SDValue ExtractedFromVec = I->getOperand(0);
14979     DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
14980     if (M == VecInMap.end()) {
14981       VT = ExtractedFromVec.getValueType();
14982       // Quit if not 128/256-bit vector.
14983       if (!VT.is128BitVector() && !VT.is256BitVector())
14984         return SDValue();
14985       // Quit if not the same type.
14986       if (VecInMap.begin() != VecInMap.end() &&
14987           VT != VecInMap.begin()->first.getValueType())
14988         return SDValue();
14989       M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
14990       VecIns.push_back(ExtractedFromVec);
14991     }
14992     M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
14993   }
14994
14995   assert((VT.is128BitVector() || VT.is256BitVector()) &&
14996          "Not extracted from 128-/256-bit vector.");
14997
14998   unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
14999
15000   for (DenseMap<SDValue, unsigned>::const_iterator
15001         I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
15002     // Quit if not all elements are used.
15003     if (I->second != FullMask)
15004       return SDValue();
15005   }
15006
15007   EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
15008
15009   // Cast all vectors into TestVT for PTEST.
15010   for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
15011     VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]);
15012
15013   // If more than one full vectors are evaluated, OR them first before PTEST.
15014   for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
15015     // Each iteration will OR 2 nodes and append the result until there is only
15016     // 1 node left, i.e. the final OR'd value of all vectors.
15017     SDValue LHS = VecIns[Slot];
15018     SDValue RHS = VecIns[Slot + 1];
15019     VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
15020   }
15021
15022   return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
15023                      VecIns.back(), VecIns.back());
15024 }
15025
15026 /// \brief return true if \c Op has a use that doesn't just read flags.
15027 static bool hasNonFlagsUse(SDValue Op) {
15028   for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
15029        ++UI) {
15030     SDNode *User = *UI;
15031     unsigned UOpNo = UI.getOperandNo();
15032     if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
15033       // Look pass truncate.
15034       UOpNo = User->use_begin().getOperandNo();
15035       User = *User->use_begin();
15036     }
15037
15038     if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
15039         !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
15040       return true;
15041   }
15042   return false;
15043 }
15044
15045 /// Emit nodes that will be selected as "test Op0,Op0", or something
15046 /// equivalent.
15047 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
15048                                     SelectionDAG &DAG) const {
15049   if (Op.getValueType() == MVT::i1)
15050     // KORTEST instruction should be selected
15051     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
15052                        DAG.getConstant(0, Op.getValueType()));
15053
15054   // CF and OF aren't always set the way we want. Determine which
15055   // of these we need.
15056   bool NeedCF = false;
15057   bool NeedOF = false;
15058   switch (X86CC) {
15059   default: break;
15060   case X86::COND_A: case X86::COND_AE:
15061   case X86::COND_B: case X86::COND_BE:
15062     NeedCF = true;
15063     break;
15064   case X86::COND_G: case X86::COND_GE:
15065   case X86::COND_L: case X86::COND_LE:
15066   case X86::COND_O: case X86::COND_NO: {
15067     // Check if we really need to set the
15068     // Overflow flag. If NoSignedWrap is present
15069     // that is not actually needed.
15070     switch (Op->getOpcode()) {
15071     case ISD::ADD:
15072     case ISD::SUB:
15073     case ISD::MUL:
15074     case ISD::SHL: {
15075       const BinaryWithFlagsSDNode *BinNode =
15076           cast<BinaryWithFlagsSDNode>(Op.getNode());
15077       if (BinNode->hasNoSignedWrap())
15078         break;
15079     }
15080     default:
15081       NeedOF = true;
15082       break;
15083     }
15084     break;
15085   }
15086   }
15087   // See if we can use the EFLAGS value from the operand instead of
15088   // doing a separate TEST. TEST always sets OF and CF to 0, so unless
15089   // we prove that the arithmetic won't overflow, we can't use OF or CF.
15090   if (Op.getResNo() != 0 || NeedOF || NeedCF) {
15091     // Emit a CMP with 0, which is the TEST pattern.
15092     //if (Op.getValueType() == MVT::i1)
15093     //  return DAG.getNode(X86ISD::CMP, dl, MVT::i1, Op,
15094     //                     DAG.getConstant(0, MVT::i1));
15095     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
15096                        DAG.getConstant(0, Op.getValueType()));
15097   }
15098   unsigned Opcode = 0;
15099   unsigned NumOperands = 0;
15100
15101   // Truncate operations may prevent the merge of the SETCC instruction
15102   // and the arithmetic instruction before it. Attempt to truncate the operands
15103   // of the arithmetic instruction and use a reduced bit-width instruction.
15104   bool NeedTruncation = false;
15105   SDValue ArithOp = Op;
15106   if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
15107     SDValue Arith = Op->getOperand(0);
15108     // Both the trunc and the arithmetic op need to have one user each.
15109     if (Arith->hasOneUse())
15110       switch (Arith.getOpcode()) {
15111         default: break;
15112         case ISD::ADD:
15113         case ISD::SUB:
15114         case ISD::AND:
15115         case ISD::OR:
15116         case ISD::XOR: {
15117           NeedTruncation = true;
15118           ArithOp = Arith;
15119         }
15120       }
15121   }
15122
15123   // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
15124   // which may be the result of a CAST.  We use the variable 'Op', which is the
15125   // non-casted variable when we check for possible users.
15126   switch (ArithOp.getOpcode()) {
15127   case ISD::ADD:
15128     // Due to an isel shortcoming, be conservative if this add is likely to be
15129     // selected as part of a load-modify-store instruction. When the root node
15130     // in a match is a store, isel doesn't know how to remap non-chain non-flag
15131     // uses of other nodes in the match, such as the ADD in this case. This
15132     // leads to the ADD being left around and reselected, with the result being
15133     // two adds in the output.  Alas, even if none our users are stores, that
15134     // doesn't prove we're O.K.  Ergo, if we have any parents that aren't
15135     // CopyToReg or SETCC, eschew INC/DEC.  A better fix seems to require
15136     // climbing the DAG back to the root, and it doesn't seem to be worth the
15137     // effort.
15138     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
15139          UE = Op.getNode()->use_end(); UI != UE; ++UI)
15140       if (UI->getOpcode() != ISD::CopyToReg &&
15141           UI->getOpcode() != ISD::SETCC &&
15142           UI->getOpcode() != ISD::STORE)
15143         goto default_case;
15144
15145     if (ConstantSDNode *C =
15146         dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
15147       // An add of one will be selected as an INC.
15148       if (C->getAPIntValue() == 1 && !Subtarget->slowIncDec()) {
15149         Opcode = X86ISD::INC;
15150         NumOperands = 1;
15151         break;
15152       }
15153
15154       // An add of negative one (subtract of one) will be selected as a DEC.
15155       if (C->getAPIntValue().isAllOnesValue() && !Subtarget->slowIncDec()) {
15156         Opcode = X86ISD::DEC;
15157         NumOperands = 1;
15158         break;
15159       }
15160     }
15161
15162     // Otherwise use a regular EFLAGS-setting add.
15163     Opcode = X86ISD::ADD;
15164     NumOperands = 2;
15165     break;
15166   case ISD::SHL:
15167   case ISD::SRL:
15168     // If we have a constant logical shift that's only used in a comparison
15169     // against zero turn it into an equivalent AND. This allows turning it into
15170     // a TEST instruction later.
15171     if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
15172         isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
15173       EVT VT = Op.getValueType();
15174       unsigned BitWidth = VT.getSizeInBits();
15175       unsigned ShAmt = Op->getConstantOperandVal(1);
15176       if (ShAmt >= BitWidth) // Avoid undefined shifts.
15177         break;
15178       APInt Mask = ArithOp.getOpcode() == ISD::SRL
15179                        ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
15180                        : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
15181       if (!Mask.isSignedIntN(32)) // Avoid large immediates.
15182         break;
15183       SDValue New = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
15184                                 DAG.getConstant(Mask, VT));
15185       DAG.ReplaceAllUsesWith(Op, New);
15186       Op = New;
15187     }
15188     break;
15189
15190   case ISD::AND:
15191     // If the primary and result isn't used, don't bother using X86ISD::AND,
15192     // because a TEST instruction will be better.
15193     if (!hasNonFlagsUse(Op))
15194       break;
15195     // FALL THROUGH
15196   case ISD::SUB:
15197   case ISD::OR:
15198   case ISD::XOR:
15199     // Due to the ISEL shortcoming noted above, be conservative if this op is
15200     // likely to be selected as part of a load-modify-store instruction.
15201     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
15202            UE = Op.getNode()->use_end(); UI != UE; ++UI)
15203       if (UI->getOpcode() == ISD::STORE)
15204         goto default_case;
15205
15206     // Otherwise use a regular EFLAGS-setting instruction.
15207     switch (ArithOp.getOpcode()) {
15208     default: llvm_unreachable("unexpected operator!");
15209     case ISD::SUB: Opcode = X86ISD::SUB; break;
15210     case ISD::XOR: Opcode = X86ISD::XOR; break;
15211     case ISD::AND: Opcode = X86ISD::AND; break;
15212     case ISD::OR: {
15213       if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
15214         SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG);
15215         if (EFLAGS.getNode())
15216           return EFLAGS;
15217       }
15218       Opcode = X86ISD::OR;
15219       break;
15220     }
15221     }
15222
15223     NumOperands = 2;
15224     break;
15225   case X86ISD::ADD:
15226   case X86ISD::SUB:
15227   case X86ISD::INC:
15228   case X86ISD::DEC:
15229   case X86ISD::OR:
15230   case X86ISD::XOR:
15231   case X86ISD::AND:
15232     return SDValue(Op.getNode(), 1);
15233   default:
15234   default_case:
15235     break;
15236   }
15237
15238   // If we found that truncation is beneficial, perform the truncation and
15239   // update 'Op'.
15240   if (NeedTruncation) {
15241     EVT VT = Op.getValueType();
15242     SDValue WideVal = Op->getOperand(0);
15243     EVT WideVT = WideVal.getValueType();
15244     unsigned ConvertedOp = 0;
15245     // Use a target machine opcode to prevent further DAGCombine
15246     // optimizations that may separate the arithmetic operations
15247     // from the setcc node.
15248     switch (WideVal.getOpcode()) {
15249       default: break;
15250       case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
15251       case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
15252       case ISD::AND: ConvertedOp = X86ISD::AND; break;
15253       case ISD::OR:  ConvertedOp = X86ISD::OR;  break;
15254       case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
15255     }
15256
15257     if (ConvertedOp) {
15258       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15259       if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
15260         SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
15261         SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
15262         Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
15263       }
15264     }
15265   }
15266
15267   if (Opcode == 0)
15268     // Emit a CMP with 0, which is the TEST pattern.
15269     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
15270                        DAG.getConstant(0, Op.getValueType()));
15271
15272   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
15273   SmallVector<SDValue, 4> Ops;
15274   for (unsigned i = 0; i != NumOperands; ++i)
15275     Ops.push_back(Op.getOperand(i));
15276
15277   SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
15278   DAG.ReplaceAllUsesWith(Op, New);
15279   return SDValue(New.getNode(), 1);
15280 }
15281
15282 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
15283 /// equivalent.
15284 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
15285                                    SDLoc dl, SelectionDAG &DAG) const {
15286   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) {
15287     if (C->getAPIntValue() == 0)
15288       return EmitTest(Op0, X86CC, dl, DAG);
15289
15290      if (Op0.getValueType() == MVT::i1)
15291        llvm_unreachable("Unexpected comparison operation for MVT::i1 operands");
15292   }
15293
15294   if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
15295        Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
15296     // Do the comparison at i32 if it's smaller, besides the Atom case.
15297     // This avoids subregister aliasing issues. Keep the smaller reference
15298     // if we're optimizing for size, however, as that'll allow better folding
15299     // of memory operations.
15300     if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 &&
15301         !DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
15302              AttributeSet::FunctionIndex, Attribute::MinSize) &&
15303         !Subtarget->isAtom()) {
15304       unsigned ExtendOp =
15305           isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
15306       Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
15307       Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
15308     }
15309     // Use SUB instead of CMP to enable CSE between SUB and CMP.
15310     SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
15311     SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
15312                               Op0, Op1);
15313     return SDValue(Sub.getNode(), 1);
15314   }
15315   return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
15316 }
15317
15318 /// Convert a comparison if required by the subtarget.
15319 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
15320                                                  SelectionDAG &DAG) const {
15321   // If the subtarget does not support the FUCOMI instruction, floating-point
15322   // comparisons have to be converted.
15323   if (Subtarget->hasCMov() ||
15324       Cmp.getOpcode() != X86ISD::CMP ||
15325       !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
15326       !Cmp.getOperand(1).getValueType().isFloatingPoint())
15327     return Cmp;
15328
15329   // The instruction selector will select an FUCOM instruction instead of
15330   // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
15331   // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
15332   // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
15333   SDLoc dl(Cmp);
15334   SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
15335   SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
15336   SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
15337                             DAG.getConstant(8, MVT::i8));
15338   SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
15339   return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
15340 }
15341
15342 /// The minimum architected relative accuracy is 2^-12. We need one
15343 /// Newton-Raphson step to have a good float result (24 bits of precision).
15344 SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
15345                                             DAGCombinerInfo &DCI,
15346                                             unsigned &RefinementSteps,
15347                                             bool &UseOneConstNR) const {
15348   // FIXME: We should use instruction latency models to calculate the cost of
15349   // each potential sequence, but this is very hard to do reliably because
15350   // at least Intel's Core* chips have variable timing based on the number of
15351   // significant digits in the divisor and/or sqrt operand.
15352   if (!Subtarget->useSqrtEst())
15353     return SDValue();
15354
15355   EVT VT = Op.getValueType();
15356
15357   // SSE1 has rsqrtss and rsqrtps.
15358   // TODO: Add support for AVX512 (v16f32).
15359   // It is likely not profitable to do this for f64 because a double-precision
15360   // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
15361   // instructions: convert to single, rsqrtss, convert back to double, refine
15362   // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
15363   // along with FMA, this could be a throughput win.
15364   if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
15365       (Subtarget->hasAVX() && VT == MVT::v8f32)) {
15366     RefinementSteps = 1;
15367     UseOneConstNR = false;
15368     return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
15369   }
15370   return SDValue();
15371 }
15372
15373 /// The minimum architected relative accuracy is 2^-12. We need one
15374 /// Newton-Raphson step to have a good float result (24 bits of precision).
15375 SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
15376                                             DAGCombinerInfo &DCI,
15377                                             unsigned &RefinementSteps) const {
15378   // FIXME: We should use instruction latency models to calculate the cost of
15379   // each potential sequence, but this is very hard to do reliably because
15380   // at least Intel's Core* chips have variable timing based on the number of
15381   // significant digits in the divisor.
15382   if (!Subtarget->useReciprocalEst())
15383     return SDValue();
15384
15385   EVT VT = Op.getValueType();
15386
15387   // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
15388   // TODO: Add support for AVX512 (v16f32).
15389   // It is likely not profitable to do this for f64 because a double-precision
15390   // reciprocal estimate with refinement on x86 prior to FMA requires
15391   // 15 instructions: convert to single, rcpss, convert back to double, refine
15392   // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
15393   // along with FMA, this could be a throughput win.
15394   if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
15395       (Subtarget->hasAVX() && VT == MVT::v8f32)) {
15396     RefinementSteps = ReciprocalEstimateRefinementSteps;
15397     return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
15398   }
15399   return SDValue();
15400 }
15401
15402 static bool isAllOnes(SDValue V) {
15403   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
15404   return C && C->isAllOnesValue();
15405 }
15406
15407 /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
15408 /// if it's possible.
15409 SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
15410                                      SDLoc dl, SelectionDAG &DAG) const {
15411   SDValue Op0 = And.getOperand(0);
15412   SDValue Op1 = And.getOperand(1);
15413   if (Op0.getOpcode() == ISD::TRUNCATE)
15414     Op0 = Op0.getOperand(0);
15415   if (Op1.getOpcode() == ISD::TRUNCATE)
15416     Op1 = Op1.getOperand(0);
15417
15418   SDValue LHS, RHS;
15419   if (Op1.getOpcode() == ISD::SHL)
15420     std::swap(Op0, Op1);
15421   if (Op0.getOpcode() == ISD::SHL) {
15422     if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0)))
15423       if (And00C->getZExtValue() == 1) {
15424         // If we looked past a truncate, check that it's only truncating away
15425         // known zeros.
15426         unsigned BitWidth = Op0.getValueSizeInBits();
15427         unsigned AndBitWidth = And.getValueSizeInBits();
15428         if (BitWidth > AndBitWidth) {
15429           APInt Zeros, Ones;
15430           DAG.computeKnownBits(Op0, Zeros, Ones);
15431           if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
15432             return SDValue();
15433         }
15434         LHS = Op1;
15435         RHS = Op0.getOperand(1);
15436       }
15437   } else if (Op1.getOpcode() == ISD::Constant) {
15438     ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
15439     uint64_t AndRHSVal = AndRHS->getZExtValue();
15440     SDValue AndLHS = Op0;
15441
15442     if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
15443       LHS = AndLHS.getOperand(0);
15444       RHS = AndLHS.getOperand(1);
15445     }
15446
15447     // Use BT if the immediate can't be encoded in a TEST instruction.
15448     if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
15449       LHS = AndLHS;
15450       RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType());
15451     }
15452   }
15453
15454   if (LHS.getNode()) {
15455     // If LHS is i8, promote it to i32 with any_extend.  There is no i8 BT
15456     // instruction.  Since the shift amount is in-range-or-undefined, we know
15457     // that doing a bittest on the i32 value is ok.  We extend to i32 because
15458     // the encoding for the i16 version is larger than the i32 version.
15459     // Also promote i16 to i32 for performance / code size reason.
15460     if (LHS.getValueType() == MVT::i8 ||
15461         LHS.getValueType() == MVT::i16)
15462       LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
15463
15464     // If the operand types disagree, extend the shift amount to match.  Since
15465     // BT ignores high bits (like shifts) we can use anyextend.
15466     if (LHS.getValueType() != RHS.getValueType())
15467       RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
15468
15469     SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
15470     X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
15471     return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15472                        DAG.getConstant(Cond, MVT::i8), BT);
15473   }
15474
15475   return SDValue();
15476 }
15477
15478 /// \brief - Turns an ISD::CondCode into a value suitable for SSE floating point
15479 /// mask CMPs.
15480 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
15481                               SDValue &Op1) {
15482   unsigned SSECC;
15483   bool Swap = false;
15484
15485   // SSE Condition code mapping:
15486   //  0 - EQ
15487   //  1 - LT
15488   //  2 - LE
15489   //  3 - UNORD
15490   //  4 - NEQ
15491   //  5 - NLT
15492   //  6 - NLE
15493   //  7 - ORD
15494   switch (SetCCOpcode) {
15495   default: llvm_unreachable("Unexpected SETCC condition");
15496   case ISD::SETOEQ:
15497   case ISD::SETEQ:  SSECC = 0; break;
15498   case ISD::SETOGT:
15499   case ISD::SETGT:  Swap = true; // Fallthrough
15500   case ISD::SETLT:
15501   case ISD::SETOLT: SSECC = 1; break;
15502   case ISD::SETOGE:
15503   case ISD::SETGE:  Swap = true; // Fallthrough
15504   case ISD::SETLE:
15505   case ISD::SETOLE: SSECC = 2; break;
15506   case ISD::SETUO:  SSECC = 3; break;
15507   case ISD::SETUNE:
15508   case ISD::SETNE:  SSECC = 4; break;
15509   case ISD::SETULE: Swap = true; // Fallthrough
15510   case ISD::SETUGE: SSECC = 5; break;
15511   case ISD::SETULT: Swap = true; // Fallthrough
15512   case ISD::SETUGT: SSECC = 6; break;
15513   case ISD::SETO:   SSECC = 7; break;
15514   case ISD::SETUEQ:
15515   case ISD::SETONE: SSECC = 8; break;
15516   }
15517   if (Swap)
15518     std::swap(Op0, Op1);
15519
15520   return SSECC;
15521 }
15522
15523 // Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
15524 // ones, and then concatenate the result back.
15525 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
15526   MVT VT = Op.getSimpleValueType();
15527
15528   assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
15529          "Unsupported value type for operation");
15530
15531   unsigned NumElems = VT.getVectorNumElements();
15532   SDLoc dl(Op);
15533   SDValue CC = Op.getOperand(2);
15534
15535   // Extract the LHS vectors
15536   SDValue LHS = Op.getOperand(0);
15537   SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
15538   SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
15539
15540   // Extract the RHS vectors
15541   SDValue RHS = Op.getOperand(1);
15542   SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
15543   SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
15544
15545   // Issue the operation on the smaller types and concatenate the result back
15546   MVT EltVT = VT.getVectorElementType();
15547   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
15548   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
15549                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
15550                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
15551 }
15552
15553 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG,
15554                                      const X86Subtarget *Subtarget) {
15555   SDValue Op0 = Op.getOperand(0);
15556   SDValue Op1 = Op.getOperand(1);
15557   SDValue CC = Op.getOperand(2);
15558   MVT VT = Op.getSimpleValueType();
15559   SDLoc dl(Op);
15560
15561   assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 8 &&
15562          Op.getValueType().getScalarType() == MVT::i1 &&
15563          "Cannot set masked compare for this operation");
15564
15565   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
15566   unsigned  Opc = 0;
15567   bool Unsigned = false;
15568   bool Swap = false;
15569   unsigned SSECC;
15570   switch (SetCCOpcode) {
15571   default: llvm_unreachable("Unexpected SETCC condition");
15572   case ISD::SETNE:  SSECC = 4; break;
15573   case ISD::SETEQ:  Opc = X86ISD::PCMPEQM; break;
15574   case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
15575   case ISD::SETLT:  Swap = true; //fall-through
15576   case ISD::SETGT:  Opc = X86ISD::PCMPGTM; break;
15577   case ISD::SETULT: SSECC = 1; Unsigned = true; break;
15578   case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
15579   case ISD::SETGE:  Swap = true; SSECC = 2; break; // LE + swap
15580   case ISD::SETULE: Unsigned = true; //fall-through
15581   case ISD::SETLE:  SSECC = 2; break;
15582   }
15583
15584   if (Swap)
15585     std::swap(Op0, Op1);
15586   if (Opc)
15587     return DAG.getNode(Opc, dl, VT, Op0, Op1);
15588   Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
15589   return DAG.getNode(Opc, dl, VT, Op0, Op1,
15590                      DAG.getConstant(SSECC, MVT::i8));
15591 }
15592
15593 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
15594 /// operand \p Op1.  If non-trivial (for example because it's not constant)
15595 /// return an empty value.
15596 static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG)
15597 {
15598   BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
15599   if (!BV)
15600     return SDValue();
15601
15602   MVT VT = Op1.getSimpleValueType();
15603   MVT EVT = VT.getVectorElementType();
15604   unsigned n = VT.getVectorNumElements();
15605   SmallVector<SDValue, 8> ULTOp1;
15606
15607   for (unsigned i = 0; i < n; ++i) {
15608     ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
15609     if (!Elt || Elt->isOpaque() || Elt->getValueType(0) != EVT)
15610       return SDValue();
15611
15612     // Avoid underflow.
15613     APInt Val = Elt->getAPIntValue();
15614     if (Val == 0)
15615       return SDValue();
15616
15617     ULTOp1.push_back(DAG.getConstant(Val - 1, EVT));
15618   }
15619
15620   return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ULTOp1);
15621 }
15622
15623 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
15624                            SelectionDAG &DAG) {
15625   SDValue Op0 = Op.getOperand(0);
15626   SDValue Op1 = Op.getOperand(1);
15627   SDValue CC = Op.getOperand(2);
15628   MVT VT = Op.getSimpleValueType();
15629   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
15630   bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
15631   SDLoc dl(Op);
15632
15633   if (isFP) {
15634 #ifndef NDEBUG
15635     MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
15636     assert(EltVT == MVT::f32 || EltVT == MVT::f64);
15637 #endif
15638
15639     unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
15640     unsigned Opc = X86ISD::CMPP;
15641     if (Subtarget->hasAVX512() && VT.getVectorElementType() == MVT::i1) {
15642       assert(VT.getVectorNumElements() <= 16);
15643       Opc = X86ISD::CMPM;
15644     }
15645     // In the two special cases we can't handle, emit two comparisons.
15646     if (SSECC == 8) {
15647       unsigned CC0, CC1;
15648       unsigned CombineOpc;
15649       if (SetCCOpcode == ISD::SETUEQ) {
15650         CC0 = 3; CC1 = 0; CombineOpc = ISD::OR;
15651       } else {
15652         assert(SetCCOpcode == ISD::SETONE);
15653         CC0 = 7; CC1 = 4; CombineOpc = ISD::AND;
15654       }
15655
15656       SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
15657                                  DAG.getConstant(CC0, MVT::i8));
15658       SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
15659                                  DAG.getConstant(CC1, MVT::i8));
15660       return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
15661     }
15662     // Handle all other FP comparisons here.
15663     return DAG.getNode(Opc, dl, VT, Op0, Op1,
15664                        DAG.getConstant(SSECC, MVT::i8));
15665   }
15666
15667   // Break 256-bit integer vector compare into smaller ones.
15668   if (VT.is256BitVector() && !Subtarget->hasInt256())
15669     return Lower256IntVSETCC(Op, DAG);
15670
15671   bool MaskResult = (VT.getVectorElementType() == MVT::i1);
15672   EVT OpVT = Op1.getValueType();
15673   if (Subtarget->hasAVX512()) {
15674     if (Op1.getValueType().is512BitVector() ||
15675         (Subtarget->hasBWI() && Subtarget->hasVLX()) ||
15676         (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32))
15677       return LowerIntVSETCC_AVX512(Op, DAG, Subtarget);
15678
15679     // In AVX-512 architecture setcc returns mask with i1 elements,
15680     // But there is no compare instruction for i8 and i16 elements in KNL.
15681     // We are not talking about 512-bit operands in this case, these
15682     // types are illegal.
15683     if (MaskResult &&
15684         (OpVT.getVectorElementType().getSizeInBits() < 32 &&
15685          OpVT.getVectorElementType().getSizeInBits() >= 8))
15686       return DAG.getNode(ISD::TRUNCATE, dl, VT,
15687                          DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
15688   }
15689
15690   // We are handling one of the integer comparisons here.  Since SSE only has
15691   // GT and EQ comparisons for integer, swapping operands and multiple
15692   // operations may be required for some comparisons.
15693   unsigned Opc;
15694   bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
15695   bool Subus = false;
15696
15697   switch (SetCCOpcode) {
15698   default: llvm_unreachable("Unexpected SETCC condition");
15699   case ISD::SETNE:  Invert = true;
15700   case ISD::SETEQ:  Opc = X86ISD::PCMPEQ; break;
15701   case ISD::SETLT:  Swap = true;
15702   case ISD::SETGT:  Opc = X86ISD::PCMPGT; break;
15703   case ISD::SETGE:  Swap = true;
15704   case ISD::SETLE:  Opc = X86ISD::PCMPGT;
15705                     Invert = true; break;
15706   case ISD::SETULT: Swap = true;
15707   case ISD::SETUGT: Opc = X86ISD::PCMPGT;
15708                     FlipSigns = true; break;
15709   case ISD::SETUGE: Swap = true;
15710   case ISD::SETULE: Opc = X86ISD::PCMPGT;
15711                     FlipSigns = true; Invert = true; break;
15712   }
15713
15714   // Special case: Use min/max operations for SETULE/SETUGE
15715   MVT VET = VT.getVectorElementType();
15716   bool hasMinMax =
15717        (Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
15718     || (Subtarget->hasSSE2()  && (VET == MVT::i8));
15719
15720   if (hasMinMax) {
15721     switch (SetCCOpcode) {
15722     default: break;
15723     case ISD::SETULE: Opc = X86ISD::UMIN; MinMax = true; break;
15724     case ISD::SETUGE: Opc = X86ISD::UMAX; MinMax = true; break;
15725     }
15726
15727     if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
15728   }
15729
15730   bool hasSubus = Subtarget->hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
15731   if (!MinMax && hasSubus) {
15732     // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
15733     // Op0 u<= Op1:
15734     //   t = psubus Op0, Op1
15735     //   pcmpeq t, <0..0>
15736     switch (SetCCOpcode) {
15737     default: break;
15738     case ISD::SETULT: {
15739       // If the comparison is against a constant we can turn this into a
15740       // setule.  With psubus, setule does not require a swap.  This is
15741       // beneficial because the constant in the register is no longer
15742       // destructed as the destination so it can be hoisted out of a loop.
15743       // Only do this pre-AVX since vpcmp* is no longer destructive.
15744       if (Subtarget->hasAVX())
15745         break;
15746       SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG);
15747       if (ULEOp1.getNode()) {
15748         Op1 = ULEOp1;
15749         Subus = true; Invert = false; Swap = false;
15750       }
15751       break;
15752     }
15753     // Psubus is better than flip-sign because it requires no inversion.
15754     case ISD::SETUGE: Subus = true; Invert = false; Swap = true;  break;
15755     case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
15756     }
15757
15758     if (Subus) {
15759       Opc = X86ISD::SUBUS;
15760       FlipSigns = false;
15761     }
15762   }
15763
15764   if (Swap)
15765     std::swap(Op0, Op1);
15766
15767   // Check that the operation in question is available (most are plain SSE2,
15768   // but PCMPGTQ and PCMPEQQ have different requirements).
15769   if (VT == MVT::v2i64) {
15770     if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) {
15771       assert(Subtarget->hasSSE2() && "Don't know how to lower!");
15772
15773       // First cast everything to the right type.
15774       Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
15775       Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
15776
15777       // Since SSE has no unsigned integer comparisons, we need to flip the sign
15778       // bits of the inputs before performing those operations. The lower
15779       // compare is always unsigned.
15780       SDValue SB;
15781       if (FlipSigns) {
15782         SB = DAG.getConstant(0x80000000U, MVT::v4i32);
15783       } else {
15784         SDValue Sign = DAG.getConstant(0x80000000U, MVT::i32);
15785         SDValue Zero = DAG.getConstant(0x00000000U, MVT::i32);
15786         SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
15787                          Sign, Zero, Sign, Zero);
15788       }
15789       Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
15790       Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
15791
15792       // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
15793       SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
15794       SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
15795
15796       // Create masks for only the low parts/high parts of the 64 bit integers.
15797       static const int MaskHi[] = { 1, 1, 3, 3 };
15798       static const int MaskLo[] = { 0, 0, 2, 2 };
15799       SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
15800       SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
15801       SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
15802
15803       SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
15804       Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
15805
15806       if (Invert)
15807         Result = DAG.getNOT(dl, Result, MVT::v4i32);
15808
15809       return DAG.getNode(ISD::BITCAST, dl, VT, Result);
15810     }
15811
15812     if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) {
15813       // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
15814       // pcmpeqd + pshufd + pand.
15815       assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!");
15816
15817       // First cast everything to the right type.
15818       Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
15819       Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
15820
15821       // Do the compare.
15822       SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
15823
15824       // Make sure the lower and upper halves are both all-ones.
15825       static const int Mask[] = { 1, 0, 3, 2 };
15826       SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
15827       Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
15828
15829       if (Invert)
15830         Result = DAG.getNOT(dl, Result, MVT::v4i32);
15831
15832       return DAG.getNode(ISD::BITCAST, dl, VT, Result);
15833     }
15834   }
15835
15836   // Since SSE has no unsigned integer comparisons, we need to flip the sign
15837   // bits of the inputs before performing those operations.
15838   if (FlipSigns) {
15839     EVT EltVT = VT.getVectorElementType();
15840     SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), VT);
15841     Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
15842     Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
15843   }
15844
15845   SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
15846
15847   // If the logical-not of the result is required, perform that now.
15848   if (Invert)
15849     Result = DAG.getNOT(dl, Result, VT);
15850
15851   if (MinMax)
15852     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
15853
15854   if (Subus)
15855     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
15856                          getZeroVector(VT, Subtarget, DAG, dl));
15857
15858   return Result;
15859 }
15860
15861 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
15862
15863   MVT VT = Op.getSimpleValueType();
15864
15865   if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
15866
15867   assert(((!Subtarget->hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
15868          && "SetCC type must be 8-bit or 1-bit integer");
15869   SDValue Op0 = Op.getOperand(0);
15870   SDValue Op1 = Op.getOperand(1);
15871   SDLoc dl(Op);
15872   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
15873
15874   // Optimize to BT if possible.
15875   // Lower (X & (1 << N)) == 0 to BT(X, N).
15876   // Lower ((X >>u N) & 1) != 0 to BT(X, N).
15877   // Lower ((X >>s N) & 1) != 0 to BT(X, N).
15878   if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
15879       Op1.getOpcode() == ISD::Constant &&
15880       cast<ConstantSDNode>(Op1)->isNullValue() &&
15881       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15882     SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
15883     if (NewSetCC.getNode()) {
15884       if (VT == MVT::i1)
15885         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
15886       return NewSetCC;
15887     }
15888   }
15889
15890   // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
15891   // these.
15892   if (Op1.getOpcode() == ISD::Constant &&
15893       (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
15894        cast<ConstantSDNode>(Op1)->isNullValue()) &&
15895       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15896
15897     // If the input is a setcc, then reuse the input setcc or use a new one with
15898     // the inverted condition.
15899     if (Op0.getOpcode() == X86ISD::SETCC) {
15900       X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
15901       bool Invert = (CC == ISD::SETNE) ^
15902         cast<ConstantSDNode>(Op1)->isNullValue();
15903       if (!Invert)
15904         return Op0;
15905
15906       CCode = X86::GetOppositeBranchCondition(CCode);
15907       SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15908                                   DAG.getConstant(CCode, MVT::i8),
15909                                   Op0.getOperand(1));
15910       if (VT == MVT::i1)
15911         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
15912       return SetCC;
15913     }
15914   }
15915   if ((Op0.getValueType() == MVT::i1) && (Op1.getOpcode() == ISD::Constant) &&
15916       (cast<ConstantSDNode>(Op1)->getZExtValue() == 1) &&
15917       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15918
15919     ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
15920     return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, MVT::i1), NewCC);
15921   }
15922
15923   bool isFP = Op1.getSimpleValueType().isFloatingPoint();
15924   unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
15925   if (X86CC == X86::COND_INVALID)
15926     return SDValue();
15927
15928   SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
15929   EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
15930   SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15931                               DAG.getConstant(X86CC, MVT::i8), EFLAGS);
15932   if (VT == MVT::i1)
15933     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
15934   return SetCC;
15935 }
15936
15937 // isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
15938 static bool isX86LogicalCmp(SDValue Op) {
15939   unsigned Opc = Op.getNode()->getOpcode();
15940   if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
15941       Opc == X86ISD::SAHF)
15942     return true;
15943   if (Op.getResNo() == 1 &&
15944       (Opc == X86ISD::ADD ||
15945        Opc == X86ISD::SUB ||
15946        Opc == X86ISD::ADC ||
15947        Opc == X86ISD::SBB ||
15948        Opc == X86ISD::SMUL ||
15949        Opc == X86ISD::UMUL ||
15950        Opc == X86ISD::INC ||
15951        Opc == X86ISD::DEC ||
15952        Opc == X86ISD::OR ||
15953        Opc == X86ISD::XOR ||
15954        Opc == X86ISD::AND))
15955     return true;
15956
15957   if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
15958     return true;
15959
15960   return false;
15961 }
15962
15963 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
15964   if (V.getOpcode() != ISD::TRUNCATE)
15965     return false;
15966
15967   SDValue VOp0 = V.getOperand(0);
15968   unsigned InBits = VOp0.getValueSizeInBits();
15969   unsigned Bits = V.getValueSizeInBits();
15970   return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
15971 }
15972
15973 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
15974   bool addTest = true;
15975   SDValue Cond  = Op.getOperand(0);
15976   SDValue Op1 = Op.getOperand(1);
15977   SDValue Op2 = Op.getOperand(2);
15978   SDLoc DL(Op);
15979   EVT VT = Op1.getValueType();
15980   SDValue CC;
15981
15982   // Lower fp selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
15983   // are available. Otherwise fp cmovs get lowered into a less efficient branch
15984   // sequence later on.
15985   if (Cond.getOpcode() == ISD::SETCC &&
15986       ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
15987        (Subtarget->hasSSE1() && VT == MVT::f32)) &&
15988       VT == Cond.getOperand(0).getValueType() && Cond->hasOneUse()) {
15989     SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
15990     int SSECC = translateX86FSETCC(
15991         cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
15992
15993     if (SSECC != 8) {
15994       if (Subtarget->hasAVX512()) {
15995         SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1,
15996                                   DAG.getConstant(SSECC, MVT::i8));
15997         return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2);
15998       }
15999       SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
16000                                 DAG.getConstant(SSECC, MVT::i8));
16001       SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
16002       SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
16003       return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
16004     }
16005   }
16006
16007   if (Cond.getOpcode() == ISD::SETCC) {
16008     SDValue NewCond = LowerSETCC(Cond, DAG);
16009     if (NewCond.getNode())
16010       Cond = NewCond;
16011   }
16012
16013   // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
16014   // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
16015   // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
16016   // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
16017   if (Cond.getOpcode() == X86ISD::SETCC &&
16018       Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
16019       isZero(Cond.getOperand(1).getOperand(1))) {
16020     SDValue Cmp = Cond.getOperand(1);
16021
16022     unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
16023
16024     if ((isAllOnes(Op1) || isAllOnes(Op2)) &&
16025         (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
16026       SDValue Y = isAllOnes(Op2) ? Op1 : Op2;
16027
16028       SDValue CmpOp0 = Cmp.getOperand(0);
16029       // Apply further optimizations for special cases
16030       // (select (x != 0), -1, 0) -> neg & sbb
16031       // (select (x == 0), 0, -1) -> neg & sbb
16032       if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y))
16033         if (YC->isNullValue() &&
16034             (isAllOnes(Op1) == (CondCode == X86::COND_NE))) {
16035           SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
16036           SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
16037                                     DAG.getConstant(0, CmpOp0.getValueType()),
16038                                     CmpOp0);
16039           SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
16040                                     DAG.getConstant(X86::COND_B, MVT::i8),
16041                                     SDValue(Neg.getNode(), 1));
16042           return Res;
16043         }
16044
16045       Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
16046                         CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));
16047       Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16048
16049       SDValue Res =   // Res = 0 or -1.
16050         DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
16051                     DAG.getConstant(X86::COND_B, MVT::i8), Cmp);
16052
16053       if (isAllOnes(Op1) != (CondCode == X86::COND_E))
16054         Res = DAG.getNOT(DL, Res, Res.getValueType());
16055
16056       ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
16057       if (!N2C || !N2C->isNullValue())
16058         Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
16059       return Res;
16060     }
16061   }
16062
16063   // Look past (and (setcc_carry (cmp ...)), 1).
16064   if (Cond.getOpcode() == ISD::AND &&
16065       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
16066     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
16067     if (C && C->getAPIntValue() == 1)
16068       Cond = Cond.getOperand(0);
16069   }
16070
16071   // If condition flag is set by a X86ISD::CMP, then use it as the condition
16072   // setting operand in place of the X86ISD::SETCC.
16073   unsigned CondOpcode = Cond.getOpcode();
16074   if (CondOpcode == X86ISD::SETCC ||
16075       CondOpcode == X86ISD::SETCC_CARRY) {
16076     CC = Cond.getOperand(0);
16077
16078     SDValue Cmp = Cond.getOperand(1);
16079     unsigned Opc = Cmp.getOpcode();
16080     MVT VT = Op.getSimpleValueType();
16081
16082     bool IllegalFPCMov = false;
16083     if (VT.isFloatingPoint() && !VT.isVector() &&
16084         !isScalarFPTypeInSSEReg(VT))  // FPStack?
16085       IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
16086
16087     if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
16088         Opc == X86ISD::BT) { // FIXME
16089       Cond = Cmp;
16090       addTest = false;
16091     }
16092   } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
16093              CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
16094              ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
16095               Cond.getOperand(0).getValueType() != MVT::i8)) {
16096     SDValue LHS = Cond.getOperand(0);
16097     SDValue RHS = Cond.getOperand(1);
16098     unsigned X86Opcode;
16099     unsigned X86Cond;
16100     SDVTList VTs;
16101     switch (CondOpcode) {
16102     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
16103     case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
16104     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
16105     case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
16106     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
16107     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
16108     default: llvm_unreachable("unexpected overflowing operator");
16109     }
16110     if (CondOpcode == ISD::UMULO)
16111       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
16112                           MVT::i32);
16113     else
16114       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
16115
16116     SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
16117
16118     if (CondOpcode == ISD::UMULO)
16119       Cond = X86Op.getValue(2);
16120     else
16121       Cond = X86Op.getValue(1);
16122
16123     CC = DAG.getConstant(X86Cond, MVT::i8);
16124     addTest = false;
16125   }
16126
16127   if (addTest) {
16128     // Look pass the truncate if the high bits are known zero.
16129     if (isTruncWithZeroHighBitsInput(Cond, DAG))
16130         Cond = Cond.getOperand(0);
16131
16132     // We know the result of AND is compared against zero. Try to match
16133     // it to BT.
16134     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
16135       SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG);
16136       if (NewSetCC.getNode()) {
16137         CC = NewSetCC.getOperand(0);
16138         Cond = NewSetCC.getOperand(1);
16139         addTest = false;
16140       }
16141     }
16142   }
16143
16144   if (addTest) {
16145     CC = DAG.getConstant(X86::COND_NE, MVT::i8);
16146     Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
16147   }
16148
16149   // a <  b ? -1 :  0 -> RES = ~setcc_carry
16150   // a <  b ?  0 : -1 -> RES = setcc_carry
16151   // a >= b ? -1 :  0 -> RES = setcc_carry
16152   // a >= b ?  0 : -1 -> RES = ~setcc_carry
16153   if (Cond.getOpcode() == X86ISD::SUB) {
16154     Cond = ConvertCmpIfNecessary(Cond, DAG);
16155     unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
16156
16157     if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
16158         (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) {
16159       SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
16160                                 DAG.getConstant(X86::COND_B, MVT::i8), Cond);
16161       if (isAllOnes(Op1) != (CondCode == X86::COND_B))
16162         return DAG.getNOT(DL, Res, Res.getValueType());
16163       return Res;
16164     }
16165   }
16166
16167   // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
16168   // widen the cmov and push the truncate through. This avoids introducing a new
16169   // branch during isel and doesn't add any extensions.
16170   if (Op.getValueType() == MVT::i8 &&
16171       Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
16172     SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
16173     if (T1.getValueType() == T2.getValueType() &&
16174         // Blacklist CopyFromReg to avoid partial register stalls.
16175         T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
16176       SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
16177       SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
16178       return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
16179     }
16180   }
16181
16182   // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
16183   // condition is true.
16184   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
16185   SDValue Ops[] = { Op2, Op1, CC, Cond };
16186   return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
16187 }
16188
16189 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget,
16190                                        SelectionDAG &DAG) {
16191   MVT VT = Op->getSimpleValueType(0);
16192   SDValue In = Op->getOperand(0);
16193   MVT InVT = In.getSimpleValueType();
16194   MVT VTElt = VT.getVectorElementType();
16195   MVT InVTElt = InVT.getVectorElementType();
16196   SDLoc dl(Op);
16197
16198   // SKX processor
16199   if ((InVTElt == MVT::i1) &&
16200       (((Subtarget->hasBWI() && Subtarget->hasVLX() &&
16201         VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) ||
16202
16203        ((Subtarget->hasBWI() && VT.is512BitVector() &&
16204         VTElt.getSizeInBits() <= 16)) ||
16205
16206        ((Subtarget->hasDQI() && Subtarget->hasVLX() &&
16207         VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
16208
16209        ((Subtarget->hasDQI() && VT.is512BitVector() &&
16210         VTElt.getSizeInBits() >= 32))))
16211     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16212
16213   unsigned int NumElts = VT.getVectorNumElements();
16214
16215   if (NumElts != 8 && NumElts != 16)
16216     return SDValue();
16217
16218   if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) {
16219     if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
16220       return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
16221     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16222   }
16223
16224   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16225   assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
16226
16227   MVT ExtVT = (NumElts == 8) ? MVT::v8i64 : MVT::v16i32;
16228   Constant *C = ConstantInt::get(*DAG.getContext(),
16229     APInt::getAllOnesValue(ExtVT.getScalarType().getSizeInBits()));
16230
16231   SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
16232   unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
16233   SDValue Ld = DAG.getLoad(ExtVT.getScalarType(), dl, DAG.getEntryNode(), CP,
16234                           MachinePointerInfo::getConstantPool(),
16235                           false, false, false, Alignment);
16236   SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, dl, ExtVT, In, Ld);
16237   if (VT.is512BitVector())
16238     return Brcst;
16239   return DAG.getNode(X86ISD::VTRUNC, dl, VT, Brcst);
16240 }
16241
16242 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
16243                                 SelectionDAG &DAG) {
16244   MVT VT = Op->getSimpleValueType(0);
16245   SDValue In = Op->getOperand(0);
16246   MVT InVT = In.getSimpleValueType();
16247   SDLoc dl(Op);
16248
16249   if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
16250     return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
16251
16252   if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
16253       (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
16254       (VT != MVT::v16i16 || InVT != MVT::v16i8))
16255     return SDValue();
16256
16257   if (Subtarget->hasInt256())
16258     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16259
16260   // Optimize vectors in AVX mode
16261   // Sign extend  v8i16 to v8i32 and
16262   //              v4i32 to v4i64
16263   //
16264   // Divide input vector into two parts
16265   // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
16266   // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
16267   // concat the vectors to original VT
16268
16269   unsigned NumElems = InVT.getVectorNumElements();
16270   SDValue Undef = DAG.getUNDEF(InVT);
16271
16272   SmallVector<int,8> ShufMask1(NumElems, -1);
16273   for (unsigned i = 0; i != NumElems/2; ++i)
16274     ShufMask1[i] = i;
16275
16276   SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask1[0]);
16277
16278   SmallVector<int,8> ShufMask2(NumElems, -1);
16279   for (unsigned i = 0; i != NumElems/2; ++i)
16280     ShufMask2[i] = i + NumElems/2;
16281
16282   SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]);
16283
16284   MVT HalfVT = MVT::getVectorVT(VT.getScalarType(),
16285                                 VT.getVectorNumElements()/2);
16286
16287   OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
16288   OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi);
16289
16290   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
16291 }
16292
16293 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
16294 // may emit an illegal shuffle but the expansion is still better than scalar
16295 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
16296 // we'll emit a shuffle and a arithmetic shift.
16297 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
16298 // TODO: It is possible to support ZExt by zeroing the undef values during
16299 // the shuffle phase or after the shuffle.
16300 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
16301                                  SelectionDAG &DAG) {
16302   MVT RegVT = Op.getSimpleValueType();
16303   assert(RegVT.isVector() && "We only custom lower vector sext loads.");
16304   assert(RegVT.isInteger() &&
16305          "We only custom lower integer vector sext loads.");
16306
16307   // Nothing useful we can do without SSE2 shuffles.
16308   assert(Subtarget->hasSSE2() && "We only custom lower sext loads with SSE2.");
16309
16310   LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
16311   SDLoc dl(Ld);
16312   EVT MemVT = Ld->getMemoryVT();
16313   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16314   unsigned RegSz = RegVT.getSizeInBits();
16315
16316   ISD::LoadExtType Ext = Ld->getExtensionType();
16317
16318   assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
16319          && "Only anyext and sext are currently implemented.");
16320   assert(MemVT != RegVT && "Cannot extend to the same type");
16321   assert(MemVT.isVector() && "Must load a vector from memory");
16322
16323   unsigned NumElems = RegVT.getVectorNumElements();
16324   unsigned MemSz = MemVT.getSizeInBits();
16325   assert(RegSz > MemSz && "Register size must be greater than the mem size");
16326
16327   if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) {
16328     // The only way in which we have a legal 256-bit vector result but not the
16329     // integer 256-bit operations needed to directly lower a sextload is if we
16330     // have AVX1 but not AVX2. In that case, we can always emit a sextload to
16331     // a 128-bit vector and a normal sign_extend to 256-bits that should get
16332     // correctly legalized. We do this late to allow the canonical form of
16333     // sextload to persist throughout the rest of the DAG combiner -- it wants
16334     // to fold together any extensions it can, and so will fuse a sign_extend
16335     // of an sextload into a sextload targeting a wider value.
16336     SDValue Load;
16337     if (MemSz == 128) {
16338       // Just switch this to a normal load.
16339       assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
16340                                        "it must be a legal 128-bit vector "
16341                                        "type!");
16342       Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
16343                   Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(),
16344                   Ld->isInvariant(), Ld->getAlignment());
16345     } else {
16346       assert(MemSz < 128 &&
16347              "Can't extend a type wider than 128 bits to a 256 bit vector!");
16348       // Do an sext load to a 128-bit vector type. We want to use the same
16349       // number of elements, but elements half as wide. This will end up being
16350       // recursively lowered by this routine, but will succeed as we definitely
16351       // have all the necessary features if we're using AVX1.
16352       EVT HalfEltVT =
16353           EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
16354       EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
16355       Load =
16356           DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
16357                          Ld->getPointerInfo(), MemVT, Ld->isVolatile(),
16358                          Ld->isNonTemporal(), Ld->isInvariant(),
16359                          Ld->getAlignment());
16360     }
16361
16362     // Replace chain users with the new chain.
16363     assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
16364     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
16365
16366     // Finally, do a normal sign-extend to the desired register.
16367     return DAG.getSExtOrTrunc(Load, dl, RegVT);
16368   }
16369
16370   // All sizes must be a power of two.
16371   assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
16372          "Non-power-of-two elements are not custom lowered!");
16373
16374   // Attempt to load the original value using scalar loads.
16375   // Find the largest scalar type that divides the total loaded size.
16376   MVT SclrLoadTy = MVT::i8;
16377   for (MVT Tp : MVT::integer_valuetypes()) {
16378     if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
16379       SclrLoadTy = Tp;
16380     }
16381   }
16382
16383   // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
16384   if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
16385       (64 <= MemSz))
16386     SclrLoadTy = MVT::f64;
16387
16388   // Calculate the number of scalar loads that we need to perform
16389   // in order to load our vector from memory.
16390   unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
16391
16392   assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
16393          "Can only lower sext loads with a single scalar load!");
16394
16395   unsigned loadRegZize = RegSz;
16396   if (Ext == ISD::SEXTLOAD && RegSz == 256)
16397     loadRegZize /= 2;
16398
16399   // Represent our vector as a sequence of elements which are the
16400   // largest scalar that we can load.
16401   EVT LoadUnitVecVT = EVT::getVectorVT(
16402       *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
16403
16404   // Represent the data using the same element type that is stored in
16405   // memory. In practice, we ''widen'' MemVT.
16406   EVT WideVecVT =
16407       EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
16408                        loadRegZize / MemVT.getScalarType().getSizeInBits());
16409
16410   assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
16411          "Invalid vector type");
16412
16413   // We can't shuffle using an illegal type.
16414   assert(TLI.isTypeLegal(WideVecVT) &&
16415          "We only lower types that form legal widened vector types");
16416
16417   SmallVector<SDValue, 8> Chains;
16418   SDValue Ptr = Ld->getBasePtr();
16419   SDValue Increment =
16420       DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, TLI.getPointerTy());
16421   SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
16422
16423   for (unsigned i = 0; i < NumLoads; ++i) {
16424     // Perform a single load.
16425     SDValue ScalarLoad =
16426         DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
16427                     Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(),
16428                     Ld->getAlignment());
16429     Chains.push_back(ScalarLoad.getValue(1));
16430     // Create the first element type using SCALAR_TO_VECTOR in order to avoid
16431     // another round of DAGCombining.
16432     if (i == 0)
16433       Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
16434     else
16435       Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
16436                         ScalarLoad, DAG.getIntPtrConstant(i));
16437
16438     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
16439   }
16440
16441   SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
16442
16443   // Bitcast the loaded value to a vector of the original element type, in
16444   // the size of the target vector type.
16445   SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
16446   unsigned SizeRatio = RegSz / MemSz;
16447
16448   if (Ext == ISD::SEXTLOAD) {
16449     // If we have SSE4.1, we can directly emit a VSEXT node.
16450     if (Subtarget->hasSSE41()) {
16451       SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
16452       DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16453       return Sext;
16454     }
16455
16456     // Otherwise we'll shuffle the small elements in the high bits of the
16457     // larger type and perform an arithmetic shift. If the shift is not legal
16458     // it's better to scalarize.
16459     assert(TLI.isOperationLegalOrCustom(ISD::SRA, RegVT) &&
16460            "We can't implement a sext load without an arithmetic right shift!");
16461
16462     // Redistribute the loaded elements into the different locations.
16463     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
16464     for (unsigned i = 0; i != NumElems; ++i)
16465       ShuffleVec[i * SizeRatio + SizeRatio - 1] = i;
16466
16467     SDValue Shuff = DAG.getVectorShuffle(
16468         WideVecVT, dl, SlicedVec, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
16469
16470     Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
16471
16472     // Build the arithmetic shift.
16473     unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
16474                    MemVT.getVectorElementType().getSizeInBits();
16475     Shuff =
16476         DAG.getNode(ISD::SRA, dl, RegVT, Shuff, DAG.getConstant(Amt, RegVT));
16477
16478     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16479     return Shuff;
16480   }
16481
16482   // Redistribute the loaded elements into the different locations.
16483   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
16484   for (unsigned i = 0; i != NumElems; ++i)
16485     ShuffleVec[i * SizeRatio] = i;
16486
16487   SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
16488                                        DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
16489
16490   // Bitcast to the requested type.
16491   Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
16492   DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16493   return Shuff;
16494 }
16495
16496 // isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
16497 // ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
16498 // from the AND / OR.
16499 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
16500   Opc = Op.getOpcode();
16501   if (Opc != ISD::OR && Opc != ISD::AND)
16502     return false;
16503   return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
16504           Op.getOperand(0).hasOneUse() &&
16505           Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
16506           Op.getOperand(1).hasOneUse());
16507 }
16508
16509 // isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
16510 // 1 and that the SETCC node has a single use.
16511 static bool isXor1OfSetCC(SDValue Op) {
16512   if (Op.getOpcode() != ISD::XOR)
16513     return false;
16514   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
16515   if (N1C && N1C->getAPIntValue() == 1) {
16516     return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
16517       Op.getOperand(0).hasOneUse();
16518   }
16519   return false;
16520 }
16521
16522 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
16523   bool addTest = true;
16524   SDValue Chain = Op.getOperand(0);
16525   SDValue Cond  = Op.getOperand(1);
16526   SDValue Dest  = Op.getOperand(2);
16527   SDLoc dl(Op);
16528   SDValue CC;
16529   bool Inverted = false;
16530
16531   if (Cond.getOpcode() == ISD::SETCC) {
16532     // Check for setcc([su]{add,sub,mul}o == 0).
16533     if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
16534         isa<ConstantSDNode>(Cond.getOperand(1)) &&
16535         cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() &&
16536         Cond.getOperand(0).getResNo() == 1 &&
16537         (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
16538          Cond.getOperand(0).getOpcode() == ISD::UADDO ||
16539          Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
16540          Cond.getOperand(0).getOpcode() == ISD::USUBO ||
16541          Cond.getOperand(0).getOpcode() == ISD::SMULO ||
16542          Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
16543       Inverted = true;
16544       Cond = Cond.getOperand(0);
16545     } else {
16546       SDValue NewCond = LowerSETCC(Cond, DAG);
16547       if (NewCond.getNode())
16548         Cond = NewCond;
16549     }
16550   }
16551 #if 0
16552   // FIXME: LowerXALUO doesn't handle these!!
16553   else if (Cond.getOpcode() == X86ISD::ADD  ||
16554            Cond.getOpcode() == X86ISD::SUB  ||
16555            Cond.getOpcode() == X86ISD::SMUL ||
16556            Cond.getOpcode() == X86ISD::UMUL)
16557     Cond = LowerXALUO(Cond, DAG);
16558 #endif
16559
16560   // Look pass (and (setcc_carry (cmp ...)), 1).
16561   if (Cond.getOpcode() == ISD::AND &&
16562       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
16563     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
16564     if (C && C->getAPIntValue() == 1)
16565       Cond = Cond.getOperand(0);
16566   }
16567
16568   // If condition flag is set by a X86ISD::CMP, then use it as the condition
16569   // setting operand in place of the X86ISD::SETCC.
16570   unsigned CondOpcode = Cond.getOpcode();
16571   if (CondOpcode == X86ISD::SETCC ||
16572       CondOpcode == X86ISD::SETCC_CARRY) {
16573     CC = Cond.getOperand(0);
16574
16575     SDValue Cmp = Cond.getOperand(1);
16576     unsigned Opc = Cmp.getOpcode();
16577     // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
16578     if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
16579       Cond = Cmp;
16580       addTest = false;
16581     } else {
16582       switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
16583       default: break;
16584       case X86::COND_O:
16585       case X86::COND_B:
16586         // These can only come from an arithmetic instruction with overflow,
16587         // e.g. SADDO, UADDO.
16588         Cond = Cond.getNode()->getOperand(1);
16589         addTest = false;
16590         break;
16591       }
16592     }
16593   }
16594   CondOpcode = Cond.getOpcode();
16595   if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
16596       CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
16597       ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
16598        Cond.getOperand(0).getValueType() != MVT::i8)) {
16599     SDValue LHS = Cond.getOperand(0);
16600     SDValue RHS = Cond.getOperand(1);
16601     unsigned X86Opcode;
16602     unsigned X86Cond;
16603     SDVTList VTs;
16604     // Keep this in sync with LowerXALUO, otherwise we might create redundant
16605     // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
16606     // X86ISD::INC).
16607     switch (CondOpcode) {
16608     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
16609     case ISD::SADDO:
16610       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
16611         if (C->isOne()) {
16612           X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
16613           break;
16614         }
16615       X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
16616     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
16617     case ISD::SSUBO:
16618       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
16619         if (C->isOne()) {
16620           X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
16621           break;
16622         }
16623       X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
16624     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
16625     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
16626     default: llvm_unreachable("unexpected overflowing operator");
16627     }
16628     if (Inverted)
16629       X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
16630     if (CondOpcode == ISD::UMULO)
16631       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
16632                           MVT::i32);
16633     else
16634       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
16635
16636     SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
16637
16638     if (CondOpcode == ISD::UMULO)
16639       Cond = X86Op.getValue(2);
16640     else
16641       Cond = X86Op.getValue(1);
16642
16643     CC = DAG.getConstant(X86Cond, MVT::i8);
16644     addTest = false;
16645   } else {
16646     unsigned CondOpc;
16647     if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
16648       SDValue Cmp = Cond.getOperand(0).getOperand(1);
16649       if (CondOpc == ISD::OR) {
16650         // Also, recognize the pattern generated by an FCMP_UNE. We can emit
16651         // two branches instead of an explicit OR instruction with a
16652         // separate test.
16653         if (Cmp == Cond.getOperand(1).getOperand(1) &&
16654             isX86LogicalCmp(Cmp)) {
16655           CC = Cond.getOperand(0).getOperand(0);
16656           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16657                               Chain, Dest, CC, Cmp);
16658           CC = Cond.getOperand(1).getOperand(0);
16659           Cond = Cmp;
16660           addTest = false;
16661         }
16662       } else { // ISD::AND
16663         // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
16664         // two branches instead of an explicit AND instruction with a
16665         // separate test. However, we only do this if this block doesn't
16666         // have a fall-through edge, because this requires an explicit
16667         // jmp when the condition is false.
16668         if (Cmp == Cond.getOperand(1).getOperand(1) &&
16669             isX86LogicalCmp(Cmp) &&
16670             Op.getNode()->hasOneUse()) {
16671           X86::CondCode CCode =
16672             (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
16673           CCode = X86::GetOppositeBranchCondition(CCode);
16674           CC = DAG.getConstant(CCode, MVT::i8);
16675           SDNode *User = *Op.getNode()->use_begin();
16676           // Look for an unconditional branch following this conditional branch.
16677           // We need this because we need to reverse the successors in order
16678           // to implement FCMP_OEQ.
16679           if (User->getOpcode() == ISD::BR) {
16680             SDValue FalseBB = User->getOperand(1);
16681             SDNode *NewBR =
16682               DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16683             assert(NewBR == User);
16684             (void)NewBR;
16685             Dest = FalseBB;
16686
16687             Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16688                                 Chain, Dest, CC, Cmp);
16689             X86::CondCode CCode =
16690               (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
16691             CCode = X86::GetOppositeBranchCondition(CCode);
16692             CC = DAG.getConstant(CCode, MVT::i8);
16693             Cond = Cmp;
16694             addTest = false;
16695           }
16696         }
16697       }
16698     } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
16699       // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
16700       // It should be transformed during dag combiner except when the condition
16701       // is set by a arithmetics with overflow node.
16702       X86::CondCode CCode =
16703         (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
16704       CCode = X86::GetOppositeBranchCondition(CCode);
16705       CC = DAG.getConstant(CCode, MVT::i8);
16706       Cond = Cond.getOperand(0).getOperand(1);
16707       addTest = false;
16708     } else if (Cond.getOpcode() == ISD::SETCC &&
16709                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
16710       // For FCMP_OEQ, we can emit
16711       // two branches instead of an explicit AND instruction with a
16712       // separate test. However, we only do this if this block doesn't
16713       // have a fall-through edge, because this requires an explicit
16714       // jmp when the condition is false.
16715       if (Op.getNode()->hasOneUse()) {
16716         SDNode *User = *Op.getNode()->use_begin();
16717         // Look for an unconditional branch following this conditional branch.
16718         // We need this because we need to reverse the successors in order
16719         // to implement FCMP_OEQ.
16720         if (User->getOpcode() == ISD::BR) {
16721           SDValue FalseBB = User->getOperand(1);
16722           SDNode *NewBR =
16723             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16724           assert(NewBR == User);
16725           (void)NewBR;
16726           Dest = FalseBB;
16727
16728           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
16729                                     Cond.getOperand(0), Cond.getOperand(1));
16730           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16731           CC = DAG.getConstant(X86::COND_NE, MVT::i8);
16732           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16733                               Chain, Dest, CC, Cmp);
16734           CC = DAG.getConstant(X86::COND_P, MVT::i8);
16735           Cond = Cmp;
16736           addTest = false;
16737         }
16738       }
16739     } else if (Cond.getOpcode() == ISD::SETCC &&
16740                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
16741       // For FCMP_UNE, we can emit
16742       // two branches instead of an explicit AND instruction with a
16743       // separate test. However, we only do this if this block doesn't
16744       // have a fall-through edge, because this requires an explicit
16745       // jmp when the condition is false.
16746       if (Op.getNode()->hasOneUse()) {
16747         SDNode *User = *Op.getNode()->use_begin();
16748         // Look for an unconditional branch following this conditional branch.
16749         // We need this because we need to reverse the successors in order
16750         // to implement FCMP_UNE.
16751         if (User->getOpcode() == ISD::BR) {
16752           SDValue FalseBB = User->getOperand(1);
16753           SDNode *NewBR =
16754             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16755           assert(NewBR == User);
16756           (void)NewBR;
16757
16758           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
16759                                     Cond.getOperand(0), Cond.getOperand(1));
16760           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16761           CC = DAG.getConstant(X86::COND_NE, MVT::i8);
16762           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16763                               Chain, Dest, CC, Cmp);
16764           CC = DAG.getConstant(X86::COND_NP, MVT::i8);
16765           Cond = Cmp;
16766           addTest = false;
16767           Dest = FalseBB;
16768         }
16769       }
16770     }
16771   }
16772
16773   if (addTest) {
16774     // Look pass the truncate if the high bits are known zero.
16775     if (isTruncWithZeroHighBitsInput(Cond, DAG))
16776         Cond = Cond.getOperand(0);
16777
16778     // We know the result of AND is compared against zero. Try to match
16779     // it to BT.
16780     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
16781       SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
16782       if (NewSetCC.getNode()) {
16783         CC = NewSetCC.getOperand(0);
16784         Cond = NewSetCC.getOperand(1);
16785         addTest = false;
16786       }
16787     }
16788   }
16789
16790   if (addTest) {
16791     X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
16792     CC = DAG.getConstant(X86Cond, MVT::i8);
16793     Cond = EmitTest(Cond, X86Cond, dl, DAG);
16794   }
16795   Cond = ConvertCmpIfNecessary(Cond, DAG);
16796   return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16797                      Chain, Dest, CC, Cond);
16798 }
16799
16800 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
16801 // Calls to _alloca are needed to probe the stack when allocating more than 4k
16802 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
16803 // that the guard pages used by the OS virtual memory manager are allocated in
16804 // correct sequence.
16805 SDValue
16806 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
16807                                            SelectionDAG &DAG) const {
16808   MachineFunction &MF = DAG.getMachineFunction();
16809   bool SplitStack = MF.shouldSplitStack();
16810   bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMachO()) ||
16811                SplitStack;
16812   SDLoc dl(Op);
16813
16814   if (!Lower) {
16815     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16816     SDNode* Node = Op.getNode();
16817
16818     unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
16819     assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
16820         " not tell us which reg is the stack pointer!");
16821     EVT VT = Node->getValueType(0);
16822     SDValue Tmp1 = SDValue(Node, 0);
16823     SDValue Tmp2 = SDValue(Node, 1);
16824     SDValue Tmp3 = Node->getOperand(2);
16825     SDValue Chain = Tmp1.getOperand(0);
16826
16827     // Chain the dynamic stack allocation so that it doesn't modify the stack
16828     // pointer when other instructions are using the stack.
16829     Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true),
16830         SDLoc(Node));
16831
16832     SDValue Size = Tmp2.getOperand(1);
16833     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
16834     Chain = SP.getValue(1);
16835     unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
16836     const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
16837     unsigned StackAlign = TFI.getStackAlignment();
16838     Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
16839     if (Align > StackAlign)
16840       Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
16841           DAG.getConstant(-(uint64_t)Align, VT));
16842     Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
16843
16844     Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, true),
16845         DAG.getIntPtrConstant(0, true), SDValue(),
16846         SDLoc(Node));
16847
16848     SDValue Ops[2] = { Tmp1, Tmp2 };
16849     return DAG.getMergeValues(Ops, dl);
16850   }
16851
16852   // Get the inputs.
16853   SDValue Chain = Op.getOperand(0);
16854   SDValue Size  = Op.getOperand(1);
16855   unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
16856   EVT VT = Op.getNode()->getValueType(0);
16857
16858   bool Is64Bit = Subtarget->is64Bit();
16859   EVT SPTy = getPointerTy();
16860
16861   if (SplitStack) {
16862     MachineRegisterInfo &MRI = MF.getRegInfo();
16863
16864     if (Is64Bit) {
16865       // The 64 bit implementation of segmented stacks needs to clobber both r10
16866       // r11. This makes it impossible to use it along with nested parameters.
16867       const Function *F = MF.getFunction();
16868
16869       for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
16870            I != E; ++I)
16871         if (I->hasNestAttr())
16872           report_fatal_error("Cannot use segmented stacks with functions that "
16873                              "have nested arguments.");
16874     }
16875
16876     const TargetRegisterClass *AddrRegClass =
16877       getRegClassFor(getPointerTy());
16878     unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
16879     Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
16880     SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
16881                                 DAG.getRegister(Vreg, SPTy));
16882     SDValue Ops1[2] = { Value, Chain };
16883     return DAG.getMergeValues(Ops1, dl);
16884   } else {
16885     SDValue Flag;
16886     const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX);
16887
16888     Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
16889     Flag = Chain.getValue(1);
16890     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
16891
16892     Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
16893
16894     const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
16895     unsigned SPReg = RegInfo->getStackRegister();
16896     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
16897     Chain = SP.getValue(1);
16898
16899     if (Align) {
16900       SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
16901                        DAG.getConstant(-(uint64_t)Align, VT));
16902       Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
16903     }
16904
16905     SDValue Ops1[2] = { SP, Chain };
16906     return DAG.getMergeValues(Ops1, dl);
16907   }
16908 }
16909
16910 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
16911   MachineFunction &MF = DAG.getMachineFunction();
16912   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
16913
16914   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
16915   SDLoc DL(Op);
16916
16917   if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) {
16918     // vastart just stores the address of the VarArgsFrameIndex slot into the
16919     // memory location argument.
16920     SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
16921                                    getPointerTy());
16922     return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
16923                         MachinePointerInfo(SV), false, false, 0);
16924   }
16925
16926   // __va_list_tag:
16927   //   gp_offset         (0 - 6 * 8)
16928   //   fp_offset         (48 - 48 + 8 * 16)
16929   //   overflow_arg_area (point to parameters coming in memory).
16930   //   reg_save_area
16931   SmallVector<SDValue, 8> MemOps;
16932   SDValue FIN = Op.getOperand(1);
16933   // Store gp_offset
16934   SDValue Store = DAG.getStore(Op.getOperand(0), DL,
16935                                DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
16936                                                MVT::i32),
16937                                FIN, MachinePointerInfo(SV), false, false, 0);
16938   MemOps.push_back(Store);
16939
16940   // Store fp_offset
16941   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
16942                     FIN, DAG.getIntPtrConstant(4));
16943   Store = DAG.getStore(Op.getOperand(0), DL,
16944                        DAG.getConstant(FuncInfo->getVarArgsFPOffset(),
16945                                        MVT::i32),
16946                        FIN, MachinePointerInfo(SV, 4), false, false, 0);
16947   MemOps.push_back(Store);
16948
16949   // Store ptr to overflow_arg_area
16950   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
16951                     FIN, DAG.getIntPtrConstant(4));
16952   SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
16953                                     getPointerTy());
16954   Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
16955                        MachinePointerInfo(SV, 8),
16956                        false, false, 0);
16957   MemOps.push_back(Store);
16958
16959   // Store ptr to reg_save_area.
16960   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
16961                     FIN, DAG.getIntPtrConstant(8));
16962   SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
16963                                     getPointerTy());
16964   Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
16965                        MachinePointerInfo(SV, 16), false, false, 0);
16966   MemOps.push_back(Store);
16967   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
16968 }
16969
16970 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
16971   assert(Subtarget->is64Bit() &&
16972          "LowerVAARG only handles 64-bit va_arg!");
16973   assert((Subtarget->isTargetLinux() ||
16974           Subtarget->isTargetDarwin()) &&
16975           "Unhandled target in LowerVAARG");
16976   assert(Op.getNode()->getNumOperands() == 4);
16977   SDValue Chain = Op.getOperand(0);
16978   SDValue SrcPtr = Op.getOperand(1);
16979   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
16980   unsigned Align = Op.getConstantOperandVal(3);
16981   SDLoc dl(Op);
16982
16983   EVT ArgVT = Op.getNode()->getValueType(0);
16984   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
16985   uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
16986   uint8_t ArgMode;
16987
16988   // Decide which area this value should be read from.
16989   // TODO: Implement the AMD64 ABI in its entirety. This simple
16990   // selection mechanism works only for the basic types.
16991   if (ArgVT == MVT::f80) {
16992     llvm_unreachable("va_arg for f80 not yet implemented");
16993   } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
16994     ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
16995   } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
16996     ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
16997   } else {
16998     llvm_unreachable("Unhandled argument type in LowerVAARG");
16999   }
17000
17001   if (ArgMode == 2) {
17002     // Sanity Check: Make sure using fp_offset makes sense.
17003     assert(!DAG.getTarget().Options.UseSoftFloat &&
17004            !(DAG.getMachineFunction()
17005                 .getFunction()->getAttributes()
17006                 .hasAttribute(AttributeSet::FunctionIndex,
17007                               Attribute::NoImplicitFloat)) &&
17008            Subtarget->hasSSE1());
17009   }
17010
17011   // Insert VAARG_64 node into the DAG
17012   // VAARG_64 returns two values: Variable Argument Address, Chain
17013   SmallVector<SDValue, 11> InstOps;
17014   InstOps.push_back(Chain);
17015   InstOps.push_back(SrcPtr);
17016   InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32));
17017   InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8));
17018   InstOps.push_back(DAG.getConstant(Align, MVT::i32));
17019   SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other);
17020   SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
17021                                           VTs, InstOps, MVT::i64,
17022                                           MachinePointerInfo(SV),
17023                                           /*Align=*/0,
17024                                           /*Volatile=*/false,
17025                                           /*ReadMem=*/true,
17026                                           /*WriteMem=*/true);
17027   Chain = VAARG.getValue(1);
17028
17029   // Load the next argument and return it
17030   return DAG.getLoad(ArgVT, dl,
17031                      Chain,
17032                      VAARG,
17033                      MachinePointerInfo(),
17034                      false, false, false, 0);
17035 }
17036
17037 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
17038                            SelectionDAG &DAG) {
17039   // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
17040   assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
17041   SDValue Chain = Op.getOperand(0);
17042   SDValue DstPtr = Op.getOperand(1);
17043   SDValue SrcPtr = Op.getOperand(2);
17044   const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
17045   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
17046   SDLoc DL(Op);
17047
17048   return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
17049                        DAG.getIntPtrConstant(24), 8, /*isVolatile*/false,
17050                        false,
17051                        MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
17052 }
17053
17054 // getTargetVShiftByConstNode - Handle vector element shifts where the shift
17055 // amount is a constant. Takes immediate version of shift as input.
17056 static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
17057                                           SDValue SrcOp, uint64_t ShiftAmt,
17058                                           SelectionDAG &DAG) {
17059   MVT ElementType = VT.getVectorElementType();
17060
17061   // Fold this packed shift into its first operand if ShiftAmt is 0.
17062   if (ShiftAmt == 0)
17063     return SrcOp;
17064
17065   // Check for ShiftAmt >= element width
17066   if (ShiftAmt >= ElementType.getSizeInBits()) {
17067     if (Opc == X86ISD::VSRAI)
17068       ShiftAmt = ElementType.getSizeInBits() - 1;
17069     else
17070       return DAG.getConstant(0, VT);
17071   }
17072
17073   assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
17074          && "Unknown target vector shift-by-constant node");
17075
17076   // Fold this packed vector shift into a build vector if SrcOp is a
17077   // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT.
17078   if (VT == SrcOp.getSimpleValueType() &&
17079       ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
17080     SmallVector<SDValue, 8> Elts;
17081     unsigned NumElts = SrcOp->getNumOperands();
17082     ConstantSDNode *ND;
17083
17084     switch(Opc) {
17085     default: llvm_unreachable(nullptr);
17086     case X86ISD::VSHLI:
17087       for (unsigned i=0; i!=NumElts; ++i) {
17088         SDValue CurrentOp = SrcOp->getOperand(i);
17089         if (CurrentOp->getOpcode() == ISD::UNDEF) {
17090           Elts.push_back(CurrentOp);
17091           continue;
17092         }
17093         ND = cast<ConstantSDNode>(CurrentOp);
17094         const APInt &C = ND->getAPIntValue();
17095         Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), ElementType));
17096       }
17097       break;
17098     case X86ISD::VSRLI:
17099       for (unsigned i=0; i!=NumElts; ++i) {
17100         SDValue CurrentOp = SrcOp->getOperand(i);
17101         if (CurrentOp->getOpcode() == ISD::UNDEF) {
17102           Elts.push_back(CurrentOp);
17103           continue;
17104         }
17105         ND = cast<ConstantSDNode>(CurrentOp);
17106         const APInt &C = ND->getAPIntValue();
17107         Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), ElementType));
17108       }
17109       break;
17110     case X86ISD::VSRAI:
17111       for (unsigned i=0; i!=NumElts; ++i) {
17112         SDValue CurrentOp = SrcOp->getOperand(i);
17113         if (CurrentOp->getOpcode() == ISD::UNDEF) {
17114           Elts.push_back(CurrentOp);
17115           continue;
17116         }
17117         ND = cast<ConstantSDNode>(CurrentOp);
17118         const APInt &C = ND->getAPIntValue();
17119         Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), ElementType));
17120       }
17121       break;
17122     }
17123
17124     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
17125   }
17126
17127   return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8));
17128 }
17129
17130 // getTargetVShiftNode - Handle vector element shifts where the shift amount
17131 // may or may not be a constant. Takes immediate version of shift as input.
17132 static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
17133                                    SDValue SrcOp, SDValue ShAmt,
17134                                    SelectionDAG &DAG) {
17135   MVT SVT = ShAmt.getSimpleValueType();
17136   assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
17137
17138   // Catch shift-by-constant.
17139   if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
17140     return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
17141                                       CShAmt->getZExtValue(), DAG);
17142
17143   // Change opcode to non-immediate version
17144   switch (Opc) {
17145     default: llvm_unreachable("Unknown target vector shift node");
17146     case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
17147     case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
17148     case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
17149   }
17150
17151   const X86Subtarget &Subtarget =
17152       static_cast<const X86Subtarget &>(DAG.getSubtarget());
17153   if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
17154       ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
17155     // Let the shuffle legalizer expand this shift amount node.
17156     SDValue Op0 = ShAmt.getOperand(0);
17157     Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
17158     ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, &Subtarget, DAG);
17159   } else {
17160     // Need to build a vector containing shift amount.
17161     // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
17162     SmallVector<SDValue, 4> ShOps;
17163     ShOps.push_back(ShAmt);
17164     if (SVT == MVT::i32) {
17165       ShOps.push_back(DAG.getConstant(0, SVT));
17166       ShOps.push_back(DAG.getUNDEF(SVT));
17167     }
17168     ShOps.push_back(DAG.getUNDEF(SVT));
17169
17170     MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
17171     ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, BVT, ShOps);
17172   }
17173
17174   // The return type has to be a 128-bit type with the same element
17175   // type as the input type.
17176   MVT EltVT = VT.getVectorElementType();
17177   EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
17178
17179   ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt);
17180   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
17181 }
17182
17183 /// \brief Return (and \p Op, \p Mask) for compare instructions or
17184 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
17185 /// necessary casting for \p Mask when lowering masking intrinsics.
17186 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
17187                                     SDValue PreservedSrc,
17188                                     const X86Subtarget *Subtarget,
17189                                     SelectionDAG &DAG) {
17190     EVT VT = Op.getValueType();
17191     EVT MaskVT = EVT::getVectorVT(*DAG.getContext(),
17192                                   MVT::i1, VT.getVectorNumElements());
17193     EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17194                                      Mask.getValueType().getSizeInBits());
17195     SDLoc dl(Op);
17196
17197     assert(MaskVT.isSimple() && "invalid mask type");
17198
17199     if (isAllOnes(Mask))
17200       return Op;
17201
17202     // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
17203     // are extracted by EXTRACT_SUBVECTOR.
17204     SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17205                               DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17206                               DAG.getIntPtrConstant(0));
17207
17208     switch (Op.getOpcode()) {
17209       default: break;
17210       case X86ISD::PCMPEQM:
17211       case X86ISD::PCMPGTM:
17212       case X86ISD::CMPM:
17213       case X86ISD::CMPMU:
17214         return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
17215     }
17216     if (PreservedSrc.getOpcode() == ISD::UNDEF)
17217       PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
17218     return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc);
17219 }
17220
17221 /// \brief Creates an SDNode for a predicated scalar operation.
17222 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
17223 /// The mask is comming as MVT::i8 and it should be truncated
17224 /// to MVT::i1 while lowering masking intrinsics.
17225 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
17226 /// "X86select" instead of "vselect". We just can't create the "vselect" node for
17227 /// a scalar instruction.
17228 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
17229                                     SDValue PreservedSrc,
17230                                     const X86Subtarget *Subtarget,
17231                                     SelectionDAG &DAG) {
17232     if (isAllOnes(Mask))
17233       return Op;
17234
17235     EVT VT = Op.getValueType();
17236     SDLoc dl(Op);
17237     // The mask should be of type MVT::i1
17238     SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
17239
17240     if (PreservedSrc.getOpcode() == ISD::UNDEF)
17241       PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
17242     return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
17243 }
17244
17245 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
17246                                        SelectionDAG &DAG) {
17247   SDLoc dl(Op);
17248   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
17249   EVT VT = Op.getValueType();
17250   const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
17251   if (IntrData) {
17252     switch(IntrData->Type) {
17253     case INTR_TYPE_1OP:
17254       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
17255     case INTR_TYPE_2OP:
17256       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17257         Op.getOperand(2));
17258     case INTR_TYPE_3OP:
17259       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17260         Op.getOperand(2), Op.getOperand(3));
17261     case INTR_TYPE_1OP_MASK_RM: {
17262       SDValue Src = Op.getOperand(1);
17263       SDValue Src0 = Op.getOperand(2);
17264       SDValue Mask = Op.getOperand(3);
17265       SDValue RoundingMode = Op.getOperand(4);
17266       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
17267                                               RoundingMode),
17268                                   Mask, Src0, Subtarget, DAG);
17269     }
17270     case INTR_TYPE_SCALAR_MASK_RM: {
17271       SDValue Src1 = Op.getOperand(1);
17272       SDValue Src2 = Op.getOperand(2);
17273       SDValue Src0 = Op.getOperand(3);
17274       SDValue Mask = Op.getOperand(4);
17275       SDValue RoundingMode = Op.getOperand(5);
17276       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
17277                                               RoundingMode),
17278                                   Mask, Src0, Subtarget, DAG);
17279     }
17280     case INTR_TYPE_2OP_MASK: {
17281       SDValue Mask = Op.getOperand(4);
17282       SDValue PassThru = Op.getOperand(3);
17283       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17284       if (IntrWithRoundingModeOpcode != 0) {
17285         unsigned Round = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
17286         if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
17287           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17288                                       dl, Op.getValueType(),
17289                                       Op.getOperand(1), Op.getOperand(2),
17290                                       Op.getOperand(3), Op.getOperand(5)),
17291                                       Mask, PassThru, Subtarget, DAG);
17292         }
17293       }
17294       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17295                                               Op.getOperand(1),
17296                                               Op.getOperand(2)),
17297                                   Mask, PassThru, Subtarget, DAG);
17298     }
17299     case FMA_OP_MASK: {
17300       SDValue Src1 = Op.getOperand(1);
17301       SDValue Src2 = Op.getOperand(2);
17302       SDValue Src3 = Op.getOperand(3);
17303       SDValue Mask = Op.getOperand(4);
17304       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17305       if (IntrWithRoundingModeOpcode != 0) {
17306         SDValue Rnd = Op.getOperand(5);
17307         if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
17308             X86::STATIC_ROUNDING::CUR_DIRECTION)
17309           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17310                                                   dl, Op.getValueType(),
17311                                                   Src1, Src2, Src3, Rnd),
17312                                       Mask, Src1, Subtarget, DAG);
17313       }
17314       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
17315                                               dl, Op.getValueType(),
17316                                               Src1, Src2, Src3),
17317                                   Mask, Src1, Subtarget, DAG);
17318     }
17319     case CMP_MASK:
17320     case CMP_MASK_CC: {
17321       // Comparison intrinsics with masks.
17322       // Example of transformation:
17323       // (i8 (int_x86_avx512_mask_pcmpeq_q_128
17324       //             (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
17325       // (i8 (bitcast
17326       //   (v8i1 (insert_subvector undef,
17327       //           (v2i1 (and (PCMPEQM %a, %b),
17328       //                      (extract_subvector
17329       //                         (v8i1 (bitcast %mask)), 0))), 0))))
17330       EVT VT = Op.getOperand(1).getValueType();
17331       EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17332                                     VT.getVectorNumElements());
17333       SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
17334       EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17335                                        Mask.getValueType().getSizeInBits());
17336       SDValue Cmp;
17337       if (IntrData->Type == CMP_MASK_CC) {
17338         Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
17339                     Op.getOperand(2), Op.getOperand(3));
17340       } else {
17341         assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
17342         Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
17343                     Op.getOperand(2));
17344       }
17345       SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
17346                                              DAG.getTargetConstant(0, MaskVT),
17347                                              Subtarget, DAG);
17348       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
17349                                 DAG.getUNDEF(BitcastVT), CmpMask,
17350                                 DAG.getIntPtrConstant(0));
17351       return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
17352     }
17353     case COMI: { // Comparison intrinsics
17354       ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
17355       SDValue LHS = Op.getOperand(1);
17356       SDValue RHS = Op.getOperand(2);
17357       unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
17358       assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
17359       SDValue Cond = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
17360       SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17361                                   DAG.getConstant(X86CC, MVT::i8), Cond);
17362       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17363     }
17364     case VSHIFT:
17365       return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
17366                                  Op.getOperand(1), Op.getOperand(2), DAG);
17367     case VSHIFT_MASK:
17368       return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl,
17369                                                       Op.getSimpleValueType(),
17370                                                       Op.getOperand(1),
17371                                                       Op.getOperand(2), DAG),
17372                                   Op.getOperand(4), Op.getOperand(3), Subtarget,
17373                                   DAG);
17374     case COMPRESS_EXPAND_IN_REG: {
17375       SDValue Mask = Op.getOperand(3);
17376       SDValue DataToCompress = Op.getOperand(1);
17377       SDValue PassThru = Op.getOperand(2);
17378       if (isAllOnes(Mask)) // return data as is
17379         return Op.getOperand(1);
17380       EVT VT = Op.getValueType();
17381       EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17382                                     VT.getVectorNumElements());
17383       EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17384                                        Mask.getValueType().getSizeInBits());
17385       SDLoc dl(Op);
17386       SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17387                                   DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17388                                   DAG.getIntPtrConstant(0));
17389
17390       return DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToCompress,
17391                          PassThru);
17392     }
17393     case BLEND: {
17394       SDValue Mask = Op.getOperand(3);
17395       EVT VT = Op.getValueType();
17396       EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17397                                     VT.getVectorNumElements());
17398       EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17399                                        Mask.getValueType().getSizeInBits());
17400       SDLoc dl(Op);
17401       SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17402                                   DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17403                                   DAG.getIntPtrConstant(0));
17404       return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1),
17405                          Op.getOperand(2));
17406     }
17407     default:
17408       break;
17409     }
17410   }
17411
17412   switch (IntNo) {
17413   default: return SDValue();    // Don't custom lower most intrinsics.
17414
17415   case Intrinsic::x86_avx512_mask_valign_q_512:
17416   case Intrinsic::x86_avx512_mask_valign_d_512:
17417     // Vector source operands are swapped.
17418     return getVectorMaskingNode(DAG.getNode(X86ISD::VALIGN, dl,
17419                                             Op.getValueType(), Op.getOperand(2),
17420                                             Op.getOperand(1),
17421                                             Op.getOperand(3)),
17422                                 Op.getOperand(5), Op.getOperand(4),
17423                                 Subtarget, DAG);
17424
17425   // ptest and testp intrinsics. The intrinsic these come from are designed to
17426   // return an integer value, not just an instruction so lower it to the ptest
17427   // or testp pattern and a setcc for the result.
17428   case Intrinsic::x86_sse41_ptestz:
17429   case Intrinsic::x86_sse41_ptestc:
17430   case Intrinsic::x86_sse41_ptestnzc:
17431   case Intrinsic::x86_avx_ptestz_256:
17432   case Intrinsic::x86_avx_ptestc_256:
17433   case Intrinsic::x86_avx_ptestnzc_256:
17434   case Intrinsic::x86_avx_vtestz_ps:
17435   case Intrinsic::x86_avx_vtestc_ps:
17436   case Intrinsic::x86_avx_vtestnzc_ps:
17437   case Intrinsic::x86_avx_vtestz_pd:
17438   case Intrinsic::x86_avx_vtestc_pd:
17439   case Intrinsic::x86_avx_vtestnzc_pd:
17440   case Intrinsic::x86_avx_vtestz_ps_256:
17441   case Intrinsic::x86_avx_vtestc_ps_256:
17442   case Intrinsic::x86_avx_vtestnzc_ps_256:
17443   case Intrinsic::x86_avx_vtestz_pd_256:
17444   case Intrinsic::x86_avx_vtestc_pd_256:
17445   case Intrinsic::x86_avx_vtestnzc_pd_256: {
17446     bool IsTestPacked = false;
17447     unsigned X86CC;
17448     switch (IntNo) {
17449     default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
17450     case Intrinsic::x86_avx_vtestz_ps:
17451     case Intrinsic::x86_avx_vtestz_pd:
17452     case Intrinsic::x86_avx_vtestz_ps_256:
17453     case Intrinsic::x86_avx_vtestz_pd_256:
17454       IsTestPacked = true; // Fallthrough
17455     case Intrinsic::x86_sse41_ptestz:
17456     case Intrinsic::x86_avx_ptestz_256:
17457       // ZF = 1
17458       X86CC = X86::COND_E;
17459       break;
17460     case Intrinsic::x86_avx_vtestc_ps:
17461     case Intrinsic::x86_avx_vtestc_pd:
17462     case Intrinsic::x86_avx_vtestc_ps_256:
17463     case Intrinsic::x86_avx_vtestc_pd_256:
17464       IsTestPacked = true; // Fallthrough
17465     case Intrinsic::x86_sse41_ptestc:
17466     case Intrinsic::x86_avx_ptestc_256:
17467       // CF = 1
17468       X86CC = X86::COND_B;
17469       break;
17470     case Intrinsic::x86_avx_vtestnzc_ps:
17471     case Intrinsic::x86_avx_vtestnzc_pd:
17472     case Intrinsic::x86_avx_vtestnzc_ps_256:
17473     case Intrinsic::x86_avx_vtestnzc_pd_256:
17474       IsTestPacked = true; // Fallthrough
17475     case Intrinsic::x86_sse41_ptestnzc:
17476     case Intrinsic::x86_avx_ptestnzc_256:
17477       // ZF and CF = 0
17478       X86CC = X86::COND_A;
17479       break;
17480     }
17481
17482     SDValue LHS = Op.getOperand(1);
17483     SDValue RHS = Op.getOperand(2);
17484     unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
17485     SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
17486     SDValue CC = DAG.getConstant(X86CC, MVT::i8);
17487     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
17488     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17489   }
17490   case Intrinsic::x86_avx512_kortestz_w:
17491   case Intrinsic::x86_avx512_kortestc_w: {
17492     unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B;
17493     SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1));
17494     SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2));
17495     SDValue CC = DAG.getConstant(X86CC, MVT::i8);
17496     SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
17497     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i1, CC, Test);
17498     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17499   }
17500
17501   case Intrinsic::x86_sse42_pcmpistria128:
17502   case Intrinsic::x86_sse42_pcmpestria128:
17503   case Intrinsic::x86_sse42_pcmpistric128:
17504   case Intrinsic::x86_sse42_pcmpestric128:
17505   case Intrinsic::x86_sse42_pcmpistrio128:
17506   case Intrinsic::x86_sse42_pcmpestrio128:
17507   case Intrinsic::x86_sse42_pcmpistris128:
17508   case Intrinsic::x86_sse42_pcmpestris128:
17509   case Intrinsic::x86_sse42_pcmpistriz128:
17510   case Intrinsic::x86_sse42_pcmpestriz128: {
17511     unsigned Opcode;
17512     unsigned X86CC;
17513     switch (IntNo) {
17514     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
17515     case Intrinsic::x86_sse42_pcmpistria128:
17516       Opcode = X86ISD::PCMPISTRI;
17517       X86CC = X86::COND_A;
17518       break;
17519     case Intrinsic::x86_sse42_pcmpestria128:
17520       Opcode = X86ISD::PCMPESTRI;
17521       X86CC = X86::COND_A;
17522       break;
17523     case Intrinsic::x86_sse42_pcmpistric128:
17524       Opcode = X86ISD::PCMPISTRI;
17525       X86CC = X86::COND_B;
17526       break;
17527     case Intrinsic::x86_sse42_pcmpestric128:
17528       Opcode = X86ISD::PCMPESTRI;
17529       X86CC = X86::COND_B;
17530       break;
17531     case Intrinsic::x86_sse42_pcmpistrio128:
17532       Opcode = X86ISD::PCMPISTRI;
17533       X86CC = X86::COND_O;
17534       break;
17535     case Intrinsic::x86_sse42_pcmpestrio128:
17536       Opcode = X86ISD::PCMPESTRI;
17537       X86CC = X86::COND_O;
17538       break;
17539     case Intrinsic::x86_sse42_pcmpistris128:
17540       Opcode = X86ISD::PCMPISTRI;
17541       X86CC = X86::COND_S;
17542       break;
17543     case Intrinsic::x86_sse42_pcmpestris128:
17544       Opcode = X86ISD::PCMPESTRI;
17545       X86CC = X86::COND_S;
17546       break;
17547     case Intrinsic::x86_sse42_pcmpistriz128:
17548       Opcode = X86ISD::PCMPISTRI;
17549       X86CC = X86::COND_E;
17550       break;
17551     case Intrinsic::x86_sse42_pcmpestriz128:
17552       Opcode = X86ISD::PCMPESTRI;
17553       X86CC = X86::COND_E;
17554       break;
17555     }
17556     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
17557     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17558     SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
17559     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17560                                 DAG.getConstant(X86CC, MVT::i8),
17561                                 SDValue(PCMP.getNode(), 1));
17562     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17563   }
17564
17565   case Intrinsic::x86_sse42_pcmpistri128:
17566   case Intrinsic::x86_sse42_pcmpestri128: {
17567     unsigned Opcode;
17568     if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
17569       Opcode = X86ISD::PCMPISTRI;
17570     else
17571       Opcode = X86ISD::PCMPESTRI;
17572
17573     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
17574     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17575     return DAG.getNode(Opcode, dl, VTs, NewOps);
17576   }
17577   }
17578 }
17579
17580 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
17581                               SDValue Src, SDValue Mask, SDValue Base,
17582                               SDValue Index, SDValue ScaleOp, SDValue Chain,
17583                               const X86Subtarget * Subtarget) {
17584   SDLoc dl(Op);
17585   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
17586   assert(C && "Invalid scale type");
17587   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
17588   EVT MaskVT = MVT::getVectorVT(MVT::i1,
17589                              Index.getSimpleValueType().getVectorNumElements());
17590   SDValue MaskInReg;
17591   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
17592   if (MaskC)
17593     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
17594   else
17595     MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
17596   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
17597   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
17598   SDValue Segment = DAG.getRegister(0, MVT::i32);
17599   if (Src.getOpcode() == ISD::UNDEF)
17600     Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl);
17601   SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
17602   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
17603   SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
17604   return DAG.getMergeValues(RetOps, dl);
17605 }
17606
17607 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
17608                                SDValue Src, SDValue Mask, SDValue Base,
17609                                SDValue Index, SDValue ScaleOp, SDValue Chain) {
17610   SDLoc dl(Op);
17611   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
17612   assert(C && "Invalid scale type");
17613   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
17614   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
17615   SDValue Segment = DAG.getRegister(0, MVT::i32);
17616   EVT MaskVT = MVT::getVectorVT(MVT::i1,
17617                              Index.getSimpleValueType().getVectorNumElements());
17618   SDValue MaskInReg;
17619   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
17620   if (MaskC)
17621     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
17622   else
17623     MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
17624   SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
17625   SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
17626   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
17627   return SDValue(Res, 1);
17628 }
17629
17630 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
17631                                SDValue Mask, SDValue Base, SDValue Index,
17632                                SDValue ScaleOp, SDValue Chain) {
17633   SDLoc dl(Op);
17634   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
17635   assert(C && "Invalid scale type");
17636   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
17637   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
17638   SDValue Segment = DAG.getRegister(0, MVT::i32);
17639   EVT MaskVT =
17640     MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
17641   SDValue MaskInReg;
17642   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
17643   if (MaskC)
17644     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
17645   else
17646     MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
17647   //SDVTList VTs = DAG.getVTList(MVT::Other);
17648   SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
17649   SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
17650   return SDValue(Res, 0);
17651 }
17652
17653 // getReadPerformanceCounter - Handles the lowering of builtin intrinsics that
17654 // read performance monitor counters (x86_rdpmc).
17655 static void getReadPerformanceCounter(SDNode *N, SDLoc DL,
17656                               SelectionDAG &DAG, const X86Subtarget *Subtarget,
17657                               SmallVectorImpl<SDValue> &Results) {
17658   assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
17659   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
17660   SDValue LO, HI;
17661
17662   // The ECX register is used to select the index of the performance counter
17663   // to read.
17664   SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
17665                                    N->getOperand(2));
17666   SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
17667
17668   // Reads the content of a 64-bit performance counter and returns it in the
17669   // registers EDX:EAX.
17670   if (Subtarget->is64Bit()) {
17671     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
17672     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
17673                             LO.getValue(2));
17674   } else {
17675     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
17676     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
17677                             LO.getValue(2));
17678   }
17679   Chain = HI.getValue(1);
17680
17681   if (Subtarget->is64Bit()) {
17682     // The EAX register is loaded with the low-order 32 bits. The EDX register
17683     // is loaded with the supported high-order bits of the counter.
17684     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
17685                               DAG.getConstant(32, MVT::i8));
17686     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
17687     Results.push_back(Chain);
17688     return;
17689   }
17690
17691   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
17692   SDValue Ops[] = { LO, HI };
17693   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
17694   Results.push_back(Pair);
17695   Results.push_back(Chain);
17696 }
17697
17698 // getReadTimeStampCounter - Handles the lowering of builtin intrinsics that
17699 // read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is
17700 // also used to custom lower READCYCLECOUNTER nodes.
17701 static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode,
17702                               SelectionDAG &DAG, const X86Subtarget *Subtarget,
17703                               SmallVectorImpl<SDValue> &Results) {
17704   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
17705   SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
17706   SDValue LO, HI;
17707
17708   // The processor's time-stamp counter (a 64-bit MSR) is stored into the
17709   // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
17710   // and the EAX register is loaded with the low-order 32 bits.
17711   if (Subtarget->is64Bit()) {
17712     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
17713     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
17714                             LO.getValue(2));
17715   } else {
17716     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
17717     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
17718                             LO.getValue(2));
17719   }
17720   SDValue Chain = HI.getValue(1);
17721
17722   if (Opcode == X86ISD::RDTSCP_DAG) {
17723     assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
17724
17725     // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
17726     // the ECX register. Add 'ecx' explicitly to the chain.
17727     SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
17728                                      HI.getValue(2));
17729     // Explicitly store the content of ECX at the location passed in input
17730     // to the 'rdtscp' intrinsic.
17731     Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
17732                          MachinePointerInfo(), false, false, 0);
17733   }
17734
17735   if (Subtarget->is64Bit()) {
17736     // The EDX register is loaded with the high-order 32 bits of the MSR, and
17737     // the EAX register is loaded with the low-order 32 bits.
17738     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
17739                               DAG.getConstant(32, MVT::i8));
17740     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
17741     Results.push_back(Chain);
17742     return;
17743   }
17744
17745   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
17746   SDValue Ops[] = { LO, HI };
17747   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
17748   Results.push_back(Pair);
17749   Results.push_back(Chain);
17750 }
17751
17752 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
17753                                      SelectionDAG &DAG) {
17754   SmallVector<SDValue, 2> Results;
17755   SDLoc DL(Op);
17756   getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
17757                           Results);
17758   return DAG.getMergeValues(Results, DL);
17759 }
17760
17761
17762 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
17763                                       SelectionDAG &DAG) {
17764   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
17765
17766   const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo);
17767   if (!IntrData)
17768     return SDValue();
17769
17770   SDLoc dl(Op);
17771   switch(IntrData->Type) {
17772   default:
17773     llvm_unreachable("Unknown Intrinsic Type");
17774     break;
17775   case RDSEED:
17776   case RDRAND: {
17777     // Emit the node with the right value type.
17778     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
17779     SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
17780
17781     // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
17782     // Otherwise return the value from Rand, which is always 0, casted to i32.
17783     SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
17784                       DAG.getConstant(1, Op->getValueType(1)),
17785                       DAG.getConstant(X86::COND_B, MVT::i32),
17786                       SDValue(Result.getNode(), 1) };
17787     SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
17788                                   DAG.getVTList(Op->getValueType(1), MVT::Glue),
17789                                   Ops);
17790
17791     // Return { result, isValid, chain }.
17792     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
17793                        SDValue(Result.getNode(), 2));
17794   }
17795   case GATHER: {
17796   //gather(v1, mask, index, base, scale);
17797     SDValue Chain = Op.getOperand(0);
17798     SDValue Src   = Op.getOperand(2);
17799     SDValue Base  = Op.getOperand(3);
17800     SDValue Index = Op.getOperand(4);
17801     SDValue Mask  = Op.getOperand(5);
17802     SDValue Scale = Op.getOperand(6);
17803     return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain,
17804                           Subtarget);
17805   }
17806   case SCATTER: {
17807   //scatter(base, mask, index, v1, scale);
17808     SDValue Chain = Op.getOperand(0);
17809     SDValue Base  = Op.getOperand(2);
17810     SDValue Mask  = Op.getOperand(3);
17811     SDValue Index = Op.getOperand(4);
17812     SDValue Src   = Op.getOperand(5);
17813     SDValue Scale = Op.getOperand(6);
17814     return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain);
17815   }
17816   case PREFETCH: {
17817     SDValue Hint = Op.getOperand(6);
17818     unsigned HintVal;
17819     if (dyn_cast<ConstantSDNode> (Hint) == nullptr ||
17820         (HintVal = dyn_cast<ConstantSDNode> (Hint)->getZExtValue()) > 1)
17821       llvm_unreachable("Wrong prefetch hint in intrinsic: should be 0 or 1");
17822     unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0);
17823     SDValue Chain = Op.getOperand(0);
17824     SDValue Mask  = Op.getOperand(2);
17825     SDValue Index = Op.getOperand(3);
17826     SDValue Base  = Op.getOperand(4);
17827     SDValue Scale = Op.getOperand(5);
17828     return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain);
17829   }
17830   // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
17831   case RDTSC: {
17832     SmallVector<SDValue, 2> Results;
17833     getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget, Results);
17834     return DAG.getMergeValues(Results, dl);
17835   }
17836   // Read Performance Monitoring Counters.
17837   case RDPMC: {
17838     SmallVector<SDValue, 2> Results;
17839     getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
17840     return DAG.getMergeValues(Results, dl);
17841   }
17842   // XTEST intrinsics.
17843   case XTEST: {
17844     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
17845     SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
17846     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17847                                 DAG.getConstant(X86::COND_NE, MVT::i8),
17848                                 InTrans);
17849     SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
17850     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
17851                        Ret, SDValue(InTrans.getNode(), 1));
17852   }
17853   // ADC/ADCX/SBB
17854   case ADX: {
17855     SmallVector<SDValue, 2> Results;
17856     SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
17857     SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
17858     SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
17859                                 DAG.getConstant(-1, MVT::i8));
17860     SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
17861                               Op.getOperand(4), GenCF.getValue(1));
17862     SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
17863                                  Op.getOperand(5), MachinePointerInfo(),
17864                                  false, false, 0);
17865     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17866                                 DAG.getConstant(X86::COND_B, MVT::i8),
17867                                 Res.getValue(1));
17868     Results.push_back(SetCC);
17869     Results.push_back(Store);
17870     return DAG.getMergeValues(Results, dl);
17871   }
17872   case COMPRESS_TO_MEM: {
17873     SDLoc dl(Op);
17874     SDValue Mask = Op.getOperand(4);
17875     SDValue DataToCompress = Op.getOperand(3);
17876     SDValue Addr = Op.getOperand(2);
17877     SDValue Chain = Op.getOperand(0);
17878
17879     if (isAllOnes(Mask)) // return just a store
17880       return DAG.getStore(Chain, dl, DataToCompress, Addr,
17881                           MachinePointerInfo(), false, false, 0);
17882
17883     EVT VT = DataToCompress.getValueType();
17884     EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17885                                   VT.getVectorNumElements());
17886     EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17887                                      Mask.getValueType().getSizeInBits());
17888     SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17889                                 DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17890                                 DAG.getIntPtrConstant(0));
17891
17892     SDValue Compressed =  DAG.getNode(IntrData->Opc0, dl, VT, VMask,
17893                                       DataToCompress, DAG.getUNDEF(VT));
17894     return DAG.getStore(Chain, dl, Compressed, Addr,
17895                         MachinePointerInfo(), false, false, 0);
17896   }
17897   case EXPAND_FROM_MEM: {
17898     SDLoc dl(Op);
17899     SDValue Mask = Op.getOperand(4);
17900     SDValue PathThru = Op.getOperand(3);
17901     SDValue Addr = Op.getOperand(2);
17902     SDValue Chain = Op.getOperand(0);
17903     EVT VT = Op.getValueType();
17904
17905     if (isAllOnes(Mask)) // return just a load
17906       return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false,
17907                          false, 0);
17908     EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17909                                   VT.getVectorNumElements());
17910     EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17911                                      Mask.getValueType().getSizeInBits());
17912     SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17913                                 DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17914                                 DAG.getIntPtrConstant(0));
17915
17916     SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(),
17917                                    false, false, false, 0);
17918
17919     SmallVector<SDValue, 2> Results;
17920     Results.push_back(DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToExpand,
17921                                   PathThru));
17922     Results.push_back(Chain);
17923     return DAG.getMergeValues(Results, dl);
17924   }
17925   }
17926 }
17927
17928 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
17929                                            SelectionDAG &DAG) const {
17930   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
17931   MFI->setReturnAddressIsTaken(true);
17932
17933   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
17934     return SDValue();
17935
17936   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
17937   SDLoc dl(Op);
17938   EVT PtrVT = getPointerTy();
17939
17940   if (Depth > 0) {
17941     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
17942     const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
17943     SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
17944     return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
17945                        DAG.getNode(ISD::ADD, dl, PtrVT,
17946                                    FrameAddr, Offset),
17947                        MachinePointerInfo(), false, false, false, 0);
17948   }
17949
17950   // Just load the return address.
17951   SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
17952   return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
17953                      RetAddrFI, MachinePointerInfo(), false, false, false, 0);
17954 }
17955
17956 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
17957   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
17958   MFI->setFrameAddressIsTaken(true);
17959
17960   EVT VT = Op.getValueType();
17961   SDLoc dl(Op);  // FIXME probably not meaningful
17962   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
17963   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
17964   unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(
17965       DAG.getMachineFunction());
17966   assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
17967           (FrameReg == X86::EBP && VT == MVT::i32)) &&
17968          "Invalid Frame Register!");
17969   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
17970   while (Depth--)
17971     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
17972                             MachinePointerInfo(),
17973                             false, false, false, 0);
17974   return FrameAddr;
17975 }
17976
17977 // FIXME? Maybe this could be a TableGen attribute on some registers and
17978 // this table could be generated automatically from RegInfo.
17979 unsigned X86TargetLowering::getRegisterByName(const char* RegName,
17980                                               EVT VT) const {
17981   unsigned Reg = StringSwitch<unsigned>(RegName)
17982                        .Case("esp", X86::ESP)
17983                        .Case("rsp", X86::RSP)
17984                        .Default(0);
17985   if (Reg)
17986     return Reg;
17987   report_fatal_error("Invalid register name global variable");
17988 }
17989
17990 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
17991                                                      SelectionDAG &DAG) const {
17992   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
17993   return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());
17994 }
17995
17996 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
17997   SDValue Chain     = Op.getOperand(0);
17998   SDValue Offset    = Op.getOperand(1);
17999   SDValue Handler   = Op.getOperand(2);
18000   SDLoc dl      (Op);
18001
18002   EVT PtrVT = getPointerTy();
18003   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
18004   unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
18005   assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
18006           (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
18007          "Invalid Frame Register!");
18008   SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
18009   unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
18010
18011   SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
18012                                  DAG.getIntPtrConstant(RegInfo->getSlotSize()));
18013   StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
18014   Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
18015                        false, false, 0);
18016   Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
18017
18018   return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
18019                      DAG.getRegister(StoreAddrReg, PtrVT));
18020 }
18021
18022 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
18023                                                SelectionDAG &DAG) const {
18024   SDLoc DL(Op);
18025   return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
18026                      DAG.getVTList(MVT::i32, MVT::Other),
18027                      Op.getOperand(0), Op.getOperand(1));
18028 }
18029
18030 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
18031                                                 SelectionDAG &DAG) const {
18032   SDLoc DL(Op);
18033   return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
18034                      Op.getOperand(0), Op.getOperand(1));
18035 }
18036
18037 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
18038   return Op.getOperand(0);
18039 }
18040
18041 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
18042                                                 SelectionDAG &DAG) const {
18043   SDValue Root = Op.getOperand(0);
18044   SDValue Trmp = Op.getOperand(1); // trampoline
18045   SDValue FPtr = Op.getOperand(2); // nested function
18046   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
18047   SDLoc dl (Op);
18048
18049   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
18050   const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
18051
18052   if (Subtarget->is64Bit()) {
18053     SDValue OutChains[6];
18054
18055     // Large code-model.
18056     const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
18057     const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
18058
18059     const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
18060     const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
18061
18062     const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
18063
18064     // Load the pointer to the nested function into R11.
18065     unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
18066     SDValue Addr = Trmp;
18067     OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
18068                                 Addr, MachinePointerInfo(TrmpAddr),
18069                                 false, false, 0);
18070
18071     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18072                        DAG.getConstant(2, MVT::i64));
18073     OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
18074                                 MachinePointerInfo(TrmpAddr, 2),
18075                                 false, false, 2);
18076
18077     // Load the 'nest' parameter value into R10.
18078     // R10 is specified in X86CallingConv.td
18079     OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
18080     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18081                        DAG.getConstant(10, MVT::i64));
18082     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
18083                                 Addr, MachinePointerInfo(TrmpAddr, 10),
18084                                 false, false, 0);
18085
18086     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18087                        DAG.getConstant(12, MVT::i64));
18088     OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
18089                                 MachinePointerInfo(TrmpAddr, 12),
18090                                 false, false, 2);
18091
18092     // Jump to the nested function.
18093     OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
18094     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18095                        DAG.getConstant(20, MVT::i64));
18096     OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
18097                                 Addr, MachinePointerInfo(TrmpAddr, 20),
18098                                 false, false, 0);
18099
18100     unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
18101     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18102                        DAG.getConstant(22, MVT::i64));
18103     OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,
18104                                 MachinePointerInfo(TrmpAddr, 22),
18105                                 false, false, 0);
18106
18107     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
18108   } else {
18109     const Function *Func =
18110       cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
18111     CallingConv::ID CC = Func->getCallingConv();
18112     unsigned NestReg;
18113
18114     switch (CC) {
18115     default:
18116       llvm_unreachable("Unsupported calling convention");
18117     case CallingConv::C:
18118     case CallingConv::X86_StdCall: {
18119       // Pass 'nest' parameter in ECX.
18120       // Must be kept in sync with X86CallingConv.td
18121       NestReg = X86::ECX;
18122
18123       // Check that ECX wasn't needed by an 'inreg' parameter.
18124       FunctionType *FTy = Func->getFunctionType();
18125       const AttributeSet &Attrs = Func->getAttributes();
18126
18127       if (!Attrs.isEmpty() && !Func->isVarArg()) {
18128         unsigned InRegCount = 0;
18129         unsigned Idx = 1;
18130
18131         for (FunctionType::param_iterator I = FTy->param_begin(),
18132              E = FTy->param_end(); I != E; ++I, ++Idx)
18133           if (Attrs.hasAttribute(Idx, Attribute::InReg))
18134             // FIXME: should only count parameters that are lowered to integers.
18135             InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
18136
18137         if (InRegCount > 2) {
18138           report_fatal_error("Nest register in use - reduce number of inreg"
18139                              " parameters!");
18140         }
18141       }
18142       break;
18143     }
18144     case CallingConv::X86_FastCall:
18145     case CallingConv::X86_ThisCall:
18146     case CallingConv::Fast:
18147       // Pass 'nest' parameter in EAX.
18148       // Must be kept in sync with X86CallingConv.td
18149       NestReg = X86::EAX;
18150       break;
18151     }
18152
18153     SDValue OutChains[4];
18154     SDValue Addr, Disp;
18155
18156     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18157                        DAG.getConstant(10, MVT::i32));
18158     Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
18159
18160     // This is storing the opcode for MOV32ri.
18161     const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
18162     const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
18163     OutChains[0] = DAG.getStore(Root, dl,
18164                                 DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
18165                                 Trmp, MachinePointerInfo(TrmpAddr),
18166                                 false, false, 0);
18167
18168     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18169                        DAG.getConstant(1, MVT::i32));
18170     OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
18171                                 MachinePointerInfo(TrmpAddr, 1),
18172                                 false, false, 1);
18173
18174     const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
18175     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18176                        DAG.getConstant(5, MVT::i32));
18177     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,
18178                                 MachinePointerInfo(TrmpAddr, 5),
18179                                 false, false, 1);
18180
18181     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18182                        DAG.getConstant(6, MVT::i32));
18183     OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
18184                                 MachinePointerInfo(TrmpAddr, 6),
18185                                 false, false, 1);
18186
18187     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
18188   }
18189 }
18190
18191 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
18192                                             SelectionDAG &DAG) const {
18193   /*
18194    The rounding mode is in bits 11:10 of FPSR, and has the following
18195    settings:
18196      00 Round to nearest
18197      01 Round to -inf
18198      10 Round to +inf
18199      11 Round to 0
18200
18201   FLT_ROUNDS, on the other hand, expects the following:
18202     -1 Undefined
18203      0 Round to 0
18204      1 Round to nearest
18205      2 Round to +inf
18206      3 Round to -inf
18207
18208   To perform the conversion, we do:
18209     (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
18210   */
18211
18212   MachineFunction &MF = DAG.getMachineFunction();
18213   const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
18214   unsigned StackAlignment = TFI.getStackAlignment();
18215   MVT VT = Op.getSimpleValueType();
18216   SDLoc DL(Op);
18217
18218   // Save FP Control Word to stack slot
18219   int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
18220   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
18221
18222   MachineMemOperand *MMO =
18223    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
18224                            MachineMemOperand::MOStore, 2, 2);
18225
18226   SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
18227   SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
18228                                           DAG.getVTList(MVT::Other),
18229                                           Ops, MVT::i16, MMO);
18230
18231   // Load FP Control Word from stack slot
18232   SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
18233                             MachinePointerInfo(), false, false, false, 0);
18234
18235   // Transform as necessary
18236   SDValue CWD1 =
18237     DAG.getNode(ISD::SRL, DL, MVT::i16,
18238                 DAG.getNode(ISD::AND, DL, MVT::i16,
18239                             CWD, DAG.getConstant(0x800, MVT::i16)),
18240                 DAG.getConstant(11, MVT::i8));
18241   SDValue CWD2 =
18242     DAG.getNode(ISD::SRL, DL, MVT::i16,
18243                 DAG.getNode(ISD::AND, DL, MVT::i16,
18244                             CWD, DAG.getConstant(0x400, MVT::i16)),
18245                 DAG.getConstant(9, MVT::i8));
18246
18247   SDValue RetVal =
18248     DAG.getNode(ISD::AND, DL, MVT::i16,
18249                 DAG.getNode(ISD::ADD, DL, MVT::i16,
18250                             DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
18251                             DAG.getConstant(1, MVT::i16)),
18252                 DAG.getConstant(3, MVT::i16));
18253
18254   return DAG.getNode((VT.getSizeInBits() < 16 ?
18255                       ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
18256 }
18257
18258 static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
18259   MVT VT = Op.getSimpleValueType();
18260   EVT OpVT = VT;
18261   unsigned NumBits = VT.getSizeInBits();
18262   SDLoc dl(Op);
18263
18264   Op = Op.getOperand(0);
18265   if (VT == MVT::i8) {
18266     // Zero extend to i32 since there is not an i8 bsr.
18267     OpVT = MVT::i32;
18268     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
18269   }
18270
18271   // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
18272   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
18273   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
18274
18275   // If src is zero (i.e. bsr sets ZF), returns NumBits.
18276   SDValue Ops[] = {
18277     Op,
18278     DAG.getConstant(NumBits+NumBits-1, OpVT),
18279     DAG.getConstant(X86::COND_E, MVT::i8),
18280     Op.getValue(1)
18281   };
18282   Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
18283
18284   // Finally xor with NumBits-1.
18285   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
18286
18287   if (VT == MVT::i8)
18288     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
18289   return Op;
18290 }
18291
18292 static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
18293   MVT VT = Op.getSimpleValueType();
18294   EVT OpVT = VT;
18295   unsigned NumBits = VT.getSizeInBits();
18296   SDLoc dl(Op);
18297
18298   Op = Op.getOperand(0);
18299   if (VT == MVT::i8) {
18300     // Zero extend to i32 since there is not an i8 bsr.
18301     OpVT = MVT::i32;
18302     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
18303   }
18304
18305   // Issue a bsr (scan bits in reverse).
18306   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
18307   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
18308
18309   // And xor with NumBits-1.
18310   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
18311
18312   if (VT == MVT::i8)
18313     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
18314   return Op;
18315 }
18316
18317 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
18318   MVT VT = Op.getSimpleValueType();
18319   unsigned NumBits = VT.getSizeInBits();
18320   SDLoc dl(Op);
18321   Op = Op.getOperand(0);
18322
18323   // Issue a bsf (scan bits forward) which also sets EFLAGS.
18324   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18325   Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
18326
18327   // If src is zero (i.e. bsf sets ZF), returns NumBits.
18328   SDValue Ops[] = {
18329     Op,
18330     DAG.getConstant(NumBits, VT),
18331     DAG.getConstant(X86::COND_E, MVT::i8),
18332     Op.getValue(1)
18333   };
18334   return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
18335 }
18336
18337 // Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
18338 // ones, and then concatenate the result back.
18339 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
18340   MVT VT = Op.getSimpleValueType();
18341
18342   assert(VT.is256BitVector() && VT.isInteger() &&
18343          "Unsupported value type for operation");
18344
18345   unsigned NumElems = VT.getVectorNumElements();
18346   SDLoc dl(Op);
18347
18348   // Extract the LHS vectors
18349   SDValue LHS = Op.getOperand(0);
18350   SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
18351   SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
18352
18353   // Extract the RHS vectors
18354   SDValue RHS = Op.getOperand(1);
18355   SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
18356   SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
18357
18358   MVT EltVT = VT.getVectorElementType();
18359   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
18360
18361   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
18362                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
18363                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
18364 }
18365
18366 static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
18367   assert(Op.getSimpleValueType().is256BitVector() &&
18368          Op.getSimpleValueType().isInteger() &&
18369          "Only handle AVX 256-bit vector integer operation");
18370   return Lower256IntArith(Op, DAG);
18371 }
18372
18373 static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
18374   assert(Op.getSimpleValueType().is256BitVector() &&
18375          Op.getSimpleValueType().isInteger() &&
18376          "Only handle AVX 256-bit vector integer operation");
18377   return Lower256IntArith(Op, DAG);
18378 }
18379
18380 static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
18381                         SelectionDAG &DAG) {
18382   SDLoc dl(Op);
18383   MVT VT = Op.getSimpleValueType();
18384
18385   // Decompose 256-bit ops into smaller 128-bit ops.
18386   if (VT.is256BitVector() && !Subtarget->hasInt256())
18387     return Lower256IntArith(Op, DAG);
18388
18389   SDValue A = Op.getOperand(0);
18390   SDValue B = Op.getOperand(1);
18391
18392   // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
18393   if (VT == MVT::v4i32) {
18394     assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() &&
18395            "Should not custom lower when pmuldq is available!");
18396
18397     // Extract the odd parts.
18398     static const int UnpackMask[] = { 1, -1, 3, -1 };
18399     SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
18400     SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
18401
18402     // Multiply the even parts.
18403     SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
18404     // Now multiply odd parts.
18405     SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
18406
18407     Evens = DAG.getNode(ISD::BITCAST, dl, VT, Evens);
18408     Odds = DAG.getNode(ISD::BITCAST, dl, VT, Odds);
18409
18410     // Merge the two vectors back together with a shuffle. This expands into 2
18411     // shuffles.
18412     static const int ShufMask[] = { 0, 4, 2, 6 };
18413     return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
18414   }
18415
18416   assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
18417          "Only know how to lower V2I64/V4I64/V8I64 multiply");
18418
18419   //  Ahi = psrlqi(a, 32);
18420   //  Bhi = psrlqi(b, 32);
18421   //
18422   //  AloBlo = pmuludq(a, b);
18423   //  AloBhi = pmuludq(a, Bhi);
18424   //  AhiBlo = pmuludq(Ahi, b);
18425
18426   //  AloBhi = psllqi(AloBhi, 32);
18427   //  AhiBlo = psllqi(AhiBlo, 32);
18428   //  return AloBlo + AloBhi + AhiBlo;
18429
18430   SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
18431   SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
18432
18433   // Bit cast to 32-bit vectors for MULUDQ
18434   EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
18435                                   (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;
18436   A = DAG.getNode(ISD::BITCAST, dl, MulVT, A);
18437   B = DAG.getNode(ISD::BITCAST, dl, MulVT, B);
18438   Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi);
18439   Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi);
18440
18441   SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
18442   SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
18443   SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
18444
18445   AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG);
18446   AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG);
18447
18448   SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
18449   return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
18450 }
18451
18452 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
18453   assert(Subtarget->isTargetWin64() && "Unexpected target");
18454   EVT VT = Op.getValueType();
18455   assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
18456          "Unexpected return type for lowering");
18457
18458   RTLIB::Libcall LC;
18459   bool isSigned;
18460   switch (Op->getOpcode()) {
18461   default: llvm_unreachable("Unexpected request for libcall!");
18462   case ISD::SDIV:      isSigned = true;  LC = RTLIB::SDIV_I128;    break;
18463   case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;
18464   case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;
18465   case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;
18466   case ISD::SDIVREM:   isSigned = true;  LC = RTLIB::SDIVREM_I128; break;
18467   case ISD::UDIVREM:   isSigned = false; LC = RTLIB::UDIVREM_I128; break;
18468   }
18469
18470   SDLoc dl(Op);
18471   SDValue InChain = DAG.getEntryNode();
18472
18473   TargetLowering::ArgListTy Args;
18474   TargetLowering::ArgListEntry Entry;
18475   for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
18476     EVT ArgVT = Op->getOperand(i).getValueType();
18477     assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
18478            "Unexpected argument type for lowering");
18479     SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
18480     Entry.Node = StackPtr;
18481     InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(),
18482                            false, false, 16);
18483     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
18484     Entry.Ty = PointerType::get(ArgTy,0);
18485     Entry.isSExt = false;
18486     Entry.isZExt = false;
18487     Args.push_back(Entry);
18488   }
18489
18490   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
18491                                          getPointerTy());
18492
18493   TargetLowering::CallLoweringInfo CLI(DAG);
18494   CLI.setDebugLoc(dl).setChain(InChain)
18495     .setCallee(getLibcallCallingConv(LC),
18496                static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
18497                Callee, std::move(Args), 0)
18498     .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
18499
18500   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
18501   return DAG.getNode(ISD::BITCAST, dl, VT, CallInfo.first);
18502 }
18503
18504 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
18505                              SelectionDAG &DAG) {
18506   SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
18507   EVT VT = Op0.getValueType();
18508   SDLoc dl(Op);
18509
18510   assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) ||
18511          (VT == MVT::v8i32 && Subtarget->hasInt256()));
18512
18513   // PMULxD operations multiply each even value (starting at 0) of LHS with
18514   // the related value of RHS and produce a widen result.
18515   // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
18516   // => <2 x i64> <ae|cg>
18517   //
18518   // In other word, to have all the results, we need to perform two PMULxD:
18519   // 1. one with the even values.
18520   // 2. one with the odd values.
18521   // To achieve #2, with need to place the odd values at an even position.
18522   //
18523   // Place the odd value at an even position (basically, shift all values 1
18524   // step to the left):
18525   const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
18526   // <a|b|c|d> => <b|undef|d|undef>
18527   SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask);
18528   // <e|f|g|h> => <f|undef|h|undef>
18529   SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask);
18530
18531   // Emit two multiplies, one for the lower 2 ints and one for the higher 2
18532   // ints.
18533   MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
18534   bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
18535   unsigned Opcode =
18536       (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
18537   // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
18538   // => <2 x i64> <ae|cg>
18539   SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT,
18540                              DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
18541   // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
18542   // => <2 x i64> <bf|dh>
18543   SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT,
18544                              DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
18545
18546   // Shuffle it back into the right order.
18547   SDValue Highs, Lows;
18548   if (VT == MVT::v8i32) {
18549     const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
18550     Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
18551     const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
18552     Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
18553   } else {
18554     const int HighMask[] = {1, 5, 3, 7};
18555     Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
18556     const int LowMask[] = {0, 4, 2, 6};
18557     Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
18558   }
18559
18560   // If we have a signed multiply but no PMULDQ fix up the high parts of a
18561   // unsigned multiply.
18562   if (IsSigned && !Subtarget->hasSSE41()) {
18563     SDValue ShAmt =
18564         DAG.getConstant(31, DAG.getTargetLoweringInfo().getShiftAmountTy(VT));
18565     SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
18566                              DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
18567     SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
18568                              DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
18569
18570     SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
18571     Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
18572   }
18573
18574   // The first result of MUL_LOHI is actually the low value, followed by the
18575   // high value.
18576   SDValue Ops[] = {Lows, Highs};
18577   return DAG.getMergeValues(Ops, dl);
18578 }
18579
18580 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
18581                                          const X86Subtarget *Subtarget) {
18582   MVT VT = Op.getSimpleValueType();
18583   SDLoc dl(Op);
18584   SDValue R = Op.getOperand(0);
18585   SDValue Amt = Op.getOperand(1);
18586
18587   // Optimize shl/srl/sra with constant shift amount.
18588   if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
18589     if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
18590       uint64_t ShiftAmt = ShiftConst->getZExtValue();
18591
18592       if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
18593           (Subtarget->hasInt256() &&
18594            (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16)) ||
18595           (Subtarget->hasAVX512() &&
18596            (VT == MVT::v8i64 || VT == MVT::v16i32))) {
18597         if (Op.getOpcode() == ISD::SHL)
18598           return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
18599                                             DAG);
18600         if (Op.getOpcode() == ISD::SRL)
18601           return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
18602                                             DAG);
18603         if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64)
18604           return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
18605                                             DAG);
18606       }
18607
18608       if (VT == MVT::v16i8) {
18609         if (Op.getOpcode() == ISD::SHL) {
18610           // Make a large shift.
18611           SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
18612                                                    MVT::v8i16, R, ShiftAmt,
18613                                                    DAG);
18614           SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
18615           // Zero out the rightmost bits.
18616           SmallVector<SDValue, 16> V(16,
18617                                      DAG.getConstant(uint8_t(-1U << ShiftAmt),
18618                                                      MVT::i8));
18619           return DAG.getNode(ISD::AND, dl, VT, SHL,
18620                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18621         }
18622         if (Op.getOpcode() == ISD::SRL) {
18623           // Make a large shift.
18624           SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
18625                                                    MVT::v8i16, R, ShiftAmt,
18626                                                    DAG);
18627           SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
18628           // Zero out the leftmost bits.
18629           SmallVector<SDValue, 16> V(16,
18630                                      DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
18631                                                      MVT::i8));
18632           return DAG.getNode(ISD::AND, dl, VT, SRL,
18633                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18634         }
18635         if (Op.getOpcode() == ISD::SRA) {
18636           if (ShiftAmt == 7) {
18637             // R s>> 7  ===  R s< 0
18638             SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
18639             return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
18640           }
18641
18642           // R s>> a === ((R u>> a) ^ m) - m
18643           SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
18644           SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
18645                                                          MVT::i8));
18646           SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
18647           Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
18648           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
18649           return Res;
18650         }
18651         llvm_unreachable("Unknown shift opcode.");
18652       }
18653
18654       if (Subtarget->hasInt256() && VT == MVT::v32i8) {
18655         if (Op.getOpcode() == ISD::SHL) {
18656           // Make a large shift.
18657           SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
18658                                                    MVT::v16i16, R, ShiftAmt,
18659                                                    DAG);
18660           SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
18661           // Zero out the rightmost bits.
18662           SmallVector<SDValue, 32> V(32,
18663                                      DAG.getConstant(uint8_t(-1U << ShiftAmt),
18664                                                      MVT::i8));
18665           return DAG.getNode(ISD::AND, dl, VT, SHL,
18666                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18667         }
18668         if (Op.getOpcode() == ISD::SRL) {
18669           // Make a large shift.
18670           SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
18671                                                    MVT::v16i16, R, ShiftAmt,
18672                                                    DAG);
18673           SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
18674           // Zero out the leftmost bits.
18675           SmallVector<SDValue, 32> V(32,
18676                                      DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
18677                                                      MVT::i8));
18678           return DAG.getNode(ISD::AND, dl, VT, SRL,
18679                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18680         }
18681         if (Op.getOpcode() == ISD::SRA) {
18682           if (ShiftAmt == 7) {
18683             // R s>> 7  ===  R s< 0
18684             SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
18685             return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
18686           }
18687
18688           // R s>> a === ((R u>> a) ^ m) - m
18689           SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
18690           SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt,
18691                                                          MVT::i8));
18692           SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
18693           Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
18694           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
18695           return Res;
18696         }
18697         llvm_unreachable("Unknown shift opcode.");
18698       }
18699     }
18700   }
18701
18702   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
18703   if (!Subtarget->is64Bit() &&
18704       (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
18705       Amt.getOpcode() == ISD::BITCAST &&
18706       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
18707     Amt = Amt.getOperand(0);
18708     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
18709                      VT.getVectorNumElements();
18710     unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
18711     uint64_t ShiftAmt = 0;
18712     for (unsigned i = 0; i != Ratio; ++i) {
18713       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i));
18714       if (!C)
18715         return SDValue();
18716       // 6 == Log2(64)
18717       ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
18718     }
18719     // Check remaining shift amounts.
18720     for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
18721       uint64_t ShAmt = 0;
18722       for (unsigned j = 0; j != Ratio; ++j) {
18723         ConstantSDNode *C =
18724           dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
18725         if (!C)
18726           return SDValue();
18727         // 6 == Log2(64)
18728         ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
18729       }
18730       if (ShAmt != ShiftAmt)
18731         return SDValue();
18732     }
18733     switch (Op.getOpcode()) {
18734     default:
18735       llvm_unreachable("Unknown shift opcode!");
18736     case ISD::SHL:
18737       return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
18738                                         DAG);
18739     case ISD::SRL:
18740       return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
18741                                         DAG);
18742     case ISD::SRA:
18743       return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
18744                                         DAG);
18745     }
18746   }
18747
18748   return SDValue();
18749 }
18750
18751 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
18752                                         const X86Subtarget* Subtarget) {
18753   MVT VT = Op.getSimpleValueType();
18754   SDLoc dl(Op);
18755   SDValue R = Op.getOperand(0);
18756   SDValue Amt = Op.getOperand(1);
18757
18758   if ((VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) ||
18759       VT == MVT::v4i32 || VT == MVT::v8i16 ||
18760       (Subtarget->hasInt256() &&
18761        ((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) ||
18762         VT == MVT::v8i32 || VT == MVT::v16i16)) ||
18763        (Subtarget->hasAVX512() && (VT == MVT::v8i64 || VT == MVT::v16i32))) {
18764     SDValue BaseShAmt;
18765     EVT EltVT = VT.getVectorElementType();
18766
18767     if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
18768       // Check if this build_vector node is doing a splat.
18769       // If so, then set BaseShAmt equal to the splat value.
18770       BaseShAmt = BV->getSplatValue();
18771       if (BaseShAmt && BaseShAmt.getOpcode() == ISD::UNDEF)
18772         BaseShAmt = SDValue();
18773     } else {
18774       if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
18775         Amt = Amt.getOperand(0);
18776
18777       ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
18778       if (SVN && SVN->isSplat()) {
18779         unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
18780         SDValue InVec = Amt.getOperand(0);
18781         if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
18782           assert((SplatIdx < InVec.getValueType().getVectorNumElements()) &&
18783                  "Unexpected shuffle index found!");
18784           BaseShAmt = InVec.getOperand(SplatIdx);
18785         } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
18786            if (ConstantSDNode *C =
18787                dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
18788              if (C->getZExtValue() == SplatIdx)
18789                BaseShAmt = InVec.getOperand(1);
18790            }
18791         }
18792
18793         if (!BaseShAmt)
18794           // Avoid introducing an extract element from a shuffle.
18795           BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
18796                                     DAG.getIntPtrConstant(SplatIdx));
18797       }
18798     }
18799
18800     if (BaseShAmt.getNode()) {
18801       assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
18802       if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
18803         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
18804       else if (EltVT.bitsLT(MVT::i32))
18805         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
18806
18807       switch (Op.getOpcode()) {
18808       default:
18809         llvm_unreachable("Unknown shift opcode!");
18810       case ISD::SHL:
18811         switch (VT.SimpleTy) {
18812         default: return SDValue();
18813         case MVT::v2i64:
18814         case MVT::v4i32:
18815         case MVT::v8i16:
18816         case MVT::v4i64:
18817         case MVT::v8i32:
18818         case MVT::v16i16:
18819         case MVT::v16i32:
18820         case MVT::v8i64:
18821           return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG);
18822         }
18823       case ISD::SRA:
18824         switch (VT.SimpleTy) {
18825         default: return SDValue();
18826         case MVT::v4i32:
18827         case MVT::v8i16:
18828         case MVT::v8i32:
18829         case MVT::v16i16:
18830         case MVT::v16i32:
18831         case MVT::v8i64:
18832           return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG);
18833         }
18834       case ISD::SRL:
18835         switch (VT.SimpleTy) {
18836         default: return SDValue();
18837         case MVT::v2i64:
18838         case MVT::v4i32:
18839         case MVT::v8i16:
18840         case MVT::v4i64:
18841         case MVT::v8i32:
18842         case MVT::v16i16:
18843         case MVT::v16i32:
18844         case MVT::v8i64:
18845           return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG);
18846         }
18847       }
18848     }
18849   }
18850
18851   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
18852   if (!Subtarget->is64Bit() &&
18853       (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64) ||
18854       (Subtarget->hasAVX512() && VT == MVT::v8i64)) &&
18855       Amt.getOpcode() == ISD::BITCAST &&
18856       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
18857     Amt = Amt.getOperand(0);
18858     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
18859                      VT.getVectorNumElements();
18860     std::vector<SDValue> Vals(Ratio);
18861     for (unsigned i = 0; i != Ratio; ++i)
18862       Vals[i] = Amt.getOperand(i);
18863     for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
18864       for (unsigned j = 0; j != Ratio; ++j)
18865         if (Vals[j] != Amt.getOperand(i + j))
18866           return SDValue();
18867     }
18868     switch (Op.getOpcode()) {
18869     default:
18870       llvm_unreachable("Unknown shift opcode!");
18871     case ISD::SHL:
18872       return DAG.getNode(X86ISD::VSHL, dl, VT, R, Op.getOperand(1));
18873     case ISD::SRL:
18874       return DAG.getNode(X86ISD::VSRL, dl, VT, R, Op.getOperand(1));
18875     case ISD::SRA:
18876       return DAG.getNode(X86ISD::VSRA, dl, VT, R, Op.getOperand(1));
18877     }
18878   }
18879
18880   return SDValue();
18881 }
18882
18883 static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
18884                           SelectionDAG &DAG) {
18885   MVT VT = Op.getSimpleValueType();
18886   SDLoc dl(Op);
18887   SDValue R = Op.getOperand(0);
18888   SDValue Amt = Op.getOperand(1);
18889   SDValue V;
18890
18891   assert(VT.isVector() && "Custom lowering only for vector shifts!");
18892   assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!");
18893
18894   V = LowerScalarImmediateShift(Op, DAG, Subtarget);
18895   if (V.getNode())
18896     return V;
18897
18898   V = LowerScalarVariableShift(Op, DAG, Subtarget);
18899   if (V.getNode())
18900       return V;
18901
18902   if (Subtarget->hasAVX512() && (VT == MVT::v16i32 || VT == MVT::v8i64))
18903     return Op;
18904   // AVX2 has VPSLLV/VPSRAV/VPSRLV.
18905   if (Subtarget->hasInt256()) {
18906     if (Op.getOpcode() == ISD::SRL &&
18907         (VT == MVT::v2i64 || VT == MVT::v4i32 ||
18908          VT == MVT::v4i64 || VT == MVT::v8i32))
18909       return Op;
18910     if (Op.getOpcode() == ISD::SHL &&
18911         (VT == MVT::v2i64 || VT == MVT::v4i32 ||
18912          VT == MVT::v4i64 || VT == MVT::v8i32))
18913       return Op;
18914     if (Op.getOpcode() == ISD::SRA && (VT == MVT::v4i32 || VT == MVT::v8i32))
18915       return Op;
18916   }
18917
18918   // If possible, lower this packed shift into a vector multiply instead of
18919   // expanding it into a sequence of scalar shifts.
18920   // Do this only if the vector shift count is a constant build_vector.
18921   if (Op.getOpcode() == ISD::SHL &&
18922       (VT == MVT::v8i16 || VT == MVT::v4i32 ||
18923        (Subtarget->hasInt256() && VT == MVT::v16i16)) &&
18924       ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
18925     SmallVector<SDValue, 8> Elts;
18926     EVT SVT = VT.getScalarType();
18927     unsigned SVTBits = SVT.getSizeInBits();
18928     const APInt &One = APInt(SVTBits, 1);
18929     unsigned NumElems = VT.getVectorNumElements();
18930
18931     for (unsigned i=0; i !=NumElems; ++i) {
18932       SDValue Op = Amt->getOperand(i);
18933       if (Op->getOpcode() == ISD::UNDEF) {
18934         Elts.push_back(Op);
18935         continue;
18936       }
18937
18938       ConstantSDNode *ND = cast<ConstantSDNode>(Op);
18939       const APInt &C = APInt(SVTBits, ND->getAPIntValue().getZExtValue());
18940       uint64_t ShAmt = C.getZExtValue();
18941       if (ShAmt >= SVTBits) {
18942         Elts.push_back(DAG.getUNDEF(SVT));
18943         continue;
18944       }
18945       Elts.push_back(DAG.getConstant(One.shl(ShAmt), SVT));
18946     }
18947     SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
18948     return DAG.getNode(ISD::MUL, dl, VT, R, BV);
18949   }
18950
18951   // Lower SHL with variable shift amount.
18952   if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
18953     Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT));
18954
18955     Op = DAG.getNode(ISD::ADD, dl, VT, Op, DAG.getConstant(0x3f800000U, VT));
18956     Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op);
18957     Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
18958     return DAG.getNode(ISD::MUL, dl, VT, Op, R);
18959   }
18960
18961   // If possible, lower this shift as a sequence of two shifts by
18962   // constant plus a MOVSS/MOVSD instead of scalarizing it.
18963   // Example:
18964   //   (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
18965   //
18966   // Could be rewritten as:
18967   //   (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
18968   //
18969   // The advantage is that the two shifts from the example would be
18970   // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
18971   // the vector shift into four scalar shifts plus four pairs of vector
18972   // insert/extract.
18973   if ((VT == MVT::v8i16 || VT == MVT::v4i32) &&
18974       ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
18975     unsigned TargetOpcode = X86ISD::MOVSS;
18976     bool CanBeSimplified;
18977     // The splat value for the first packed shift (the 'X' from the example).
18978     SDValue Amt1 = Amt->getOperand(0);
18979     // The splat value for the second packed shift (the 'Y' from the example).
18980     SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) :
18981                                         Amt->getOperand(2);
18982
18983     // See if it is possible to replace this node with a sequence of
18984     // two shifts followed by a MOVSS/MOVSD
18985     if (VT == MVT::v4i32) {
18986       // Check if it is legal to use a MOVSS.
18987       CanBeSimplified = Amt2 == Amt->getOperand(2) &&
18988                         Amt2 == Amt->getOperand(3);
18989       if (!CanBeSimplified) {
18990         // Otherwise, check if we can still simplify this node using a MOVSD.
18991         CanBeSimplified = Amt1 == Amt->getOperand(1) &&
18992                           Amt->getOperand(2) == Amt->getOperand(3);
18993         TargetOpcode = X86ISD::MOVSD;
18994         Amt2 = Amt->getOperand(2);
18995       }
18996     } else {
18997       // Do similar checks for the case where the machine value type
18998       // is MVT::v8i16.
18999       CanBeSimplified = Amt1 == Amt->getOperand(1);
19000       for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
19001         CanBeSimplified = Amt2 == Amt->getOperand(i);
19002
19003       if (!CanBeSimplified) {
19004         TargetOpcode = X86ISD::MOVSD;
19005         CanBeSimplified = true;
19006         Amt2 = Amt->getOperand(4);
19007         for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
19008           CanBeSimplified = Amt1 == Amt->getOperand(i);
19009         for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
19010           CanBeSimplified = Amt2 == Amt->getOperand(j);
19011       }
19012     }
19013
19014     if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
19015         isa<ConstantSDNode>(Amt2)) {
19016       // Replace this node with two shifts followed by a MOVSS/MOVSD.
19017       EVT CastVT = MVT::v4i32;
19018       SDValue Splat1 =
19019         DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), VT);
19020       SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
19021       SDValue Splat2 =
19022         DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), VT);
19023       SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
19024       if (TargetOpcode == X86ISD::MOVSD)
19025         CastVT = MVT::v2i64;
19026       SDValue BitCast1 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift1);
19027       SDValue BitCast2 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift2);
19028       SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2,
19029                                             BitCast1, DAG);
19030       return DAG.getNode(ISD::BITCAST, dl, VT, Result);
19031     }
19032   }
19033
19034   if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
19035     assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq.");
19036
19037     // a = a << 5;
19038     Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(5, VT));
19039     Op = DAG.getNode(ISD::BITCAST, dl, VT, Op);
19040
19041     // Turn 'a' into a mask suitable for VSELECT
19042     SDValue VSelM = DAG.getConstant(0x80, VT);
19043     SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
19044     OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
19045
19046     SDValue CM1 = DAG.getConstant(0x0f, VT);
19047     SDValue CM2 = DAG.getConstant(0x3f, VT);
19048
19049     // r = VSELECT(r, psllw(r & (char16)15, 4), a);
19050     SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1);
19051     M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 4, DAG);
19052     M = DAG.getNode(ISD::BITCAST, dl, VT, M);
19053     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
19054
19055     // a += a
19056     Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
19057     OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
19058     OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
19059
19060     // r = VSELECT(r, psllw(r & (char16)63, 2), a);
19061     M = DAG.getNode(ISD::AND, dl, VT, R, CM2);
19062     M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 2, DAG);
19063     M = DAG.getNode(ISD::BITCAST, dl, VT, M);
19064     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
19065
19066     // a += a
19067     Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
19068     OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
19069     OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
19070
19071     // return VSELECT(r, r+r, a);
19072     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel,
19073                     DAG.getNode(ISD::ADD, dl, VT, R, R), R);
19074     return R;
19075   }
19076
19077   // It's worth extending once and using the v8i32 shifts for 16-bit types, but
19078   // the extra overheads to get from v16i8 to v8i32 make the existing SSE
19079   // solution better.
19080   if (Subtarget->hasInt256() && VT == MVT::v8i16) {
19081     MVT NewVT = VT == MVT::v8i16 ? MVT::v8i32 : MVT::v16i16;
19082     unsigned ExtOpc =
19083         Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
19084     R = DAG.getNode(ExtOpc, dl, NewVT, R);
19085     Amt = DAG.getNode(ISD::ANY_EXTEND, dl, NewVT, Amt);
19086     return DAG.getNode(ISD::TRUNCATE, dl, VT,
19087                        DAG.getNode(Op.getOpcode(), dl, NewVT, R, Amt));
19088     }
19089
19090   // Decompose 256-bit shifts into smaller 128-bit shifts.
19091   if (VT.is256BitVector()) {
19092     unsigned NumElems = VT.getVectorNumElements();
19093     MVT EltVT = VT.getVectorElementType();
19094     EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
19095
19096     // Extract the two vectors
19097     SDValue V1 = Extract128BitVector(R, 0, DAG, dl);
19098     SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl);
19099
19100     // Recreate the shift amount vectors
19101     SDValue Amt1, Amt2;
19102     if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
19103       // Constant shift amount
19104       SmallVector<SDValue, 4> Amt1Csts;
19105       SmallVector<SDValue, 4> Amt2Csts;
19106       for (unsigned i = 0; i != NumElems/2; ++i)
19107         Amt1Csts.push_back(Amt->getOperand(i));
19108       for (unsigned i = NumElems/2; i != NumElems; ++i)
19109         Amt2Csts.push_back(Amt->getOperand(i));
19110
19111       Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts);
19112       Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts);
19113     } else {
19114       // Variable shift amount
19115       Amt1 = Extract128BitVector(Amt, 0, DAG, dl);
19116       Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl);
19117     }
19118
19119     // Issue new vector shifts for the smaller types
19120     V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
19121     V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
19122
19123     // Concatenate the result back
19124     return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
19125   }
19126
19127   return SDValue();
19128 }
19129
19130 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
19131   // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
19132   // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
19133   // looks for this combo and may remove the "setcc" instruction if the "setcc"
19134   // has only one use.
19135   SDNode *N = Op.getNode();
19136   SDValue LHS = N->getOperand(0);
19137   SDValue RHS = N->getOperand(1);
19138   unsigned BaseOp = 0;
19139   unsigned Cond = 0;
19140   SDLoc DL(Op);
19141   switch (Op.getOpcode()) {
19142   default: llvm_unreachable("Unknown ovf instruction!");
19143   case ISD::SADDO:
19144     // A subtract of one will be selected as a INC. Note that INC doesn't
19145     // set CF, so we can't do this for UADDO.
19146     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
19147       if (C->isOne()) {
19148         BaseOp = X86ISD::INC;
19149         Cond = X86::COND_O;
19150         break;
19151       }
19152     BaseOp = X86ISD::ADD;
19153     Cond = X86::COND_O;
19154     break;
19155   case ISD::UADDO:
19156     BaseOp = X86ISD::ADD;
19157     Cond = X86::COND_B;
19158     break;
19159   case ISD::SSUBO:
19160     // A subtract of one will be selected as a DEC. Note that DEC doesn't
19161     // set CF, so we can't do this for USUBO.
19162     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
19163       if (C->isOne()) {
19164         BaseOp = X86ISD::DEC;
19165         Cond = X86::COND_O;
19166         break;
19167       }
19168     BaseOp = X86ISD::SUB;
19169     Cond = X86::COND_O;
19170     break;
19171   case ISD::USUBO:
19172     BaseOp = X86ISD::SUB;
19173     Cond = X86::COND_B;
19174     break;
19175   case ISD::SMULO:
19176     BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
19177     Cond = X86::COND_O;
19178     break;
19179   case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
19180     if (N->getValueType(0) == MVT::i8) {
19181       BaseOp = X86ISD::UMUL8;
19182       Cond = X86::COND_O;
19183       break;
19184     }
19185     SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
19186                                  MVT::i32);
19187     SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
19188
19189     SDValue SetCC =
19190       DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
19191                   DAG.getConstant(X86::COND_O, MVT::i32),
19192                   SDValue(Sum.getNode(), 2));
19193
19194     return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
19195   }
19196   }
19197
19198   // Also sets EFLAGS.
19199   SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
19200   SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
19201
19202   SDValue SetCC =
19203     DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1),
19204                 DAG.getConstant(Cond, MVT::i32),
19205                 SDValue(Sum.getNode(), 1));
19206
19207   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
19208 }
19209
19210 // Sign extension of the low part of vector elements. This may be used either
19211 // when sign extend instructions are not available or if the vector element
19212 // sizes already match the sign-extended size. If the vector elements are in
19213 // their pre-extended size and sign extend instructions are available, that will
19214 // be handled by LowerSIGN_EXTEND.
19215 SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
19216                                                   SelectionDAG &DAG) const {
19217   SDLoc dl(Op);
19218   EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
19219   MVT VT = Op.getSimpleValueType();
19220
19221   if (!Subtarget->hasSSE2() || !VT.isVector())
19222     return SDValue();
19223
19224   unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
19225                       ExtraVT.getScalarType().getSizeInBits();
19226
19227   switch (VT.SimpleTy) {
19228     default: return SDValue();
19229     case MVT::v8i32:
19230     case MVT::v16i16:
19231       if (!Subtarget->hasFp256())
19232         return SDValue();
19233       if (!Subtarget->hasInt256()) {
19234         // needs to be split
19235         unsigned NumElems = VT.getVectorNumElements();
19236
19237         // Extract the LHS vectors
19238         SDValue LHS = Op.getOperand(0);
19239         SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
19240         SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
19241
19242         MVT EltVT = VT.getVectorElementType();
19243         EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
19244
19245         EVT ExtraEltVT = ExtraVT.getVectorElementType();
19246         unsigned ExtraNumElems = ExtraVT.getVectorNumElements();
19247         ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
19248                                    ExtraNumElems/2);
19249         SDValue Extra = DAG.getValueType(ExtraVT);
19250
19251         LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
19252         LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
19253
19254         return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);
19255       }
19256       // fall through
19257     case MVT::v4i32:
19258     case MVT::v8i16: {
19259       SDValue Op0 = Op.getOperand(0);
19260
19261       // This is a sign extension of some low part of vector elements without
19262       // changing the size of the vector elements themselves:
19263       // Shift-Left + Shift-Right-Algebraic.
19264       SDValue Shl = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0,
19265                                                BitsDiff, DAG);
19266       return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Shl, BitsDiff,
19267                                         DAG);
19268     }
19269   }
19270 }
19271
19272 /// Returns true if the operand type is exactly twice the native width, and
19273 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
19274 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
19275 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
19276 bool X86TargetLowering::needsCmpXchgNb(const Type *MemType) const {
19277   unsigned OpWidth = MemType->getPrimitiveSizeInBits();
19278
19279   if (OpWidth == 64)
19280     return !Subtarget->is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
19281   else if (OpWidth == 128)
19282     return Subtarget->hasCmpxchg16b();
19283   else
19284     return false;
19285 }
19286
19287 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
19288   return needsCmpXchgNb(SI->getValueOperand()->getType());
19289 }
19290
19291 // Note: this turns large loads into lock cmpxchg8b/16b.
19292 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
19293 bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
19294   auto PTy = cast<PointerType>(LI->getPointerOperand()->getType());
19295   return needsCmpXchgNb(PTy->getElementType());
19296 }
19297
19298 bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
19299   unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
19300   const Type *MemType = AI->getType();
19301
19302   // If the operand is too big, we must see if cmpxchg8/16b is available
19303   // and default to library calls otherwise.
19304   if (MemType->getPrimitiveSizeInBits() > NativeWidth)
19305     return needsCmpXchgNb(MemType);
19306
19307   AtomicRMWInst::BinOp Op = AI->getOperation();
19308   switch (Op) {
19309   default:
19310     llvm_unreachable("Unknown atomic operation");
19311   case AtomicRMWInst::Xchg:
19312   case AtomicRMWInst::Add:
19313   case AtomicRMWInst::Sub:
19314     // It's better to use xadd, xsub or xchg for these in all cases.
19315     return false;
19316   case AtomicRMWInst::Or:
19317   case AtomicRMWInst::And:
19318   case AtomicRMWInst::Xor:
19319     // If the atomicrmw's result isn't actually used, we can just add a "lock"
19320     // prefix to a normal instruction for these operations.
19321     return !AI->use_empty();
19322   case AtomicRMWInst::Nand:
19323   case AtomicRMWInst::Max:
19324   case AtomicRMWInst::Min:
19325   case AtomicRMWInst::UMax:
19326   case AtomicRMWInst::UMin:
19327     // These always require a non-trivial set of data operations on x86. We must
19328     // use a cmpxchg loop.
19329     return true;
19330   }
19331 }
19332
19333 static bool hasMFENCE(const X86Subtarget& Subtarget) {
19334   // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
19335   // no-sse2). There isn't any reason to disable it if the target processor
19336   // supports it.
19337   return Subtarget.hasSSE2() || Subtarget.is64Bit();
19338 }
19339
19340 LoadInst *
19341 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
19342   unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
19343   const Type *MemType = AI->getType();
19344   // Accesses larger than the native width are turned into cmpxchg/libcalls, so
19345   // there is no benefit in turning such RMWs into loads, and it is actually
19346   // harmful as it introduces a mfence.
19347   if (MemType->getPrimitiveSizeInBits() > NativeWidth)
19348     return nullptr;
19349
19350   auto Builder = IRBuilder<>(AI);
19351   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
19352   auto SynchScope = AI->getSynchScope();
19353   // We must restrict the ordering to avoid generating loads with Release or
19354   // ReleaseAcquire orderings.
19355   auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
19356   auto Ptr = AI->getPointerOperand();
19357
19358   // Before the load we need a fence. Here is an example lifted from
19359   // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
19360   // is required:
19361   // Thread 0:
19362   //   x.store(1, relaxed);
19363   //   r1 = y.fetch_add(0, release);
19364   // Thread 1:
19365   //   y.fetch_add(42, acquire);
19366   //   r2 = x.load(relaxed);
19367   // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
19368   // lowered to just a load without a fence. A mfence flushes the store buffer,
19369   // making the optimization clearly correct.
19370   // FIXME: it is required if isAtLeastRelease(Order) but it is not clear
19371   // otherwise, we might be able to be more agressive on relaxed idempotent
19372   // rmw. In practice, they do not look useful, so we don't try to be
19373   // especially clever.
19374   if (SynchScope == SingleThread) {
19375     // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
19376     // the IR level, so we must wrap it in an intrinsic.
19377     return nullptr;
19378   } else if (hasMFENCE(*Subtarget)) {
19379     Function *MFence = llvm::Intrinsic::getDeclaration(M,
19380             Intrinsic::x86_sse2_mfence);
19381     Builder.CreateCall(MFence);
19382   } else {
19383     // FIXME: it might make sense to use a locked operation here but on a
19384     // different cache-line to prevent cache-line bouncing. In practice it
19385     // is probably a small win, and x86 processors without mfence are rare
19386     // enough that we do not bother.
19387     return nullptr;
19388   }
19389
19390   // Finally we can emit the atomic load.
19391   LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
19392           AI->getType()->getPrimitiveSizeInBits());
19393   Loaded->setAtomic(Order, SynchScope);
19394   AI->replaceAllUsesWith(Loaded);
19395   AI->eraseFromParent();
19396   return Loaded;
19397 }
19398
19399 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
19400                                  SelectionDAG &DAG) {
19401   SDLoc dl(Op);
19402   AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
19403     cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
19404   SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
19405     cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
19406
19407   // The only fence that needs an instruction is a sequentially-consistent
19408   // cross-thread fence.
19409   if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) {
19410     if (hasMFENCE(*Subtarget))
19411       return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
19412
19413     SDValue Chain = Op.getOperand(0);
19414     SDValue Zero = DAG.getConstant(0, MVT::i32);
19415     SDValue Ops[] = {
19416       DAG.getRegister(X86::ESP, MVT::i32), // Base
19417       DAG.getTargetConstant(1, MVT::i8),   // Scale
19418       DAG.getRegister(0, MVT::i32),        // Index
19419       DAG.getTargetConstant(0, MVT::i32),  // Disp
19420       DAG.getRegister(0, MVT::i32),        // Segment.
19421       Zero,
19422       Chain
19423     };
19424     SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
19425     return SDValue(Res, 0);
19426   }
19427
19428   // MEMBARRIER is a compiler barrier; it codegens to a no-op.
19429   return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
19430 }
19431
19432 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
19433                              SelectionDAG &DAG) {
19434   MVT T = Op.getSimpleValueType();
19435   SDLoc DL(Op);
19436   unsigned Reg = 0;
19437   unsigned size = 0;
19438   switch(T.SimpleTy) {
19439   default: llvm_unreachable("Invalid value type!");
19440   case MVT::i8:  Reg = X86::AL;  size = 1; break;
19441   case MVT::i16: Reg = X86::AX;  size = 2; break;
19442   case MVT::i32: Reg = X86::EAX; size = 4; break;
19443   case MVT::i64:
19444     assert(Subtarget->is64Bit() && "Node not type legal!");
19445     Reg = X86::RAX; size = 8;
19446     break;
19447   }
19448   SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
19449                                   Op.getOperand(2), SDValue());
19450   SDValue Ops[] = { cpIn.getValue(0),
19451                     Op.getOperand(1),
19452                     Op.getOperand(3),
19453                     DAG.getTargetConstant(size, MVT::i8),
19454                     cpIn.getValue(1) };
19455   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
19456   MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
19457   SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
19458                                            Ops, T, MMO);
19459
19460   SDValue cpOut =
19461     DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
19462   SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
19463                                       MVT::i32, cpOut.getValue(2));
19464   SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1),
19465                                 DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);
19466
19467   DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
19468   DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
19469   DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
19470   return SDValue();
19471 }
19472
19473 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
19474                             SelectionDAG &DAG) {
19475   MVT SrcVT = Op.getOperand(0).getSimpleValueType();
19476   MVT DstVT = Op.getSimpleValueType();
19477
19478   if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) {
19479     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
19480     if (DstVT != MVT::f64)
19481       // This conversion needs to be expanded.
19482       return SDValue();
19483
19484     SDValue InVec = Op->getOperand(0);
19485     SDLoc dl(Op);
19486     unsigned NumElts = SrcVT.getVectorNumElements();
19487     EVT SVT = SrcVT.getVectorElementType();
19488
19489     // Widen the vector in input in the case of MVT::v2i32.
19490     // Example: from MVT::v2i32 to MVT::v4i32.
19491     SmallVector<SDValue, 16> Elts;
19492     for (unsigned i = 0, e = NumElts; i != e; ++i)
19493       Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec,
19494                                  DAG.getIntPtrConstant(i)));
19495
19496     // Explicitly mark the extra elements as Undef.
19497     SDValue Undef = DAG.getUNDEF(SVT);
19498     for (unsigned i = NumElts, e = NumElts * 2; i != e; ++i)
19499       Elts.push_back(Undef);
19500
19501     EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
19502     SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts);
19503     SDValue ToV2F64 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, BV);
19504     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
19505                        DAG.getIntPtrConstant(0));
19506   }
19507
19508   assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
19509          Subtarget->hasMMX() && "Unexpected custom BITCAST");
19510   assert((DstVT == MVT::i64 ||
19511           (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
19512          "Unexpected custom BITCAST");
19513   // i64 <=> MMX conversions are Legal.
19514   if (SrcVT==MVT::i64 && DstVT.isVector())
19515     return Op;
19516   if (DstVT==MVT::i64 && SrcVT.isVector())
19517     return Op;
19518   // MMX <=> MMX conversions are Legal.
19519   if (SrcVT.isVector() && DstVT.isVector())
19520     return Op;
19521   // All other conversions need to be expanded.
19522   return SDValue();
19523 }
19524
19525 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget,
19526                           SelectionDAG &DAG) {
19527   SDNode *Node = Op.getNode();
19528   SDLoc dl(Node);
19529
19530   Op = Op.getOperand(0);
19531   EVT VT = Op.getValueType();
19532   assert((VT.is128BitVector() || VT.is256BitVector()) &&
19533          "CTPOP lowering only implemented for 128/256-bit wide vector types");
19534
19535   unsigned NumElts = VT.getVectorNumElements();
19536   EVT EltVT = VT.getVectorElementType();
19537   unsigned Len = EltVT.getSizeInBits();
19538
19539   // This is the vectorized version of the "best" algorithm from
19540   // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
19541   // with a minor tweak to use a series of adds + shifts instead of vector
19542   // multiplications. Implemented for the v2i64, v4i64, v4i32, v8i32 types:
19543   //
19544   //  v2i64, v4i64, v4i32 => Only profitable w/ popcnt disabled
19545   //  v8i32 => Always profitable
19546   //
19547   // FIXME: There a couple of possible improvements:
19548   //
19549   // 1) Support for i8 and i16 vectors (needs measurements if popcnt enabled).
19550   // 2) Use strategies from http://wm.ite.pl/articles/sse-popcount.html
19551   //
19552   assert(EltVT.isInteger() && (Len == 32 || Len == 64) && Len % 8 == 0 &&
19553          "CTPOP not implemented for this vector element type.");
19554
19555   // X86 canonicalize ANDs to vXi64, generate the appropriate bitcasts to avoid
19556   // extra legalization.
19557   bool NeedsBitcast = EltVT == MVT::i32;
19558   MVT BitcastVT = VT.is256BitVector() ? MVT::v4i64 : MVT::v2i64;
19559
19560   SDValue Cst55 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), EltVT);
19561   SDValue Cst33 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), EltVT);
19562   SDValue Cst0F = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), EltVT);
19563
19564   // v = v - ((v >> 1) & 0x55555555...)
19565   SmallVector<SDValue, 8> Ones(NumElts, DAG.getConstant(1, EltVT));
19566   SDValue OnesV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ones);
19567   SDValue Srl = DAG.getNode(ISD::SRL, dl, VT, Op, OnesV);
19568   if (NeedsBitcast)
19569     Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);
19570
19571   SmallVector<SDValue, 8> Mask55(NumElts, Cst55);
19572   SDValue M55 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask55);
19573   if (NeedsBitcast)
19574     M55 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M55);
19575
19576   SDValue And = DAG.getNode(ISD::AND, dl, Srl.getValueType(), Srl, M55);
19577   if (VT != And.getValueType())
19578     And = DAG.getNode(ISD::BITCAST, dl, VT, And);
19579   SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, Op, And);
19580
19581   // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
19582   SmallVector<SDValue, 8> Mask33(NumElts, Cst33);
19583   SDValue M33 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask33);
19584   SmallVector<SDValue, 8> Twos(NumElts, DAG.getConstant(2, EltVT));
19585   SDValue TwosV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Twos);
19586
19587   Srl = DAG.getNode(ISD::SRL, dl, VT, Sub, TwosV);
19588   if (NeedsBitcast) {
19589     Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);
19590     M33 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M33);
19591     Sub = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Sub);
19592   }
19593
19594   SDValue AndRHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Srl, M33);
19595   SDValue AndLHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Sub, M33);
19596   if (VT != AndRHS.getValueType()) {
19597     AndRHS = DAG.getNode(ISD::BITCAST, dl, VT, AndRHS);
19598     AndLHS = DAG.getNode(ISD::BITCAST, dl, VT, AndLHS);
19599   }
19600   SDValue Add = DAG.getNode(ISD::ADD, dl, VT, AndLHS, AndRHS);
19601
19602   // v = (v + (v >> 4)) & 0x0F0F0F0F...
19603   SmallVector<SDValue, 8> Fours(NumElts, DAG.getConstant(4, EltVT));
19604   SDValue FoursV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Fours);
19605   Srl = DAG.getNode(ISD::SRL, dl, VT, Add, FoursV);
19606   Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);
19607
19608   SmallVector<SDValue, 8> Mask0F(NumElts, Cst0F);
19609   SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask0F);
19610   if (NeedsBitcast) {
19611     Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);
19612     M0F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M0F);
19613   }
19614   And = DAG.getNode(ISD::AND, dl, M0F.getValueType(), Add, M0F);
19615   if (VT != And.getValueType())
19616     And = DAG.getNode(ISD::BITCAST, dl, VT, And);
19617
19618   // The algorithm mentioned above uses:
19619   //    v = (v * 0x01010101...) >> (Len - 8)
19620   //
19621   // Change it to use vector adds + vector shifts which yield faster results on
19622   // Haswell than using vector integer multiplication.
19623   //
19624   // For i32 elements:
19625   //    v = v + (v >> 8)
19626   //    v = v + (v >> 16)
19627   //
19628   // For i64 elements:
19629   //    v = v + (v >> 8)
19630   //    v = v + (v >> 16)
19631   //    v = v + (v >> 32)
19632   //
19633   Add = And;
19634   SmallVector<SDValue, 8> Csts;
19635   for (unsigned i = 8; i <= Len/2; i *= 2) {
19636     Csts.assign(NumElts, DAG.getConstant(i, EltVT));
19637     SDValue CstsV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Csts);
19638     Srl = DAG.getNode(ISD::SRL, dl, VT, Add, CstsV);
19639     Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);
19640     Csts.clear();
19641   }
19642
19643   // The result is on the least significant 6-bits on i32 and 7-bits on i64.
19644   SDValue Cst3F = DAG.getConstant(APInt(Len, Len == 32 ? 0x3F : 0x7F), EltVT);
19645   SmallVector<SDValue, 8> Cst3FV(NumElts, Cst3F);
19646   SDValue M3F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Cst3FV);
19647   if (NeedsBitcast) {
19648     Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);
19649     M3F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M3F);
19650   }
19651   And = DAG.getNode(ISD::AND, dl, M3F.getValueType(), Add, M3F);
19652   if (VT != And.getValueType())
19653     And = DAG.getNode(ISD::BITCAST, dl, VT, And);
19654
19655   return And;
19656 }
19657
19658 static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
19659   SDNode *Node = Op.getNode();
19660   SDLoc dl(Node);
19661   EVT T = Node->getValueType(0);
19662   SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
19663                               DAG.getConstant(0, T), Node->getOperand(2));
19664   return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
19665                        cast<AtomicSDNode>(Node)->getMemoryVT(),
19666                        Node->getOperand(0),
19667                        Node->getOperand(1), negOp,
19668                        cast<AtomicSDNode>(Node)->getMemOperand(),
19669                        cast<AtomicSDNode>(Node)->getOrdering(),
19670                        cast<AtomicSDNode>(Node)->getSynchScope());
19671 }
19672
19673 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
19674   SDNode *Node = Op.getNode();
19675   SDLoc dl(Node);
19676   EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
19677
19678   // Convert seq_cst store -> xchg
19679   // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
19680   // FIXME: On 32-bit, store -> fist or movq would be more efficient
19681   //        (The only way to get a 16-byte store is cmpxchg16b)
19682   // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
19683   if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent ||
19684       !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
19685     SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
19686                                  cast<AtomicSDNode>(Node)->getMemoryVT(),
19687                                  Node->getOperand(0),
19688                                  Node->getOperand(1), Node->getOperand(2),
19689                                  cast<AtomicSDNode>(Node)->getMemOperand(),
19690                                  cast<AtomicSDNode>(Node)->getOrdering(),
19691                                  cast<AtomicSDNode>(Node)->getSynchScope());
19692     return Swap.getValue(1);
19693   }
19694   // Other atomic stores have a simple pattern.
19695   return Op;
19696 }
19697
19698 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
19699   EVT VT = Op.getNode()->getSimpleValueType(0);
19700
19701   // Let legalize expand this if it isn't a legal type yet.
19702   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
19703     return SDValue();
19704
19705   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
19706
19707   unsigned Opc;
19708   bool ExtraOp = false;
19709   switch (Op.getOpcode()) {
19710   default: llvm_unreachable("Invalid code");
19711   case ISD::ADDC: Opc = X86ISD::ADD; break;
19712   case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
19713   case ISD::SUBC: Opc = X86ISD::SUB; break;
19714   case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
19715   }
19716
19717   if (!ExtraOp)
19718     return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
19719                        Op.getOperand(1));
19720   return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
19721                      Op.getOperand(1), Op.getOperand(2));
19722 }
19723
19724 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
19725                             SelectionDAG &DAG) {
19726   assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit());
19727
19728   // For MacOSX, we want to call an alternative entry point: __sincos_stret,
19729   // which returns the values as { float, float } (in XMM0) or
19730   // { double, double } (which is returned in XMM0, XMM1).
19731   SDLoc dl(Op);
19732   SDValue Arg = Op.getOperand(0);
19733   EVT ArgVT = Arg.getValueType();
19734   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
19735
19736   TargetLowering::ArgListTy Args;
19737   TargetLowering::ArgListEntry Entry;
19738
19739   Entry.Node = Arg;
19740   Entry.Ty = ArgTy;
19741   Entry.isSExt = false;
19742   Entry.isZExt = false;
19743   Args.push_back(Entry);
19744
19745   bool isF64 = ArgVT == MVT::f64;
19746   // Only optimize x86_64 for now. i386 is a bit messy. For f32,
19747   // the small struct {f32, f32} is returned in (eax, edx). For f64,
19748   // the results are returned via SRet in memory.
19749   const char *LibcallName =  isF64 ? "__sincos_stret" : "__sincosf_stret";
19750   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19751   SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy());
19752
19753   Type *RetTy = isF64
19754     ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
19755     : (Type*)VectorType::get(ArgTy, 4);
19756
19757   TargetLowering::CallLoweringInfo CLI(DAG);
19758   CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
19759     .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0);
19760
19761   std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
19762
19763   if (isF64)
19764     // Returned in xmm0 and xmm1.
19765     return CallResult.first;
19766
19767   // Returned in bits 0:31 and 32:64 xmm0.
19768   SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
19769                                CallResult.first, DAG.getIntPtrConstant(0));
19770   SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
19771                                CallResult.first, DAG.getIntPtrConstant(1));
19772   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
19773   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
19774 }
19775
19776 /// LowerOperation - Provide custom lowering hooks for some operations.
19777 ///
19778 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
19779   switch (Op.getOpcode()) {
19780   default: llvm_unreachable("Should not custom lower this!");
19781   case ISD::SIGN_EXTEND_INREG:  return LowerSIGN_EXTEND_INREG(Op,DAG);
19782   case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
19783   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
19784     return LowerCMP_SWAP(Op, Subtarget, DAG);
19785   case ISD::CTPOP:              return LowerCTPOP(Op, Subtarget, DAG);
19786   case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
19787   case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op,DAG);
19788   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
19789   case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, DAG);
19790   case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
19791   case ISD::VSELECT:            return LowerVSELECT(Op, DAG);
19792   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
19793   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
19794   case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
19795   case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
19796   case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
19797   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
19798   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
19799   case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
19800   case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
19801   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
19802   case ISD::SHL_PARTS:
19803   case ISD::SRA_PARTS:
19804   case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
19805   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
19806   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
19807   case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
19808   case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, Subtarget, DAG);
19809   case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, Subtarget, DAG);
19810   case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, Subtarget, DAG);
19811   case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
19812   case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
19813   case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
19814   case ISD::LOAD:               return LowerExtendedLoad(Op, Subtarget, DAG);
19815   case ISD::FABS:
19816   case ISD::FNEG:               return LowerFABSorFNEG(Op, DAG);
19817   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
19818   case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
19819   case ISD::SETCC:              return LowerSETCC(Op, DAG);
19820   case ISD::SELECT:             return LowerSELECT(Op, DAG);
19821   case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
19822   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
19823   case ISD::VASTART:            return LowerVASTART(Op, DAG);
19824   case ISD::VAARG:              return LowerVAARG(Op, DAG);
19825   case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
19826   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
19827   case ISD::INTRINSIC_VOID:
19828   case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
19829   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
19830   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
19831   case ISD::FRAME_TO_ARGS_OFFSET:
19832                                 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
19833   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
19834   case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
19835   case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
19836   case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
19837   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
19838   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
19839   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
19840   case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
19841   case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ_ZERO_UNDEF(Op, DAG);
19842   case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
19843   case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
19844   case ISD::UMUL_LOHI:
19845   case ISD::SMUL_LOHI:          return LowerMUL_LOHI(Op, Subtarget, DAG);
19846   case ISD::SRA:
19847   case ISD::SRL:
19848   case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
19849   case ISD::SADDO:
19850   case ISD::UADDO:
19851   case ISD::SSUBO:
19852   case ISD::USUBO:
19853   case ISD::SMULO:
19854   case ISD::UMULO:              return LowerXALUO(Op, DAG);
19855   case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
19856   case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);
19857   case ISD::ADDC:
19858   case ISD::ADDE:
19859   case ISD::SUBC:
19860   case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
19861   case ISD::ADD:                return LowerADD(Op, DAG);
19862   case ISD::SUB:                return LowerSUB(Op, DAG);
19863   case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
19864   }
19865 }
19866
19867 /// ReplaceNodeResults - Replace a node with an illegal result type
19868 /// with a new node built out of custom code.
19869 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
19870                                            SmallVectorImpl<SDValue>&Results,
19871                                            SelectionDAG &DAG) const {
19872   SDLoc dl(N);
19873   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19874   switch (N->getOpcode()) {
19875   default:
19876     llvm_unreachable("Do not know how to custom type legalize this operation!");
19877   // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
19878   case X86ISD::FMINC:
19879   case X86ISD::FMIN:
19880   case X86ISD::FMAXC:
19881   case X86ISD::FMAX: {
19882     EVT VT = N->getValueType(0);
19883     if (VT != MVT::v2f32)
19884       llvm_unreachable("Unexpected type (!= v2f32) on FMIN/FMAX.");
19885     SDValue UNDEF = DAG.getUNDEF(VT);
19886     SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
19887                               N->getOperand(0), UNDEF);
19888     SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
19889                               N->getOperand(1), UNDEF);
19890     Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
19891     return;
19892   }
19893   case ISD::SIGN_EXTEND_INREG:
19894   case ISD::ADDC:
19895   case ISD::ADDE:
19896   case ISD::SUBC:
19897   case ISD::SUBE:
19898     // We don't want to expand or promote these.
19899     return;
19900   case ISD::SDIV:
19901   case ISD::UDIV:
19902   case ISD::SREM:
19903   case ISD::UREM:
19904   case ISD::SDIVREM:
19905   case ISD::UDIVREM: {
19906     SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
19907     Results.push_back(V);
19908     return;
19909   }
19910   case ISD::FP_TO_SINT:
19911   case ISD::FP_TO_UINT: {
19912     bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
19913
19914     if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType()))
19915       return;
19916
19917     std::pair<SDValue,SDValue> Vals =
19918         FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
19919     SDValue FIST = Vals.first, StackSlot = Vals.second;
19920     if (FIST.getNode()) {
19921       EVT VT = N->getValueType(0);
19922       // Return a load from the stack slot.
19923       if (StackSlot.getNode())
19924         Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
19925                                       MachinePointerInfo(),
19926                                       false, false, false, 0));
19927       else
19928         Results.push_back(FIST);
19929     }
19930     return;
19931   }
19932   case ISD::UINT_TO_FP: {
19933     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
19934     if (N->getOperand(0).getValueType() != MVT::v2i32 ||
19935         N->getValueType(0) != MVT::v2f32)
19936       return;
19937     SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,
19938                                  N->getOperand(0));
19939     SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
19940                                      MVT::f64);
19941     SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias);
19942     SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
19943                              DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, VBias));
19944     Or = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or);
19945     SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
19946     Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
19947     return;
19948   }
19949   case ISD::FP_ROUND: {
19950     if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
19951         return;
19952     SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
19953     Results.push_back(V);
19954     return;
19955   }
19956   case ISD::INTRINSIC_W_CHAIN: {
19957     unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
19958     switch (IntNo) {
19959     default : llvm_unreachable("Do not know how to custom type "
19960                                "legalize this intrinsic operation!");
19961     case Intrinsic::x86_rdtsc:
19962       return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
19963                                      Results);
19964     case Intrinsic::x86_rdtscp:
19965       return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
19966                                      Results);
19967     case Intrinsic::x86_rdpmc:
19968       return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
19969     }
19970   }
19971   case ISD::READCYCLECOUNTER: {
19972     return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
19973                                    Results);
19974   }
19975   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
19976     EVT T = N->getValueType(0);
19977     assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
19978     bool Regs64bit = T == MVT::i128;
19979     EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
19980     SDValue cpInL, cpInH;
19981     cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
19982                         DAG.getConstant(0, HalfT));
19983     cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
19984                         DAG.getConstant(1, HalfT));
19985     cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
19986                              Regs64bit ? X86::RAX : X86::EAX,
19987                              cpInL, SDValue());
19988     cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
19989                              Regs64bit ? X86::RDX : X86::EDX,
19990                              cpInH, cpInL.getValue(1));
19991     SDValue swapInL, swapInH;
19992     swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
19993                           DAG.getConstant(0, HalfT));
19994     swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
19995                           DAG.getConstant(1, HalfT));
19996     swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl,
19997                                Regs64bit ? X86::RBX : X86::EBX,
19998                                swapInL, cpInH.getValue(1));
19999     swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl,
20000                                Regs64bit ? X86::RCX : X86::ECX,
20001                                swapInH, swapInL.getValue(1));
20002     SDValue Ops[] = { swapInH.getValue(0),
20003                       N->getOperand(1),
20004                       swapInH.getValue(1) };
20005     SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20006     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
20007     unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
20008                                   X86ISD::LCMPXCHG8_DAG;
20009     SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
20010     SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
20011                                         Regs64bit ? X86::RAX : X86::EAX,
20012                                         HalfT, Result.getValue(1));
20013     SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
20014                                         Regs64bit ? X86::RDX : X86::EDX,
20015                                         HalfT, cpOutL.getValue(2));
20016     SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
20017
20018     SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
20019                                         MVT::i32, cpOutH.getValue(2));
20020     SDValue Success =
20021         DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
20022                     DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);
20023     Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
20024
20025     Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
20026     Results.push_back(Success);
20027     Results.push_back(EFLAGS.getValue(1));
20028     return;
20029   }
20030   case ISD::ATOMIC_SWAP:
20031   case ISD::ATOMIC_LOAD_ADD:
20032   case ISD::ATOMIC_LOAD_SUB:
20033   case ISD::ATOMIC_LOAD_AND:
20034   case ISD::ATOMIC_LOAD_OR:
20035   case ISD::ATOMIC_LOAD_XOR:
20036   case ISD::ATOMIC_LOAD_NAND:
20037   case ISD::ATOMIC_LOAD_MIN:
20038   case ISD::ATOMIC_LOAD_MAX:
20039   case ISD::ATOMIC_LOAD_UMIN:
20040   case ISD::ATOMIC_LOAD_UMAX:
20041   case ISD::ATOMIC_LOAD: {
20042     // Delegate to generic TypeLegalization. Situations we can really handle
20043     // should have already been dealt with by AtomicExpandPass.cpp.
20044     break;
20045   }
20046   case ISD::BITCAST: {
20047     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
20048     EVT DstVT = N->getValueType(0);
20049     EVT SrcVT = N->getOperand(0)->getValueType(0);
20050
20051     if (SrcVT != MVT::f64 ||
20052         (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
20053       return;
20054
20055     unsigned NumElts = DstVT.getVectorNumElements();
20056     EVT SVT = DstVT.getVectorElementType();
20057     EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
20058     SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
20059                                    MVT::v2f64, N->getOperand(0));
20060     SDValue ToVecInt = DAG.getNode(ISD::BITCAST, dl, WiderVT, Expanded);
20061
20062     if (ExperimentalVectorWideningLegalization) {
20063       // If we are legalizing vectors by widening, we already have the desired
20064       // legal vector type, just return it.
20065       Results.push_back(ToVecInt);
20066       return;
20067     }
20068
20069     SmallVector<SDValue, 8> Elts;
20070     for (unsigned i = 0, e = NumElts; i != e; ++i)
20071       Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
20072                                    ToVecInt, DAG.getIntPtrConstant(i)));
20073
20074     Results.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, DstVT, Elts));
20075   }
20076   }
20077 }
20078
20079 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
20080   switch (Opcode) {
20081   default: return nullptr;
20082   case X86ISD::BSF:                return "X86ISD::BSF";
20083   case X86ISD::BSR:                return "X86ISD::BSR";
20084   case X86ISD::SHLD:               return "X86ISD::SHLD";
20085   case X86ISD::SHRD:               return "X86ISD::SHRD";
20086   case X86ISD::FAND:               return "X86ISD::FAND";
20087   case X86ISD::FANDN:              return "X86ISD::FANDN";
20088   case X86ISD::FOR:                return "X86ISD::FOR";
20089   case X86ISD::FXOR:               return "X86ISD::FXOR";
20090   case X86ISD::FSRL:               return "X86ISD::FSRL";
20091   case X86ISD::FILD:               return "X86ISD::FILD";
20092   case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
20093   case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
20094   case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
20095   case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
20096   case X86ISD::FLD:                return "X86ISD::FLD";
20097   case X86ISD::FST:                return "X86ISD::FST";
20098   case X86ISD::CALL:               return "X86ISD::CALL";
20099   case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
20100   case X86ISD::RDTSCP_DAG:         return "X86ISD::RDTSCP_DAG";
20101   case X86ISD::RDPMC_DAG:          return "X86ISD::RDPMC_DAG";
20102   case X86ISD::BT:                 return "X86ISD::BT";
20103   case X86ISD::CMP:                return "X86ISD::CMP";
20104   case X86ISD::COMI:               return "X86ISD::COMI";
20105   case X86ISD::UCOMI:              return "X86ISD::UCOMI";
20106   case X86ISD::CMPM:               return "X86ISD::CMPM";
20107   case X86ISD::CMPMU:              return "X86ISD::CMPMU";
20108   case X86ISD::SETCC:              return "X86ISD::SETCC";
20109   case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
20110   case X86ISD::FSETCC:             return "X86ISD::FSETCC";
20111   case X86ISD::CMOV:               return "X86ISD::CMOV";
20112   case X86ISD::BRCOND:             return "X86ISD::BRCOND";
20113   case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
20114   case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
20115   case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
20116   case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
20117   case X86ISD::Wrapper:            return "X86ISD::Wrapper";
20118   case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
20119   case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
20120   case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
20121   case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
20122   case X86ISD::PINSRB:             return "X86ISD::PINSRB";
20123   case X86ISD::PINSRW:             return "X86ISD::PINSRW";
20124   case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
20125   case X86ISD::ANDNP:              return "X86ISD::ANDNP";
20126   case X86ISD::PSIGN:              return "X86ISD::PSIGN";
20127   case X86ISD::BLENDI:             return "X86ISD::BLENDI";
20128   case X86ISD::SHRUNKBLEND:        return "X86ISD::SHRUNKBLEND";
20129   case X86ISD::SUBUS:              return "X86ISD::SUBUS";
20130   case X86ISD::HADD:               return "X86ISD::HADD";
20131   case X86ISD::HSUB:               return "X86ISD::HSUB";
20132   case X86ISD::FHADD:              return "X86ISD::FHADD";
20133   case X86ISD::FHSUB:              return "X86ISD::FHSUB";
20134   case X86ISD::UMAX:               return "X86ISD::UMAX";
20135   case X86ISD::UMIN:               return "X86ISD::UMIN";
20136   case X86ISD::SMAX:               return "X86ISD::SMAX";
20137   case X86ISD::SMIN:               return "X86ISD::SMIN";
20138   case X86ISD::FMAX:               return "X86ISD::FMAX";
20139   case X86ISD::FMIN:               return "X86ISD::FMIN";
20140   case X86ISD::FMAXC:              return "X86ISD::FMAXC";
20141   case X86ISD::FMINC:              return "X86ISD::FMINC";
20142   case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
20143   case X86ISD::FRCP:               return "X86ISD::FRCP";
20144   case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
20145   case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
20146   case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
20147   case X86ISD::EH_SJLJ_SETJMP:     return "X86ISD::EH_SJLJ_SETJMP";
20148   case X86ISD::EH_SJLJ_LONGJMP:    return "X86ISD::EH_SJLJ_LONGJMP";
20149   case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
20150   case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
20151   case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
20152   case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
20153   case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
20154   case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
20155   case X86ISD::LCMPXCHG16_DAG:     return "X86ISD::LCMPXCHG16_DAG";
20156   case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
20157   case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
20158   case X86ISD::VZEXT:              return "X86ISD::VZEXT";
20159   case X86ISD::VSEXT:              return "X86ISD::VSEXT";
20160   case X86ISD::VTRUNC:             return "X86ISD::VTRUNC";
20161   case X86ISD::VTRUNCM:            return "X86ISD::VTRUNCM";
20162   case X86ISD::VINSERT:            return "X86ISD::VINSERT";
20163   case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
20164   case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
20165   case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
20166   case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
20167   case X86ISD::VSHL:               return "X86ISD::VSHL";
20168   case X86ISD::VSRL:               return "X86ISD::VSRL";
20169   case X86ISD::VSRA:               return "X86ISD::VSRA";
20170   case X86ISD::VSHLI:              return "X86ISD::VSHLI";
20171   case X86ISD::VSRLI:              return "X86ISD::VSRLI";
20172   case X86ISD::VSRAI:              return "X86ISD::VSRAI";
20173   case X86ISD::CMPP:               return "X86ISD::CMPP";
20174   case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
20175   case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
20176   case X86ISD::PCMPEQM:            return "X86ISD::PCMPEQM";
20177   case X86ISD::PCMPGTM:            return "X86ISD::PCMPGTM";
20178   case X86ISD::ADD:                return "X86ISD::ADD";
20179   case X86ISD::SUB:                return "X86ISD::SUB";
20180   case X86ISD::ADC:                return "X86ISD::ADC";
20181   case X86ISD::SBB:                return "X86ISD::SBB";
20182   case X86ISD::SMUL:               return "X86ISD::SMUL";
20183   case X86ISD::UMUL:               return "X86ISD::UMUL";
20184   case X86ISD::SMUL8:              return "X86ISD::SMUL8";
20185   case X86ISD::UMUL8:              return "X86ISD::UMUL8";
20186   case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
20187   case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
20188   case X86ISD::INC:                return "X86ISD::INC";
20189   case X86ISD::DEC:                return "X86ISD::DEC";
20190   case X86ISD::OR:                 return "X86ISD::OR";
20191   case X86ISD::XOR:                return "X86ISD::XOR";
20192   case X86ISD::AND:                return "X86ISD::AND";
20193   case X86ISD::BEXTR:              return "X86ISD::BEXTR";
20194   case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
20195   case X86ISD::PTEST:              return "X86ISD::PTEST";
20196   case X86ISD::TESTP:              return "X86ISD::TESTP";
20197   case X86ISD::TESTM:              return "X86ISD::TESTM";
20198   case X86ISD::TESTNM:             return "X86ISD::TESTNM";
20199   case X86ISD::KORTEST:            return "X86ISD::KORTEST";
20200   case X86ISD::PACKSS:             return "X86ISD::PACKSS";
20201   case X86ISD::PACKUS:             return "X86ISD::PACKUS";
20202   case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
20203   case X86ISD::VALIGN:             return "X86ISD::VALIGN";
20204   case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
20205   case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
20206   case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
20207   case X86ISD::SHUFP:              return "X86ISD::SHUFP";
20208   case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
20209   case X86ISD::MOVLHPD:            return "X86ISD::MOVLHPD";
20210   case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
20211   case X86ISD::MOVLPS:             return "X86ISD::MOVLPS";
20212   case X86ISD::MOVLPD:             return "X86ISD::MOVLPD";
20213   case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
20214   case X86ISD::MOVSHDUP:           return "X86ISD::MOVSHDUP";
20215   case X86ISD::MOVSLDUP:           return "X86ISD::MOVSLDUP";
20216   case X86ISD::MOVSD:              return "X86ISD::MOVSD";
20217   case X86ISD::MOVSS:              return "X86ISD::MOVSS";
20218   case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
20219   case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
20220   case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
20221   case X86ISD::VBROADCASTM:        return "X86ISD::VBROADCASTM";
20222   case X86ISD::VEXTRACT:           return "X86ISD::VEXTRACT";
20223   case X86ISD::VPERMILPI:          return "X86ISD::VPERMILPI";
20224   case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
20225   case X86ISD::VPERMV:             return "X86ISD::VPERMV";
20226   case X86ISD::VPERMV3:            return "X86ISD::VPERMV3";
20227   case X86ISD::VPERMIV3:           return "X86ISD::VPERMIV3";
20228   case X86ISD::VPERMI:             return "X86ISD::VPERMI";
20229   case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
20230   case X86ISD::PMULDQ:             return "X86ISD::PMULDQ";
20231   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
20232   case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
20233   case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
20234   case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
20235   case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
20236   case X86ISD::WIN_FTOL:           return "X86ISD::WIN_FTOL";
20237   case X86ISD::SAHF:               return "X86ISD::SAHF";
20238   case X86ISD::RDRAND:             return "X86ISD::RDRAND";
20239   case X86ISD::RDSEED:             return "X86ISD::RDSEED";
20240   case X86ISD::FMADD:              return "X86ISD::FMADD";
20241   case X86ISD::FMSUB:              return "X86ISD::FMSUB";
20242   case X86ISD::FNMADD:             return "X86ISD::FNMADD";
20243   case X86ISD::FNMSUB:             return "X86ISD::FNMSUB";
20244   case X86ISD::FMADDSUB:           return "X86ISD::FMADDSUB";
20245   case X86ISD::FMSUBADD:           return "X86ISD::FMSUBADD";
20246   case X86ISD::PCMPESTRI:          return "X86ISD::PCMPESTRI";
20247   case X86ISD::PCMPISTRI:          return "X86ISD::PCMPISTRI";
20248   case X86ISD::XTEST:              return "X86ISD::XTEST";
20249   case X86ISD::COMPRESS:           return "X86ISD::COMPRESS";
20250   case X86ISD::EXPAND:             return "X86ISD::EXPAND";
20251   case X86ISD::SELECT:             return "X86ISD::SELECT";
20252   case X86ISD::ADDSUB:             return "X86ISD::ADDSUB";
20253   case X86ISD::RCP28:              return "X86ISD::RCP28";
20254   case X86ISD::RSQRT28:            return "X86ISD::RSQRT28";
20255   }
20256 }
20257
20258 // isLegalAddressingMode - Return true if the addressing mode represented
20259 // by AM is legal for this target, for a load/store of the specified type.
20260 bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
20261                                               Type *Ty) const {
20262   // X86 supports extremely general addressing modes.
20263   CodeModel::Model M = getTargetMachine().getCodeModel();
20264   Reloc::Model R = getTargetMachine().getRelocationModel();
20265
20266   // X86 allows a sign-extended 32-bit immediate field as a displacement.
20267   if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
20268     return false;
20269
20270   if (AM.BaseGV) {
20271     unsigned GVFlags =
20272       Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
20273
20274     // If a reference to this global requires an extra load, we can't fold it.
20275     if (isGlobalStubReference(GVFlags))
20276       return false;
20277
20278     // If BaseGV requires a register for the PIC base, we cannot also have a
20279     // BaseReg specified.
20280     if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
20281       return false;
20282
20283     // If lower 4G is not available, then we must use rip-relative addressing.
20284     if ((M != CodeModel::Small || R != Reloc::Static) &&
20285         Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
20286       return false;
20287   }
20288
20289   switch (AM.Scale) {
20290   case 0:
20291   case 1:
20292   case 2:
20293   case 4:
20294   case 8:
20295     // These scales always work.
20296     break;
20297   case 3:
20298   case 5:
20299   case 9:
20300     // These scales are formed with basereg+scalereg.  Only accept if there is
20301     // no basereg yet.
20302     if (AM.HasBaseReg)
20303       return false;
20304     break;
20305   default:  // Other stuff never works.
20306     return false;
20307   }
20308
20309   return true;
20310 }
20311
20312 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
20313   unsigned Bits = Ty->getScalarSizeInBits();
20314
20315   // 8-bit shifts are always expensive, but versions with a scalar amount aren't
20316   // particularly cheaper than those without.
20317   if (Bits == 8)
20318     return false;
20319
20320   // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
20321   // variable shifts just as cheap as scalar ones.
20322   if (Subtarget->hasInt256() && (Bits == 32 || Bits == 64))
20323     return false;
20324
20325   // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
20326   // fully general vector.
20327   return true;
20328 }
20329
20330 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
20331   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
20332     return false;
20333   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
20334   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
20335   return NumBits1 > NumBits2;
20336 }
20337
20338 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
20339   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
20340     return false;
20341
20342   if (!isTypeLegal(EVT::getEVT(Ty1)))
20343     return false;
20344
20345   assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
20346
20347   // Assuming the caller doesn't have a zeroext or signext return parameter,
20348   // truncation all the way down to i1 is valid.
20349   return true;
20350 }
20351
20352 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
20353   return isInt<32>(Imm);
20354 }
20355
20356 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
20357   // Can also use sub to handle negated immediates.
20358   return isInt<32>(Imm);
20359 }
20360
20361 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
20362   if (!VT1.isInteger() || !VT2.isInteger())
20363     return false;
20364   unsigned NumBits1 = VT1.getSizeInBits();
20365   unsigned NumBits2 = VT2.getSizeInBits();
20366   return NumBits1 > NumBits2;
20367 }
20368
20369 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
20370   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
20371   return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
20372 }
20373
20374 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
20375   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
20376   return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
20377 }
20378
20379 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
20380   EVT VT1 = Val.getValueType();
20381   if (isZExtFree(VT1, VT2))
20382     return true;
20383
20384   if (Val.getOpcode() != ISD::LOAD)
20385     return false;
20386
20387   if (!VT1.isSimple() || !VT1.isInteger() ||
20388       !VT2.isSimple() || !VT2.isInteger())
20389     return false;
20390
20391   switch (VT1.getSimpleVT().SimpleTy) {
20392   default: break;
20393   case MVT::i8:
20394   case MVT::i16:
20395   case MVT::i32:
20396     // X86 has 8, 16, and 32-bit zero-extending loads.
20397     return true;
20398   }
20399
20400   return false;
20401 }
20402
20403 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
20404
20405 bool
20406 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
20407   if (!(Subtarget->hasFMA() || Subtarget->hasFMA4()))
20408     return false;
20409
20410   VT = VT.getScalarType();
20411
20412   if (!VT.isSimple())
20413     return false;
20414
20415   switch (VT.getSimpleVT().SimpleTy) {
20416   case MVT::f32:
20417   case MVT::f64:
20418     return true;
20419   default:
20420     break;
20421   }
20422
20423   return false;
20424 }
20425
20426 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
20427   // i16 instructions are longer (0x66 prefix) and potentially slower.
20428   return !(VT1 == MVT::i32 && VT2 == MVT::i16);
20429 }
20430
20431 /// isShuffleMaskLegal - Targets can use this to indicate that they only
20432 /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
20433 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
20434 /// are assumed to be legal.
20435 bool
20436 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
20437                                       EVT VT) const {
20438   if (!VT.isSimple())
20439     return false;
20440
20441   MVT SVT = VT.getSimpleVT();
20442
20443   // Very little shuffling can be done for 64-bit vectors right now.
20444   if (VT.getSizeInBits() == 64)
20445     return false;
20446
20447   // This is an experimental legality test that is tailored to match the
20448   // legality test of the experimental lowering more closely. They are gated
20449   // separately to ease testing of performance differences.
20450   if (ExperimentalVectorShuffleLegality)
20451     // We only care that the types being shuffled are legal. The lowering can
20452     // handle any possible shuffle mask that results.
20453     return isTypeLegal(SVT);
20454
20455   // If this is a single-input shuffle with no 128 bit lane crossings we can
20456   // lower it into pshufb.
20457   if ((SVT.is128BitVector() && Subtarget->hasSSSE3()) ||
20458       (SVT.is256BitVector() && Subtarget->hasInt256())) {
20459     bool isLegal = true;
20460     for (unsigned I = 0, E = M.size(); I != E; ++I) {
20461       if (M[I] >= (int)SVT.getVectorNumElements() ||
20462           ShuffleCrosses128bitLane(SVT, I, M[I])) {
20463         isLegal = false;
20464         break;
20465       }
20466     }
20467     if (isLegal)
20468       return true;
20469   }
20470
20471   // FIXME: blends, shifts.
20472   return (SVT.getVectorNumElements() == 2 ||
20473           ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
20474           isMOVLMask(M, SVT) ||
20475           isCommutedMOVLMask(M, SVT) ||
20476           isMOVHLPSMask(M, SVT) ||
20477           isSHUFPMask(M, SVT) ||
20478           isSHUFPMask(M, SVT, /* Commuted */ true) ||
20479           isPSHUFDMask(M, SVT) ||
20480           isPSHUFDMask(M, SVT, /* SecondOperand */ true) ||
20481           isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) ||
20482           isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) ||
20483           isPALIGNRMask(M, SVT, Subtarget) ||
20484           isUNPCKLMask(M, SVT, Subtarget->hasInt256()) ||
20485           isUNPCKHMask(M, SVT, Subtarget->hasInt256()) ||
20486           isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
20487           isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
20488           isBlendMask(M, SVT, Subtarget->hasSSE41(), Subtarget->hasInt256()) ||
20489           (Subtarget->hasSSE41() && isINSERTPSMask(M, SVT)));
20490 }
20491
20492 bool
20493 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
20494                                           EVT VT) const {
20495   if (!VT.isSimple())
20496     return false;
20497
20498   MVT SVT = VT.getSimpleVT();
20499
20500   // This is an experimental legality test that is tailored to match the
20501   // legality test of the experimental lowering more closely. They are gated
20502   // separately to ease testing of performance differences.
20503   if (ExperimentalVectorShuffleLegality)
20504     // The new vector shuffle lowering is very good at managing zero-inputs.
20505     return isShuffleMaskLegal(Mask, VT);
20506
20507   unsigned NumElts = SVT.getVectorNumElements();
20508   // FIXME: This collection of masks seems suspect.
20509   if (NumElts == 2)
20510     return true;
20511   if (NumElts == 4 && SVT.is128BitVector()) {
20512     return (isMOVLMask(Mask, SVT)  ||
20513             isCommutedMOVLMask(Mask, SVT, true) ||
20514             isSHUFPMask(Mask, SVT) ||
20515             isSHUFPMask(Mask, SVT, /* Commuted */ true) ||
20516             isBlendMask(Mask, SVT, Subtarget->hasSSE41(),
20517                         Subtarget->hasInt256()));
20518   }
20519   return false;
20520 }
20521
20522 //===----------------------------------------------------------------------===//
20523 //                           X86 Scheduler Hooks
20524 //===----------------------------------------------------------------------===//
20525
20526 /// Utility function to emit xbegin specifying the start of an RTM region.
20527 static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,
20528                                      const TargetInstrInfo *TII) {
20529   DebugLoc DL = MI->getDebugLoc();
20530
20531   const BasicBlock *BB = MBB->getBasicBlock();
20532   MachineFunction::iterator I = MBB;
20533   ++I;
20534
20535   // For the v = xbegin(), we generate
20536   //
20537   // thisMBB:
20538   //  xbegin sinkMBB
20539   //
20540   // mainMBB:
20541   //  eax = -1
20542   //
20543   // sinkMBB:
20544   //  v = eax
20545
20546   MachineBasicBlock *thisMBB = MBB;
20547   MachineFunction *MF = MBB->getParent();
20548   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
20549   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
20550   MF->insert(I, mainMBB);
20551   MF->insert(I, sinkMBB);
20552
20553   // Transfer the remainder of BB and its successor edges to sinkMBB.
20554   sinkMBB->splice(sinkMBB->begin(), MBB,
20555                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
20556   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
20557
20558   // thisMBB:
20559   //  xbegin sinkMBB
20560   //  # fallthrough to mainMBB
20561   //  # abortion to sinkMBB
20562   BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
20563   thisMBB->addSuccessor(mainMBB);
20564   thisMBB->addSuccessor(sinkMBB);
20565
20566   // mainMBB:
20567   //  EAX = -1
20568   BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
20569   mainMBB->addSuccessor(sinkMBB);
20570
20571   // sinkMBB:
20572   // EAX is live into the sinkMBB
20573   sinkMBB->addLiveIn(X86::EAX);
20574   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
20575           TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
20576     .addReg(X86::EAX);
20577
20578   MI->eraseFromParent();
20579   return sinkMBB;
20580 }
20581
20582 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
20583 // or XMM0_V32I8 in AVX all of this code can be replaced with that
20584 // in the .td file.
20585 static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB,
20586                                        const TargetInstrInfo *TII) {
20587   unsigned Opc;
20588   switch (MI->getOpcode()) {
20589   default: llvm_unreachable("illegal opcode!");
20590   case X86::PCMPISTRM128REG:  Opc = X86::PCMPISTRM128rr;  break;
20591   case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
20592   case X86::PCMPISTRM128MEM:  Opc = X86::PCMPISTRM128rm;  break;
20593   case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
20594   case X86::PCMPESTRM128REG:  Opc = X86::PCMPESTRM128rr;  break;
20595   case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
20596   case X86::PCMPESTRM128MEM:  Opc = X86::PCMPESTRM128rm;  break;
20597   case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
20598   }
20599
20600   DebugLoc dl = MI->getDebugLoc();
20601   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
20602
20603   unsigned NumArgs = MI->getNumOperands();
20604   for (unsigned i = 1; i < NumArgs; ++i) {
20605     MachineOperand &Op = MI->getOperand(i);
20606     if (!(Op.isReg() && Op.isImplicit()))
20607       MIB.addOperand(Op);
20608   }
20609   if (MI->hasOneMemOperand())
20610     MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
20611
20612   BuildMI(*BB, MI, dl,
20613     TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
20614     .addReg(X86::XMM0);
20615
20616   MI->eraseFromParent();
20617   return BB;
20618 }
20619
20620 // FIXME: Custom handling because TableGen doesn't support multiple implicit
20621 // defs in an instruction pattern
20622 static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB,
20623                                        const TargetInstrInfo *TII) {
20624   unsigned Opc;
20625   switch (MI->getOpcode()) {
20626   default: llvm_unreachable("illegal opcode!");
20627   case X86::PCMPISTRIREG:  Opc = X86::PCMPISTRIrr;  break;
20628   case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
20629   case X86::PCMPISTRIMEM:  Opc = X86::PCMPISTRIrm;  break;
20630   case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
20631   case X86::PCMPESTRIREG:  Opc = X86::PCMPESTRIrr;  break;
20632   case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
20633   case X86::PCMPESTRIMEM:  Opc = X86::PCMPESTRIrm;  break;
20634   case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
20635   }
20636
20637   DebugLoc dl = MI->getDebugLoc();
20638   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
20639
20640   unsigned NumArgs = MI->getNumOperands(); // remove the results
20641   for (unsigned i = 1; i < NumArgs; ++i) {
20642     MachineOperand &Op = MI->getOperand(i);
20643     if (!(Op.isReg() && Op.isImplicit()))
20644       MIB.addOperand(Op);
20645   }
20646   if (MI->hasOneMemOperand())
20647     MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
20648
20649   BuildMI(*BB, MI, dl,
20650     TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
20651     .addReg(X86::ECX);
20652
20653   MI->eraseFromParent();
20654   return BB;
20655 }
20656
20657 static MachineBasicBlock *EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
20658                                       const X86Subtarget *Subtarget) {
20659   DebugLoc dl = MI->getDebugLoc();
20660   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
20661   // Address into RAX/EAX, other two args into ECX, EDX.
20662   unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
20663   unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
20664   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
20665   for (int i = 0; i < X86::AddrNumOperands; ++i)
20666     MIB.addOperand(MI->getOperand(i));
20667
20668   unsigned ValOps = X86::AddrNumOperands;
20669   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
20670     .addReg(MI->getOperand(ValOps).getReg());
20671   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
20672     .addReg(MI->getOperand(ValOps+1).getReg());
20673
20674   // The instruction doesn't actually take any operands though.
20675   BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr));
20676
20677   MI->eraseFromParent(); // The pseudo is gone now.
20678   return BB;
20679 }
20680
20681 MachineBasicBlock *
20682 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI,
20683                                                  MachineBasicBlock *MBB) const {
20684   // Emit va_arg instruction on X86-64.
20685
20686   // Operands to this pseudo-instruction:
20687   // 0  ) Output        : destination address (reg)
20688   // 1-5) Input         : va_list address (addr, i64mem)
20689   // 6  ) ArgSize       : Size (in bytes) of vararg type
20690   // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
20691   // 8  ) Align         : Alignment of type
20692   // 9  ) EFLAGS (implicit-def)
20693
20694   assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
20695   assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands");
20696
20697   unsigned DestReg = MI->getOperand(0).getReg();
20698   MachineOperand &Base = MI->getOperand(1);
20699   MachineOperand &Scale = MI->getOperand(2);
20700   MachineOperand &Index = MI->getOperand(3);
20701   MachineOperand &Disp = MI->getOperand(4);
20702   MachineOperand &Segment = MI->getOperand(5);
20703   unsigned ArgSize = MI->getOperand(6).getImm();
20704   unsigned ArgMode = MI->getOperand(7).getImm();
20705   unsigned Align = MI->getOperand(8).getImm();
20706
20707   // Memory Reference
20708   assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
20709   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
20710   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
20711
20712   // Machine Information
20713   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
20714   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
20715   const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
20716   const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
20717   DebugLoc DL = MI->getDebugLoc();
20718
20719   // struct va_list {
20720   //   i32   gp_offset
20721   //   i32   fp_offset
20722   //   i64   overflow_area (address)
20723   //   i64   reg_save_area (address)
20724   // }
20725   // sizeof(va_list) = 24
20726   // alignment(va_list) = 8
20727
20728   unsigned TotalNumIntRegs = 6;
20729   unsigned TotalNumXMMRegs = 8;
20730   bool UseGPOffset = (ArgMode == 1);
20731   bool UseFPOffset = (ArgMode == 2);
20732   unsigned MaxOffset = TotalNumIntRegs * 8 +
20733                        (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
20734
20735   /* Align ArgSize to a multiple of 8 */
20736   unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
20737   bool NeedsAlign = (Align > 8);
20738
20739   MachineBasicBlock *thisMBB = MBB;
20740   MachineBasicBlock *overflowMBB;
20741   MachineBasicBlock *offsetMBB;
20742   MachineBasicBlock *endMBB;
20743
20744   unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
20745   unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
20746   unsigned OffsetReg = 0;
20747
20748   if (!UseGPOffset && !UseFPOffset) {
20749     // If we only pull from the overflow region, we don't create a branch.
20750     // We don't need to alter control flow.
20751     OffsetDestReg = 0; // unused
20752     OverflowDestReg = DestReg;
20753
20754     offsetMBB = nullptr;
20755     overflowMBB = thisMBB;
20756     endMBB = thisMBB;
20757   } else {
20758     // First emit code to check if gp_offset (or fp_offset) is below the bound.
20759     // If so, pull the argument from reg_save_area. (branch to offsetMBB)
20760     // If not, pull from overflow_area. (branch to overflowMBB)
20761     //
20762     //       thisMBB
20763     //         |     .
20764     //         |        .
20765     //     offsetMBB   overflowMBB
20766     //         |        .
20767     //         |     .
20768     //        endMBB
20769
20770     // Registers for the PHI in endMBB
20771     OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
20772     OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
20773
20774     const BasicBlock *LLVM_BB = MBB->getBasicBlock();
20775     MachineFunction *MF = MBB->getParent();
20776     overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20777     offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20778     endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20779
20780     MachineFunction::iterator MBBIter = MBB;
20781     ++MBBIter;
20782
20783     // Insert the new basic blocks
20784     MF->insert(MBBIter, offsetMBB);
20785     MF->insert(MBBIter, overflowMBB);
20786     MF->insert(MBBIter, endMBB);
20787
20788     // Transfer the remainder of MBB and its successor edges to endMBB.
20789     endMBB->splice(endMBB->begin(), thisMBB,
20790                    std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
20791     endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
20792
20793     // Make offsetMBB and overflowMBB successors of thisMBB
20794     thisMBB->addSuccessor(offsetMBB);
20795     thisMBB->addSuccessor(overflowMBB);
20796
20797     // endMBB is a successor of both offsetMBB and overflowMBB
20798     offsetMBB->addSuccessor(endMBB);
20799     overflowMBB->addSuccessor(endMBB);
20800
20801     // Load the offset value into a register
20802     OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
20803     BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
20804       .addOperand(Base)
20805       .addOperand(Scale)
20806       .addOperand(Index)
20807       .addDisp(Disp, UseFPOffset ? 4 : 0)
20808       .addOperand(Segment)
20809       .setMemRefs(MMOBegin, MMOEnd);
20810
20811     // Check if there is enough room left to pull this argument.
20812     BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
20813       .addReg(OffsetReg)
20814       .addImm(MaxOffset + 8 - ArgSizeA8);
20815
20816     // Branch to "overflowMBB" if offset >= max
20817     // Fall through to "offsetMBB" otherwise
20818     BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
20819       .addMBB(overflowMBB);
20820   }
20821
20822   // In offsetMBB, emit code to use the reg_save_area.
20823   if (offsetMBB) {
20824     assert(OffsetReg != 0);
20825
20826     // Read the reg_save_area address.
20827     unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
20828     BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
20829       .addOperand(Base)
20830       .addOperand(Scale)
20831       .addOperand(Index)
20832       .addDisp(Disp, 16)
20833       .addOperand(Segment)
20834       .setMemRefs(MMOBegin, MMOEnd);
20835
20836     // Zero-extend the offset
20837     unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
20838       BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
20839         .addImm(0)
20840         .addReg(OffsetReg)
20841         .addImm(X86::sub_32bit);
20842
20843     // Add the offset to the reg_save_area to get the final address.
20844     BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
20845       .addReg(OffsetReg64)
20846       .addReg(RegSaveReg);
20847
20848     // Compute the offset for the next argument
20849     unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
20850     BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
20851       .addReg(OffsetReg)
20852       .addImm(UseFPOffset ? 16 : 8);
20853
20854     // Store it back into the va_list.
20855     BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
20856       .addOperand(Base)
20857       .addOperand(Scale)
20858       .addOperand(Index)
20859       .addDisp(Disp, UseFPOffset ? 4 : 0)
20860       .addOperand(Segment)
20861       .addReg(NextOffsetReg)
20862       .setMemRefs(MMOBegin, MMOEnd);
20863
20864     // Jump to endMBB
20865     BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
20866       .addMBB(endMBB);
20867   }
20868
20869   //
20870   // Emit code to use overflow area
20871   //
20872
20873   // Load the overflow_area address into a register.
20874   unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
20875   BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
20876     .addOperand(Base)
20877     .addOperand(Scale)
20878     .addOperand(Index)
20879     .addDisp(Disp, 8)
20880     .addOperand(Segment)
20881     .setMemRefs(MMOBegin, MMOEnd);
20882
20883   // If we need to align it, do so. Otherwise, just copy the address
20884   // to OverflowDestReg.
20885   if (NeedsAlign) {
20886     // Align the overflow address
20887     assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2");
20888     unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
20889
20890     // aligned_addr = (addr + (align-1)) & ~(align-1)
20891     BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
20892       .addReg(OverflowAddrReg)
20893       .addImm(Align-1);
20894
20895     BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
20896       .addReg(TmpReg)
20897       .addImm(~(uint64_t)(Align-1));
20898   } else {
20899     BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
20900       .addReg(OverflowAddrReg);
20901   }
20902
20903   // Compute the next overflow address after this argument.
20904   // (the overflow address should be kept 8-byte aligned)
20905   unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
20906   BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
20907     .addReg(OverflowDestReg)
20908     .addImm(ArgSizeA8);
20909
20910   // Store the new overflow address.
20911   BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
20912     .addOperand(Base)
20913     .addOperand(Scale)
20914     .addOperand(Index)
20915     .addDisp(Disp, 8)
20916     .addOperand(Segment)
20917     .addReg(NextAddrReg)
20918     .setMemRefs(MMOBegin, MMOEnd);
20919
20920   // If we branched, emit the PHI to the front of endMBB.
20921   if (offsetMBB) {
20922     BuildMI(*endMBB, endMBB->begin(), DL,
20923             TII->get(X86::PHI), DestReg)
20924       .addReg(OffsetDestReg).addMBB(offsetMBB)
20925       .addReg(OverflowDestReg).addMBB(overflowMBB);
20926   }
20927
20928   // Erase the pseudo instruction
20929   MI->eraseFromParent();
20930
20931   return endMBB;
20932 }
20933
20934 MachineBasicBlock *
20935 X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
20936                                                  MachineInstr *MI,
20937                                                  MachineBasicBlock *MBB) const {
20938   // Emit code to save XMM registers to the stack. The ABI says that the
20939   // number of registers to save is given in %al, so it's theoretically
20940   // possible to do an indirect jump trick to avoid saving all of them,
20941   // however this code takes a simpler approach and just executes all
20942   // of the stores if %al is non-zero. It's less code, and it's probably
20943   // easier on the hardware branch predictor, and stores aren't all that
20944   // expensive anyway.
20945
20946   // Create the new basic blocks. One block contains all the XMM stores,
20947   // and one block is the final destination regardless of whether any
20948   // stores were performed.
20949   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
20950   MachineFunction *F = MBB->getParent();
20951   MachineFunction::iterator MBBIter = MBB;
20952   ++MBBIter;
20953   MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
20954   MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
20955   F->insert(MBBIter, XMMSaveMBB);
20956   F->insert(MBBIter, EndMBB);
20957
20958   // Transfer the remainder of MBB and its successor edges to EndMBB.
20959   EndMBB->splice(EndMBB->begin(), MBB,
20960                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
20961   EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
20962
20963   // The original block will now fall through to the XMM save block.
20964   MBB->addSuccessor(XMMSaveMBB);
20965   // The XMMSaveMBB will fall through to the end block.
20966   XMMSaveMBB->addSuccessor(EndMBB);
20967
20968   // Now add the instructions.
20969   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
20970   DebugLoc DL = MI->getDebugLoc();
20971
20972   unsigned CountReg = MI->getOperand(0).getReg();
20973   int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
20974   int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
20975
20976   if (!Subtarget->isTargetWin64()) {
20977     // If %al is 0, branch around the XMM save block.
20978     BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
20979     BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
20980     MBB->addSuccessor(EndMBB);
20981   }
20982
20983   // Make sure the last operand is EFLAGS, which gets clobbered by the branch
20984   // that was just emitted, but clearly shouldn't be "saved".
20985   assert((MI->getNumOperands() <= 3 ||
20986           !MI->getOperand(MI->getNumOperands() - 1).isReg() ||
20987           MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS)
20988          && "Expected last argument to be EFLAGS");
20989   unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
20990   // In the XMM save block, save all the XMM argument registers.
20991   for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) {
20992     int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
20993     MachineMemOperand *MMO =
20994       F->getMachineMemOperand(
20995           MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset),
20996         MachineMemOperand::MOStore,
20997         /*Size=*/16, /*Align=*/16);
20998     BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
20999       .addFrameIndex(RegSaveFrameIndex)
21000       .addImm(/*Scale=*/1)
21001       .addReg(/*IndexReg=*/0)
21002       .addImm(/*Disp=*/Offset)
21003       .addReg(/*Segment=*/0)
21004       .addReg(MI->getOperand(i).getReg())
21005       .addMemOperand(MMO);
21006   }
21007
21008   MI->eraseFromParent();   // The pseudo instruction is gone now.
21009
21010   return EndMBB;
21011 }
21012
21013 // The EFLAGS operand of SelectItr might be missing a kill marker
21014 // because there were multiple uses of EFLAGS, and ISel didn't know
21015 // which to mark. Figure out whether SelectItr should have had a
21016 // kill marker, and set it if it should. Returns the correct kill
21017 // marker value.
21018 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
21019                                      MachineBasicBlock* BB,
21020                                      const TargetRegisterInfo* TRI) {
21021   // Scan forward through BB for a use/def of EFLAGS.
21022   MachineBasicBlock::iterator miI(std::next(SelectItr));
21023   for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
21024     const MachineInstr& mi = *miI;
21025     if (mi.readsRegister(X86::EFLAGS))
21026       return false;
21027     if (mi.definesRegister(X86::EFLAGS))
21028       break; // Should have kill-flag - update below.
21029   }
21030
21031   // If we hit the end of the block, check whether EFLAGS is live into a
21032   // successor.
21033   if (miI == BB->end()) {
21034     for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
21035                                           sEnd = BB->succ_end();
21036          sItr != sEnd; ++sItr) {
21037       MachineBasicBlock* succ = *sItr;
21038       if (succ->isLiveIn(X86::EFLAGS))
21039         return false;
21040     }
21041   }
21042
21043   // We found a def, or hit the end of the basic block and EFLAGS wasn't live
21044   // out. SelectMI should have a kill flag on EFLAGS.
21045   SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
21046   return true;
21047 }
21048
21049 MachineBasicBlock *
21050 X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
21051                                      MachineBasicBlock *BB) const {
21052   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21053   DebugLoc DL = MI->getDebugLoc();
21054
21055   // To "insert" a SELECT_CC instruction, we actually have to insert the
21056   // diamond control-flow pattern.  The incoming instruction knows the
21057   // destination vreg to set, the condition code register to branch on, the
21058   // true/false values to select between, and a branch opcode to use.
21059   const BasicBlock *LLVM_BB = BB->getBasicBlock();
21060   MachineFunction::iterator It = BB;
21061   ++It;
21062
21063   //  thisMBB:
21064   //  ...
21065   //   TrueVal = ...
21066   //   cmpTY ccX, r1, r2
21067   //   bCC copy1MBB
21068   //   fallthrough --> copy0MBB
21069   MachineBasicBlock *thisMBB = BB;
21070   MachineFunction *F = BB->getParent();
21071   MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
21072   MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
21073   F->insert(It, copy0MBB);
21074   F->insert(It, sinkMBB);
21075
21076   // If the EFLAGS register isn't dead in the terminator, then claim that it's
21077   // live into the sink and copy blocks.
21078   const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
21079   if (!MI->killsRegister(X86::EFLAGS) &&
21080       !checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
21081     copy0MBB->addLiveIn(X86::EFLAGS);
21082     sinkMBB->addLiveIn(X86::EFLAGS);
21083   }
21084
21085   // Transfer the remainder of BB and its successor edges to sinkMBB.
21086   sinkMBB->splice(sinkMBB->begin(), BB,
21087                   std::next(MachineBasicBlock::iterator(MI)), BB->end());
21088   sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
21089
21090   // Add the true and fallthrough blocks as its successors.
21091   BB->addSuccessor(copy0MBB);
21092   BB->addSuccessor(sinkMBB);
21093
21094   // Create the conditional branch instruction.
21095   unsigned Opc =
21096     X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
21097   BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
21098
21099   //  copy0MBB:
21100   //   %FalseValue = ...
21101   //   # fallthrough to sinkMBB
21102   copy0MBB->addSuccessor(sinkMBB);
21103
21104   //  sinkMBB:
21105   //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
21106   //  ...
21107   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
21108           TII->get(X86::PHI), MI->getOperand(0).getReg())
21109     .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
21110     .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
21111
21112   MI->eraseFromParent();   // The pseudo instruction is gone now.
21113   return sinkMBB;
21114 }
21115
21116 MachineBasicBlock *
21117 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
21118                                         MachineBasicBlock *BB) const {
21119   MachineFunction *MF = BB->getParent();
21120   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21121   DebugLoc DL = MI->getDebugLoc();
21122   const BasicBlock *LLVM_BB = BB->getBasicBlock();
21123
21124   assert(MF->shouldSplitStack());
21125
21126   const bool Is64Bit = Subtarget->is64Bit();
21127   const bool IsLP64 = Subtarget->isTarget64BitLP64();
21128
21129   const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
21130   const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
21131
21132   // BB:
21133   //  ... [Till the alloca]
21134   // If stacklet is not large enough, jump to mallocMBB
21135   //
21136   // bumpMBB:
21137   //  Allocate by subtracting from RSP
21138   //  Jump to continueMBB
21139   //
21140   // mallocMBB:
21141   //  Allocate by call to runtime
21142   //
21143   // continueMBB:
21144   //  ...
21145   //  [rest of original BB]
21146   //
21147
21148   MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
21149   MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
21150   MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
21151
21152   MachineRegisterInfo &MRI = MF->getRegInfo();
21153   const TargetRegisterClass *AddrRegClass =
21154     getRegClassFor(getPointerTy());
21155
21156   unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
21157     bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
21158     tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
21159     SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
21160     sizeVReg = MI->getOperand(1).getReg(),
21161     physSPReg = IsLP64 || Subtarget->isTargetNaCl64() ? X86::RSP : X86::ESP;
21162
21163   MachineFunction::iterator MBBIter = BB;
21164   ++MBBIter;
21165
21166   MF->insert(MBBIter, bumpMBB);
21167   MF->insert(MBBIter, mallocMBB);
21168   MF->insert(MBBIter, continueMBB);
21169
21170   continueMBB->splice(continueMBB->begin(), BB,
21171                       std::next(MachineBasicBlock::iterator(MI)), BB->end());
21172   continueMBB->transferSuccessorsAndUpdatePHIs(BB);
21173
21174   // Add code to the main basic block to check if the stack limit has been hit,
21175   // and if so, jump to mallocMBB otherwise to bumpMBB.
21176   BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
21177   BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
21178     .addReg(tmpSPVReg).addReg(sizeVReg);
21179   BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
21180     .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
21181     .addReg(SPLimitVReg);
21182   BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
21183
21184   // bumpMBB simply decreases the stack pointer, since we know the current
21185   // stacklet has enough space.
21186   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
21187     .addReg(SPLimitVReg);
21188   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
21189     .addReg(SPLimitVReg);
21190   BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
21191
21192   // Calls into a routine in libgcc to allocate more space from the heap.
21193   const uint32_t *RegMask =
21194       Subtarget->getRegisterInfo()->getCallPreservedMask(CallingConv::C);
21195   if (IsLP64) {
21196     BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
21197       .addReg(sizeVReg);
21198     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
21199       .addExternalSymbol("__morestack_allocate_stack_space")
21200       .addRegMask(RegMask)
21201       .addReg(X86::RDI, RegState::Implicit)
21202       .addReg(X86::RAX, RegState::ImplicitDefine);
21203   } else if (Is64Bit) {
21204     BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
21205       .addReg(sizeVReg);
21206     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
21207       .addExternalSymbol("__morestack_allocate_stack_space")
21208       .addRegMask(RegMask)
21209       .addReg(X86::EDI, RegState::Implicit)
21210       .addReg(X86::EAX, RegState::ImplicitDefine);
21211   } else {
21212     BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
21213       .addImm(12);
21214     BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
21215     BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
21216       .addExternalSymbol("__morestack_allocate_stack_space")
21217       .addRegMask(RegMask)
21218       .addReg(X86::EAX, RegState::ImplicitDefine);
21219   }
21220
21221   if (!Is64Bit)
21222     BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
21223       .addImm(16);
21224
21225   BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
21226     .addReg(IsLP64 ? X86::RAX : X86::EAX);
21227   BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
21228
21229   // Set up the CFG correctly.
21230   BB->addSuccessor(bumpMBB);
21231   BB->addSuccessor(mallocMBB);
21232   mallocMBB->addSuccessor(continueMBB);
21233   bumpMBB->addSuccessor(continueMBB);
21234
21235   // Take care of the PHI nodes.
21236   BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
21237           MI->getOperand(0).getReg())
21238     .addReg(mallocPtrVReg).addMBB(mallocMBB)
21239     .addReg(bumpSPPtrVReg).addMBB(bumpMBB);
21240
21241   // Delete the original pseudo instruction.
21242   MI->eraseFromParent();
21243
21244   // And we're done.
21245   return continueMBB;
21246 }
21247
21248 MachineBasicBlock *
21249 X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
21250                                         MachineBasicBlock *BB) const {
21251   DebugLoc DL = MI->getDebugLoc();
21252
21253   assert(!Subtarget->isTargetMachO());
21254
21255   X86FrameLowering::emitStackProbeCall(*BB->getParent(), *BB, MI, DL);
21256
21257   MI->eraseFromParent();   // The pseudo instruction is gone now.
21258   return BB;
21259 }
21260
21261 MachineBasicBlock *
21262 X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
21263                                       MachineBasicBlock *BB) const {
21264   // This is pretty easy.  We're taking the value that we received from
21265   // our load from the relocation, sticking it in either RDI (x86-64)
21266   // or EAX and doing an indirect call.  The return value will then
21267   // be in the normal return register.
21268   MachineFunction *F = BB->getParent();
21269   const X86InstrInfo *TII = Subtarget->getInstrInfo();
21270   DebugLoc DL = MI->getDebugLoc();
21271
21272   assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
21273   assert(MI->getOperand(3).isGlobal() && "This should be a global");
21274
21275   // Get a register mask for the lowered call.
21276   // FIXME: The 32-bit calls have non-standard calling conventions. Use a
21277   // proper register mask.
21278   const uint32_t *RegMask =
21279       Subtarget->getRegisterInfo()->getCallPreservedMask(CallingConv::C);
21280   if (Subtarget->is64Bit()) {
21281     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
21282                                       TII->get(X86::MOV64rm), X86::RDI)
21283     .addReg(X86::RIP)
21284     .addImm(0).addReg(0)
21285     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
21286                       MI->getOperand(3).getTargetFlags())
21287     .addReg(0);
21288     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
21289     addDirectMem(MIB, X86::RDI);
21290     MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
21291   } else if (F->getTarget().getRelocationModel() != Reloc::PIC_) {
21292     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
21293                                       TII->get(X86::MOV32rm), X86::EAX)
21294     .addReg(0)
21295     .addImm(0).addReg(0)
21296     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
21297                       MI->getOperand(3).getTargetFlags())
21298     .addReg(0);
21299     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
21300     addDirectMem(MIB, X86::EAX);
21301     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
21302   } else {
21303     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
21304                                       TII->get(X86::MOV32rm), X86::EAX)
21305     .addReg(TII->getGlobalBaseReg(F))
21306     .addImm(0).addReg(0)
21307     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
21308                       MI->getOperand(3).getTargetFlags())
21309     .addReg(0);
21310     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
21311     addDirectMem(MIB, X86::EAX);
21312     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
21313   }
21314
21315   MI->eraseFromParent(); // The pseudo instruction is gone now.
21316   return BB;
21317 }
21318
21319 MachineBasicBlock *
21320 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
21321                                     MachineBasicBlock *MBB) const {
21322   DebugLoc DL = MI->getDebugLoc();
21323   MachineFunction *MF = MBB->getParent();
21324   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21325   MachineRegisterInfo &MRI = MF->getRegInfo();
21326
21327   const BasicBlock *BB = MBB->getBasicBlock();
21328   MachineFunction::iterator I = MBB;
21329   ++I;
21330
21331   // Memory Reference
21332   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
21333   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
21334
21335   unsigned DstReg;
21336   unsigned MemOpndSlot = 0;
21337
21338   unsigned CurOp = 0;
21339
21340   DstReg = MI->getOperand(CurOp++).getReg();
21341   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
21342   assert(RC->hasType(MVT::i32) && "Invalid destination!");
21343   unsigned mainDstReg = MRI.createVirtualRegister(RC);
21344   unsigned restoreDstReg = MRI.createVirtualRegister(RC);
21345
21346   MemOpndSlot = CurOp;
21347
21348   MVT PVT = getPointerTy();
21349   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
21350          "Invalid Pointer Size!");
21351
21352   // For v = setjmp(buf), we generate
21353   //
21354   // thisMBB:
21355   //  buf[LabelOffset] = restoreMBB
21356   //  SjLjSetup restoreMBB
21357   //
21358   // mainMBB:
21359   //  v_main = 0
21360   //
21361   // sinkMBB:
21362   //  v = phi(main, restore)
21363   //
21364   // restoreMBB:
21365   //  if base pointer being used, load it from frame
21366   //  v_restore = 1
21367
21368   MachineBasicBlock *thisMBB = MBB;
21369   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
21370   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
21371   MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
21372   MF->insert(I, mainMBB);
21373   MF->insert(I, sinkMBB);
21374   MF->push_back(restoreMBB);
21375
21376   MachineInstrBuilder MIB;
21377
21378   // Transfer the remainder of BB and its successor edges to sinkMBB.
21379   sinkMBB->splice(sinkMBB->begin(), MBB,
21380                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
21381   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
21382
21383   // thisMBB:
21384   unsigned PtrStoreOpc = 0;
21385   unsigned LabelReg = 0;
21386   const int64_t LabelOffset = 1 * PVT.getStoreSize();
21387   Reloc::Model RM = MF->getTarget().getRelocationModel();
21388   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
21389                      (RM == Reloc::Static || RM == Reloc::DynamicNoPIC);
21390
21391   // Prepare IP either in reg or imm.
21392   if (!UseImmLabel) {
21393     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
21394     const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
21395     LabelReg = MRI.createVirtualRegister(PtrRC);
21396     if (Subtarget->is64Bit()) {
21397       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
21398               .addReg(X86::RIP)
21399               .addImm(0)
21400               .addReg(0)
21401               .addMBB(restoreMBB)
21402               .addReg(0);
21403     } else {
21404       const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
21405       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
21406               .addReg(XII->getGlobalBaseReg(MF))
21407               .addImm(0)
21408               .addReg(0)
21409               .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference())
21410               .addReg(0);
21411     }
21412   } else
21413     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
21414   // Store IP
21415   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
21416   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
21417     if (i == X86::AddrDisp)
21418       MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset);
21419     else
21420       MIB.addOperand(MI->getOperand(MemOpndSlot + i));
21421   }
21422   if (!UseImmLabel)
21423     MIB.addReg(LabelReg);
21424   else
21425     MIB.addMBB(restoreMBB);
21426   MIB.setMemRefs(MMOBegin, MMOEnd);
21427   // Setup
21428   MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
21429           .addMBB(restoreMBB);
21430
21431   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
21432   MIB.addRegMask(RegInfo->getNoPreservedMask());
21433   thisMBB->addSuccessor(mainMBB);
21434   thisMBB->addSuccessor(restoreMBB);
21435
21436   // mainMBB:
21437   //  EAX = 0
21438   BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
21439   mainMBB->addSuccessor(sinkMBB);
21440
21441   // sinkMBB:
21442   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
21443           TII->get(X86::PHI), DstReg)
21444     .addReg(mainDstReg).addMBB(mainMBB)
21445     .addReg(restoreDstReg).addMBB(restoreMBB);
21446
21447   // restoreMBB:
21448   if (RegInfo->hasBasePointer(*MF)) {
21449     const bool Uses64BitFramePtr =
21450         Subtarget->isTarget64BitLP64() || Subtarget->isTargetNaCl64();
21451     X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
21452     X86FI->setRestoreBasePointer(MF);
21453     unsigned FramePtr = RegInfo->getFrameRegister(*MF);
21454     unsigned BasePtr = RegInfo->getBaseRegister();
21455     unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
21456     addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
21457                  FramePtr, true, X86FI->getRestoreBasePointerOffset())
21458       .setMIFlag(MachineInstr::FrameSetup);
21459   }
21460   BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
21461   BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
21462   restoreMBB->addSuccessor(sinkMBB);
21463
21464   MI->eraseFromParent();
21465   return sinkMBB;
21466 }
21467
21468 MachineBasicBlock *
21469 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
21470                                      MachineBasicBlock *MBB) const {
21471   DebugLoc DL = MI->getDebugLoc();
21472   MachineFunction *MF = MBB->getParent();
21473   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21474   MachineRegisterInfo &MRI = MF->getRegInfo();
21475
21476   // Memory Reference
21477   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
21478   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
21479
21480   MVT PVT = getPointerTy();
21481   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
21482          "Invalid Pointer Size!");
21483
21484   const TargetRegisterClass *RC =
21485     (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
21486   unsigned Tmp = MRI.createVirtualRegister(RC);
21487   // Since FP is only updated here but NOT referenced, it's treated as GPR.
21488   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
21489   unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
21490   unsigned SP = RegInfo->getStackRegister();
21491
21492   MachineInstrBuilder MIB;
21493
21494   const int64_t LabelOffset = 1 * PVT.getStoreSize();
21495   const int64_t SPOffset = 2 * PVT.getStoreSize();
21496
21497   unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
21498   unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
21499
21500   // Reload FP
21501   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
21502   for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
21503     MIB.addOperand(MI->getOperand(i));
21504   MIB.setMemRefs(MMOBegin, MMOEnd);
21505   // Reload IP
21506   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
21507   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
21508     if (i == X86::AddrDisp)
21509       MIB.addDisp(MI->getOperand(i), LabelOffset);
21510     else
21511       MIB.addOperand(MI->getOperand(i));
21512   }
21513   MIB.setMemRefs(MMOBegin, MMOEnd);
21514   // Reload SP
21515   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
21516   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
21517     if (i == X86::AddrDisp)
21518       MIB.addDisp(MI->getOperand(i), SPOffset);
21519     else
21520       MIB.addOperand(MI->getOperand(i));
21521   }
21522   MIB.setMemRefs(MMOBegin, MMOEnd);
21523   // Jump
21524   BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
21525
21526   MI->eraseFromParent();
21527   return MBB;
21528 }
21529
21530 // Replace 213-type (isel default) FMA3 instructions with 231-type for
21531 // accumulator loops. Writing back to the accumulator allows the coalescer
21532 // to remove extra copies in the loop.
21533 MachineBasicBlock *
21534 X86TargetLowering::emitFMA3Instr(MachineInstr *MI,
21535                                  MachineBasicBlock *MBB) const {
21536   MachineOperand &AddendOp = MI->getOperand(3);
21537
21538   // Bail out early if the addend isn't a register - we can't switch these.
21539   if (!AddendOp.isReg())
21540     return MBB;
21541
21542   MachineFunction &MF = *MBB->getParent();
21543   MachineRegisterInfo &MRI = MF.getRegInfo();
21544
21545   // Check whether the addend is defined by a PHI:
21546   assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?");
21547   MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg());
21548   if (!AddendDef.isPHI())
21549     return MBB;
21550
21551   // Look for the following pattern:
21552   // loop:
21553   //   %addend = phi [%entry, 0], [%loop, %result]
21554   //   ...
21555   //   %result<tied1> = FMA213 %m2<tied0>, %m1, %addend
21556
21557   // Replace with:
21558   //   loop:
21559   //   %addend = phi [%entry, 0], [%loop, %result]
21560   //   ...
21561   //   %result<tied1> = FMA231 %addend<tied0>, %m1, %m2
21562
21563   for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) {
21564     assert(AddendDef.getOperand(i).isReg());
21565     MachineOperand PHISrcOp = AddendDef.getOperand(i);
21566     MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg());
21567     if (&PHISrcInst == MI) {
21568       // Found a matching instruction.
21569       unsigned NewFMAOpc = 0;
21570       switch (MI->getOpcode()) {
21571         case X86::VFMADDPDr213r: NewFMAOpc = X86::VFMADDPDr231r; break;
21572         case X86::VFMADDPSr213r: NewFMAOpc = X86::VFMADDPSr231r; break;
21573         case X86::VFMADDSDr213r: NewFMAOpc = X86::VFMADDSDr231r; break;
21574         case X86::VFMADDSSr213r: NewFMAOpc = X86::VFMADDSSr231r; break;
21575         case X86::VFMSUBPDr213r: NewFMAOpc = X86::VFMSUBPDr231r; break;
21576         case X86::VFMSUBPSr213r: NewFMAOpc = X86::VFMSUBPSr231r; break;
21577         case X86::VFMSUBSDr213r: NewFMAOpc = X86::VFMSUBSDr231r; break;
21578         case X86::VFMSUBSSr213r: NewFMAOpc = X86::VFMSUBSSr231r; break;
21579         case X86::VFNMADDPDr213r: NewFMAOpc = X86::VFNMADDPDr231r; break;
21580         case X86::VFNMADDPSr213r: NewFMAOpc = X86::VFNMADDPSr231r; break;
21581         case X86::VFNMADDSDr213r: NewFMAOpc = X86::VFNMADDSDr231r; break;
21582         case X86::VFNMADDSSr213r: NewFMAOpc = X86::VFNMADDSSr231r; break;
21583         case X86::VFNMSUBPDr213r: NewFMAOpc = X86::VFNMSUBPDr231r; break;
21584         case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break;
21585         case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break;
21586         case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break;
21587         case X86::VFMADDSUBPDr213r: NewFMAOpc = X86::VFMADDSUBPDr231r; break;
21588         case X86::VFMADDSUBPSr213r: NewFMAOpc = X86::VFMADDSUBPSr231r; break;
21589         case X86::VFMSUBADDPDr213r: NewFMAOpc = X86::VFMSUBADDPDr231r; break;
21590         case X86::VFMSUBADDPSr213r: NewFMAOpc = X86::VFMSUBADDPSr231r; break;
21591
21592         case X86::VFMADDPDr213rY: NewFMAOpc = X86::VFMADDPDr231rY; break;
21593         case X86::VFMADDPSr213rY: NewFMAOpc = X86::VFMADDPSr231rY; break;
21594         case X86::VFMSUBPDr213rY: NewFMAOpc = X86::VFMSUBPDr231rY; break;
21595         case X86::VFMSUBPSr213rY: NewFMAOpc = X86::VFMSUBPSr231rY; break;
21596         case X86::VFNMADDPDr213rY: NewFMAOpc = X86::VFNMADDPDr231rY; break;
21597         case X86::VFNMADDPSr213rY: NewFMAOpc = X86::VFNMADDPSr231rY; break;
21598         case X86::VFNMSUBPDr213rY: NewFMAOpc = X86::VFNMSUBPDr231rY; break;
21599         case X86::VFNMSUBPSr213rY: NewFMAOpc = X86::VFNMSUBPSr231rY; break;
21600         case X86::VFMADDSUBPDr213rY: NewFMAOpc = X86::VFMADDSUBPDr231rY; break;
21601         case X86::VFMADDSUBPSr213rY: NewFMAOpc = X86::VFMADDSUBPSr231rY; break;
21602         case X86::VFMSUBADDPDr213rY: NewFMAOpc = X86::VFMSUBADDPDr231rY; break;
21603         case X86::VFMSUBADDPSr213rY: NewFMAOpc = X86::VFMSUBADDPSr231rY; break;
21604         default: llvm_unreachable("Unrecognized FMA variant.");
21605       }
21606
21607       const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
21608       MachineInstrBuilder MIB =
21609         BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc))
21610         .addOperand(MI->getOperand(0))
21611         .addOperand(MI->getOperand(3))
21612         .addOperand(MI->getOperand(2))
21613         .addOperand(MI->getOperand(1));
21614       MBB->insert(MachineBasicBlock::iterator(MI), MIB);
21615       MI->eraseFromParent();
21616     }
21617   }
21618
21619   return MBB;
21620 }
21621
21622 MachineBasicBlock *
21623 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
21624                                                MachineBasicBlock *BB) const {
21625   switch (MI->getOpcode()) {
21626   default: llvm_unreachable("Unexpected instr type to insert");
21627   case X86::TAILJMPd64:
21628   case X86::TAILJMPr64:
21629   case X86::TAILJMPm64:
21630   case X86::TAILJMPd64_REX:
21631   case X86::TAILJMPr64_REX:
21632   case X86::TAILJMPm64_REX:
21633     llvm_unreachable("TAILJMP64 would not be touched here.");
21634   case X86::TCRETURNdi64:
21635   case X86::TCRETURNri64:
21636   case X86::TCRETURNmi64:
21637     return BB;
21638   case X86::WIN_ALLOCA:
21639     return EmitLoweredWinAlloca(MI, BB);
21640   case X86::SEG_ALLOCA_32:
21641   case X86::SEG_ALLOCA_64:
21642     return EmitLoweredSegAlloca(MI, BB);
21643   case X86::TLSCall_32:
21644   case X86::TLSCall_64:
21645     return EmitLoweredTLSCall(MI, BB);
21646   case X86::CMOV_GR8:
21647   case X86::CMOV_FR32:
21648   case X86::CMOV_FR64:
21649   case X86::CMOV_V4F32:
21650   case X86::CMOV_V2F64:
21651   case X86::CMOV_V2I64:
21652   case X86::CMOV_V8F32:
21653   case X86::CMOV_V4F64:
21654   case X86::CMOV_V4I64:
21655   case X86::CMOV_V16F32:
21656   case X86::CMOV_V8F64:
21657   case X86::CMOV_V8I64:
21658   case X86::CMOV_GR16:
21659   case X86::CMOV_GR32:
21660   case X86::CMOV_RFP32:
21661   case X86::CMOV_RFP64:
21662   case X86::CMOV_RFP80:
21663     return EmitLoweredSelect(MI, BB);
21664
21665   case X86::FP32_TO_INT16_IN_MEM:
21666   case X86::FP32_TO_INT32_IN_MEM:
21667   case X86::FP32_TO_INT64_IN_MEM:
21668   case X86::FP64_TO_INT16_IN_MEM:
21669   case X86::FP64_TO_INT32_IN_MEM:
21670   case X86::FP64_TO_INT64_IN_MEM:
21671   case X86::FP80_TO_INT16_IN_MEM:
21672   case X86::FP80_TO_INT32_IN_MEM:
21673   case X86::FP80_TO_INT64_IN_MEM: {
21674     MachineFunction *F = BB->getParent();
21675     const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21676     DebugLoc DL = MI->getDebugLoc();
21677
21678     // Change the floating point control register to use "round towards zero"
21679     // mode when truncating to an integer value.
21680     int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
21681     addFrameReference(BuildMI(*BB, MI, DL,
21682                               TII->get(X86::FNSTCW16m)), CWFrameIdx);
21683
21684     // Load the old value of the high byte of the control word...
21685     unsigned OldCW =
21686       F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
21687     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
21688                       CWFrameIdx);
21689
21690     // Set the high part to be round to zero...
21691     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
21692       .addImm(0xC7F);
21693
21694     // Reload the modified control word now...
21695     addFrameReference(BuildMI(*BB, MI, DL,
21696                               TII->get(X86::FLDCW16m)), CWFrameIdx);
21697
21698     // Restore the memory image of control word to original value
21699     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
21700       .addReg(OldCW);
21701
21702     // Get the X86 opcode to use.
21703     unsigned Opc;
21704     switch (MI->getOpcode()) {
21705     default: llvm_unreachable("illegal opcode!");
21706     case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
21707     case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
21708     case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
21709     case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
21710     case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
21711     case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
21712     case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
21713     case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
21714     case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
21715     }
21716
21717     X86AddressMode AM;
21718     MachineOperand &Op = MI->getOperand(0);
21719     if (Op.isReg()) {
21720       AM.BaseType = X86AddressMode::RegBase;
21721       AM.Base.Reg = Op.getReg();
21722     } else {
21723       AM.BaseType = X86AddressMode::FrameIndexBase;
21724       AM.Base.FrameIndex = Op.getIndex();
21725     }
21726     Op = MI->getOperand(1);
21727     if (Op.isImm())
21728       AM.Scale = Op.getImm();
21729     Op = MI->getOperand(2);
21730     if (Op.isImm())
21731       AM.IndexReg = Op.getImm();
21732     Op = MI->getOperand(3);
21733     if (Op.isGlobal()) {
21734       AM.GV = Op.getGlobal();
21735     } else {
21736       AM.Disp = Op.getImm();
21737     }
21738     addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
21739                       .addReg(MI->getOperand(X86::AddrNumOperands).getReg());
21740
21741     // Reload the original control word now.
21742     addFrameReference(BuildMI(*BB, MI, DL,
21743                               TII->get(X86::FLDCW16m)), CWFrameIdx);
21744
21745     MI->eraseFromParent();   // The pseudo instruction is gone now.
21746     return BB;
21747   }
21748     // String/text processing lowering.
21749   case X86::PCMPISTRM128REG:
21750   case X86::VPCMPISTRM128REG:
21751   case X86::PCMPISTRM128MEM:
21752   case X86::VPCMPISTRM128MEM:
21753   case X86::PCMPESTRM128REG:
21754   case X86::VPCMPESTRM128REG:
21755   case X86::PCMPESTRM128MEM:
21756   case X86::VPCMPESTRM128MEM:
21757     assert(Subtarget->hasSSE42() &&
21758            "Target must have SSE4.2 or AVX features enabled");
21759     return EmitPCMPSTRM(MI, BB, Subtarget->getInstrInfo());
21760
21761   // String/text processing lowering.
21762   case X86::PCMPISTRIREG:
21763   case X86::VPCMPISTRIREG:
21764   case X86::PCMPISTRIMEM:
21765   case X86::VPCMPISTRIMEM:
21766   case X86::PCMPESTRIREG:
21767   case X86::VPCMPESTRIREG:
21768   case X86::PCMPESTRIMEM:
21769   case X86::VPCMPESTRIMEM:
21770     assert(Subtarget->hasSSE42() &&
21771            "Target must have SSE4.2 or AVX features enabled");
21772     return EmitPCMPSTRI(MI, BB, Subtarget->getInstrInfo());
21773
21774   // Thread synchronization.
21775   case X86::MONITOR:
21776     return EmitMonitor(MI, BB, Subtarget);
21777
21778   // xbegin
21779   case X86::XBEGIN:
21780     return EmitXBegin(MI, BB, Subtarget->getInstrInfo());
21781
21782   case X86::VASTART_SAVE_XMM_REGS:
21783     return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
21784
21785   case X86::VAARG_64:
21786     return EmitVAARG64WithCustomInserter(MI, BB);
21787
21788   case X86::EH_SjLj_SetJmp32:
21789   case X86::EH_SjLj_SetJmp64:
21790     return emitEHSjLjSetJmp(MI, BB);
21791
21792   case X86::EH_SjLj_LongJmp32:
21793   case X86::EH_SjLj_LongJmp64:
21794     return emitEHSjLjLongJmp(MI, BB);
21795
21796   case TargetOpcode::STATEPOINT:
21797     // As an implementation detail, STATEPOINT shares the STACKMAP format at
21798     // this point in the process.  We diverge later.
21799     return emitPatchPoint(MI, BB);
21800
21801   case TargetOpcode::STACKMAP:
21802   case TargetOpcode::PATCHPOINT:
21803     return emitPatchPoint(MI, BB);
21804
21805   case X86::VFMADDPDr213r:
21806   case X86::VFMADDPSr213r:
21807   case X86::VFMADDSDr213r:
21808   case X86::VFMADDSSr213r:
21809   case X86::VFMSUBPDr213r:
21810   case X86::VFMSUBPSr213r:
21811   case X86::VFMSUBSDr213r:
21812   case X86::VFMSUBSSr213r:
21813   case X86::VFNMADDPDr213r:
21814   case X86::VFNMADDPSr213r:
21815   case X86::VFNMADDSDr213r:
21816   case X86::VFNMADDSSr213r:
21817   case X86::VFNMSUBPDr213r:
21818   case X86::VFNMSUBPSr213r:
21819   case X86::VFNMSUBSDr213r:
21820   case X86::VFNMSUBSSr213r:
21821   case X86::VFMADDSUBPDr213r:
21822   case X86::VFMADDSUBPSr213r:
21823   case X86::VFMSUBADDPDr213r:
21824   case X86::VFMSUBADDPSr213r:
21825   case X86::VFMADDPDr213rY:
21826   case X86::VFMADDPSr213rY:
21827   case X86::VFMSUBPDr213rY:
21828   case X86::VFMSUBPSr213rY:
21829   case X86::VFNMADDPDr213rY:
21830   case X86::VFNMADDPSr213rY:
21831   case X86::VFNMSUBPDr213rY:
21832   case X86::VFNMSUBPSr213rY:
21833   case X86::VFMADDSUBPDr213rY:
21834   case X86::VFMADDSUBPSr213rY:
21835   case X86::VFMSUBADDPDr213rY:
21836   case X86::VFMSUBADDPSr213rY:
21837     return emitFMA3Instr(MI, BB);
21838   }
21839 }
21840
21841 //===----------------------------------------------------------------------===//
21842 //                           X86 Optimization Hooks
21843 //===----------------------------------------------------------------------===//
21844
21845 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
21846                                                       APInt &KnownZero,
21847                                                       APInt &KnownOne,
21848                                                       const SelectionDAG &DAG,
21849                                                       unsigned Depth) const {
21850   unsigned BitWidth = KnownZero.getBitWidth();
21851   unsigned Opc = Op.getOpcode();
21852   assert((Opc >= ISD::BUILTIN_OP_END ||
21853           Opc == ISD::INTRINSIC_WO_CHAIN ||
21854           Opc == ISD::INTRINSIC_W_CHAIN ||
21855           Opc == ISD::INTRINSIC_VOID) &&
21856          "Should use MaskedValueIsZero if you don't know whether Op"
21857          " is a target node!");
21858
21859   KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
21860   switch (Opc) {
21861   default: break;
21862   case X86ISD::ADD:
21863   case X86ISD::SUB:
21864   case X86ISD::ADC:
21865   case X86ISD::SBB:
21866   case X86ISD::SMUL:
21867   case X86ISD::UMUL:
21868   case X86ISD::INC:
21869   case X86ISD::DEC:
21870   case X86ISD::OR:
21871   case X86ISD::XOR:
21872   case X86ISD::AND:
21873     // These nodes' second result is a boolean.
21874     if (Op.getResNo() == 0)
21875       break;
21876     // Fallthrough
21877   case X86ISD::SETCC:
21878     KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
21879     break;
21880   case ISD::INTRINSIC_WO_CHAIN: {
21881     unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
21882     unsigned NumLoBits = 0;
21883     switch (IntId) {
21884     default: break;
21885     case Intrinsic::x86_sse_movmsk_ps:
21886     case Intrinsic::x86_avx_movmsk_ps_256:
21887     case Intrinsic::x86_sse2_movmsk_pd:
21888     case Intrinsic::x86_avx_movmsk_pd_256:
21889     case Intrinsic::x86_mmx_pmovmskb:
21890     case Intrinsic::x86_sse2_pmovmskb_128:
21891     case Intrinsic::x86_avx2_pmovmskb: {
21892       // High bits of movmskp{s|d}, pmovmskb are known zero.
21893       switch (IntId) {
21894         default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
21895         case Intrinsic::x86_sse_movmsk_ps:      NumLoBits = 4; break;
21896         case Intrinsic::x86_avx_movmsk_ps_256:  NumLoBits = 8; break;
21897         case Intrinsic::x86_sse2_movmsk_pd:     NumLoBits = 2; break;
21898         case Intrinsic::x86_avx_movmsk_pd_256:  NumLoBits = 4; break;
21899         case Intrinsic::x86_mmx_pmovmskb:       NumLoBits = 8; break;
21900         case Intrinsic::x86_sse2_pmovmskb_128:  NumLoBits = 16; break;
21901         case Intrinsic::x86_avx2_pmovmskb:      NumLoBits = 32; break;
21902       }
21903       KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
21904       break;
21905     }
21906     }
21907     break;
21908   }
21909   }
21910 }
21911
21912 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
21913   SDValue Op,
21914   const SelectionDAG &,
21915   unsigned Depth) const {
21916   // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
21917   if (Op.getOpcode() == X86ISD::SETCC_CARRY)
21918     return Op.getValueType().getScalarType().getSizeInBits();
21919
21920   // Fallback case.
21921   return 1;
21922 }
21923
21924 /// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
21925 /// node is a GlobalAddress + offset.
21926 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
21927                                        const GlobalValue* &GA,
21928                                        int64_t &Offset) const {
21929   if (N->getOpcode() == X86ISD::Wrapper) {
21930     if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
21931       GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
21932       Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
21933       return true;
21934     }
21935   }
21936   return TargetLowering::isGAPlusOffset(N, GA, Offset);
21937 }
21938
21939 /// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the
21940 /// same as extracting the high 128-bit part of 256-bit vector and then
21941 /// inserting the result into the low part of a new 256-bit vector
21942 static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
21943   EVT VT = SVOp->getValueType(0);
21944   unsigned NumElems = VT.getVectorNumElements();
21945
21946   // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
21947   for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j)
21948     if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
21949         SVOp->getMaskElt(j) >= 0)
21950       return false;
21951
21952   return true;
21953 }
21954
21955 /// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the
21956 /// same as extracting the low 128-bit part of 256-bit vector and then
21957 /// inserting the result into the high part of a new 256-bit vector
21958 static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
21959   EVT VT = SVOp->getValueType(0);
21960   unsigned NumElems = VT.getVectorNumElements();
21961
21962   // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
21963   for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j)
21964     if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
21965         SVOp->getMaskElt(j) >= 0)
21966       return false;
21967
21968   return true;
21969 }
21970
21971 /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
21972 static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
21973                                         TargetLowering::DAGCombinerInfo &DCI,
21974                                         const X86Subtarget* Subtarget) {
21975   SDLoc dl(N);
21976   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
21977   SDValue V1 = SVOp->getOperand(0);
21978   SDValue V2 = SVOp->getOperand(1);
21979   EVT VT = SVOp->getValueType(0);
21980   unsigned NumElems = VT.getVectorNumElements();
21981
21982   if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
21983       V2.getOpcode() == ISD::CONCAT_VECTORS) {
21984     //
21985     //                   0,0,0,...
21986     //                      |
21987     //    V      UNDEF    BUILD_VECTOR    UNDEF
21988     //     \      /           \           /
21989     //  CONCAT_VECTOR         CONCAT_VECTOR
21990     //         \                  /
21991     //          \                /
21992     //          RESULT: V + zero extended
21993     //
21994     if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
21995         V2.getOperand(1).getOpcode() != ISD::UNDEF ||
21996         V1.getOperand(1).getOpcode() != ISD::UNDEF)
21997       return SDValue();
21998
21999     if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
22000       return SDValue();
22001
22002     // To match the shuffle mask, the first half of the mask should
22003     // be exactly the first vector, and all the rest a splat with the
22004     // first element of the second one.
22005     for (unsigned i = 0; i != NumElems/2; ++i)
22006       if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
22007           !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
22008         return SDValue();
22009
22010     // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
22011     if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
22012       if (Ld->hasNUsesOfValue(1, 0)) {
22013         SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
22014         SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
22015         SDValue ResNode =
22016           DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
22017                                   Ld->getMemoryVT(),
22018                                   Ld->getPointerInfo(),
22019                                   Ld->getAlignment(),
22020                                   false/*isVolatile*/, true/*ReadMem*/,
22021                                   false/*WriteMem*/);
22022
22023         // Make sure the newly-created LOAD is in the same position as Ld in
22024         // terms of dependency. We create a TokenFactor for Ld and ResNode,
22025         // and update uses of Ld's output chain to use the TokenFactor.
22026         if (Ld->hasAnyUseOfValue(1)) {
22027           SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
22028                              SDValue(Ld, 1), SDValue(ResNode.getNode(), 1));
22029           DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
22030           DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
22031                                  SDValue(ResNode.getNode(), 1));
22032         }
22033
22034         return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
22035       }
22036     }
22037
22038     // Emit a zeroed vector and insert the desired subvector on its
22039     // first half.
22040     SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
22041     SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
22042     return DCI.CombineTo(N, InsV);
22043   }
22044
22045   //===--------------------------------------------------------------------===//
22046   // Combine some shuffles into subvector extracts and inserts:
22047   //
22048
22049   // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
22050   if (isShuffleHigh128VectorInsertLow(SVOp)) {
22051     SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl);
22052     SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl);
22053     return DCI.CombineTo(N, InsV);
22054   }
22055
22056   // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
22057   if (isShuffleLow128VectorInsertHigh(SVOp)) {
22058     SDValue V = Extract128BitVector(V1, 0, DAG, dl);
22059     SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl);
22060     return DCI.CombineTo(N, InsV);
22061   }
22062
22063   return SDValue();
22064 }
22065
22066 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
22067 /// possible.
22068 ///
22069 /// This is the leaf of the recursive combinine below. When we have found some
22070 /// chain of single-use x86 shuffle instructions and accumulated the combined
22071 /// shuffle mask represented by them, this will try to pattern match that mask
22072 /// into either a single instruction if there is a special purpose instruction
22073 /// for this operation, or into a PSHUFB instruction which is a fully general
22074 /// instruction but should only be used to replace chains over a certain depth.
22075 static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
22076                                    int Depth, bool HasPSHUFB, SelectionDAG &DAG,
22077                                    TargetLowering::DAGCombinerInfo &DCI,
22078                                    const X86Subtarget *Subtarget) {
22079   assert(!Mask.empty() && "Cannot combine an empty shuffle mask!");
22080
22081   // Find the operand that enters the chain. Note that multiple uses are OK
22082   // here, we're not going to remove the operand we find.
22083   SDValue Input = Op.getOperand(0);
22084   while (Input.getOpcode() == ISD::BITCAST)
22085     Input = Input.getOperand(0);
22086
22087   MVT VT = Input.getSimpleValueType();
22088   MVT RootVT = Root.getSimpleValueType();
22089   SDLoc DL(Root);
22090
22091   // Just remove no-op shuffle masks.
22092   if (Mask.size() == 1) {
22093     DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Input),
22094                   /*AddTo*/ true);
22095     return true;
22096   }
22097
22098   // Use the float domain if the operand type is a floating point type.
22099   bool FloatDomain = VT.isFloatingPoint();
22100
22101   // For floating point shuffles, we don't have free copies in the shuffle
22102   // instructions or the ability to load as part of the instruction, so
22103   // canonicalize their shuffles to UNPCK or MOV variants.
22104   //
22105   // Note that even with AVX we prefer the PSHUFD form of shuffle for integer
22106   // vectors because it can have a load folded into it that UNPCK cannot. This
22107   // doesn't preclude something switching to the shorter encoding post-RA.
22108   if (FloatDomain) {
22109     if (Mask.equals(0, 0) || Mask.equals(1, 1)) {
22110       bool Lo = Mask.equals(0, 0);
22111       unsigned Shuffle;
22112       MVT ShuffleVT;
22113       // Check if we have SSE3 which will let us use MOVDDUP. That instruction
22114       // is no slower than UNPCKLPD but has the option to fold the input operand
22115       // into even an unaligned memory load.
22116       if (Lo && Subtarget->hasSSE3()) {
22117         Shuffle = X86ISD::MOVDDUP;
22118         ShuffleVT = MVT::v2f64;
22119       } else {
22120         // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller
22121         // than the UNPCK variants.
22122         Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS;
22123         ShuffleVT = MVT::v4f32;
22124       }
22125       if (Depth == 1 && Root->getOpcode() == Shuffle)
22126         return false; // Nothing to do!
22127       Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
22128       DCI.AddToWorklist(Op.getNode());
22129       if (Shuffle == X86ISD::MOVDDUP)
22130         Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
22131       else
22132         Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
22133       DCI.AddToWorklist(Op.getNode());
22134       DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22135                     /*AddTo*/ true);
22136       return true;
22137     }
22138     if (Subtarget->hasSSE3() &&
22139         (Mask.equals(0, 0, 2, 2) || Mask.equals(1, 1, 3, 3))) {
22140       bool Lo = Mask.equals(0, 0, 2, 2);
22141       unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP;
22142       MVT ShuffleVT = MVT::v4f32;
22143       if (Depth == 1 && Root->getOpcode() == Shuffle)
22144         return false; // Nothing to do!
22145       Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
22146       DCI.AddToWorklist(Op.getNode());
22147       Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
22148       DCI.AddToWorklist(Op.getNode());
22149       DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22150                     /*AddTo*/ true);
22151       return true;
22152     }
22153     if (Mask.equals(0, 0, 1, 1) || Mask.equals(2, 2, 3, 3)) {
22154       bool Lo = Mask.equals(0, 0, 1, 1);
22155       unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
22156       MVT ShuffleVT = MVT::v4f32;
22157       if (Depth == 1 && Root->getOpcode() == Shuffle)
22158         return false; // Nothing to do!
22159       Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
22160       DCI.AddToWorklist(Op.getNode());
22161       Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
22162       DCI.AddToWorklist(Op.getNode());
22163       DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22164                     /*AddTo*/ true);
22165       return true;
22166     }
22167   }
22168
22169   // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK
22170   // variants as none of these have single-instruction variants that are
22171   // superior to the UNPCK formulation.
22172   if (!FloatDomain &&
22173       (Mask.equals(0, 0, 1, 1, 2, 2, 3, 3) ||
22174        Mask.equals(4, 4, 5, 5, 6, 6, 7, 7) ||
22175        Mask.equals(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7) ||
22176        Mask.equals(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15,
22177                    15))) {
22178     bool Lo = Mask[0] == 0;
22179     unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
22180     if (Depth == 1 && Root->getOpcode() == Shuffle)
22181       return false; // Nothing to do!
22182     MVT ShuffleVT;
22183     switch (Mask.size()) {
22184     case 8:
22185       ShuffleVT = MVT::v8i16;
22186       break;
22187     case 16:
22188       ShuffleVT = MVT::v16i8;
22189       break;
22190     default:
22191       llvm_unreachable("Impossible mask size!");
22192     };
22193     Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
22194     DCI.AddToWorklist(Op.getNode());
22195     Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
22196     DCI.AddToWorklist(Op.getNode());
22197     DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22198                   /*AddTo*/ true);
22199     return true;
22200   }
22201
22202   // Don't try to re-form single instruction chains under any circumstances now
22203   // that we've done encoding canonicalization for them.
22204   if (Depth < 2)
22205     return false;
22206
22207   // If we have 3 or more shuffle instructions or a chain involving PSHUFB, we
22208   // can replace them with a single PSHUFB instruction profitably. Intel's
22209   // manuals suggest only using PSHUFB if doing so replacing 5 instructions, but
22210   // in practice PSHUFB tends to be *very* fast so we're more aggressive.
22211   if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) {
22212     SmallVector<SDValue, 16> PSHUFBMask;
22213     assert(Mask.size() <= 16 && "Can't shuffle elements smaller than bytes!");
22214     int Ratio = 16 / Mask.size();
22215     for (unsigned i = 0; i < 16; ++i) {
22216       if (Mask[i / Ratio] == SM_SentinelUndef) {
22217         PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
22218         continue;
22219       }
22220       int M = Mask[i / Ratio] != SM_SentinelZero
22221                   ? Ratio * Mask[i / Ratio] + i % Ratio
22222                   : 255;
22223       PSHUFBMask.push_back(DAG.getConstant(M, MVT::i8));
22224     }
22225     Op = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Input);
22226     DCI.AddToWorklist(Op.getNode());
22227     SDValue PSHUFBMaskOp =
22228         DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, PSHUFBMask);
22229     DCI.AddToWorklist(PSHUFBMaskOp.getNode());
22230     Op = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, Op, PSHUFBMaskOp);
22231     DCI.AddToWorklist(Op.getNode());
22232     DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22233                   /*AddTo*/ true);
22234     return true;
22235   }
22236
22237   // Failed to find any combines.
22238   return false;
22239 }
22240
22241 /// \brief Fully generic combining of x86 shuffle instructions.
22242 ///
22243 /// This should be the last combine run over the x86 shuffle instructions. Once
22244 /// they have been fully optimized, this will recursively consider all chains
22245 /// of single-use shuffle instructions, build a generic model of the cumulative
22246 /// shuffle operation, and check for simpler instructions which implement this
22247 /// operation. We use this primarily for two purposes:
22248 ///
22249 /// 1) Collapse generic shuffles to specialized single instructions when
22250 ///    equivalent. In most cases, this is just an encoding size win, but
22251 ///    sometimes we will collapse multiple generic shuffles into a single
22252 ///    special-purpose shuffle.
22253 /// 2) Look for sequences of shuffle instructions with 3 or more total
22254 ///    instructions, and replace them with the slightly more expensive SSSE3
22255 ///    PSHUFB instruction if available. We do this as the last combining step
22256 ///    to ensure we avoid using PSHUFB if we can implement the shuffle with
22257 ///    a suitable short sequence of other instructions. The PHUFB will either
22258 ///    use a register or have to read from memory and so is slightly (but only
22259 ///    slightly) more expensive than the other shuffle instructions.
22260 ///
22261 /// Because this is inherently a quadratic operation (for each shuffle in
22262 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
22263 /// This should never be an issue in practice as the shuffle lowering doesn't
22264 /// produce sequences of more than 8 instructions.
22265 ///
22266 /// FIXME: We will currently miss some cases where the redundant shuffling
22267 /// would simplify under the threshold for PSHUFB formation because of
22268 /// combine-ordering. To fix this, we should do the redundant instruction
22269 /// combining in this recursive walk.
22270 static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
22271                                           ArrayRef<int> RootMask,
22272                                           int Depth, bool HasPSHUFB,
22273                                           SelectionDAG &DAG,
22274                                           TargetLowering::DAGCombinerInfo &DCI,
22275                                           const X86Subtarget *Subtarget) {
22276   // Bound the depth of our recursive combine because this is ultimately
22277   // quadratic in nature.
22278   if (Depth > 8)
22279     return false;
22280
22281   // Directly rip through bitcasts to find the underlying operand.
22282   while (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).hasOneUse())
22283     Op = Op.getOperand(0);
22284
22285   MVT VT = Op.getSimpleValueType();
22286   if (!VT.isVector())
22287     return false; // Bail if we hit a non-vector.
22288   // FIXME: This routine should be taught about 256-bit shuffles, or a 256-bit
22289   // version should be added.
22290   if (VT.getSizeInBits() != 128)
22291     return false;
22292
22293   assert(Root.getSimpleValueType().isVector() &&
22294          "Shuffles operate on vector types!");
22295   assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
22296          "Can only combine shuffles of the same vector register size.");
22297
22298   if (!isTargetShuffle(Op.getOpcode()))
22299     return false;
22300   SmallVector<int, 16> OpMask;
22301   bool IsUnary;
22302   bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, OpMask, IsUnary);
22303   // We only can combine unary shuffles which we can decode the mask for.
22304   if (!HaveMask || !IsUnary)
22305     return false;
22306
22307   assert(VT.getVectorNumElements() == OpMask.size() &&
22308          "Different mask size from vector size!");
22309   assert(((RootMask.size() > OpMask.size() &&
22310            RootMask.size() % OpMask.size() == 0) ||
22311           (OpMask.size() > RootMask.size() &&
22312            OpMask.size() % RootMask.size() == 0) ||
22313           OpMask.size() == RootMask.size()) &&
22314          "The smaller number of elements must divide the larger.");
22315   int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
22316   int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
22317   assert(((RootRatio == 1 && OpRatio == 1) ||
22318           (RootRatio == 1) != (OpRatio == 1)) &&
22319          "Must not have a ratio for both incoming and op masks!");
22320
22321   SmallVector<int, 16> Mask;
22322   Mask.reserve(std::max(OpMask.size(), RootMask.size()));
22323
22324   // Merge this shuffle operation's mask into our accumulated mask. Note that
22325   // this shuffle's mask will be the first applied to the input, followed by the
22326   // root mask to get us all the way to the root value arrangement. The reason
22327   // for this order is that we are recursing up the operation chain.
22328   for (int i = 0, e = std::max(OpMask.size(), RootMask.size()); i < e; ++i) {
22329     int RootIdx = i / RootRatio;
22330     if (RootMask[RootIdx] < 0) {
22331       // This is a zero or undef lane, we're done.
22332       Mask.push_back(RootMask[RootIdx]);
22333       continue;
22334     }
22335
22336     int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
22337     int OpIdx = RootMaskedIdx / OpRatio;
22338     if (OpMask[OpIdx] < 0) {
22339       // The incoming lanes are zero or undef, it doesn't matter which ones we
22340       // are using.
22341       Mask.push_back(OpMask[OpIdx]);
22342       continue;
22343     }
22344
22345     // Ok, we have non-zero lanes, map them through.
22346     Mask.push_back(OpMask[OpIdx] * OpRatio +
22347                    RootMaskedIdx % OpRatio);
22348   }
22349
22350   // See if we can recurse into the operand to combine more things.
22351   switch (Op.getOpcode()) {
22352     case X86ISD::PSHUFB:
22353       HasPSHUFB = true;
22354     case X86ISD::PSHUFD:
22355     case X86ISD::PSHUFHW:
22356     case X86ISD::PSHUFLW:
22357       if (Op.getOperand(0).hasOneUse() &&
22358           combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
22359                                         HasPSHUFB, DAG, DCI, Subtarget))
22360         return true;
22361       break;
22362
22363     case X86ISD::UNPCKL:
22364     case X86ISD::UNPCKH:
22365       assert(Op.getOperand(0) == Op.getOperand(1) && "We only combine unary shuffles!");
22366       // We can't check for single use, we have to check that this shuffle is the only user.
22367       if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
22368           combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
22369                                         HasPSHUFB, DAG, DCI, Subtarget))
22370           return true;
22371       break;
22372   }
22373
22374   // Minor canonicalization of the accumulated shuffle mask to make it easier
22375   // to match below. All this does is detect masks with squential pairs of
22376   // elements, and shrink them to the half-width mask. It does this in a loop
22377   // so it will reduce the size of the mask to the minimal width mask which
22378   // performs an equivalent shuffle.
22379   SmallVector<int, 16> WidenedMask;
22380   while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
22381     Mask = std::move(WidenedMask);
22382     WidenedMask.clear();
22383   }
22384
22385   return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI,
22386                                 Subtarget);
22387 }
22388
22389 /// \brief Get the PSHUF-style mask from PSHUF node.
22390 ///
22391 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
22392 /// PSHUF-style masks that can be reused with such instructions.
22393 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
22394   SmallVector<int, 4> Mask;
22395   bool IsUnary;
22396   bool HaveMask = getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), Mask, IsUnary);
22397   (void)HaveMask;
22398   assert(HaveMask);
22399
22400   switch (N.getOpcode()) {
22401   case X86ISD::PSHUFD:
22402     return Mask;
22403   case X86ISD::PSHUFLW:
22404     Mask.resize(4);
22405     return Mask;
22406   case X86ISD::PSHUFHW:
22407     Mask.erase(Mask.begin(), Mask.begin() + 4);
22408     for (int &M : Mask)
22409       M -= 4;
22410     return Mask;
22411   default:
22412     llvm_unreachable("No valid shuffle instruction found!");
22413   }
22414 }
22415
22416 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
22417 ///
22418 /// We walk up the chain and look for a combinable shuffle, skipping over
22419 /// shuffles that we could hoist this shuffle's transformation past without
22420 /// altering anything.
22421 static SDValue
22422 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
22423                              SelectionDAG &DAG,
22424                              TargetLowering::DAGCombinerInfo &DCI) {
22425   assert(N.getOpcode() == X86ISD::PSHUFD &&
22426          "Called with something other than an x86 128-bit half shuffle!");
22427   SDLoc DL(N);
22428
22429   // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
22430   // of the shuffles in the chain so that we can form a fresh chain to replace
22431   // this one.
22432   SmallVector<SDValue, 8> Chain;
22433   SDValue V = N.getOperand(0);
22434   for (; V.hasOneUse(); V = V.getOperand(0)) {
22435     switch (V.getOpcode()) {
22436     default:
22437       return SDValue(); // Nothing combined!
22438
22439     case ISD::BITCAST:
22440       // Skip bitcasts as we always know the type for the target specific
22441       // instructions.
22442       continue;
22443
22444     case X86ISD::PSHUFD:
22445       // Found another dword shuffle.
22446       break;
22447
22448     case X86ISD::PSHUFLW:
22449       // Check that the low words (being shuffled) are the identity in the
22450       // dword shuffle, and the high words are self-contained.
22451       if (Mask[0] != 0 || Mask[1] != 1 ||
22452           !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
22453         return SDValue();
22454
22455       Chain.push_back(V);
22456       continue;
22457
22458     case X86ISD::PSHUFHW:
22459       // Check that the high words (being shuffled) are the identity in the
22460       // dword shuffle, and the low words are self-contained.
22461       if (Mask[2] != 2 || Mask[3] != 3 ||
22462           !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
22463         return SDValue();
22464
22465       Chain.push_back(V);
22466       continue;
22467
22468     case X86ISD::UNPCKL:
22469     case X86ISD::UNPCKH:
22470       // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
22471       // shuffle into a preceding word shuffle.
22472       if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16)
22473         return SDValue();
22474
22475       // Search for a half-shuffle which we can combine with.
22476       unsigned CombineOp =
22477           V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
22478       if (V.getOperand(0) != V.getOperand(1) ||
22479           !V->isOnlyUserOf(V.getOperand(0).getNode()))
22480         return SDValue();
22481       Chain.push_back(V);
22482       V = V.getOperand(0);
22483       do {
22484         switch (V.getOpcode()) {
22485         default:
22486           return SDValue(); // Nothing to combine.
22487
22488         case X86ISD::PSHUFLW:
22489         case X86ISD::PSHUFHW:
22490           if (V.getOpcode() == CombineOp)
22491             break;
22492
22493           Chain.push_back(V);
22494
22495           // Fallthrough!
22496         case ISD::BITCAST:
22497           V = V.getOperand(0);
22498           continue;
22499         }
22500         break;
22501       } while (V.hasOneUse());
22502       break;
22503     }
22504     // Break out of the loop if we break out of the switch.
22505     break;
22506   }
22507
22508   if (!V.hasOneUse())
22509     // We fell out of the loop without finding a viable combining instruction.
22510     return SDValue();
22511
22512   // Merge this node's mask and our incoming mask.
22513   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
22514   for (int &M : Mask)
22515     M = VMask[M];
22516   V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
22517                   getV4X86ShuffleImm8ForMask(Mask, DAG));
22518
22519   // Rebuild the chain around this new shuffle.
22520   while (!Chain.empty()) {
22521     SDValue W = Chain.pop_back_val();
22522
22523     if (V.getValueType() != W.getOperand(0).getValueType())
22524       V = DAG.getNode(ISD::BITCAST, DL, W.getOperand(0).getValueType(), V);
22525
22526     switch (W.getOpcode()) {
22527     default:
22528       llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
22529
22530     case X86ISD::UNPCKL:
22531     case X86ISD::UNPCKH:
22532       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
22533       break;
22534
22535     case X86ISD::PSHUFD:
22536     case X86ISD::PSHUFLW:
22537     case X86ISD::PSHUFHW:
22538       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
22539       break;
22540     }
22541   }
22542   if (V.getValueType() != N.getValueType())
22543     V = DAG.getNode(ISD::BITCAST, DL, N.getValueType(), V);
22544
22545   // Return the new chain to replace N.
22546   return V;
22547 }
22548
22549 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw.
22550 ///
22551 /// We walk up the chain, skipping shuffles of the other half and looking
22552 /// through shuffles which switch halves trying to find a shuffle of the same
22553 /// pair of dwords.
22554 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
22555                                         SelectionDAG &DAG,
22556                                         TargetLowering::DAGCombinerInfo &DCI) {
22557   assert(
22558       (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
22559       "Called with something other than an x86 128-bit half shuffle!");
22560   SDLoc DL(N);
22561   unsigned CombineOpcode = N.getOpcode();
22562
22563   // Walk up a single-use chain looking for a combinable shuffle.
22564   SDValue V = N.getOperand(0);
22565   for (; V.hasOneUse(); V = V.getOperand(0)) {
22566     switch (V.getOpcode()) {
22567     default:
22568       return false; // Nothing combined!
22569
22570     case ISD::BITCAST:
22571       // Skip bitcasts as we always know the type for the target specific
22572       // instructions.
22573       continue;
22574
22575     case X86ISD::PSHUFLW:
22576     case X86ISD::PSHUFHW:
22577       if (V.getOpcode() == CombineOpcode)
22578         break;
22579
22580       // Other-half shuffles are no-ops.
22581       continue;
22582     }
22583     // Break out of the loop if we break out of the switch.
22584     break;
22585   }
22586
22587   if (!V.hasOneUse())
22588     // We fell out of the loop without finding a viable combining instruction.
22589     return false;
22590
22591   // Combine away the bottom node as its shuffle will be accumulated into
22592   // a preceding shuffle.
22593   DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
22594
22595   // Record the old value.
22596   SDValue Old = V;
22597
22598   // Merge this node's mask and our incoming mask (adjusted to account for all
22599   // the pshufd instructions encountered).
22600   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
22601   for (int &M : Mask)
22602     M = VMask[M];
22603   V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
22604                   getV4X86ShuffleImm8ForMask(Mask, DAG));
22605
22606   // Check that the shuffles didn't cancel each other out. If not, we need to
22607   // combine to the new one.
22608   if (Old != V)
22609     // Replace the combinable shuffle with the combined one, updating all users
22610     // so that we re-evaluate the chain here.
22611     DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
22612
22613   return true;
22614 }
22615
22616 /// \brief Try to combine x86 target specific shuffles.
22617 static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
22618                                            TargetLowering::DAGCombinerInfo &DCI,
22619                                            const X86Subtarget *Subtarget) {
22620   SDLoc DL(N);
22621   MVT VT = N.getSimpleValueType();
22622   SmallVector<int, 4> Mask;
22623
22624   switch (N.getOpcode()) {
22625   case X86ISD::PSHUFD:
22626   case X86ISD::PSHUFLW:
22627   case X86ISD::PSHUFHW:
22628     Mask = getPSHUFShuffleMask(N);
22629     assert(Mask.size() == 4);
22630     break;
22631   default:
22632     return SDValue();
22633   }
22634
22635   // Nuke no-op shuffles that show up after combining.
22636   if (isNoopShuffleMask(Mask))
22637     return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
22638
22639   // Look for simplifications involving one or two shuffle instructions.
22640   SDValue V = N.getOperand(0);
22641   switch (N.getOpcode()) {
22642   default:
22643     break;
22644   case X86ISD::PSHUFLW:
22645   case X86ISD::PSHUFHW:
22646     assert(VT == MVT::v8i16);
22647     (void)VT;
22648
22649     if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
22650       return SDValue(); // We combined away this shuffle, so we're done.
22651
22652     // See if this reduces to a PSHUFD which is no more expensive and can
22653     // combine with more operations. Note that it has to at least flip the
22654     // dwords as otherwise it would have been removed as a no-op.
22655     if (Mask[0] == 2 && Mask[1] == 3 && Mask[2] == 0 && Mask[3] == 1) {
22656       int DMask[] = {0, 1, 2, 3};
22657       int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
22658       DMask[DOffset + 0] = DOffset + 1;
22659       DMask[DOffset + 1] = DOffset + 0;
22660       V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V);
22661       DCI.AddToWorklist(V.getNode());
22662       V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V,
22663                       getV4X86ShuffleImm8ForMask(DMask, DAG));
22664       DCI.AddToWorklist(V.getNode());
22665       return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
22666     }
22667
22668     // Look for shuffle patterns which can be implemented as a single unpack.
22669     // FIXME: This doesn't handle the location of the PSHUFD generically, and
22670     // only works when we have a PSHUFD followed by two half-shuffles.
22671     if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
22672         (V.getOpcode() == X86ISD::PSHUFLW ||
22673          V.getOpcode() == X86ISD::PSHUFHW) &&
22674         V.getOpcode() != N.getOpcode() &&
22675         V.hasOneUse()) {
22676       SDValue D = V.getOperand(0);
22677       while (D.getOpcode() == ISD::BITCAST && D.hasOneUse())
22678         D = D.getOperand(0);
22679       if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
22680         SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
22681         SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
22682         int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
22683         int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
22684         int WordMask[8];
22685         for (int i = 0; i < 4; ++i) {
22686           WordMask[i + NOffset] = Mask[i] + NOffset;
22687           WordMask[i + VOffset] = VMask[i] + VOffset;
22688         }
22689         // Map the word mask through the DWord mask.
22690         int MappedMask[8];
22691         for (int i = 0; i < 8; ++i)
22692           MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
22693         const int UnpackLoMask[] = {0, 0, 1, 1, 2, 2, 3, 3};
22694         const int UnpackHiMask[] = {4, 4, 5, 5, 6, 6, 7, 7};
22695         if (std::equal(std::begin(MappedMask), std::end(MappedMask),
22696                        std::begin(UnpackLoMask)) ||
22697             std::equal(std::begin(MappedMask), std::end(MappedMask),
22698                        std::begin(UnpackHiMask))) {
22699           // We can replace all three shuffles with an unpack.
22700           V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, D.getOperand(0));
22701           DCI.AddToWorklist(V.getNode());
22702           return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
22703                                                 : X86ISD::UNPCKH,
22704                              DL, MVT::v8i16, V, V);
22705         }
22706       }
22707     }
22708
22709     break;
22710
22711   case X86ISD::PSHUFD:
22712     if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI))
22713       return NewN;
22714
22715     break;
22716   }
22717
22718   return SDValue();
22719 }
22720
22721 /// \brief Try to combine a shuffle into a target-specific add-sub node.
22722 ///
22723 /// We combine this directly on the abstract vector shuffle nodes so it is
22724 /// easier to generically match. We also insert dummy vector shuffle nodes for
22725 /// the operands which explicitly discard the lanes which are unused by this
22726 /// operation to try to flow through the rest of the combiner the fact that
22727 /// they're unused.
22728 static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) {
22729   SDLoc DL(N);
22730   EVT VT = N->getValueType(0);
22731
22732   // We only handle target-independent shuffles.
22733   // FIXME: It would be easy and harmless to use the target shuffle mask
22734   // extraction tool to support more.
22735   if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
22736     return SDValue();
22737
22738   auto *SVN = cast<ShuffleVectorSDNode>(N);
22739   ArrayRef<int> Mask = SVN->getMask();
22740   SDValue V1 = N->getOperand(0);
22741   SDValue V2 = N->getOperand(1);
22742
22743   // We require the first shuffle operand to be the SUB node, and the second to
22744   // be the ADD node.
22745   // FIXME: We should support the commuted patterns.
22746   if (V1->getOpcode() != ISD::FSUB || V2->getOpcode() != ISD::FADD)
22747     return SDValue();
22748
22749   // If there are other uses of these operations we can't fold them.
22750   if (!V1->hasOneUse() || !V2->hasOneUse())
22751     return SDValue();
22752
22753   // Ensure that both operations have the same operands. Note that we can
22754   // commute the FADD operands.
22755   SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
22756   if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
22757       (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
22758     return SDValue();
22759
22760   // We're looking for blends between FADD and FSUB nodes. We insist on these
22761   // nodes being lined up in a specific expected pattern.
22762   if (!(isShuffleEquivalent(Mask, 0, 3) ||
22763         isShuffleEquivalent(Mask, 0, 5, 2, 7) ||
22764         isShuffleEquivalent(Mask, 0, 9, 2, 11, 4, 13, 6, 15)))
22765     return SDValue();
22766
22767   // Only specific types are legal at this point, assert so we notice if and
22768   // when these change.
22769   assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 ||
22770           VT == MVT::v4f64) &&
22771          "Unknown vector type encountered!");
22772
22773   return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS);
22774 }
22775
22776 /// PerformShuffleCombine - Performs several different shuffle combines.
22777 static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
22778                                      TargetLowering::DAGCombinerInfo &DCI,
22779                                      const X86Subtarget *Subtarget) {
22780   SDLoc dl(N);
22781   SDValue N0 = N->getOperand(0);
22782   SDValue N1 = N->getOperand(1);
22783   EVT VT = N->getValueType(0);
22784
22785   // Don't create instructions with illegal types after legalize types has run.
22786   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22787   if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
22788     return SDValue();
22789
22790   // If we have legalized the vector types, look for blends of FADD and FSUB
22791   // nodes that we can fuse into an ADDSUB node.
22792   if (TLI.isTypeLegal(VT) && Subtarget->hasSSE3())
22793     if (SDValue AddSub = combineShuffleToAddSub(N, DAG))
22794       return AddSub;
22795
22796   // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
22797   if (Subtarget->hasFp256() && VT.is256BitVector() &&
22798       N->getOpcode() == ISD::VECTOR_SHUFFLE)
22799     return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
22800
22801   // During Type Legalization, when promoting illegal vector types,
22802   // the backend might introduce new shuffle dag nodes and bitcasts.
22803   //
22804   // This code performs the following transformation:
22805   // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
22806   //       (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
22807   //
22808   // We do this only if both the bitcast and the BINOP dag nodes have
22809   // one use. Also, perform this transformation only if the new binary
22810   // operation is legal. This is to avoid introducing dag nodes that
22811   // potentially need to be further expanded (or custom lowered) into a
22812   // less optimal sequence of dag nodes.
22813   if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
22814       N1.getOpcode() == ISD::UNDEF && N0.hasOneUse() &&
22815       N0.getOpcode() == ISD::BITCAST) {
22816     SDValue BC0 = N0.getOperand(0);
22817     EVT SVT = BC0.getValueType();
22818     unsigned Opcode = BC0.getOpcode();
22819     unsigned NumElts = VT.getVectorNumElements();
22820
22821     if (BC0.hasOneUse() && SVT.isVector() &&
22822         SVT.getVectorNumElements() * 2 == NumElts &&
22823         TLI.isOperationLegal(Opcode, VT)) {
22824       bool CanFold = false;
22825       switch (Opcode) {
22826       default : break;
22827       case ISD::ADD :
22828       case ISD::FADD :
22829       case ISD::SUB :
22830       case ISD::FSUB :
22831       case ISD::MUL :
22832       case ISD::FMUL :
22833         CanFold = true;
22834       }
22835
22836       unsigned SVTNumElts = SVT.getVectorNumElements();
22837       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
22838       for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
22839         CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
22840       for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
22841         CanFold = SVOp->getMaskElt(i) < 0;
22842
22843       if (CanFold) {
22844         SDValue BC00 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(0));
22845         SDValue BC01 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(1));
22846         SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
22847         return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]);
22848       }
22849     }
22850   }
22851
22852   // Only handle 128 wide vector from here on.
22853   if (!VT.is128BitVector())
22854     return SDValue();
22855
22856   // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
22857   // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
22858   // consecutive, non-overlapping, and in the right order.
22859   SmallVector<SDValue, 16> Elts;
22860   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
22861     Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
22862
22863   SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true);
22864   if (LD.getNode())
22865     return LD;
22866
22867   if (isTargetShuffle(N->getOpcode())) {
22868     SDValue Shuffle =
22869         PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget);
22870     if (Shuffle.getNode())
22871       return Shuffle;
22872
22873     // Try recursively combining arbitrary sequences of x86 shuffle
22874     // instructions into higher-order shuffles. We do this after combining
22875     // specific PSHUF instruction sequences into their minimal form so that we
22876     // can evaluate how many specialized shuffle instructions are involved in
22877     // a particular chain.
22878     SmallVector<int, 1> NonceMask; // Just a placeholder.
22879     NonceMask.push_back(0);
22880     if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask,
22881                                       /*Depth*/ 1, /*HasPSHUFB*/ false, DAG,
22882                                       DCI, Subtarget))
22883       return SDValue(); // This routine will use CombineTo to replace N.
22884   }
22885
22886   return SDValue();
22887 }
22888
22889 /// PerformTruncateCombine - Converts truncate operation to
22890 /// a sequence of vector shuffle operations.
22891 /// It is possible when we truncate 256-bit vector to 128-bit vector
22892 static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
22893                                       TargetLowering::DAGCombinerInfo &DCI,
22894                                       const X86Subtarget *Subtarget)  {
22895   return SDValue();
22896 }
22897
22898 /// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target
22899 /// specific shuffle of a load can be folded into a single element load.
22900 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
22901 /// shuffles have been custom lowered so we need to handle those here.
22902 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
22903                                          TargetLowering::DAGCombinerInfo &DCI) {
22904   if (DCI.isBeforeLegalizeOps())
22905     return SDValue();
22906
22907   SDValue InVec = N->getOperand(0);
22908   SDValue EltNo = N->getOperand(1);
22909
22910   if (!isa<ConstantSDNode>(EltNo))
22911     return SDValue();
22912
22913   EVT OriginalVT = InVec.getValueType();
22914
22915   if (InVec.getOpcode() == ISD::BITCAST) {
22916     // Don't duplicate a load with other uses.
22917     if (!InVec.hasOneUse())
22918       return SDValue();
22919     EVT BCVT = InVec.getOperand(0).getValueType();
22920     if (BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
22921       return SDValue();
22922     InVec = InVec.getOperand(0);
22923   }
22924
22925   EVT CurrentVT = InVec.getValueType();
22926
22927   if (!isTargetShuffle(InVec.getOpcode()))
22928     return SDValue();
22929
22930   // Don't duplicate a load with other uses.
22931   if (!InVec.hasOneUse())
22932     return SDValue();
22933
22934   SmallVector<int, 16> ShuffleMask;
22935   bool UnaryShuffle;
22936   if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(),
22937                             ShuffleMask, UnaryShuffle))
22938     return SDValue();
22939
22940   // Select the input vector, guarding against out of range extract vector.
22941   unsigned NumElems = CurrentVT.getVectorNumElements();
22942   int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
22943   int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt];
22944   SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0)
22945                                          : InVec.getOperand(1);
22946
22947   // If inputs to shuffle are the same for both ops, then allow 2 uses
22948   unsigned AllowedUses = InVec.getNumOperands() > 1 &&
22949                          InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
22950
22951   if (LdNode.getOpcode() == ISD::BITCAST) {
22952     // Don't duplicate a load with other uses.
22953     if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
22954       return SDValue();
22955
22956     AllowedUses = 1; // only allow 1 load use if we have a bitcast
22957     LdNode = LdNode.getOperand(0);
22958   }
22959
22960   if (!ISD::isNormalLoad(LdNode.getNode()))
22961     return SDValue();
22962
22963   LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
22964
22965   if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
22966     return SDValue();
22967
22968   EVT EltVT = N->getValueType(0);
22969   // If there's a bitcast before the shuffle, check if the load type and
22970   // alignment is valid.
22971   unsigned Align = LN0->getAlignment();
22972   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22973   unsigned NewAlign = TLI.getDataLayout()->getABITypeAlignment(
22974       EltVT.getTypeForEVT(*DAG.getContext()));
22975
22976   if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
22977     return SDValue();
22978
22979   // All checks match so transform back to vector_shuffle so that DAG combiner
22980   // can finish the job
22981   SDLoc dl(N);
22982
22983   // Create shuffle node taking into account the case that its a unary shuffle
22984   SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT)
22985                                    : InVec.getOperand(1);
22986   Shuffle = DAG.getVectorShuffle(CurrentVT, dl,
22987                                  InVec.getOperand(0), Shuffle,
22988                                  &ShuffleMask[0]);
22989   Shuffle = DAG.getNode(ISD::BITCAST, dl, OriginalVT, Shuffle);
22990   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
22991                      EltNo);
22992 }
22993
22994 /// \brief Detect bitcasts between i32 to x86mmx low word. Since MMX types are
22995 /// special and don't usually play with other vector types, it's better to
22996 /// handle them early to be sure we emit efficient code by avoiding
22997 /// store-load conversions.
22998 static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG) {
22999   if (N->getValueType(0) != MVT::x86mmx ||
23000       N->getOperand(0)->getOpcode() != ISD::BUILD_VECTOR ||
23001       N->getOperand(0)->getValueType(0) != MVT::v2i32)
23002     return SDValue();
23003
23004   SDValue V = N->getOperand(0);
23005   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(1));
23006   if (C && C->getZExtValue() == 0 && V.getOperand(0).getValueType() == MVT::i32)
23007     return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(V.getOperand(0)),
23008                        N->getValueType(0), V.getOperand(0));
23009
23010   return SDValue();
23011 }
23012
23013 /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
23014 /// generation and convert it from being a bunch of shuffles and extracts
23015 /// into a somewhat faster sequence. For i686, the best sequence is apparently
23016 /// storing the value and loading scalars back, while for x64 we should
23017 /// use 64-bit extracts and shifts.
23018 static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
23019                                          TargetLowering::DAGCombinerInfo &DCI) {
23020   SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI);
23021   if (NewOp.getNode())
23022     return NewOp;
23023
23024   SDValue InputVector = N->getOperand(0);
23025
23026   // Detect mmx to i32 conversion through a v2i32 elt extract.
23027   if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
23028       N->getValueType(0) == MVT::i32 &&
23029       InputVector.getValueType() == MVT::v2i32) {
23030
23031     // The bitcast source is a direct mmx result.
23032     SDValue MMXSrc = InputVector.getNode()->getOperand(0);
23033     if (MMXSrc.getValueType() == MVT::x86mmx)
23034       return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
23035                          N->getValueType(0),
23036                          InputVector.getNode()->getOperand(0));
23037
23038     // The mmx is indirect: (i64 extract_elt (v1i64 bitcast (x86mmx ...))).
23039     SDValue MMXSrcOp = MMXSrc.getOperand(0);
23040     if (MMXSrc.getOpcode() == ISD::EXTRACT_VECTOR_ELT && MMXSrc.hasOneUse() &&
23041         MMXSrc.getValueType() == MVT::i64 && MMXSrcOp.hasOneUse() &&
23042         MMXSrcOp.getOpcode() == ISD::BITCAST &&
23043         MMXSrcOp.getValueType() == MVT::v1i64 &&
23044         MMXSrcOp.getOperand(0).getValueType() == MVT::x86mmx)
23045       return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
23046                          N->getValueType(0),
23047                          MMXSrcOp.getOperand(0));
23048   }
23049
23050   // Only operate on vectors of 4 elements, where the alternative shuffling
23051   // gets to be more expensive.
23052   if (InputVector.getValueType() != MVT::v4i32)
23053     return SDValue();
23054
23055   // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
23056   // single use which is a sign-extend or zero-extend, and all elements are
23057   // used.
23058   SmallVector<SDNode *, 4> Uses;
23059   unsigned ExtractedElements = 0;
23060   for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
23061        UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
23062     if (UI.getUse().getResNo() != InputVector.getResNo())
23063       return SDValue();
23064
23065     SDNode *Extract = *UI;
23066     if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
23067       return SDValue();
23068
23069     if (Extract->getValueType(0) != MVT::i32)
23070       return SDValue();
23071     if (!Extract->hasOneUse())
23072       return SDValue();
23073     if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
23074         Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
23075       return SDValue();
23076     if (!isa<ConstantSDNode>(Extract->getOperand(1)))
23077       return SDValue();
23078
23079     // Record which element was extracted.
23080     ExtractedElements |=
23081       1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
23082
23083     Uses.push_back(Extract);
23084   }
23085
23086   // If not all the elements were used, this may not be worthwhile.
23087   if (ExtractedElements != 15)
23088     return SDValue();
23089
23090   // Ok, we've now decided to do the transformation.
23091   // If 64-bit shifts are legal, use the extract-shift sequence,
23092   // otherwise bounce the vector off the cache.
23093   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23094   SDValue Vals[4];
23095   SDLoc dl(InputVector);
23096
23097   if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
23098     SDValue Cst = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, InputVector);
23099     EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy();
23100     SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
23101       DAG.getConstant(0, VecIdxTy));
23102     SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
23103       DAG.getConstant(1, VecIdxTy));
23104
23105     SDValue ShAmt = DAG.getConstant(32,
23106       DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64));
23107     Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
23108     Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
23109       DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
23110     Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
23111     Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
23112       DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
23113   } else {
23114     // Store the value to a temporary stack slot.
23115     SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
23116     SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
23117       MachinePointerInfo(), false, false, 0);
23118
23119     EVT ElementType = InputVector.getValueType().getVectorElementType();
23120     unsigned EltSize = ElementType.getSizeInBits() / 8;
23121
23122     // Replace each use (extract) with a load of the appropriate element.
23123     for (unsigned i = 0; i < 4; ++i) {
23124       uint64_t Offset = EltSize * i;
23125       SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
23126
23127       SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
23128                                        StackPtr, OffsetVal);
23129
23130       // Load the scalar.
23131       Vals[i] = DAG.getLoad(ElementType, dl, Ch,
23132                             ScalarAddr, MachinePointerInfo(),
23133                             false, false, false, 0);
23134
23135     }
23136   }
23137
23138   // Replace the extracts
23139   for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
23140     UE = Uses.end(); UI != UE; ++UI) {
23141     SDNode *Extract = *UI;
23142
23143     SDValue Idx = Extract->getOperand(1);
23144     uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
23145     DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
23146   }
23147
23148   // The replacement was made in place; don't return anything.
23149   return SDValue();
23150 }
23151
23152 /// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match.
23153 static std::pair<unsigned, bool>
23154 matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
23155                    SelectionDAG &DAG, const X86Subtarget *Subtarget) {
23156   if (!VT.isVector())
23157     return std::make_pair(0, false);
23158
23159   bool NeedSplit = false;
23160   switch (VT.getSimpleVT().SimpleTy) {
23161   default: return std::make_pair(0, false);
23162   case MVT::v4i64:
23163   case MVT::v2i64:
23164     if (!Subtarget->hasVLX())
23165       return std::make_pair(0, false);
23166     break;
23167   case MVT::v64i8:
23168   case MVT::v32i16:
23169     if (!Subtarget->hasBWI())
23170       return std::make_pair(0, false);
23171     break;
23172   case MVT::v16i32:
23173   case MVT::v8i64:
23174     if (!Subtarget->hasAVX512())
23175       return std::make_pair(0, false);
23176     break;
23177   case MVT::v32i8:
23178   case MVT::v16i16:
23179   case MVT::v8i32:
23180     if (!Subtarget->hasAVX2())
23181       NeedSplit = true;
23182     if (!Subtarget->hasAVX())
23183       return std::make_pair(0, false);
23184     break;
23185   case MVT::v16i8:
23186   case MVT::v8i16:
23187   case MVT::v4i32:
23188     if (!Subtarget->hasSSE2())
23189       return std::make_pair(0, false);
23190   }
23191
23192   // SSE2 has only a small subset of the operations.
23193   bool hasUnsigned = Subtarget->hasSSE41() ||
23194                      (Subtarget->hasSSE2() && VT == MVT::v16i8);
23195   bool hasSigned = Subtarget->hasSSE41() ||
23196                    (Subtarget->hasSSE2() && VT == MVT::v8i16);
23197
23198   ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23199
23200   unsigned Opc = 0;
23201   // Check for x CC y ? x : y.
23202   if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
23203       DAG.isEqualTo(RHS, Cond.getOperand(1))) {
23204     switch (CC) {
23205     default: break;
23206     case ISD::SETULT:
23207     case ISD::SETULE:
23208       Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
23209     case ISD::SETUGT:
23210     case ISD::SETUGE:
23211       Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
23212     case ISD::SETLT:
23213     case ISD::SETLE:
23214       Opc = hasSigned ? X86ISD::SMIN : 0; break;
23215     case ISD::SETGT:
23216     case ISD::SETGE:
23217       Opc = hasSigned ? X86ISD::SMAX : 0; break;
23218     }
23219   // Check for x CC y ? y : x -- a min/max with reversed arms.
23220   } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
23221              DAG.isEqualTo(RHS, Cond.getOperand(0))) {
23222     switch (CC) {
23223     default: break;
23224     case ISD::SETULT:
23225     case ISD::SETULE:
23226       Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
23227     case ISD::SETUGT:
23228     case ISD::SETUGE:
23229       Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
23230     case ISD::SETLT:
23231     case ISD::SETLE:
23232       Opc = hasSigned ? X86ISD::SMAX : 0; break;
23233     case ISD::SETGT:
23234     case ISD::SETGE:
23235       Opc = hasSigned ? X86ISD::SMIN : 0; break;
23236     }
23237   }
23238
23239   return std::make_pair(Opc, NeedSplit);
23240 }
23241
23242 static SDValue
23243 transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
23244                                       const X86Subtarget *Subtarget) {
23245   SDLoc dl(N);
23246   SDValue Cond = N->getOperand(0);
23247   SDValue LHS = N->getOperand(1);
23248   SDValue RHS = N->getOperand(2);
23249
23250   if (Cond.getOpcode() == ISD::SIGN_EXTEND) {
23251     SDValue CondSrc = Cond->getOperand(0);
23252     if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG)
23253       Cond = CondSrc->getOperand(0);
23254   }
23255
23256   if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
23257     return SDValue();
23258
23259   // A vselect where all conditions and data are constants can be optimized into
23260   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
23261   if (ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
23262       ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
23263     return SDValue();
23264
23265   unsigned MaskValue = 0;
23266   if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
23267     return SDValue();
23268
23269   MVT VT = N->getSimpleValueType(0);
23270   unsigned NumElems = VT.getVectorNumElements();
23271   SmallVector<int, 8> ShuffleMask(NumElems, -1);
23272   for (unsigned i = 0; i < NumElems; ++i) {
23273     // Be sure we emit undef where we can.
23274     if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF)
23275       ShuffleMask[i] = -1;
23276     else
23277       ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1);
23278   }
23279
23280   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23281   if (!TLI.isShuffleMaskLegal(ShuffleMask, VT))
23282     return SDValue();
23283   return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]);
23284 }
23285
23286 /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
23287 /// nodes.
23288 static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
23289                                     TargetLowering::DAGCombinerInfo &DCI,
23290                                     const X86Subtarget *Subtarget) {
23291   SDLoc DL(N);
23292   SDValue Cond = N->getOperand(0);
23293   // Get the LHS/RHS of the select.
23294   SDValue LHS = N->getOperand(1);
23295   SDValue RHS = N->getOperand(2);
23296   EVT VT = LHS.getValueType();
23297   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23298
23299   // If we have SSE[12] support, try to form min/max nodes. SSE min/max
23300   // instructions match the semantics of the common C idiom x<y?x:y but not
23301   // x<=y?x:y, because of how they handle negative zero (which can be
23302   // ignored in unsafe-math mode).
23303   // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
23304   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
23305       VT != MVT::f80 && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
23306       (Subtarget->hasSSE2() ||
23307        (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
23308     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23309
23310     unsigned Opcode = 0;
23311     // Check for x CC y ? x : y.
23312     if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
23313         DAG.isEqualTo(RHS, Cond.getOperand(1))) {
23314       switch (CC) {
23315       default: break;
23316       case ISD::SETULT:
23317         // Converting this to a min would handle NaNs incorrectly, and swapping
23318         // the operands would cause it to handle comparisons between positive
23319         // and negative zero incorrectly.
23320         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
23321           if (!DAG.getTarget().Options.UnsafeFPMath &&
23322               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
23323             break;
23324           std::swap(LHS, RHS);
23325         }
23326         Opcode = X86ISD::FMIN;
23327         break;
23328       case ISD::SETOLE:
23329         // Converting this to a min would handle comparisons between positive
23330         // and negative zero incorrectly.
23331         if (!DAG.getTarget().Options.UnsafeFPMath &&
23332             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
23333           break;
23334         Opcode = X86ISD::FMIN;
23335         break;
23336       case ISD::SETULE:
23337         // Converting this to a min would handle both negative zeros and NaNs
23338         // incorrectly, but we can swap the operands to fix both.
23339         std::swap(LHS, RHS);
23340       case ISD::SETOLT:
23341       case ISD::SETLT:
23342       case ISD::SETLE:
23343         Opcode = X86ISD::FMIN;
23344         break;
23345
23346       case ISD::SETOGE:
23347         // Converting this to a max would handle comparisons between positive
23348         // and negative zero incorrectly.
23349         if (!DAG.getTarget().Options.UnsafeFPMath &&
23350             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
23351           break;
23352         Opcode = X86ISD::FMAX;
23353         break;
23354       case ISD::SETUGT:
23355         // Converting this to a max would handle NaNs incorrectly, and swapping
23356         // the operands would cause it to handle comparisons between positive
23357         // and negative zero incorrectly.
23358         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
23359           if (!DAG.getTarget().Options.UnsafeFPMath &&
23360               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
23361             break;
23362           std::swap(LHS, RHS);
23363         }
23364         Opcode = X86ISD::FMAX;
23365         break;
23366       case ISD::SETUGE:
23367         // Converting this to a max would handle both negative zeros and NaNs
23368         // incorrectly, but we can swap the operands to fix both.
23369         std::swap(LHS, RHS);
23370       case ISD::SETOGT:
23371       case ISD::SETGT:
23372       case ISD::SETGE:
23373         Opcode = X86ISD::FMAX;
23374         break;
23375       }
23376     // Check for x CC y ? y : x -- a min/max with reversed arms.
23377     } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
23378                DAG.isEqualTo(RHS, Cond.getOperand(0))) {
23379       switch (CC) {
23380       default: break;
23381       case ISD::SETOGE:
23382         // Converting this to a min would handle comparisons between positive
23383         // and negative zero incorrectly, and swapping the operands would
23384         // cause it to handle NaNs incorrectly.
23385         if (!DAG.getTarget().Options.UnsafeFPMath &&
23386             !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
23387           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
23388             break;
23389           std::swap(LHS, RHS);
23390         }
23391         Opcode = X86ISD::FMIN;
23392         break;
23393       case ISD::SETUGT:
23394         // Converting this to a min would handle NaNs incorrectly.
23395         if (!DAG.getTarget().Options.UnsafeFPMath &&
23396             (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
23397           break;
23398         Opcode = X86ISD::FMIN;
23399         break;
23400       case ISD::SETUGE:
23401         // Converting this to a min would handle both negative zeros and NaNs
23402         // incorrectly, but we can swap the operands to fix both.
23403         std::swap(LHS, RHS);
23404       case ISD::SETOGT:
23405       case ISD::SETGT:
23406       case ISD::SETGE:
23407         Opcode = X86ISD::FMIN;
23408         break;
23409
23410       case ISD::SETULT:
23411         // Converting this to a max would handle NaNs incorrectly.
23412         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
23413           break;
23414         Opcode = X86ISD::FMAX;
23415         break;
23416       case ISD::SETOLE:
23417         // Converting this to a max would handle comparisons between positive
23418         // and negative zero incorrectly, and swapping the operands would
23419         // cause it to handle NaNs incorrectly.
23420         if (!DAG.getTarget().Options.UnsafeFPMath &&
23421             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
23422           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
23423             break;
23424           std::swap(LHS, RHS);
23425         }
23426         Opcode = X86ISD::FMAX;
23427         break;
23428       case ISD::SETULE:
23429         // Converting this to a max would handle both negative zeros and NaNs
23430         // incorrectly, but we can swap the operands to fix both.
23431         std::swap(LHS, RHS);
23432       case ISD::SETOLT:
23433       case ISD::SETLT:
23434       case ISD::SETLE:
23435         Opcode = X86ISD::FMAX;
23436         break;
23437       }
23438     }
23439
23440     if (Opcode)
23441       return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
23442   }
23443
23444   EVT CondVT = Cond.getValueType();
23445   if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() &&
23446       CondVT.getVectorElementType() == MVT::i1) {
23447     // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
23448     // lowering on KNL. In this case we convert it to
23449     // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
23450     // The same situation for all 128 and 256-bit vectors of i8 and i16.
23451     // Since SKX these selects have a proper lowering.
23452     EVT OpVT = LHS.getValueType();
23453     if ((OpVT.is128BitVector() || OpVT.is256BitVector()) &&
23454         (OpVT.getVectorElementType() == MVT::i8 ||
23455          OpVT.getVectorElementType() == MVT::i16) &&
23456         !(Subtarget->hasBWI() && Subtarget->hasVLX())) {
23457       Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond);
23458       DCI.AddToWorklist(Cond.getNode());
23459       return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS);
23460     }
23461   }
23462   // If this is a select between two integer constants, try to do some
23463   // optimizations.
23464   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
23465     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
23466       // Don't do this for crazy integer types.
23467       if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
23468         // If this is efficiently invertible, canonicalize the LHSC/RHSC values
23469         // so that TrueC (the true value) is larger than FalseC.
23470         bool NeedsCondInvert = false;
23471
23472         if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
23473             // Efficiently invertible.
23474             (Cond.getOpcode() == ISD::SETCC ||  // setcc -> invertible.
23475              (Cond.getOpcode() == ISD::XOR &&   // xor(X, C) -> invertible.
23476               isa<ConstantSDNode>(Cond.getOperand(1))))) {
23477           NeedsCondInvert = true;
23478           std::swap(TrueC, FalseC);
23479         }
23480
23481         // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
23482         if (FalseC->getAPIntValue() == 0 &&
23483             TrueC->getAPIntValue().isPowerOf2()) {
23484           if (NeedsCondInvert) // Invert the condition if needed.
23485             Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
23486                                DAG.getConstant(1, Cond.getValueType()));
23487
23488           // Zero extend the condition if needed.
23489           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
23490
23491           unsigned ShAmt = TrueC->getAPIntValue().logBase2();
23492           return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
23493                              DAG.getConstant(ShAmt, MVT::i8));
23494         }
23495
23496         // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
23497         if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
23498           if (NeedsCondInvert) // Invert the condition if needed.
23499             Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
23500                                DAG.getConstant(1, Cond.getValueType()));
23501
23502           // Zero extend the condition if needed.
23503           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
23504                              FalseC->getValueType(0), Cond);
23505           return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
23506                              SDValue(FalseC, 0));
23507         }
23508
23509         // Optimize cases that will turn into an LEA instruction.  This requires
23510         // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
23511         if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
23512           uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
23513           if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
23514
23515           bool isFastMultiplier = false;
23516           if (Diff < 10) {
23517             switch ((unsigned char)Diff) {
23518               default: break;
23519               case 1:  // result = add base, cond
23520               case 2:  // result = lea base(    , cond*2)
23521               case 3:  // result = lea base(cond, cond*2)
23522               case 4:  // result = lea base(    , cond*4)
23523               case 5:  // result = lea base(cond, cond*4)
23524               case 8:  // result = lea base(    , cond*8)
23525               case 9:  // result = lea base(cond, cond*8)
23526                 isFastMultiplier = true;
23527                 break;
23528             }
23529           }
23530
23531           if (isFastMultiplier) {
23532             APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
23533             if (NeedsCondInvert) // Invert the condition if needed.
23534               Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
23535                                  DAG.getConstant(1, Cond.getValueType()));
23536
23537             // Zero extend the condition if needed.
23538             Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
23539                                Cond);
23540             // Scale the condition by the difference.
23541             if (Diff != 1)
23542               Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
23543                                  DAG.getConstant(Diff, Cond.getValueType()));
23544
23545             // Add the base if non-zero.
23546             if (FalseC->getAPIntValue() != 0)
23547               Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
23548                                  SDValue(FalseC, 0));
23549             return Cond;
23550           }
23551         }
23552       }
23553   }
23554
23555   // Canonicalize max and min:
23556   // (x > y) ? x : y -> (x >= y) ? x : y
23557   // (x < y) ? x : y -> (x <= y) ? x : y
23558   // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
23559   // the need for an extra compare
23560   // against zero. e.g.
23561   // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
23562   // subl   %esi, %edi
23563   // testl  %edi, %edi
23564   // movl   $0, %eax
23565   // cmovgl %edi, %eax
23566   // =>
23567   // xorl   %eax, %eax
23568   // subl   %esi, $edi
23569   // cmovsl %eax, %edi
23570   if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
23571       DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
23572       DAG.isEqualTo(RHS, Cond.getOperand(1))) {
23573     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23574     switch (CC) {
23575     default: break;
23576     case ISD::SETLT:
23577     case ISD::SETGT: {
23578       ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
23579       Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
23580                           Cond.getOperand(0), Cond.getOperand(1), NewCC);
23581       return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
23582     }
23583     }
23584   }
23585
23586   // Early exit check
23587   if (!TLI.isTypeLegal(VT))
23588     return SDValue();
23589
23590   // Match VSELECTs into subs with unsigned saturation.
23591   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
23592       // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
23593       ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
23594        (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
23595     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23596
23597     // Check if one of the arms of the VSELECT is a zero vector. If it's on the
23598     // left side invert the predicate to simplify logic below.
23599     SDValue Other;
23600     if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
23601       Other = RHS;
23602       CC = ISD::getSetCCInverse(CC, true);
23603     } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
23604       Other = LHS;
23605     }
23606
23607     if (Other.getNode() && Other->getNumOperands() == 2 &&
23608         DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
23609       SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
23610       SDValue CondRHS = Cond->getOperand(1);
23611
23612       // Look for a general sub with unsigned saturation first.
23613       // x >= y ? x-y : 0 --> subus x, y
23614       // x >  y ? x-y : 0 --> subus x, y
23615       if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
23616           Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
23617         return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
23618
23619       if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
23620         if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
23621           if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
23622             if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
23623               // If the RHS is a constant we have to reverse the const
23624               // canonicalization.
23625               // x > C-1 ? x+-C : 0 --> subus x, C
23626               if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
23627                   CondRHSConst->getAPIntValue() ==
23628                       (-OpRHSConst->getAPIntValue() - 1))
23629                 return DAG.getNode(
23630                     X86ISD::SUBUS, DL, VT, OpLHS,
23631                     DAG.getConstant(-OpRHSConst->getAPIntValue(), VT));
23632
23633           // Another special case: If C was a sign bit, the sub has been
23634           // canonicalized into a xor.
23635           // FIXME: Would it be better to use computeKnownBits to determine
23636           //        whether it's safe to decanonicalize the xor?
23637           // x s< 0 ? x^C : 0 --> subus x, C
23638           if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
23639               ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
23640               OpRHSConst->getAPIntValue().isSignBit())
23641             // Note that we have to rebuild the RHS constant here to ensure we
23642             // don't rely on particular values of undef lanes.
23643             return DAG.getNode(
23644                 X86ISD::SUBUS, DL, VT, OpLHS,
23645                 DAG.getConstant(OpRHSConst->getAPIntValue(), VT));
23646         }
23647     }
23648   }
23649
23650   // Try to match a min/max vector operation.
23651   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) {
23652     std::pair<unsigned, bool> ret = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget);
23653     unsigned Opc = ret.first;
23654     bool NeedSplit = ret.second;
23655
23656     if (Opc && NeedSplit) {
23657       unsigned NumElems = VT.getVectorNumElements();
23658       // Extract the LHS vectors
23659       SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, DL);
23660       SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, DL);
23661
23662       // Extract the RHS vectors
23663       SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, DL);
23664       SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, DL);
23665
23666       // Create min/max for each subvector
23667       LHS = DAG.getNode(Opc, DL, LHS1.getValueType(), LHS1, RHS1);
23668       RHS = DAG.getNode(Opc, DL, LHS2.getValueType(), LHS2, RHS2);
23669
23670       // Merge the result
23671       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS, RHS);
23672     } else if (Opc)
23673       return DAG.getNode(Opc, DL, VT, LHS, RHS);
23674   }
23675
23676   // Simplify vector selection if condition value type matches vselect
23677   // operand type
23678   if (N->getOpcode() == ISD::VSELECT && CondVT == VT) {
23679     assert(Cond.getValueType().isVector() &&
23680            "vector select expects a vector selector!");
23681
23682     bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
23683     bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
23684
23685     // Try invert the condition if true value is not all 1s and false value
23686     // is not all 0s.
23687     if (!TValIsAllOnes && !FValIsAllZeros &&
23688         // Check if the selector will be produced by CMPP*/PCMP*
23689         Cond.getOpcode() == ISD::SETCC &&
23690         // Check if SETCC has already been promoted
23691         TLI.getSetCCResultType(*DAG.getContext(), VT) == CondVT) {
23692       bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
23693       bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
23694
23695       if (TValIsAllZeros || FValIsAllOnes) {
23696         SDValue CC = Cond.getOperand(2);
23697         ISD::CondCode NewCC =
23698           ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
23699                                Cond.getOperand(0).getValueType().isInteger());
23700         Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC);
23701         std::swap(LHS, RHS);
23702         TValIsAllOnes = FValIsAllOnes;
23703         FValIsAllZeros = TValIsAllZeros;
23704       }
23705     }
23706
23707     if (TValIsAllOnes || FValIsAllZeros) {
23708       SDValue Ret;
23709
23710       if (TValIsAllOnes && FValIsAllZeros)
23711         Ret = Cond;
23712       else if (TValIsAllOnes)
23713         Ret = DAG.getNode(ISD::OR, DL, CondVT, Cond,
23714                           DAG.getNode(ISD::BITCAST, DL, CondVT, RHS));
23715       else if (FValIsAllZeros)
23716         Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond,
23717                           DAG.getNode(ISD::BITCAST, DL, CondVT, LHS));
23718
23719       return DAG.getNode(ISD::BITCAST, DL, VT, Ret);
23720     }
23721   }
23722
23723   // If we know that this node is legal then we know that it is going to be
23724   // matched by one of the SSE/AVX BLEND instructions. These instructions only
23725   // depend on the highest bit in each word. Try to use SimplifyDemandedBits
23726   // to simplify previous instructions.
23727   if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
23728       !DCI.isBeforeLegalize() &&
23729       // We explicitly check against v8i16 and v16i16 because, although
23730       // they're marked as Custom, they might only be legal when Cond is a
23731       // build_vector of constants. This will be taken care in a later
23732       // condition.
23733       (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 &&
23734        VT != MVT::v8i16) &&
23735       // Don't optimize vector of constants. Those are handled by
23736       // the generic code and all the bits must be properly set for
23737       // the generic optimizer.
23738       !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
23739     unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
23740
23741     // Don't optimize vector selects that map to mask-registers.
23742     if (BitWidth == 1)
23743       return SDValue();
23744
23745     assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
23746     APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
23747
23748     APInt KnownZero, KnownOne;
23749     TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
23750                                           DCI.isBeforeLegalizeOps());
23751     if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
23752         TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
23753                                  TLO)) {
23754       // If we changed the computation somewhere in the DAG, this change
23755       // will affect all users of Cond.
23756       // Make sure it is fine and update all the nodes so that we do not
23757       // use the generic VSELECT anymore. Otherwise, we may perform
23758       // wrong optimizations as we messed up with the actual expectation
23759       // for the vector boolean values.
23760       if (Cond != TLO.Old) {
23761         // Check all uses of that condition operand to check whether it will be
23762         // consumed by non-BLEND instructions, which may depend on all bits are
23763         // set properly.
23764         for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
23765              I != E; ++I)
23766           if (I->getOpcode() != ISD::VSELECT)
23767             // TODO: Add other opcodes eventually lowered into BLEND.
23768             return SDValue();
23769
23770         // Update all the users of the condition, before committing the change,
23771         // so that the VSELECT optimizations that expect the correct vector
23772         // boolean value will not be triggered.
23773         for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
23774              I != E; ++I)
23775           DAG.ReplaceAllUsesOfValueWith(
23776               SDValue(*I, 0),
23777               DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0),
23778                           Cond, I->getOperand(1), I->getOperand(2)));
23779         DCI.CommitTargetLoweringOpt(TLO);
23780         return SDValue();
23781       }
23782       // At this point, only Cond is changed. Change the condition
23783       // just for N to keep the opportunity to optimize all other
23784       // users their own way.
23785       DAG.ReplaceAllUsesOfValueWith(
23786           SDValue(N, 0),
23787           DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0),
23788                       TLO.New, N->getOperand(1), N->getOperand(2)));
23789       return SDValue();
23790     }
23791   }
23792
23793   // We should generate an X86ISD::BLENDI from a vselect if its argument
23794   // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
23795   // constants. This specific pattern gets generated when we split a
23796   // selector for a 512 bit vector in a machine without AVX512 (but with
23797   // 256-bit vectors), during legalization:
23798   //
23799   // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
23800   //
23801   // Iff we find this pattern and the build_vectors are built from
23802   // constants, we translate the vselect into a shuffle_vector that we
23803   // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
23804   if ((N->getOpcode() == ISD::VSELECT ||
23805        N->getOpcode() == X86ISD::SHRUNKBLEND) &&
23806       !DCI.isBeforeLegalize()) {
23807     SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
23808     if (Shuffle.getNode())
23809       return Shuffle;
23810   }
23811
23812   return SDValue();
23813 }
23814
23815 // Check whether a boolean test is testing a boolean value generated by
23816 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
23817 // code.
23818 //
23819 // Simplify the following patterns:
23820 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
23821 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
23822 // to (Op EFLAGS Cond)
23823 //
23824 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
23825 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
23826 // to (Op EFLAGS !Cond)
23827 //
23828 // where Op could be BRCOND or CMOV.
23829 //
23830 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
23831   // Quit if not CMP and SUB with its value result used.
23832   if (Cmp.getOpcode() != X86ISD::CMP &&
23833       (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0)))
23834       return SDValue();
23835
23836   // Quit if not used as a boolean value.
23837   if (CC != X86::COND_E && CC != X86::COND_NE)
23838     return SDValue();
23839
23840   // Check CMP operands. One of them should be 0 or 1 and the other should be
23841   // an SetCC or extended from it.
23842   SDValue Op1 = Cmp.getOperand(0);
23843   SDValue Op2 = Cmp.getOperand(1);
23844
23845   SDValue SetCC;
23846   const ConstantSDNode* C = nullptr;
23847   bool needOppositeCond = (CC == X86::COND_E);
23848   bool checkAgainstTrue = false; // Is it a comparison against 1?
23849
23850   if ((C = dyn_cast<ConstantSDNode>(Op1)))
23851     SetCC = Op2;
23852   else if ((C = dyn_cast<ConstantSDNode>(Op2)))
23853     SetCC = Op1;
23854   else // Quit if all operands are not constants.
23855     return SDValue();
23856
23857   if (C->getZExtValue() == 1) {
23858     needOppositeCond = !needOppositeCond;
23859     checkAgainstTrue = true;
23860   } else if (C->getZExtValue() != 0)
23861     // Quit if the constant is neither 0 or 1.
23862     return SDValue();
23863
23864   bool truncatedToBoolWithAnd = false;
23865   // Skip (zext $x), (trunc $x), or (and $x, 1) node.
23866   while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
23867          SetCC.getOpcode() == ISD::TRUNCATE ||
23868          SetCC.getOpcode() == ISD::AND) {
23869     if (SetCC.getOpcode() == ISD::AND) {
23870       int OpIdx = -1;
23871       ConstantSDNode *CS;
23872       if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(0))) &&
23873           CS->getZExtValue() == 1)
23874         OpIdx = 1;
23875       if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(1))) &&
23876           CS->getZExtValue() == 1)
23877         OpIdx = 0;
23878       if (OpIdx == -1)
23879         break;
23880       SetCC = SetCC.getOperand(OpIdx);
23881       truncatedToBoolWithAnd = true;
23882     } else
23883       SetCC = SetCC.getOperand(0);
23884   }
23885
23886   switch (SetCC.getOpcode()) {
23887   case X86ISD::SETCC_CARRY:
23888     // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
23889     // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
23890     // i.e. it's a comparison against true but the result of SETCC_CARRY is not
23891     // truncated to i1 using 'and'.
23892     if (checkAgainstTrue && !truncatedToBoolWithAnd)
23893       break;
23894     assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
23895            "Invalid use of SETCC_CARRY!");
23896     // FALL THROUGH
23897   case X86ISD::SETCC:
23898     // Set the condition code or opposite one if necessary.
23899     CC = X86::CondCode(SetCC.getConstantOperandVal(0));
23900     if (needOppositeCond)
23901       CC = X86::GetOppositeBranchCondition(CC);
23902     return SetCC.getOperand(1);
23903   case X86ISD::CMOV: {
23904     // Check whether false/true value has canonical one, i.e. 0 or 1.
23905     ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
23906     ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
23907     // Quit if true value is not a constant.
23908     if (!TVal)
23909       return SDValue();
23910     // Quit if false value is not a constant.
23911     if (!FVal) {
23912       SDValue Op = SetCC.getOperand(0);
23913       // Skip 'zext' or 'trunc' node.
23914       if (Op.getOpcode() == ISD::ZERO_EXTEND ||
23915           Op.getOpcode() == ISD::TRUNCATE)
23916         Op = Op.getOperand(0);
23917       // A special case for rdrand/rdseed, where 0 is set if false cond is
23918       // found.
23919       if ((Op.getOpcode() != X86ISD::RDRAND &&
23920            Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
23921         return SDValue();
23922     }
23923     // Quit if false value is not the constant 0 or 1.
23924     bool FValIsFalse = true;
23925     if (FVal && FVal->getZExtValue() != 0) {
23926       if (FVal->getZExtValue() != 1)
23927         return SDValue();
23928       // If FVal is 1, opposite cond is needed.
23929       needOppositeCond = !needOppositeCond;
23930       FValIsFalse = false;
23931     }
23932     // Quit if TVal is not the constant opposite of FVal.
23933     if (FValIsFalse && TVal->getZExtValue() != 1)
23934       return SDValue();
23935     if (!FValIsFalse && TVal->getZExtValue() != 0)
23936       return SDValue();
23937     CC = X86::CondCode(SetCC.getConstantOperandVal(2));
23938     if (needOppositeCond)
23939       CC = X86::GetOppositeBranchCondition(CC);
23940     return SetCC.getOperand(3);
23941   }
23942   }
23943
23944   return SDValue();
23945 }
23946
23947 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
23948 static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
23949                                   TargetLowering::DAGCombinerInfo &DCI,
23950                                   const X86Subtarget *Subtarget) {
23951   SDLoc DL(N);
23952
23953   // If the flag operand isn't dead, don't touch this CMOV.
23954   if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
23955     return SDValue();
23956
23957   SDValue FalseOp = N->getOperand(0);
23958   SDValue TrueOp = N->getOperand(1);
23959   X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
23960   SDValue Cond = N->getOperand(3);
23961
23962   if (CC == X86::COND_E || CC == X86::COND_NE) {
23963     switch (Cond.getOpcode()) {
23964     default: break;
23965     case X86ISD::BSR:
23966     case X86ISD::BSF:
23967       // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
23968       if (DAG.isKnownNeverZero(Cond.getOperand(0)))
23969         return (CC == X86::COND_E) ? FalseOp : TrueOp;
23970     }
23971   }
23972
23973   SDValue Flags;
23974
23975   Flags = checkBoolTestSetCCCombine(Cond, CC);
23976   if (Flags.getNode() &&
23977       // Extra check as FCMOV only supports a subset of X86 cond.
23978       (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
23979     SDValue Ops[] = { FalseOp, TrueOp,
23980                       DAG.getConstant(CC, MVT::i8), Flags };
23981     return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
23982   }
23983
23984   // If this is a select between two integer constants, try to do some
23985   // optimizations.  Note that the operands are ordered the opposite of SELECT
23986   // operands.
23987   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
23988     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
23989       // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
23990       // larger than FalseC (the false value).
23991       if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
23992         CC = X86::GetOppositeBranchCondition(CC);
23993         std::swap(TrueC, FalseC);
23994         std::swap(TrueOp, FalseOp);
23995       }
23996
23997       // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
23998       // This is efficient for any integer data type (including i8/i16) and
23999       // shift amount.
24000       if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
24001         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
24002                            DAG.getConstant(CC, MVT::i8), Cond);
24003
24004         // Zero extend the condition if needed.
24005         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
24006
24007         unsigned ShAmt = TrueC->getAPIntValue().logBase2();
24008         Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
24009                            DAG.getConstant(ShAmt, MVT::i8));
24010         if (N->getNumValues() == 2)  // Dead flag value?
24011           return DCI.CombineTo(N, Cond, SDValue());
24012         return Cond;
24013       }
24014
24015       // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
24016       // for any integer data type, including i8/i16.
24017       if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
24018         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
24019                            DAG.getConstant(CC, MVT::i8), Cond);
24020
24021         // Zero extend the condition if needed.
24022         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
24023                            FalseC->getValueType(0), Cond);
24024         Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
24025                            SDValue(FalseC, 0));
24026
24027         if (N->getNumValues() == 2)  // Dead flag value?
24028           return DCI.CombineTo(N, Cond, SDValue());
24029         return Cond;
24030       }
24031
24032       // Optimize cases that will turn into an LEA instruction.  This requires
24033       // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
24034       if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
24035         uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
24036         if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
24037
24038         bool isFastMultiplier = false;
24039         if (Diff < 10) {
24040           switch ((unsigned char)Diff) {
24041           default: break;
24042           case 1:  // result = add base, cond
24043           case 2:  // result = lea base(    , cond*2)
24044           case 3:  // result = lea base(cond, cond*2)
24045           case 4:  // result = lea base(    , cond*4)
24046           case 5:  // result = lea base(cond, cond*4)
24047           case 8:  // result = lea base(    , cond*8)
24048           case 9:  // result = lea base(cond, cond*8)
24049             isFastMultiplier = true;
24050             break;
24051           }
24052         }
24053
24054         if (isFastMultiplier) {
24055           APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
24056           Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
24057                              DAG.getConstant(CC, MVT::i8), Cond);
24058           // Zero extend the condition if needed.
24059           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
24060                              Cond);
24061           // Scale the condition by the difference.
24062           if (Diff != 1)
24063             Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
24064                                DAG.getConstant(Diff, Cond.getValueType()));
24065
24066           // Add the base if non-zero.
24067           if (FalseC->getAPIntValue() != 0)
24068             Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
24069                                SDValue(FalseC, 0));
24070           if (N->getNumValues() == 2)  // Dead flag value?
24071             return DCI.CombineTo(N, Cond, SDValue());
24072           return Cond;
24073         }
24074       }
24075     }
24076   }
24077
24078   // Handle these cases:
24079   //   (select (x != c), e, c) -> select (x != c), e, x),
24080   //   (select (x == c), c, e) -> select (x == c), x, e)
24081   // where the c is an integer constant, and the "select" is the combination
24082   // of CMOV and CMP.
24083   //
24084   // The rationale for this change is that the conditional-move from a constant
24085   // needs two instructions, however, conditional-move from a register needs
24086   // only one instruction.
24087   //
24088   // CAVEAT: By replacing a constant with a symbolic value, it may obscure
24089   //  some instruction-combining opportunities. This opt needs to be
24090   //  postponed as late as possible.
24091   //
24092   if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
24093     // the DCI.xxxx conditions are provided to postpone the optimization as
24094     // late as possible.
24095
24096     ConstantSDNode *CmpAgainst = nullptr;
24097     if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
24098         (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
24099         !isa<ConstantSDNode>(Cond.getOperand(0))) {
24100
24101       if (CC == X86::COND_NE &&
24102           CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
24103         CC = X86::GetOppositeBranchCondition(CC);
24104         std::swap(TrueOp, FalseOp);
24105       }
24106
24107       if (CC == X86::COND_E &&
24108           CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
24109         SDValue Ops[] = { FalseOp, Cond.getOperand(0),
24110                           DAG.getConstant(CC, MVT::i8), Cond };
24111         return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
24112       }
24113     }
24114   }
24115
24116   return SDValue();
24117 }
24118
24119 static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
24120                                                 const X86Subtarget *Subtarget) {
24121   unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
24122   switch (IntNo) {
24123   default: return SDValue();
24124   // SSE/AVX/AVX2 blend intrinsics.
24125   case Intrinsic::x86_avx2_pblendvb:
24126   case Intrinsic::x86_avx2_pblendw:
24127   case Intrinsic::x86_avx2_pblendd_128:
24128   case Intrinsic::x86_avx2_pblendd_256:
24129     // Don't try to simplify this intrinsic if we don't have AVX2.
24130     if (!Subtarget->hasAVX2())
24131       return SDValue();
24132     // FALL-THROUGH
24133   case Intrinsic::x86_avx_blend_pd_256:
24134   case Intrinsic::x86_avx_blend_ps_256:
24135   case Intrinsic::x86_avx_blendv_pd_256:
24136   case Intrinsic::x86_avx_blendv_ps_256:
24137     // Don't try to simplify this intrinsic if we don't have AVX.
24138     if (!Subtarget->hasAVX())
24139       return SDValue();
24140     // FALL-THROUGH
24141   case Intrinsic::x86_sse41_pblendw:
24142   case Intrinsic::x86_sse41_blendpd:
24143   case Intrinsic::x86_sse41_blendps:
24144   case Intrinsic::x86_sse41_blendvps:
24145   case Intrinsic::x86_sse41_blendvpd:
24146   case Intrinsic::x86_sse41_pblendvb: {
24147     SDValue Op0 = N->getOperand(1);
24148     SDValue Op1 = N->getOperand(2);
24149     SDValue Mask = N->getOperand(3);
24150
24151     // Don't try to simplify this intrinsic if we don't have SSE4.1.
24152     if (!Subtarget->hasSSE41())
24153       return SDValue();
24154
24155     // fold (blend A, A, Mask) -> A
24156     if (Op0 == Op1)
24157       return Op0;
24158     // fold (blend A, B, allZeros) -> A
24159     if (ISD::isBuildVectorAllZeros(Mask.getNode()))
24160       return Op0;
24161     // fold (blend A, B, allOnes) -> B
24162     if (ISD::isBuildVectorAllOnes(Mask.getNode()))
24163       return Op1;
24164
24165     // Simplify the case where the mask is a constant i32 value.
24166     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mask)) {
24167       if (C->isNullValue())
24168         return Op0;
24169       if (C->isAllOnesValue())
24170         return Op1;
24171     }
24172
24173     return SDValue();
24174   }
24175
24176   // Packed SSE2/AVX2 arithmetic shift immediate intrinsics.
24177   case Intrinsic::x86_sse2_psrai_w:
24178   case Intrinsic::x86_sse2_psrai_d:
24179   case Intrinsic::x86_avx2_psrai_w:
24180   case Intrinsic::x86_avx2_psrai_d:
24181   case Intrinsic::x86_sse2_psra_w:
24182   case Intrinsic::x86_sse2_psra_d:
24183   case Intrinsic::x86_avx2_psra_w:
24184   case Intrinsic::x86_avx2_psra_d: {
24185     SDValue Op0 = N->getOperand(1);
24186     SDValue Op1 = N->getOperand(2);
24187     EVT VT = Op0.getValueType();
24188     assert(VT.isVector() && "Expected a vector type!");
24189
24190     if (isa<BuildVectorSDNode>(Op1))
24191       Op1 = Op1.getOperand(0);
24192
24193     if (!isa<ConstantSDNode>(Op1))
24194       return SDValue();
24195
24196     EVT SVT = VT.getVectorElementType();
24197     unsigned SVTBits = SVT.getSizeInBits();
24198
24199     ConstantSDNode *CND = cast<ConstantSDNode>(Op1);
24200     const APInt &C = APInt(SVTBits, CND->getAPIntValue().getZExtValue());
24201     uint64_t ShAmt = C.getZExtValue();
24202
24203     // Don't try to convert this shift into a ISD::SRA if the shift
24204     // count is bigger than or equal to the element size.
24205     if (ShAmt >= SVTBits)
24206       return SDValue();
24207
24208     // Trivial case: if the shift count is zero, then fold this
24209     // into the first operand.
24210     if (ShAmt == 0)
24211       return Op0;
24212
24213     // Replace this packed shift intrinsic with a target independent
24214     // shift dag node.
24215     SDValue Splat = DAG.getConstant(C, VT);
24216     return DAG.getNode(ISD::SRA, SDLoc(N), VT, Op0, Splat);
24217   }
24218   }
24219 }
24220
24221 /// PerformMulCombine - Optimize a single multiply with constant into two
24222 /// in order to implement it with two cheaper instructions, e.g.
24223 /// LEA + SHL, LEA + LEA.
24224 static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
24225                                  TargetLowering::DAGCombinerInfo &DCI) {
24226   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
24227     return SDValue();
24228
24229   EVT VT = N->getValueType(0);
24230   if (VT != MVT::i64 && VT != MVT::i32)
24231     return SDValue();
24232
24233   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
24234   if (!C)
24235     return SDValue();
24236   uint64_t MulAmt = C->getZExtValue();
24237   if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
24238     return SDValue();
24239
24240   uint64_t MulAmt1 = 0;
24241   uint64_t MulAmt2 = 0;
24242   if ((MulAmt % 9) == 0) {
24243     MulAmt1 = 9;
24244     MulAmt2 = MulAmt / 9;
24245   } else if ((MulAmt % 5) == 0) {
24246     MulAmt1 = 5;
24247     MulAmt2 = MulAmt / 5;
24248   } else if ((MulAmt % 3) == 0) {
24249     MulAmt1 = 3;
24250     MulAmt2 = MulAmt / 3;
24251   }
24252   if (MulAmt2 &&
24253       (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
24254     SDLoc DL(N);
24255
24256     if (isPowerOf2_64(MulAmt2) &&
24257         !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
24258       // If second multiplifer is pow2, issue it first. We want the multiply by
24259       // 3, 5, or 9 to be folded into the addressing mode unless the lone use
24260       // is an add.
24261       std::swap(MulAmt1, MulAmt2);
24262
24263     SDValue NewMul;
24264     if (isPowerOf2_64(MulAmt1))
24265       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
24266                            DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
24267     else
24268       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
24269                            DAG.getConstant(MulAmt1, VT));
24270
24271     if (isPowerOf2_64(MulAmt2))
24272       NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
24273                            DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
24274     else
24275       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
24276                            DAG.getConstant(MulAmt2, VT));
24277
24278     // Do not add new nodes to DAG combiner worklist.
24279     DCI.CombineTo(N, NewMul, false);
24280   }
24281   return SDValue();
24282 }
24283
24284 static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
24285   SDValue N0 = N->getOperand(0);
24286   SDValue N1 = N->getOperand(1);
24287   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
24288   EVT VT = N0.getValueType();
24289
24290   // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
24291   // since the result of setcc_c is all zero's or all ones.
24292   if (VT.isInteger() && !VT.isVector() &&
24293       N1C && N0.getOpcode() == ISD::AND &&
24294       N0.getOperand(1).getOpcode() == ISD::Constant) {
24295     SDValue N00 = N0.getOperand(0);
24296     if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
24297         ((N00.getOpcode() == ISD::ANY_EXTEND ||
24298           N00.getOpcode() == ISD::ZERO_EXTEND) &&
24299          N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) {
24300       APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
24301       APInt ShAmt = N1C->getAPIntValue();
24302       Mask = Mask.shl(ShAmt);
24303       if (Mask != 0)
24304         return DAG.getNode(ISD::AND, SDLoc(N), VT,
24305                            N00, DAG.getConstant(Mask, VT));
24306     }
24307   }
24308
24309   // Hardware support for vector shifts is sparse which makes us scalarize the
24310   // vector operations in many cases. Also, on sandybridge ADD is faster than
24311   // shl.
24312   // (shl V, 1) -> add V,V
24313   if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
24314     if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
24315       assert(N0.getValueType().isVector() && "Invalid vector shift type");
24316       // We shift all of the values by one. In many cases we do not have
24317       // hardware support for this operation. This is better expressed as an ADD
24318       // of two values.
24319       if (N1SplatC->getZExtValue() == 1)
24320         return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
24321     }
24322
24323   return SDValue();
24324 }
24325
24326 /// \brief Returns a vector of 0s if the node in input is a vector logical
24327 /// shift by a constant amount which is known to be bigger than or equal
24328 /// to the vector element size in bits.
24329 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
24330                                       const X86Subtarget *Subtarget) {
24331   EVT VT = N->getValueType(0);
24332
24333   if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
24334       (!Subtarget->hasInt256() ||
24335        (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
24336     return SDValue();
24337
24338   SDValue Amt = N->getOperand(1);
24339   SDLoc DL(N);
24340   if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
24341     if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
24342       APInt ShiftAmt = AmtSplat->getAPIntValue();
24343       unsigned MaxAmount = VT.getVectorElementType().getSizeInBits();
24344
24345       // SSE2/AVX2 logical shifts always return a vector of 0s
24346       // if the shift amount is bigger than or equal to
24347       // the element size. The constant shift amount will be
24348       // encoded as a 8-bit immediate.
24349       if (ShiftAmt.trunc(8).uge(MaxAmount))
24350         return getZeroVector(VT, Subtarget, DAG, DL);
24351     }
24352
24353   return SDValue();
24354 }
24355
24356 /// PerformShiftCombine - Combine shifts.
24357 static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
24358                                    TargetLowering::DAGCombinerInfo &DCI,
24359                                    const X86Subtarget *Subtarget) {
24360   if (N->getOpcode() == ISD::SHL) {
24361     SDValue V = PerformSHLCombine(N, DAG);
24362     if (V.getNode()) return V;
24363   }
24364
24365   if (N->getOpcode() != ISD::SRA) {
24366     // Try to fold this logical shift into a zero vector.
24367     SDValue V = performShiftToAllZeros(N, DAG, Subtarget);
24368     if (V.getNode()) return V;
24369   }
24370
24371   return SDValue();
24372 }
24373
24374 // CMPEQCombine - Recognize the distinctive  (AND (setcc ...) (setcc ..))
24375 // where both setccs reference the same FP CMP, and rewrite for CMPEQSS
24376 // and friends.  Likewise for OR -> CMPNEQSS.
24377 static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
24378                             TargetLowering::DAGCombinerInfo &DCI,
24379                             const X86Subtarget *Subtarget) {
24380   unsigned opcode;
24381
24382   // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
24383   // we're requiring SSE2 for both.
24384   if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
24385     SDValue N0 = N->getOperand(0);
24386     SDValue N1 = N->getOperand(1);
24387     SDValue CMP0 = N0->getOperand(1);
24388     SDValue CMP1 = N1->getOperand(1);
24389     SDLoc DL(N);
24390
24391     // The SETCCs should both refer to the same CMP.
24392     if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
24393       return SDValue();
24394
24395     SDValue CMP00 = CMP0->getOperand(0);
24396     SDValue CMP01 = CMP0->getOperand(1);
24397     EVT     VT    = CMP00.getValueType();
24398
24399     if (VT == MVT::f32 || VT == MVT::f64) {
24400       bool ExpectingFlags = false;
24401       // Check for any users that want flags:
24402       for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
24403            !ExpectingFlags && UI != UE; ++UI)
24404         switch (UI->getOpcode()) {
24405         default:
24406         case ISD::BR_CC:
24407         case ISD::BRCOND:
24408         case ISD::SELECT:
24409           ExpectingFlags = true;
24410           break;
24411         case ISD::CopyToReg:
24412         case ISD::SIGN_EXTEND:
24413         case ISD::ZERO_EXTEND:
24414         case ISD::ANY_EXTEND:
24415           break;
24416         }
24417
24418       if (!ExpectingFlags) {
24419         enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
24420         enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
24421
24422         if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
24423           X86::CondCode tmp = cc0;
24424           cc0 = cc1;
24425           cc1 = tmp;
24426         }
24427
24428         if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
24429             (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
24430           // FIXME: need symbolic constants for these magic numbers.
24431           // See X86ATTInstPrinter.cpp:printSSECC().
24432           unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
24433           if (Subtarget->hasAVX512()) {
24434             SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00,
24435                                          CMP01, DAG.getConstant(x86cc, MVT::i8));
24436             if (N->getValueType(0) != MVT::i1)
24437               return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
24438                                  FSetCC);
24439             return FSetCC;
24440           }
24441           SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
24442                                               CMP00.getValueType(), CMP00, CMP01,
24443                                               DAG.getConstant(x86cc, MVT::i8));
24444
24445           bool is64BitFP = (CMP00.getValueType() == MVT::f64);
24446           MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
24447
24448           if (is64BitFP && !Subtarget->is64Bit()) {
24449             // On a 32-bit target, we cannot bitcast the 64-bit float to a
24450             // 64-bit integer, since that's not a legal type. Since
24451             // OnesOrZeroesF is all ones of all zeroes, we don't need all the
24452             // bits, but can do this little dance to extract the lowest 32 bits
24453             // and work with those going forward.
24454             SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
24455                                            OnesOrZeroesF);
24456             SDValue Vector32 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32,
24457                                            Vector64);
24458             OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
24459                                         Vector32, DAG.getIntPtrConstant(0));
24460             IntVT = MVT::i32;
24461           }
24462
24463           SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, IntVT, OnesOrZeroesF);
24464           SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
24465                                       DAG.getConstant(1, IntVT));
24466           SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed);
24467           return OneBitOfTruth;
24468         }
24469       }
24470     }
24471   }
24472   return SDValue();
24473 }
24474
24475 /// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector
24476 /// so it can be folded inside ANDNP.
24477 static bool CanFoldXORWithAllOnes(const SDNode *N) {
24478   EVT VT = N->getValueType(0);
24479
24480   // Match direct AllOnes for 128 and 256-bit vectors
24481   if (ISD::isBuildVectorAllOnes(N))
24482     return true;
24483
24484   // Look through a bit convert.
24485   if (N->getOpcode() == ISD::BITCAST)
24486     N = N->getOperand(0).getNode();
24487
24488   // Sometimes the operand may come from a insert_subvector building a 256-bit
24489   // allones vector
24490   if (VT.is256BitVector() &&
24491       N->getOpcode() == ISD::INSERT_SUBVECTOR) {
24492     SDValue V1 = N->getOperand(0);
24493     SDValue V2 = N->getOperand(1);
24494
24495     if (V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
24496         V1.getOperand(0).getOpcode() == ISD::UNDEF &&
24497         ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
24498         ISD::isBuildVectorAllOnes(V2.getNode()))
24499       return true;
24500   }
24501
24502   return false;
24503 }
24504
24505 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
24506 // register. In most cases we actually compare or select YMM-sized registers
24507 // and mixing the two types creates horrible code. This method optimizes
24508 // some of the transition sequences.
24509 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
24510                                  TargetLowering::DAGCombinerInfo &DCI,
24511                                  const X86Subtarget *Subtarget) {
24512   EVT VT = N->getValueType(0);
24513   if (!VT.is256BitVector())
24514     return SDValue();
24515
24516   assert((N->getOpcode() == ISD::ANY_EXTEND ||
24517           N->getOpcode() == ISD::ZERO_EXTEND ||
24518           N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
24519
24520   SDValue Narrow = N->getOperand(0);
24521   EVT NarrowVT = Narrow->getValueType(0);
24522   if (!NarrowVT.is128BitVector())
24523     return SDValue();
24524
24525   if (Narrow->getOpcode() != ISD::XOR &&
24526       Narrow->getOpcode() != ISD::AND &&
24527       Narrow->getOpcode() != ISD::OR)
24528     return SDValue();
24529
24530   SDValue N0  = Narrow->getOperand(0);
24531   SDValue N1  = Narrow->getOperand(1);
24532   SDLoc DL(Narrow);
24533
24534   // The Left side has to be a trunc.
24535   if (N0.getOpcode() != ISD::TRUNCATE)
24536     return SDValue();
24537
24538   // The type of the truncated inputs.
24539   EVT WideVT = N0->getOperand(0)->getValueType(0);
24540   if (WideVT != VT)
24541     return SDValue();
24542
24543   // The right side has to be a 'trunc' or a constant vector.
24544   bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
24545   ConstantSDNode *RHSConstSplat = nullptr;
24546   if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
24547     RHSConstSplat = RHSBV->getConstantSplatNode();
24548   if (!RHSTrunc && !RHSConstSplat)
24549     return SDValue();
24550
24551   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24552
24553   if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
24554     return SDValue();
24555
24556   // Set N0 and N1 to hold the inputs to the new wide operation.
24557   N0 = N0->getOperand(0);
24558   if (RHSConstSplat) {
24559     N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(),
24560                      SDValue(RHSConstSplat, 0));
24561     SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);
24562     N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C);
24563   } else if (RHSTrunc) {
24564     N1 = N1->getOperand(0);
24565   }
24566
24567   // Generate the wide operation.
24568   SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
24569   unsigned Opcode = N->getOpcode();
24570   switch (Opcode) {
24571   case ISD::ANY_EXTEND:
24572     return Op;
24573   case ISD::ZERO_EXTEND: {
24574     unsigned InBits = NarrowVT.getScalarType().getSizeInBits();
24575     APInt Mask = APInt::getAllOnesValue(InBits);
24576     Mask = Mask.zext(VT.getScalarType().getSizeInBits());
24577     return DAG.getNode(ISD::AND, DL, VT,
24578                        Op, DAG.getConstant(Mask, VT));
24579   }
24580   case ISD::SIGN_EXTEND:
24581     return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
24582                        Op, DAG.getValueType(NarrowVT));
24583   default:
24584     llvm_unreachable("Unexpected opcode");
24585   }
24586 }
24587
24588 static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
24589                                  TargetLowering::DAGCombinerInfo &DCI,
24590                                  const X86Subtarget *Subtarget) {
24591   EVT VT = N->getValueType(0);
24592   if (DCI.isBeforeLegalizeOps())
24593     return SDValue();
24594
24595   SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
24596   if (R.getNode())
24597     return R;
24598
24599   // Create BEXTR instructions
24600   // BEXTR is ((X >> imm) & (2**size-1))
24601   if (VT == MVT::i32 || VT == MVT::i64) {
24602     SDValue N0 = N->getOperand(0);
24603     SDValue N1 = N->getOperand(1);
24604     SDLoc DL(N);
24605
24606     // Check for BEXTR.
24607     if ((Subtarget->hasBMI() || Subtarget->hasTBM()) &&
24608         (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) {
24609       ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
24610       ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
24611       if (MaskNode && ShiftNode) {
24612         uint64_t Mask = MaskNode->getZExtValue();
24613         uint64_t Shift = ShiftNode->getZExtValue();
24614         if (isMask_64(Mask)) {
24615           uint64_t MaskSize = CountPopulation_64(Mask);
24616           if (Shift + MaskSize <= VT.getSizeInBits())
24617             return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
24618                                DAG.getConstant(Shift | (MaskSize << 8), VT));
24619         }
24620       }
24621     } // BEXTR
24622
24623     return SDValue();
24624   }
24625
24626   // Want to form ANDNP nodes:
24627   // 1) In the hopes of then easily combining them with OR and AND nodes
24628   //    to form PBLEND/PSIGN.
24629   // 2) To match ANDN packed intrinsics
24630   if (VT != MVT::v2i64 && VT != MVT::v4i64)
24631     return SDValue();
24632
24633   SDValue N0 = N->getOperand(0);
24634   SDValue N1 = N->getOperand(1);
24635   SDLoc DL(N);
24636
24637   // Check LHS for vnot
24638   if (N0.getOpcode() == ISD::XOR &&
24639       //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
24640       CanFoldXORWithAllOnes(N0.getOperand(1).getNode()))
24641     return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
24642
24643   // Check RHS for vnot
24644   if (N1.getOpcode() == ISD::XOR &&
24645       //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
24646       CanFoldXORWithAllOnes(N1.getOperand(1).getNode()))
24647     return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
24648
24649   return SDValue();
24650 }
24651
24652 static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
24653                                 TargetLowering::DAGCombinerInfo &DCI,
24654                                 const X86Subtarget *Subtarget) {
24655   if (DCI.isBeforeLegalizeOps())
24656     return SDValue();
24657
24658   SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
24659   if (R.getNode())
24660     return R;
24661
24662   SDValue N0 = N->getOperand(0);
24663   SDValue N1 = N->getOperand(1);
24664   EVT VT = N->getValueType(0);
24665
24666   // look for psign/blend
24667   if (VT == MVT::v2i64 || VT == MVT::v4i64) {
24668     if (!Subtarget->hasSSSE3() ||
24669         (VT == MVT::v4i64 && !Subtarget->hasInt256()))
24670       return SDValue();
24671
24672     // Canonicalize pandn to RHS
24673     if (N0.getOpcode() == X86ISD::ANDNP)
24674       std::swap(N0, N1);
24675     // or (and (m, y), (pandn m, x))
24676     if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
24677       SDValue Mask = N1.getOperand(0);
24678       SDValue X    = N1.getOperand(1);
24679       SDValue Y;
24680       if (N0.getOperand(0) == Mask)
24681         Y = N0.getOperand(1);
24682       if (N0.getOperand(1) == Mask)
24683         Y = N0.getOperand(0);
24684
24685       // Check to see if the mask appeared in both the AND and ANDNP and
24686       if (!Y.getNode())
24687         return SDValue();
24688
24689       // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
24690       // Look through mask bitcast.
24691       if (Mask.getOpcode() == ISD::BITCAST)
24692         Mask = Mask.getOperand(0);
24693       if (X.getOpcode() == ISD::BITCAST)
24694         X = X.getOperand(0);
24695       if (Y.getOpcode() == ISD::BITCAST)
24696         Y = Y.getOperand(0);
24697
24698       EVT MaskVT = Mask.getValueType();
24699
24700       // Validate that the Mask operand is a vector sra node.
24701       // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
24702       // there is no psrai.b
24703       unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
24704       unsigned SraAmt = ~0;
24705       if (Mask.getOpcode() == ISD::SRA) {
24706         if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
24707           if (auto *AmtConst = AmtBV->getConstantSplatNode())
24708             SraAmt = AmtConst->getZExtValue();
24709       } else if (Mask.getOpcode() == X86ISD::VSRAI) {
24710         SDValue SraC = Mask.getOperand(1);
24711         SraAmt  = cast<ConstantSDNode>(SraC)->getZExtValue();
24712       }
24713       if ((SraAmt + 1) != EltBits)
24714         return SDValue();
24715
24716       SDLoc DL(N);
24717
24718       // Now we know we at least have a plendvb with the mask val.  See if
24719       // we can form a psignb/w/d.
24720       // psign = x.type == y.type == mask.type && y = sub(0, x);
24721       if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
24722           ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
24723           X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
24724         assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
24725                "Unsupported VT for PSIGN");
24726         Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0));
24727         return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
24728       }
24729       // PBLENDVB only available on SSE 4.1
24730       if (!Subtarget->hasSSE41())
24731         return SDValue();
24732
24733       EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
24734
24735       X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X);
24736       Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y);
24737       Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask);
24738       Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
24739       return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
24740     }
24741   }
24742
24743   if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
24744     return SDValue();
24745
24746   // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
24747   MachineFunction &MF = DAG.getMachineFunction();
24748   bool OptForSize = MF.getFunction()->getAttributes().
24749     hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
24750
24751   // SHLD/SHRD instructions have lower register pressure, but on some
24752   // platforms they have higher latency than the equivalent
24753   // series of shifts/or that would otherwise be generated.
24754   // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
24755   // have higher latencies and we are not optimizing for size.
24756   if (!OptForSize && Subtarget->isSHLDSlow())
24757     return SDValue();
24758
24759   if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
24760     std::swap(N0, N1);
24761   if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
24762     return SDValue();
24763   if (!N0.hasOneUse() || !N1.hasOneUse())
24764     return SDValue();
24765
24766   SDValue ShAmt0 = N0.getOperand(1);
24767   if (ShAmt0.getValueType() != MVT::i8)
24768     return SDValue();
24769   SDValue ShAmt1 = N1.getOperand(1);
24770   if (ShAmt1.getValueType() != MVT::i8)
24771     return SDValue();
24772   if (ShAmt0.getOpcode() == ISD::TRUNCATE)
24773     ShAmt0 = ShAmt0.getOperand(0);
24774   if (ShAmt1.getOpcode() == ISD::TRUNCATE)
24775     ShAmt1 = ShAmt1.getOperand(0);
24776
24777   SDLoc DL(N);
24778   unsigned Opc = X86ISD::SHLD;
24779   SDValue Op0 = N0.getOperand(0);
24780   SDValue Op1 = N1.getOperand(0);
24781   if (ShAmt0.getOpcode() == ISD::SUB) {
24782     Opc = X86ISD::SHRD;
24783     std::swap(Op0, Op1);
24784     std::swap(ShAmt0, ShAmt1);
24785   }
24786
24787   unsigned Bits = VT.getSizeInBits();
24788   if (ShAmt1.getOpcode() == ISD::SUB) {
24789     SDValue Sum = ShAmt1.getOperand(0);
24790     if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
24791       SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
24792       if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
24793         ShAmt1Op1 = ShAmt1Op1.getOperand(0);
24794       if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
24795         return DAG.getNode(Opc, DL, VT,
24796                            Op0, Op1,
24797                            DAG.getNode(ISD::TRUNCATE, DL,
24798                                        MVT::i8, ShAmt0));
24799     }
24800   } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
24801     ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
24802     if (ShAmt0C &&
24803         ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
24804       return DAG.getNode(Opc, DL, VT,
24805                          N0.getOperand(0), N1.getOperand(0),
24806                          DAG.getNode(ISD::TRUNCATE, DL,
24807                                        MVT::i8, ShAmt0));
24808   }
24809
24810   return SDValue();
24811 }
24812
24813 // Generate NEG and CMOV for integer abs.
24814 static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
24815   EVT VT = N->getValueType(0);
24816
24817   // Since X86 does not have CMOV for 8-bit integer, we don't convert
24818   // 8-bit integer abs to NEG and CMOV.
24819   if (VT.isInteger() && VT.getSizeInBits() == 8)
24820     return SDValue();
24821
24822   SDValue N0 = N->getOperand(0);
24823   SDValue N1 = N->getOperand(1);
24824   SDLoc DL(N);
24825
24826   // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
24827   // and change it to SUB and CMOV.
24828   if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
24829       N0.getOpcode() == ISD::ADD &&
24830       N0.getOperand(1) == N1 &&
24831       N1.getOpcode() == ISD::SRA &&
24832       N1.getOperand(0) == N0.getOperand(0))
24833     if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
24834       if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) {
24835         // Generate SUB & CMOV.
24836         SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
24837                                   DAG.getConstant(0, VT), N0.getOperand(0));
24838
24839         SDValue Ops[] = { N0.getOperand(0), Neg,
24840                           DAG.getConstant(X86::COND_GE, MVT::i8),
24841                           SDValue(Neg.getNode(), 1) };
24842         return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
24843       }
24844   return SDValue();
24845 }
24846
24847 // PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes
24848 static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
24849                                  TargetLowering::DAGCombinerInfo &DCI,
24850                                  const X86Subtarget *Subtarget) {
24851   if (DCI.isBeforeLegalizeOps())
24852     return SDValue();
24853
24854   if (Subtarget->hasCMov()) {
24855     SDValue RV = performIntegerAbsCombine(N, DAG);
24856     if (RV.getNode())
24857       return RV;
24858   }
24859
24860   return SDValue();
24861 }
24862
24863 /// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
24864 static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
24865                                   TargetLowering::DAGCombinerInfo &DCI,
24866                                   const X86Subtarget *Subtarget) {
24867   LoadSDNode *Ld = cast<LoadSDNode>(N);
24868   EVT RegVT = Ld->getValueType(0);
24869   EVT MemVT = Ld->getMemoryVT();
24870   SDLoc dl(Ld);
24871   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24872
24873   // For chips with slow 32-byte unaligned loads, break the 32-byte operation
24874   // into two 16-byte operations.
24875   ISD::LoadExtType Ext = Ld->getExtensionType();
24876   unsigned Alignment = Ld->getAlignment();
24877   bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
24878   if (RegVT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
24879       !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
24880     unsigned NumElems = RegVT.getVectorNumElements();
24881     if (NumElems < 2)
24882       return SDValue();
24883
24884     SDValue Ptr = Ld->getBasePtr();
24885     SDValue Increment = DAG.getConstant(16, TLI.getPointerTy());
24886
24887     EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
24888                                   NumElems/2);
24889     SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
24890                                 Ld->getPointerInfo(), Ld->isVolatile(),
24891                                 Ld->isNonTemporal(), Ld->isInvariant(),
24892                                 Alignment);
24893     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
24894     SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
24895                                 Ld->getPointerInfo(), Ld->isVolatile(),
24896                                 Ld->isNonTemporal(), Ld->isInvariant(),
24897                                 std::min(16U, Alignment));
24898     SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
24899                              Load1.getValue(1),
24900                              Load2.getValue(1));
24901
24902     SDValue NewVec = DAG.getUNDEF(RegVT);
24903     NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);
24904     NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);
24905     return DCI.CombineTo(N, NewVec, TF, true);
24906   }
24907
24908   return SDValue();
24909 }
24910
24911 /// PerformMLOADCombine - Resolve extending loads
24912 static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
24913                                    TargetLowering::DAGCombinerInfo &DCI,
24914                                    const X86Subtarget *Subtarget) {
24915   MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
24916   if (Mld->getExtensionType() != ISD::SEXTLOAD)
24917     return SDValue();
24918
24919   EVT VT = Mld->getValueType(0);
24920   unsigned NumElems = VT.getVectorNumElements();
24921   EVT LdVT = Mld->getMemoryVT();
24922   SDLoc dl(Mld);
24923
24924   assert(LdVT != VT && "Cannot extend to the same type");
24925   unsigned ToSz = VT.getVectorElementType().getSizeInBits();
24926   unsigned FromSz = LdVT.getVectorElementType().getSizeInBits();
24927   // From, To sizes and ElemCount must be pow of two
24928   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
24929     "Unexpected size for extending masked load");
24930
24931   unsigned SizeRatio  = ToSz / FromSz;
24932   assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
24933
24934   // Create a type on which we perform the shuffle
24935   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
24936           LdVT.getScalarType(), NumElems*SizeRatio);
24937   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
24938
24939   // Convert Src0 value
24940   SDValue WideSrc0 = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mld->getSrc0());
24941   if (Mld->getSrc0().getOpcode() != ISD::UNDEF) {
24942     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
24943     for (unsigned i = 0; i != NumElems; ++i)
24944       ShuffleVec[i] = i * SizeRatio;
24945
24946     // Can't shuffle using an illegal type.
24947     assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)
24948             && "WideVecVT should be legal");
24949     WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
24950                                     DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
24951   }
24952   // Prepare the new mask
24953   SDValue NewMask;
24954   SDValue Mask = Mld->getMask();
24955   if (Mask.getValueType() == VT) {
24956     // Mask and original value have the same type
24957     NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
24958     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
24959     for (unsigned i = 0; i != NumElems; ++i)
24960       ShuffleVec[i] = i * SizeRatio;
24961     for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
24962       ShuffleVec[i] = NumElems*SizeRatio;
24963     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
24964                                    DAG.getConstant(0, WideVecVT),
24965                                    &ShuffleVec[0]);
24966   }
24967   else {
24968     assert(Mask.getValueType().getVectorElementType() == MVT::i1);
24969     unsigned WidenNumElts = NumElems*SizeRatio;
24970     unsigned MaskNumElts = VT.getVectorNumElements();
24971     EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
24972                                      WidenNumElts);
24973
24974     unsigned NumConcat = WidenNumElts / MaskNumElts;
24975     SmallVector<SDValue, 16> Ops(NumConcat);
24976     SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType());
24977     Ops[0] = Mask;
24978     for (unsigned i = 1; i != NumConcat; ++i)
24979       Ops[i] = ZeroVal;
24980
24981     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
24982   }
24983
24984   SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
24985                                      Mld->getBasePtr(), NewMask, WideSrc0,
24986                                      Mld->getMemoryVT(), Mld->getMemOperand(),
24987                                      ISD::NON_EXTLOAD);
24988   SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
24989   return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
24990
24991 }
24992 /// PerformMSTORECombine - Resolve truncating stores
24993 static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
24994                                     const X86Subtarget *Subtarget) {
24995   MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
24996   if (!Mst->isTruncatingStore())
24997     return SDValue();
24998
24999   EVT VT = Mst->getValue().getValueType();
25000   unsigned NumElems = VT.getVectorNumElements();
25001   EVT StVT = Mst->getMemoryVT();
25002   SDLoc dl(Mst);
25003
25004   assert(StVT != VT && "Cannot truncate to the same type");
25005   unsigned FromSz = VT.getVectorElementType().getSizeInBits();
25006   unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
25007
25008   // From, To sizes and ElemCount must be pow of two
25009   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
25010     "Unexpected size for truncating masked store");
25011   // We are going to use the original vector elt for storing.
25012   // Accumulated smaller vector elements must be a multiple of the store size.
25013   assert (((NumElems * FromSz) % ToSz) == 0 &&
25014           "Unexpected ratio for truncating masked store");
25015
25016   unsigned SizeRatio  = FromSz / ToSz;
25017   assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
25018
25019   // Create a type on which we perform the shuffle
25020   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
25021           StVT.getScalarType(), NumElems*SizeRatio);
25022
25023   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
25024
25025   SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mst->getValue());
25026   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
25027   for (unsigned i = 0; i != NumElems; ++i)
25028     ShuffleVec[i] = i * SizeRatio;
25029
25030   // Can't shuffle using an illegal type.
25031   assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)
25032           && "WideVecVT should be legal");
25033
25034   SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
25035                                         DAG.getUNDEF(WideVecVT),
25036                                         &ShuffleVec[0]);
25037
25038   SDValue NewMask;
25039   SDValue Mask = Mst->getMask();
25040   if (Mask.getValueType() == VT) {
25041     // Mask and original value have the same type
25042     NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
25043     for (unsigned i = 0; i != NumElems; ++i)
25044       ShuffleVec[i] = i * SizeRatio;
25045     for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
25046       ShuffleVec[i] = NumElems*SizeRatio;
25047     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
25048                                    DAG.getConstant(0, WideVecVT),
25049                                    &ShuffleVec[0]);
25050   }
25051   else {
25052     assert(Mask.getValueType().getVectorElementType() == MVT::i1);
25053     unsigned WidenNumElts = NumElems*SizeRatio;
25054     unsigned MaskNumElts = VT.getVectorNumElements();
25055     EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
25056                                      WidenNumElts);
25057
25058     unsigned NumConcat = WidenNumElts / MaskNumElts;
25059     SmallVector<SDValue, 16> Ops(NumConcat);
25060     SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType());
25061     Ops[0] = Mask;
25062     for (unsigned i = 1; i != NumConcat; ++i)
25063       Ops[i] = ZeroVal;
25064
25065     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
25066   }
25067
25068   return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, Mst->getBasePtr(),
25069                             NewMask, StVT, Mst->getMemOperand(), false);
25070 }
25071 /// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
25072 static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
25073                                    const X86Subtarget *Subtarget) {
25074   StoreSDNode *St = cast<StoreSDNode>(N);
25075   EVT VT = St->getValue().getValueType();
25076   EVT StVT = St->getMemoryVT();
25077   SDLoc dl(St);
25078   SDValue StoredVal = St->getOperand(1);
25079   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25080
25081   // If we are saving a concatenation of two XMM registers and 32-byte stores
25082   // are slow, such as on Sandy Bridge, perform two 16-byte stores.
25083   unsigned Alignment = St->getAlignment();
25084   bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8;
25085   if (VT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
25086       StVT == VT && !IsAligned) {
25087     unsigned NumElems = VT.getVectorNumElements();
25088     if (NumElems < 2)
25089       return SDValue();
25090
25091     SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl);
25092     SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl);
25093
25094     SDValue Stride = DAG.getConstant(16, TLI.getPointerTy());
25095     SDValue Ptr0 = St->getBasePtr();
25096     SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
25097
25098     SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
25099                                 St->getPointerInfo(), St->isVolatile(),
25100                                 St->isNonTemporal(), Alignment);
25101     SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
25102                                 St->getPointerInfo(), St->isVolatile(),
25103                                 St->isNonTemporal(),
25104                                 std::min(16U, Alignment));
25105     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
25106   }
25107
25108   // Optimize trunc store (of multiple scalars) to shuffle and store.
25109   // First, pack all of the elements in one place. Next, store to memory
25110   // in fewer chunks.
25111   if (St->isTruncatingStore() && VT.isVector()) {
25112     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25113     unsigned NumElems = VT.getVectorNumElements();
25114     assert(StVT != VT && "Cannot truncate to the same type");
25115     unsigned FromSz = VT.getVectorElementType().getSizeInBits();
25116     unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
25117
25118     // From, To sizes and ElemCount must be pow of two
25119     if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
25120     // We are going to use the original vector elt for storing.
25121     // Accumulated smaller vector elements must be a multiple of the store size.
25122     if (0 != (NumElems * FromSz) % ToSz) return SDValue();
25123
25124     unsigned SizeRatio  = FromSz / ToSz;
25125
25126     assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
25127
25128     // Create a type on which we perform the shuffle
25129     EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
25130             StVT.getScalarType(), NumElems*SizeRatio);
25131
25132     assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
25133
25134     SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());
25135     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
25136     for (unsigned i = 0; i != NumElems; ++i)
25137       ShuffleVec[i] = i * SizeRatio;
25138
25139     // Can't shuffle using an illegal type.
25140     if (!TLI.isTypeLegal(WideVecVT))
25141       return SDValue();
25142
25143     SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
25144                                          DAG.getUNDEF(WideVecVT),
25145                                          &ShuffleVec[0]);
25146     // At this point all of the data is stored at the bottom of the
25147     // register. We now need to save it to mem.
25148
25149     // Find the largest store unit
25150     MVT StoreType = MVT::i8;
25151     for (MVT Tp : MVT::integer_valuetypes()) {
25152       if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
25153         StoreType = Tp;
25154     }
25155
25156     // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
25157     if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
25158         (64 <= NumElems * ToSz))
25159       StoreType = MVT::f64;
25160
25161     // Bitcast the original vector into a vector of store-size units
25162     EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
25163             StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
25164     assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
25165     SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
25166     SmallVector<SDValue, 8> Chains;
25167     SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
25168                                         TLI.getPointerTy());
25169     SDValue Ptr = St->getBasePtr();
25170
25171     // Perform one or more big stores into memory.
25172     for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
25173       SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
25174                                    StoreType, ShuffWide,
25175                                    DAG.getIntPtrConstant(i));
25176       SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
25177                                 St->getPointerInfo(), St->isVolatile(),
25178                                 St->isNonTemporal(), St->getAlignment());
25179       Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
25180       Chains.push_back(Ch);
25181     }
25182
25183     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
25184   }
25185
25186   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
25187   // the FP state in cases where an emms may be missing.
25188   // A preferable solution to the general problem is to figure out the right
25189   // places to insert EMMS.  This qualifies as a quick hack.
25190
25191   // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
25192   if (VT.getSizeInBits() != 64)
25193     return SDValue();
25194
25195   const Function *F = DAG.getMachineFunction().getFunction();
25196   bool NoImplicitFloatOps = F->getAttributes().
25197     hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
25198   bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps
25199                      && Subtarget->hasSSE2();
25200   if ((VT.isVector() ||
25201        (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
25202       isa<LoadSDNode>(St->getValue()) &&
25203       !cast<LoadSDNode>(St->getValue())->isVolatile() &&
25204       St->getChain().hasOneUse() && !St->isVolatile()) {
25205     SDNode* LdVal = St->getValue().getNode();
25206     LoadSDNode *Ld = nullptr;
25207     int TokenFactorIndex = -1;
25208     SmallVector<SDValue, 8> Ops;
25209     SDNode* ChainVal = St->getChain().getNode();
25210     // Must be a store of a load.  We currently handle two cases:  the load
25211     // is a direct child, and it's under an intervening TokenFactor.  It is
25212     // possible to dig deeper under nested TokenFactors.
25213     if (ChainVal == LdVal)
25214       Ld = cast<LoadSDNode>(St->getChain());
25215     else if (St->getValue().hasOneUse() &&
25216              ChainVal->getOpcode() == ISD::TokenFactor) {
25217       for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
25218         if (ChainVal->getOperand(i).getNode() == LdVal) {
25219           TokenFactorIndex = i;
25220           Ld = cast<LoadSDNode>(St->getValue());
25221         } else
25222           Ops.push_back(ChainVal->getOperand(i));
25223       }
25224     }
25225
25226     if (!Ld || !ISD::isNormalLoad(Ld))
25227       return SDValue();
25228
25229     // If this is not the MMX case, i.e. we are just turning i64 load/store
25230     // into f64 load/store, avoid the transformation if there are multiple
25231     // uses of the loaded value.
25232     if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
25233       return SDValue();
25234
25235     SDLoc LdDL(Ld);
25236     SDLoc StDL(N);
25237     // If we are a 64-bit capable x86, lower to a single movq load/store pair.
25238     // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
25239     // pair instead.
25240     if (Subtarget->is64Bit() || F64IsLegal) {
25241       EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
25242       SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
25243                                   Ld->getPointerInfo(), Ld->isVolatile(),
25244                                   Ld->isNonTemporal(), Ld->isInvariant(),
25245                                   Ld->getAlignment());
25246       SDValue NewChain = NewLd.getValue(1);
25247       if (TokenFactorIndex != -1) {
25248         Ops.push_back(NewChain);
25249         NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
25250       }
25251       return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
25252                           St->getPointerInfo(),
25253                           St->isVolatile(), St->isNonTemporal(),
25254                           St->getAlignment());
25255     }
25256
25257     // Otherwise, lower to two pairs of 32-bit loads / stores.
25258     SDValue LoAddr = Ld->getBasePtr();
25259     SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
25260                                  DAG.getConstant(4, MVT::i32));
25261
25262     SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
25263                                Ld->getPointerInfo(),
25264                                Ld->isVolatile(), Ld->isNonTemporal(),
25265                                Ld->isInvariant(), Ld->getAlignment());
25266     SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
25267                                Ld->getPointerInfo().getWithOffset(4),
25268                                Ld->isVolatile(), Ld->isNonTemporal(),
25269                                Ld->isInvariant(),
25270                                MinAlign(Ld->getAlignment(), 4));
25271
25272     SDValue NewChain = LoLd.getValue(1);
25273     if (TokenFactorIndex != -1) {
25274       Ops.push_back(LoLd);
25275       Ops.push_back(HiLd);
25276       NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
25277     }
25278
25279     LoAddr = St->getBasePtr();
25280     HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
25281                          DAG.getConstant(4, MVT::i32));
25282
25283     SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
25284                                 St->getPointerInfo(),
25285                                 St->isVolatile(), St->isNonTemporal(),
25286                                 St->getAlignment());
25287     SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
25288                                 St->getPointerInfo().getWithOffset(4),
25289                                 St->isVolatile(),
25290                                 St->isNonTemporal(),
25291                                 MinAlign(St->getAlignment(), 4));
25292     return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
25293   }
25294   return SDValue();
25295 }
25296
25297 /// Return 'true' if this vector operation is "horizontal"
25298 /// and return the operands for the horizontal operation in LHS and RHS.  A
25299 /// horizontal operation performs the binary operation on successive elements
25300 /// of its first operand, then on successive elements of its second operand,
25301 /// returning the resulting values in a vector.  For example, if
25302 ///   A = < float a0, float a1, float a2, float a3 >
25303 /// and
25304 ///   B = < float b0, float b1, float b2, float b3 >
25305 /// then the result of doing a horizontal operation on A and B is
25306 ///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
25307 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
25308 /// A horizontal-op B, for some already available A and B, and if so then LHS is
25309 /// set to A, RHS to B, and the routine returns 'true'.
25310 /// Note that the binary operation should have the property that if one of the
25311 /// operands is UNDEF then the result is UNDEF.
25312 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
25313   // Look for the following pattern: if
25314   //   A = < float a0, float a1, float a2, float a3 >
25315   //   B = < float b0, float b1, float b2, float b3 >
25316   // and
25317   //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
25318   //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
25319   // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
25320   // which is A horizontal-op B.
25321
25322   // At least one of the operands should be a vector shuffle.
25323   if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
25324       RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
25325     return false;
25326
25327   MVT VT = LHS.getSimpleValueType();
25328
25329   assert((VT.is128BitVector() || VT.is256BitVector()) &&
25330          "Unsupported vector type for horizontal add/sub");
25331
25332   // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
25333   // operate independently on 128-bit lanes.
25334   unsigned NumElts = VT.getVectorNumElements();
25335   unsigned NumLanes = VT.getSizeInBits()/128;
25336   unsigned NumLaneElts = NumElts / NumLanes;
25337   assert((NumLaneElts % 2 == 0) &&
25338          "Vector type should have an even number of elements in each lane");
25339   unsigned HalfLaneElts = NumLaneElts/2;
25340
25341   // View LHS in the form
25342   //   LHS = VECTOR_SHUFFLE A, B, LMask
25343   // If LHS is not a shuffle then pretend it is the shuffle
25344   //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
25345   // NOTE: in what follows a default initialized SDValue represents an UNDEF of
25346   // type VT.
25347   SDValue A, B;
25348   SmallVector<int, 16> LMask(NumElts);
25349   if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
25350     if (LHS.getOperand(0).getOpcode() != ISD::UNDEF)
25351       A = LHS.getOperand(0);
25352     if (LHS.getOperand(1).getOpcode() != ISD::UNDEF)
25353       B = LHS.getOperand(1);
25354     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
25355     std::copy(Mask.begin(), Mask.end(), LMask.begin());
25356   } else {
25357     if (LHS.getOpcode() != ISD::UNDEF)
25358       A = LHS;
25359     for (unsigned i = 0; i != NumElts; ++i)
25360       LMask[i] = i;
25361   }
25362
25363   // Likewise, view RHS in the form
25364   //   RHS = VECTOR_SHUFFLE C, D, RMask
25365   SDValue C, D;
25366   SmallVector<int, 16> RMask(NumElts);
25367   if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
25368     if (RHS.getOperand(0).getOpcode() != ISD::UNDEF)
25369       C = RHS.getOperand(0);
25370     if (RHS.getOperand(1).getOpcode() != ISD::UNDEF)
25371       D = RHS.getOperand(1);
25372     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
25373     std::copy(Mask.begin(), Mask.end(), RMask.begin());
25374   } else {
25375     if (RHS.getOpcode() != ISD::UNDEF)
25376       C = RHS;
25377     for (unsigned i = 0; i != NumElts; ++i)
25378       RMask[i] = i;
25379   }
25380
25381   // Check that the shuffles are both shuffling the same vectors.
25382   if (!(A == C && B == D) && !(A == D && B == C))
25383     return false;
25384
25385   // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
25386   if (!A.getNode() && !B.getNode())
25387     return false;
25388
25389   // If A and B occur in reverse order in RHS, then "swap" them (which means
25390   // rewriting the mask).
25391   if (A != C)
25392     CommuteVectorShuffleMask(RMask, NumElts);
25393
25394   // At this point LHS and RHS are equivalent to
25395   //   LHS = VECTOR_SHUFFLE A, B, LMask
25396   //   RHS = VECTOR_SHUFFLE A, B, RMask
25397   // Check that the masks correspond to performing a horizontal operation.
25398   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
25399     for (unsigned i = 0; i != NumLaneElts; ++i) {
25400       int LIdx = LMask[i+l], RIdx = RMask[i+l];
25401
25402       // Ignore any UNDEF components.
25403       if (LIdx < 0 || RIdx < 0 ||
25404           (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
25405           (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
25406         continue;
25407
25408       // Check that successive elements are being operated on.  If not, this is
25409       // not a horizontal operation.
25410       unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
25411       int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
25412       if (!(LIdx == Index && RIdx == Index + 1) &&
25413           !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
25414         return false;
25415     }
25416   }
25417
25418   LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
25419   RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
25420   return true;
25421 }
25422
25423 /// Do target-specific dag combines on floating point adds.
25424 static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
25425                                   const X86Subtarget *Subtarget) {
25426   EVT VT = N->getValueType(0);
25427   SDValue LHS = N->getOperand(0);
25428   SDValue RHS = N->getOperand(1);
25429
25430   // Try to synthesize horizontal adds from adds of shuffles.
25431   if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
25432        (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
25433       isHorizontalBinOp(LHS, RHS, true))
25434     return DAG.getNode(X86ISD::FHADD, SDLoc(N), VT, LHS, RHS);
25435   return SDValue();
25436 }
25437
25438 /// Do target-specific dag combines on floating point subs.
25439 static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
25440                                   const X86Subtarget *Subtarget) {
25441   EVT VT = N->getValueType(0);
25442   SDValue LHS = N->getOperand(0);
25443   SDValue RHS = N->getOperand(1);
25444
25445   // Try to synthesize horizontal subs from subs of shuffles.
25446   if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
25447        (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
25448       isHorizontalBinOp(LHS, RHS, false))
25449     return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS);
25450   return SDValue();
25451 }
25452
25453 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
25454 static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
25455   assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
25456
25457   // F[X]OR(0.0, x) -> x
25458   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
25459     if (C->getValueAPF().isPosZero())
25460       return N->getOperand(1);
25461
25462   // F[X]OR(x, 0.0) -> x
25463   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
25464     if (C->getValueAPF().isPosZero())
25465       return N->getOperand(0);
25466   return SDValue();
25467 }
25468
25469 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
25470 static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
25471   assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
25472
25473   // Only perform optimizations if UnsafeMath is used.
25474   if (!DAG.getTarget().Options.UnsafeFPMath)
25475     return SDValue();
25476
25477   // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
25478   // into FMINC and FMAXC, which are Commutative operations.
25479   unsigned NewOp = 0;
25480   switch (N->getOpcode()) {
25481     default: llvm_unreachable("unknown opcode");
25482     case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
25483     case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
25484   }
25485
25486   return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
25487                      N->getOperand(0), N->getOperand(1));
25488 }
25489
25490 /// Do target-specific dag combines on X86ISD::FAND nodes.
25491 static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
25492   // FAND(0.0, x) -> 0.0
25493   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
25494     if (C->getValueAPF().isPosZero())
25495       return N->getOperand(0);
25496
25497   // FAND(x, 0.0) -> 0.0
25498   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
25499     if (C->getValueAPF().isPosZero())
25500       return N->getOperand(1);
25501
25502   return SDValue();
25503 }
25504
25505 /// Do target-specific dag combines on X86ISD::FANDN nodes
25506 static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) {
25507   // FANDN(0.0, x) -> x
25508   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
25509     if (C->getValueAPF().isPosZero())
25510       return N->getOperand(1);
25511
25512   // FANDN(x, 0.0) -> 0.0
25513   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
25514     if (C->getValueAPF().isPosZero())
25515       return N->getOperand(1);
25516
25517   return SDValue();
25518 }
25519
25520 static SDValue PerformBTCombine(SDNode *N,
25521                                 SelectionDAG &DAG,
25522                                 TargetLowering::DAGCombinerInfo &DCI) {
25523   // BT ignores high bits in the bit index operand.
25524   SDValue Op1 = N->getOperand(1);
25525   if (Op1.hasOneUse()) {
25526     unsigned BitWidth = Op1.getValueSizeInBits();
25527     APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
25528     APInt KnownZero, KnownOne;
25529     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
25530                                           !DCI.isBeforeLegalizeOps());
25531     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25532     if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
25533         TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
25534       DCI.CommitTargetLoweringOpt(TLO);
25535   }
25536   return SDValue();
25537 }
25538
25539 static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
25540   SDValue Op = N->getOperand(0);
25541   if (Op.getOpcode() == ISD::BITCAST)
25542     Op = Op.getOperand(0);
25543   EVT VT = N->getValueType(0), OpVT = Op.getValueType();
25544   if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
25545       VT.getVectorElementType().getSizeInBits() ==
25546       OpVT.getVectorElementType().getSizeInBits()) {
25547     return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
25548   }
25549   return SDValue();
25550 }
25551
25552 static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
25553                                                const X86Subtarget *Subtarget) {
25554   EVT VT = N->getValueType(0);
25555   if (!VT.isVector())
25556     return SDValue();
25557
25558   SDValue N0 = N->getOperand(0);
25559   SDValue N1 = N->getOperand(1);
25560   EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
25561   SDLoc dl(N);
25562
25563   // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
25564   // both SSE and AVX2 since there is no sign-extended shift right
25565   // operation on a vector with 64-bit elements.
25566   //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
25567   // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
25568   if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
25569       N0.getOpcode() == ISD::SIGN_EXTEND)) {
25570     SDValue N00 = N0.getOperand(0);
25571
25572     // EXTLOAD has a better solution on AVX2,
25573     // it may be replaced with X86ISD::VSEXT node.
25574     if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256())
25575       if (!ISD::isNormalLoad(N00.getNode()))
25576         return SDValue();
25577
25578     if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
25579         SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
25580                                   N00, N1);
25581       return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
25582     }
25583   }
25584   return SDValue();
25585 }
25586
25587 static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
25588                                   TargetLowering::DAGCombinerInfo &DCI,
25589                                   const X86Subtarget *Subtarget) {
25590   SDValue N0 = N->getOperand(0);
25591   EVT VT = N->getValueType(0);
25592
25593   // (i8,i32 sext (sdivrem (i8 x, i8 y)) ->
25594   // (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y)
25595   // This exposes the sext to the sdivrem lowering, so that it directly extends
25596   // from AH (which we otherwise need to do contortions to access).
25597   if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 &&
25598       N0.getValueType() == MVT::i8 && VT == MVT::i32) {
25599     SDLoc dl(N);
25600     SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
25601     SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, dl, NodeTys,
25602                             N0.getOperand(0), N0.getOperand(1));
25603     DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
25604     return R.getValue(1);
25605   }
25606
25607   if (!DCI.isBeforeLegalizeOps())
25608     return SDValue();
25609
25610   if (!Subtarget->hasFp256())
25611     return SDValue();
25612
25613   if (VT.isVector() && VT.getSizeInBits() == 256) {
25614     SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
25615     if (R.getNode())
25616       return R;
25617   }
25618
25619   return SDValue();
25620 }
25621
25622 static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
25623                                  const X86Subtarget* Subtarget) {
25624   SDLoc dl(N);
25625   EVT VT = N->getValueType(0);
25626
25627   // Let legalize expand this if it isn't a legal type yet.
25628   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
25629     return SDValue();
25630
25631   EVT ScalarVT = VT.getScalarType();
25632   if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
25633       (!Subtarget->hasFMA() && !Subtarget->hasFMA4()))
25634     return SDValue();
25635
25636   SDValue A = N->getOperand(0);
25637   SDValue B = N->getOperand(1);
25638   SDValue C = N->getOperand(2);
25639
25640   bool NegA = (A.getOpcode() == ISD::FNEG);
25641   bool NegB = (B.getOpcode() == ISD::FNEG);
25642   bool NegC = (C.getOpcode() == ISD::FNEG);
25643
25644   // Negative multiplication when NegA xor NegB
25645   bool NegMul = (NegA != NegB);
25646   if (NegA)
25647     A = A.getOperand(0);
25648   if (NegB)
25649     B = B.getOperand(0);
25650   if (NegC)
25651     C = C.getOperand(0);
25652
25653   unsigned Opcode;
25654   if (!NegMul)
25655     Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
25656   else
25657     Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
25658
25659   return DAG.getNode(Opcode, dl, VT, A, B, C);
25660 }
25661
25662 static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
25663                                   TargetLowering::DAGCombinerInfo &DCI,
25664                                   const X86Subtarget *Subtarget) {
25665   // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
25666   //           (and (i32 x86isd::setcc_carry), 1)
25667   // This eliminates the zext. This transformation is necessary because
25668   // ISD::SETCC is always legalized to i8.
25669   SDLoc dl(N);
25670   SDValue N0 = N->getOperand(0);
25671   EVT VT = N->getValueType(0);
25672
25673   if (N0.getOpcode() == ISD::AND &&
25674       N0.hasOneUse() &&
25675       N0.getOperand(0).hasOneUse()) {
25676     SDValue N00 = N0.getOperand(0);
25677     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
25678       ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
25679       if (!C || C->getZExtValue() != 1)
25680         return SDValue();
25681       return DAG.getNode(ISD::AND, dl, VT,
25682                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
25683                                      N00.getOperand(0), N00.getOperand(1)),
25684                          DAG.getConstant(1, VT));
25685     }
25686   }
25687
25688   if (N0.getOpcode() == ISD::TRUNCATE &&
25689       N0.hasOneUse() &&
25690       N0.getOperand(0).hasOneUse()) {
25691     SDValue N00 = N0.getOperand(0);
25692     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
25693       return DAG.getNode(ISD::AND, dl, VT,
25694                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
25695                                      N00.getOperand(0), N00.getOperand(1)),
25696                          DAG.getConstant(1, VT));
25697     }
25698   }
25699   if (VT.is256BitVector()) {
25700     SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
25701     if (R.getNode())
25702       return R;
25703   }
25704
25705   // (i8,i32 zext (udivrem (i8 x, i8 y)) ->
25706   // (i8,i32 (udivrem_zext_hreg (i8 x, i8 y)
25707   // This exposes the zext to the udivrem lowering, so that it directly extends
25708   // from AH (which we otherwise need to do contortions to access).
25709   if (N0.getOpcode() == ISD::UDIVREM &&
25710       N0.getResNo() == 1 && N0.getValueType() == MVT::i8 &&
25711       (VT == MVT::i32 || VT == MVT::i64)) {
25712     SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
25713     SDValue R = DAG.getNode(X86ISD::UDIVREM8_ZEXT_HREG, dl, NodeTys,
25714                             N0.getOperand(0), N0.getOperand(1));
25715     DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
25716     return R.getValue(1);
25717   }
25718
25719   return SDValue();
25720 }
25721
25722 // Optimize x == -y --> x+y == 0
25723 //          x != -y --> x+y != 0
25724 static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
25725                                       const X86Subtarget* Subtarget) {
25726   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
25727   SDValue LHS = N->getOperand(0);
25728   SDValue RHS = N->getOperand(1);
25729   EVT VT = N->getValueType(0);
25730   SDLoc DL(N);
25731
25732   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
25733     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
25734       if (C->getAPIntValue() == 0 && LHS.hasOneUse()) {
25735         SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
25736                                    LHS.getValueType(), RHS, LHS.getOperand(1));
25737         return DAG.getSetCC(SDLoc(N), N->getValueType(0),
25738                             addV, DAG.getConstant(0, addV.getValueType()), CC);
25739       }
25740   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
25741     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0)))
25742       if (C->getAPIntValue() == 0 && RHS.hasOneUse()) {
25743         SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
25744                                    RHS.getValueType(), LHS, RHS.getOperand(1));
25745         return DAG.getSetCC(SDLoc(N), N->getValueType(0),
25746                             addV, DAG.getConstant(0, addV.getValueType()), CC);
25747       }
25748
25749   if (VT.getScalarType() == MVT::i1) {
25750     bool IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
25751       (LHS.getOperand(0).getValueType().getScalarType() ==  MVT::i1);
25752     bool IsVZero0 = ISD::isBuildVectorAllZeros(LHS.getNode());
25753     if (!IsSEXT0 && !IsVZero0)
25754       return SDValue();
25755     bool IsSEXT1 = (RHS.getOpcode() == ISD::SIGN_EXTEND) &&
25756       (RHS.getOperand(0).getValueType().getScalarType() ==  MVT::i1);
25757     bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
25758
25759     if (!IsSEXT1 && !IsVZero1)
25760       return SDValue();
25761
25762     if (IsSEXT0 && IsVZero1) {
25763       assert(VT == LHS.getOperand(0).getValueType() && "Uexpected operand type");
25764       if (CC == ISD::SETEQ)
25765         return DAG.getNOT(DL, LHS.getOperand(0), VT);
25766       return LHS.getOperand(0);
25767     }
25768     if (IsSEXT1 && IsVZero0) {
25769       assert(VT == RHS.getOperand(0).getValueType() && "Uexpected operand type");
25770       if (CC == ISD::SETEQ)
25771         return DAG.getNOT(DL, RHS.getOperand(0), VT);
25772       return RHS.getOperand(0);
25773     }
25774   }
25775
25776   return SDValue();
25777 }
25778
25779 static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
25780                                       const X86Subtarget *Subtarget) {
25781   SDLoc dl(N);
25782   MVT VT = N->getOperand(1)->getSimpleValueType(0);
25783   assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
25784          "X86insertps is only defined for v4x32");
25785
25786   SDValue Ld = N->getOperand(1);
25787   if (MayFoldLoad(Ld)) {
25788     // Extract the countS bits from the immediate so we can get the proper
25789     // address when narrowing the vector load to a specific element.
25790     // When the second source op is a memory address, interps doesn't use
25791     // countS and just gets an f32 from that address.
25792     unsigned DestIndex =
25793         cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
25794     Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
25795   } else
25796     return SDValue();
25797
25798   // Create this as a scalar to vector to match the instruction pattern.
25799   SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
25800   // countS bits are ignored when loading from memory on insertps, which
25801   // means we don't need to explicitly set them to 0.
25802   return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
25803                      LoadScalarToVector, N->getOperand(2));
25804 }
25805
25806 // Helper function of PerformSETCCCombine. It is to materialize "setb reg"
25807 // as "sbb reg,reg", since it can be extended without zext and produces
25808 // an all-ones bit which is more useful than 0/1 in some cases.
25809 static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG,
25810                                MVT VT) {
25811   if (VT == MVT::i8)
25812     return DAG.getNode(ISD::AND, DL, VT,
25813                        DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
25814                                    DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS),
25815                        DAG.getConstant(1, VT));
25816   assert (VT == MVT::i1 && "Unexpected type for SECCC node");
25817   return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,
25818                      DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
25819                                  DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS));
25820 }
25821
25822 // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
25823 static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
25824                                    TargetLowering::DAGCombinerInfo &DCI,
25825                                    const X86Subtarget *Subtarget) {
25826   SDLoc DL(N);
25827   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
25828   SDValue EFLAGS = N->getOperand(1);
25829
25830   if (CC == X86::COND_A) {
25831     // Try to convert COND_A into COND_B in an attempt to facilitate
25832     // materializing "setb reg".
25833     //
25834     // Do not flip "e > c", where "c" is a constant, because Cmp instruction
25835     // cannot take an immediate as its first operand.
25836     //
25837     if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
25838         EFLAGS.getValueType().isInteger() &&
25839         !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
25840       SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
25841                                    EFLAGS.getNode()->getVTList(),
25842                                    EFLAGS.getOperand(1), EFLAGS.getOperand(0));
25843       SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
25844       return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0));
25845     }
25846   }
25847
25848   // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
25849   // a zext and produces an all-ones bit which is more useful than 0/1 in some
25850   // cases.
25851   if (CC == X86::COND_B)
25852     return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
25853
25854   SDValue Flags;
25855
25856   Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
25857   if (Flags.getNode()) {
25858     SDValue Cond = DAG.getConstant(CC, MVT::i8);
25859     return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
25860   }
25861
25862   return SDValue();
25863 }
25864
25865 // Optimize branch condition evaluation.
25866 //
25867 static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
25868                                     TargetLowering::DAGCombinerInfo &DCI,
25869                                     const X86Subtarget *Subtarget) {
25870   SDLoc DL(N);
25871   SDValue Chain = N->getOperand(0);
25872   SDValue Dest = N->getOperand(1);
25873   SDValue EFLAGS = N->getOperand(3);
25874   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
25875
25876   SDValue Flags;
25877
25878   Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
25879   if (Flags.getNode()) {
25880     SDValue Cond = DAG.getConstant(CC, MVT::i8);
25881     return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
25882                        Flags);
25883   }
25884
25885   return SDValue();
25886 }
25887
25888 static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
25889                                                          SelectionDAG &DAG) {
25890   // Take advantage of vector comparisons producing 0 or -1 in each lane to
25891   // optimize away operation when it's from a constant.
25892   //
25893   // The general transformation is:
25894   //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
25895   //       AND(VECTOR_CMP(x,y), constant2)
25896   //    constant2 = UNARYOP(constant)
25897
25898   // Early exit if this isn't a vector operation, the operand of the
25899   // unary operation isn't a bitwise AND, or if the sizes of the operations
25900   // aren't the same.
25901   EVT VT = N->getValueType(0);
25902   if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
25903       N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
25904       VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
25905     return SDValue();
25906
25907   // Now check that the other operand of the AND is a constant. We could
25908   // make the transformation for non-constant splats as well, but it's unclear
25909   // that would be a benefit as it would not eliminate any operations, just
25910   // perform one more step in scalar code before moving to the vector unit.
25911   if (BuildVectorSDNode *BV =
25912           dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
25913     // Bail out if the vector isn't a constant.
25914     if (!BV->isConstant())
25915       return SDValue();
25916
25917     // Everything checks out. Build up the new and improved node.
25918     SDLoc DL(N);
25919     EVT IntVT = BV->getValueType(0);
25920     // Create a new constant of the appropriate type for the transformed
25921     // DAG.
25922     SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
25923     // The AND node needs bitcasts to/from an integer vector type around it.
25924     SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
25925     SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
25926                                  N->getOperand(0)->getOperand(0), MaskConst);
25927     SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
25928     return Res;
25929   }
25930
25931   return SDValue();
25932 }
25933
25934 static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
25935                                         const X86Subtarget *Subtarget) {
25936   // First try to optimize away the conversion entirely when it's
25937   // conditionally from a constant. Vectors only.
25938   SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG);
25939   if (Res != SDValue())
25940     return Res;
25941
25942   // Now move on to more general possibilities.
25943   SDValue Op0 = N->getOperand(0);
25944   EVT InVT = Op0->getValueType(0);
25945
25946   // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32))
25947   if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
25948     SDLoc dl(N);
25949     MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
25950     SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
25951     return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
25952   }
25953
25954   // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
25955   // a 32-bit target where SSE doesn't support i64->FP operations.
25956   if (Op0.getOpcode() == ISD::LOAD) {
25957     LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
25958     EVT VT = Ld->getValueType(0);
25959     if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
25960         ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
25961         !Subtarget->is64Bit() && VT == MVT::i64) {
25962       SDValue FILDChain = Subtarget->getTargetLowering()->BuildFILD(
25963           SDValue(N, 0), Ld->getValueType(0), Ld->getChain(), Op0, DAG);
25964       DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
25965       return FILDChain;
25966     }
25967   }
25968   return SDValue();
25969 }
25970
25971 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
25972 static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
25973                                  X86TargetLowering::DAGCombinerInfo &DCI) {
25974   // If the LHS and RHS of the ADC node are zero, then it can't overflow and
25975   // the result is either zero or one (depending on the input carry bit).
25976   // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
25977   if (X86::isZeroNode(N->getOperand(0)) &&
25978       X86::isZeroNode(N->getOperand(1)) &&
25979       // We don't have a good way to replace an EFLAGS use, so only do this when
25980       // dead right now.
25981       SDValue(N, 1).use_empty()) {
25982     SDLoc DL(N);
25983     EVT VT = N->getValueType(0);
25984     SDValue CarryOut = DAG.getConstant(0, N->getValueType(1));
25985     SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
25986                                DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
25987                                            DAG.getConstant(X86::COND_B,MVT::i8),
25988                                            N->getOperand(2)),
25989                                DAG.getConstant(1, VT));
25990     return DCI.CombineTo(N, Res1, CarryOut);
25991   }
25992
25993   return SDValue();
25994 }
25995
25996 // fold (add Y, (sete  X, 0)) -> adc  0, Y
25997 //      (add Y, (setne X, 0)) -> sbb -1, Y
25998 //      (sub (sete  X, 0), Y) -> sbb  0, Y
25999 //      (sub (setne X, 0), Y) -> adc -1, Y
26000 static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
26001   SDLoc DL(N);
26002
26003   // Look through ZExts.
26004   SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
26005   if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
26006     return SDValue();
26007
26008   SDValue SetCC = Ext.getOperand(0);
26009   if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
26010     return SDValue();
26011
26012   X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
26013   if (CC != X86::COND_E && CC != X86::COND_NE)
26014     return SDValue();
26015
26016   SDValue Cmp = SetCC.getOperand(1);
26017   if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
26018       !X86::isZeroNode(Cmp.getOperand(1)) ||
26019       !Cmp.getOperand(0).getValueType().isInteger())
26020     return SDValue();
26021
26022   SDValue CmpOp0 = Cmp.getOperand(0);
26023   SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
26024                                DAG.getConstant(1, CmpOp0.getValueType()));
26025
26026   SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
26027   if (CC == X86::COND_NE)
26028     return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
26029                        DL, OtherVal.getValueType(), OtherVal,
26030                        DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp);
26031   return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
26032                      DL, OtherVal.getValueType(), OtherVal,
26033                      DAG.getConstant(0, OtherVal.getValueType()), NewCmp);
26034 }
26035
26036 /// PerformADDCombine - Do target-specific dag combines on integer adds.
26037 static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
26038                                  const X86Subtarget *Subtarget) {
26039   EVT VT = N->getValueType(0);
26040   SDValue Op0 = N->getOperand(0);
26041   SDValue Op1 = N->getOperand(1);
26042
26043   // Try to synthesize horizontal adds from adds of shuffles.
26044   if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
26045        (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
26046       isHorizontalBinOp(Op0, Op1, true))
26047     return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
26048
26049   return OptimizeConditionalInDecrement(N, DAG);
26050 }
26051
26052 static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
26053                                  const X86Subtarget *Subtarget) {
26054   SDValue Op0 = N->getOperand(0);
26055   SDValue Op1 = N->getOperand(1);
26056
26057   // X86 can't encode an immediate LHS of a sub. See if we can push the
26058   // negation into a preceding instruction.
26059   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
26060     // If the RHS of the sub is a XOR with one use and a constant, invert the
26061     // immediate. Then add one to the LHS of the sub so we can turn
26062     // X-Y -> X+~Y+1, saving one register.
26063     if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
26064         isa<ConstantSDNode>(Op1.getOperand(1))) {
26065       APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
26066       EVT VT = Op0.getValueType();
26067       SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
26068                                    Op1.getOperand(0),
26069                                    DAG.getConstant(~XorC, VT));
26070       return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
26071                          DAG.getConstant(C->getAPIntValue()+1, VT));
26072     }
26073   }
26074
26075   // Try to synthesize horizontal adds from adds of shuffles.
26076   EVT VT = N->getValueType(0);
26077   if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
26078        (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
26079       isHorizontalBinOp(Op0, Op1, true))
26080     return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
26081
26082   return OptimizeConditionalInDecrement(N, DAG);
26083 }
26084
26085 /// performVZEXTCombine - Performs build vector combines
26086 static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
26087                                    TargetLowering::DAGCombinerInfo &DCI,
26088                                    const X86Subtarget *Subtarget) {
26089   SDLoc DL(N);
26090   MVT VT = N->getSimpleValueType(0);
26091   SDValue Op = N->getOperand(0);
26092   MVT OpVT = Op.getSimpleValueType();
26093   MVT OpEltVT = OpVT.getVectorElementType();
26094   unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
26095
26096   // (vzext (bitcast (vzext (x)) -> (vzext x)
26097   SDValue V = Op;
26098   while (V.getOpcode() == ISD::BITCAST)
26099     V = V.getOperand(0);
26100
26101   if (V != Op && V.getOpcode() == X86ISD::VZEXT) {
26102     MVT InnerVT = V.getSimpleValueType();
26103     MVT InnerEltVT = InnerVT.getVectorElementType();
26104
26105     // If the element sizes match exactly, we can just do one larger vzext. This
26106     // is always an exact type match as vzext operates on integer types.
26107     if (OpEltVT == InnerEltVT) {
26108       assert(OpVT == InnerVT && "Types must match for vzext!");
26109       return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
26110     }
26111
26112     // The only other way we can combine them is if only a single element of the
26113     // inner vzext is used in the input to the outer vzext.
26114     if (InnerEltVT.getSizeInBits() < InputBits)
26115       return SDValue();
26116
26117     // In this case, the inner vzext is completely dead because we're going to
26118     // only look at bits inside of the low element. Just do the outer vzext on
26119     // a bitcast of the input to the inner.
26120     return DAG.getNode(X86ISD::VZEXT, DL, VT,
26121                        DAG.getNode(ISD::BITCAST, DL, OpVT, V));
26122   }
26123
26124   // Check if we can bypass extracting and re-inserting an element of an input
26125   // vector. Essentialy:
26126   // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
26127   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
26128       V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
26129       V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
26130     SDValue ExtractedV = V.getOperand(0);
26131     SDValue OrigV = ExtractedV.getOperand(0);
26132     if (auto *ExtractIdx = dyn_cast<ConstantSDNode>(ExtractedV.getOperand(1)))
26133       if (ExtractIdx->getZExtValue() == 0) {
26134         MVT OrigVT = OrigV.getSimpleValueType();
26135         // Extract a subvector if necessary...
26136         if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
26137           int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
26138           OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
26139                                     OrigVT.getVectorNumElements() / Ratio);
26140           OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
26141                               DAG.getIntPtrConstant(0));
26142         }
26143         Op = DAG.getNode(ISD::BITCAST, DL, OpVT, OrigV);
26144         return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
26145       }
26146   }
26147
26148   return SDValue();
26149 }
26150
26151 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
26152                                              DAGCombinerInfo &DCI) const {
26153   SelectionDAG &DAG = DCI.DAG;
26154   switch (N->getOpcode()) {
26155   default: break;
26156   case ISD::EXTRACT_VECTOR_ELT:
26157     return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
26158   case ISD::VSELECT:
26159   case ISD::SELECT:
26160   case X86ISD::SHRUNKBLEND:
26161     return PerformSELECTCombine(N, DAG, DCI, Subtarget);
26162   case ISD::BITCAST:        return PerformBITCASTCombine(N, DAG);
26163   case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI, Subtarget);
26164   case ISD::ADD:            return PerformAddCombine(N, DAG, Subtarget);
26165   case ISD::SUB:            return PerformSubCombine(N, DAG, Subtarget);
26166   case X86ISD::ADC:         return PerformADCCombine(N, DAG, DCI);
26167   case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
26168   case ISD::SHL:
26169   case ISD::SRA:
26170   case ISD::SRL:            return PerformShiftCombine(N, DAG, DCI, Subtarget);
26171   case ISD::AND:            return PerformAndCombine(N, DAG, DCI, Subtarget);
26172   case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
26173   case ISD::XOR:            return PerformXorCombine(N, DAG, DCI, Subtarget);
26174   case ISD::LOAD:           return PerformLOADCombine(N, DAG, DCI, Subtarget);
26175   case ISD::MLOAD:          return PerformMLOADCombine(N, DAG, DCI, Subtarget);
26176   case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
26177   case ISD::MSTORE:         return PerformMSTORECombine(N, DAG, Subtarget);
26178   case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, Subtarget);
26179   case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
26180   case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
26181   case X86ISD::FXOR:
26182   case X86ISD::FOR:         return PerformFORCombine(N, DAG);
26183   case X86ISD::FMIN:
26184   case X86ISD::FMAX:        return PerformFMinFMaxCombine(N, DAG);
26185   case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
26186   case X86ISD::FANDN:       return PerformFANDNCombine(N, DAG);
26187   case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
26188   case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
26189   case ISD::ANY_EXTEND:
26190   case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, DCI, Subtarget);
26191   case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
26192   case ISD::SIGN_EXTEND_INREG:
26193     return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
26194   case ISD::TRUNCATE:       return PerformTruncateCombine(N, DAG,DCI,Subtarget);
26195   case ISD::SETCC:          return PerformISDSETCCCombine(N, DAG, Subtarget);
26196   case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG, DCI, Subtarget);
26197   case X86ISD::BRCOND:      return PerformBrCondCombine(N, DAG, DCI, Subtarget);
26198   case X86ISD::VZEXT:       return performVZEXTCombine(N, DAG, DCI, Subtarget);
26199   case X86ISD::SHUFP:       // Handle all target specific shuffles
26200   case X86ISD::PALIGNR:
26201   case X86ISD::UNPCKH:
26202   case X86ISD::UNPCKL:
26203   case X86ISD::MOVHLPS:
26204   case X86ISD::MOVLHPS:
26205   case X86ISD::PSHUFB:
26206   case X86ISD::PSHUFD:
26207   case X86ISD::PSHUFHW:
26208   case X86ISD::PSHUFLW:
26209   case X86ISD::MOVSS:
26210   case X86ISD::MOVSD:
26211   case X86ISD::VPERMILPI:
26212   case X86ISD::VPERM2X128:
26213   case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
26214   case ISD::FMA:            return PerformFMACombine(N, DAG, Subtarget);
26215   case ISD::INTRINSIC_WO_CHAIN:
26216     return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget);
26217   case X86ISD::INSERTPS: {
26218     if (getTargetMachine().getOptLevel() > CodeGenOpt::None)
26219       return PerformINSERTPSCombine(N, DAG, Subtarget);
26220     break;
26221   }
26222   case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DAG, Subtarget);
26223   }
26224
26225   return SDValue();
26226 }
26227
26228 /// isTypeDesirableForOp - Return true if the target has native support for
26229 /// the specified value type and it is 'desirable' to use the type for the
26230 /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
26231 /// instruction encodings are longer and some i16 instructions are slow.
26232 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
26233   if (!isTypeLegal(VT))
26234     return false;
26235   if (VT != MVT::i16)
26236     return true;
26237
26238   switch (Opc) {
26239   default:
26240     return true;
26241   case ISD::LOAD:
26242   case ISD::SIGN_EXTEND:
26243   case ISD::ZERO_EXTEND:
26244   case ISD::ANY_EXTEND:
26245   case ISD::SHL:
26246   case ISD::SRL:
26247   case ISD::SUB:
26248   case ISD::ADD:
26249   case ISD::MUL:
26250   case ISD::AND:
26251   case ISD::OR:
26252   case ISD::XOR:
26253     return false;
26254   }
26255 }
26256
26257 /// IsDesirableToPromoteOp - This method query the target whether it is
26258 /// beneficial for dag combiner to promote the specified node. If true, it
26259 /// should return the desired promotion type by reference.
26260 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
26261   EVT VT = Op.getValueType();
26262   if (VT != MVT::i16)
26263     return false;
26264
26265   bool Promote = false;
26266   bool Commute = false;
26267   switch (Op.getOpcode()) {
26268   default: break;
26269   case ISD::LOAD: {
26270     LoadSDNode *LD = cast<LoadSDNode>(Op);
26271     // If the non-extending load has a single use and it's not live out, then it
26272     // might be folded.
26273     if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&&
26274                                                      Op.hasOneUse()*/) {
26275       for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
26276              UE = Op.getNode()->use_end(); UI != UE; ++UI) {
26277         // The only case where we'd want to promote LOAD (rather then it being
26278         // promoted as an operand is when it's only use is liveout.
26279         if (UI->getOpcode() != ISD::CopyToReg)
26280           return false;
26281       }
26282     }
26283     Promote = true;
26284     break;
26285   }
26286   case ISD::SIGN_EXTEND:
26287   case ISD::ZERO_EXTEND:
26288   case ISD::ANY_EXTEND:
26289     Promote = true;
26290     break;
26291   case ISD::SHL:
26292   case ISD::SRL: {
26293     SDValue N0 = Op.getOperand(0);
26294     // Look out for (store (shl (load), x)).
26295     if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
26296       return false;
26297     Promote = true;
26298     break;
26299   }
26300   case ISD::ADD:
26301   case ISD::MUL:
26302   case ISD::AND:
26303   case ISD::OR:
26304   case ISD::XOR:
26305     Commute = true;
26306     // fallthrough
26307   case ISD::SUB: {
26308     SDValue N0 = Op.getOperand(0);
26309     SDValue N1 = Op.getOperand(1);
26310     if (!Commute && MayFoldLoad(N1))
26311       return false;
26312     // Avoid disabling potential load folding opportunities.
26313     if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
26314       return false;
26315     if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
26316       return false;
26317     Promote = true;
26318   }
26319   }
26320
26321   PVT = MVT::i32;
26322   return Promote;
26323 }
26324
26325 //===----------------------------------------------------------------------===//
26326 //                           X86 Inline Assembly Support
26327 //===----------------------------------------------------------------------===//
26328
26329 namespace {
26330   // Helper to match a string separated by whitespace.
26331   bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) {
26332     s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace.
26333
26334     for (unsigned i = 0, e = args.size(); i != e; ++i) {
26335       StringRef piece(*args[i]);
26336       if (!s.startswith(piece)) // Check if the piece matches.
26337         return false;
26338
26339       s = s.substr(piece.size());
26340       StringRef::size_type pos = s.find_first_not_of(" \t");
26341       if (pos == 0) // We matched a prefix.
26342         return false;
26343
26344       s = s.substr(pos);
26345     }
26346
26347     return s.empty();
26348   }
26349   const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={};
26350 }
26351
26352 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
26353
26354   if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
26355     if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
26356         std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
26357         std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
26358
26359       if (AsmPieces.size() == 3)
26360         return true;
26361       else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
26362         return true;
26363     }
26364   }
26365   return false;
26366 }
26367
26368 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
26369   InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
26370
26371   std::string AsmStr = IA->getAsmString();
26372
26373   IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
26374   if (!Ty || Ty->getBitWidth() % 16 != 0)
26375     return false;
26376
26377   // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
26378   SmallVector<StringRef, 4> AsmPieces;
26379   SplitString(AsmStr, AsmPieces, ";\n");
26380
26381   switch (AsmPieces.size()) {
26382   default: return false;
26383   case 1:
26384     // FIXME: this should verify that we are targeting a 486 or better.  If not,
26385     // we will turn this bswap into something that will be lowered to logical
26386     // ops instead of emitting the bswap asm.  For now, we don't support 486 or
26387     // lower so don't worry about this.
26388     // bswap $0
26389     if (matchAsm(AsmPieces[0], "bswap", "$0") ||
26390         matchAsm(AsmPieces[0], "bswapl", "$0") ||
26391         matchAsm(AsmPieces[0], "bswapq", "$0") ||
26392         matchAsm(AsmPieces[0], "bswap", "${0:q}") ||
26393         matchAsm(AsmPieces[0], "bswapl", "${0:q}") ||
26394         matchAsm(AsmPieces[0], "bswapq", "${0:q}")) {
26395       // No need to check constraints, nothing other than the equivalent of
26396       // "=r,0" would be valid here.
26397       return IntrinsicLowering::LowerToByteSwap(CI);
26398     }
26399
26400     // rorw $$8, ${0:w}  -->  llvm.bswap.i16
26401     if (CI->getType()->isIntegerTy(16) &&
26402         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
26403         (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") ||
26404          matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) {
26405       AsmPieces.clear();
26406       const std::string &ConstraintsStr = IA->getConstraintString();
26407       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
26408       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
26409       if (clobbersFlagRegisters(AsmPieces))
26410         return IntrinsicLowering::LowerToByteSwap(CI);
26411     }
26412     break;
26413   case 3:
26414     if (CI->getType()->isIntegerTy(32) &&
26415         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
26416         matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") &&
26417         matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") &&
26418         matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) {
26419       AsmPieces.clear();
26420       const std::string &ConstraintsStr = IA->getConstraintString();
26421       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
26422       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
26423       if (clobbersFlagRegisters(AsmPieces))
26424         return IntrinsicLowering::LowerToByteSwap(CI);
26425     }
26426
26427     if (CI->getType()->isIntegerTy(64)) {
26428       InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
26429       if (Constraints.size() >= 2 &&
26430           Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
26431           Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
26432         // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
26433         if (matchAsm(AsmPieces[0], "bswap", "%eax") &&
26434             matchAsm(AsmPieces[1], "bswap", "%edx") &&
26435             matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx"))
26436           return IntrinsicLowering::LowerToByteSwap(CI);
26437       }
26438     }
26439     break;
26440   }
26441   return false;
26442 }
26443
26444 /// getConstraintType - Given a constraint letter, return the type of
26445 /// constraint it is for this target.
26446 X86TargetLowering::ConstraintType
26447 X86TargetLowering::getConstraintType(const std::string &Constraint) const {
26448   if (Constraint.size() == 1) {
26449     switch (Constraint[0]) {
26450     case 'R':
26451     case 'q':
26452     case 'Q':
26453     case 'f':
26454     case 't':
26455     case 'u':
26456     case 'y':
26457     case 'x':
26458     case 'Y':
26459     case 'l':
26460       return C_RegisterClass;
26461     case 'a':
26462     case 'b':
26463     case 'c':
26464     case 'd':
26465     case 'S':
26466     case 'D':
26467     case 'A':
26468       return C_Register;
26469     case 'I':
26470     case 'J':
26471     case 'K':
26472     case 'L':
26473     case 'M':
26474     case 'N':
26475     case 'G':
26476     case 'C':
26477     case 'e':
26478     case 'Z':
26479       return C_Other;
26480     default:
26481       break;
26482     }
26483   }
26484   return TargetLowering::getConstraintType(Constraint);
26485 }
26486
26487 /// Examine constraint type and operand type and determine a weight value.
26488 /// This object must already have been set up with the operand type
26489 /// and the current alternative constraint selected.
26490 TargetLowering::ConstraintWeight
26491   X86TargetLowering::getSingleConstraintMatchWeight(
26492     AsmOperandInfo &info, const char *constraint) const {
26493   ConstraintWeight weight = CW_Invalid;
26494   Value *CallOperandVal = info.CallOperandVal;
26495     // If we don't have a value, we can't do a match,
26496     // but allow it at the lowest weight.
26497   if (!CallOperandVal)
26498     return CW_Default;
26499   Type *type = CallOperandVal->getType();
26500   // Look at the constraint type.
26501   switch (*constraint) {
26502   default:
26503     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
26504   case 'R':
26505   case 'q':
26506   case 'Q':
26507   case 'a':
26508   case 'b':
26509   case 'c':
26510   case 'd':
26511   case 'S':
26512   case 'D':
26513   case 'A':
26514     if (CallOperandVal->getType()->isIntegerTy())
26515       weight = CW_SpecificReg;
26516     break;
26517   case 'f':
26518   case 't':
26519   case 'u':
26520     if (type->isFloatingPointTy())
26521       weight = CW_SpecificReg;
26522     break;
26523   case 'y':
26524     if (type->isX86_MMXTy() && Subtarget->hasMMX())
26525       weight = CW_SpecificReg;
26526     break;
26527   case 'x':
26528   case 'Y':
26529     if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) ||
26530         ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256()))
26531       weight = CW_Register;
26532     break;
26533   case 'I':
26534     if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
26535       if (C->getZExtValue() <= 31)
26536         weight = CW_Constant;
26537     }
26538     break;
26539   case 'J':
26540     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26541       if (C->getZExtValue() <= 63)
26542         weight = CW_Constant;
26543     }
26544     break;
26545   case 'K':
26546     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26547       if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
26548         weight = CW_Constant;
26549     }
26550     break;
26551   case 'L':
26552     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26553       if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
26554         weight = CW_Constant;
26555     }
26556     break;
26557   case 'M':
26558     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26559       if (C->getZExtValue() <= 3)
26560         weight = CW_Constant;
26561     }
26562     break;
26563   case 'N':
26564     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26565       if (C->getZExtValue() <= 0xff)
26566         weight = CW_Constant;
26567     }
26568     break;
26569   case 'G':
26570   case 'C':
26571     if (dyn_cast<ConstantFP>(CallOperandVal)) {
26572       weight = CW_Constant;
26573     }
26574     break;
26575   case 'e':
26576     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26577       if ((C->getSExtValue() >= -0x80000000LL) &&
26578           (C->getSExtValue() <= 0x7fffffffLL))
26579         weight = CW_Constant;
26580     }
26581     break;
26582   case 'Z':
26583     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26584       if (C->getZExtValue() <= 0xffffffff)
26585         weight = CW_Constant;
26586     }
26587     break;
26588   }
26589   return weight;
26590 }
26591
26592 /// LowerXConstraint - try to replace an X constraint, which matches anything,
26593 /// with another that has more specific requirements based on the type of the
26594 /// corresponding operand.
26595 const char *X86TargetLowering::
26596 LowerXConstraint(EVT ConstraintVT) const {
26597   // FP X constraints get lowered to SSE1/2 registers if available, otherwise
26598   // 'f' like normal targets.
26599   if (ConstraintVT.isFloatingPoint()) {
26600     if (Subtarget->hasSSE2())
26601       return "Y";
26602     if (Subtarget->hasSSE1())
26603       return "x";
26604   }
26605
26606   return TargetLowering::LowerXConstraint(ConstraintVT);
26607 }
26608
26609 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
26610 /// vector.  If it is invalid, don't add anything to Ops.
26611 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
26612                                                      std::string &Constraint,
26613                                                      std::vector<SDValue>&Ops,
26614                                                      SelectionDAG &DAG) const {
26615   SDValue Result;
26616
26617   // Only support length 1 constraints for now.
26618   if (Constraint.length() > 1) return;
26619
26620   char ConstraintLetter = Constraint[0];
26621   switch (ConstraintLetter) {
26622   default: break;
26623   case 'I':
26624     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26625       if (C->getZExtValue() <= 31) {
26626         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26627         break;
26628       }
26629     }
26630     return;
26631   case 'J':
26632     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26633       if (C->getZExtValue() <= 63) {
26634         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26635         break;
26636       }
26637     }
26638     return;
26639   case 'K':
26640     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26641       if (isInt<8>(C->getSExtValue())) {
26642         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26643         break;
26644       }
26645     }
26646     return;
26647   case 'L':
26648     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26649       if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
26650           (Subtarget->is64Bit() && C->getZExtValue() == 0xffffffff)) {
26651         Result = DAG.getTargetConstant(C->getSExtValue(), Op.getValueType());
26652         break;
26653       }
26654     }
26655     return;
26656   case 'M':
26657     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26658       if (C->getZExtValue() <= 3) {
26659         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26660         break;
26661       }
26662     }
26663     return;
26664   case 'N':
26665     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26666       if (C->getZExtValue() <= 255) {
26667         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26668         break;
26669       }
26670     }
26671     return;
26672   case 'O':
26673     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26674       if (C->getZExtValue() <= 127) {
26675         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26676         break;
26677       }
26678     }
26679     return;
26680   case 'e': {
26681     // 32-bit signed value
26682     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26683       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
26684                                            C->getSExtValue())) {
26685         // Widen to 64 bits here to get it sign extended.
26686         Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
26687         break;
26688       }
26689     // FIXME gcc accepts some relocatable values here too, but only in certain
26690     // memory models; it's complicated.
26691     }
26692     return;
26693   }
26694   case 'Z': {
26695     // 32-bit unsigned value
26696     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26697       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
26698                                            C->getZExtValue())) {
26699         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26700         break;
26701       }
26702     }
26703     // FIXME gcc accepts some relocatable values here too, but only in certain
26704     // memory models; it's complicated.
26705     return;
26706   }
26707   case 'i': {
26708     // Literal immediates are always ok.
26709     if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
26710       // Widen to 64 bits here to get it sign extended.
26711       Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64);
26712       break;
26713     }
26714
26715     // In any sort of PIC mode addresses need to be computed at runtime by
26716     // adding in a register or some sort of table lookup.  These can't
26717     // be used as immediates.
26718     if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC())
26719       return;
26720
26721     // If we are in non-pic codegen mode, we allow the address of a global (with
26722     // an optional displacement) to be used with 'i'.
26723     GlobalAddressSDNode *GA = nullptr;
26724     int64_t Offset = 0;
26725
26726     // Match either (GA), (GA+C), (GA+C1+C2), etc.
26727     while (1) {
26728       if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
26729         Offset += GA->getOffset();
26730         break;
26731       } else if (Op.getOpcode() == ISD::ADD) {
26732         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
26733           Offset += C->getZExtValue();
26734           Op = Op.getOperand(0);
26735           continue;
26736         }
26737       } else if (Op.getOpcode() == ISD::SUB) {
26738         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
26739           Offset += -C->getZExtValue();
26740           Op = Op.getOperand(0);
26741           continue;
26742         }
26743       }
26744
26745       // Otherwise, this isn't something we can handle, reject it.
26746       return;
26747     }
26748
26749     const GlobalValue *GV = GA->getGlobal();
26750     // If we require an extra load to get this address, as in PIC mode, we
26751     // can't accept it.
26752     if (isGlobalStubReference(
26753             Subtarget->ClassifyGlobalReference(GV, DAG.getTarget())))
26754       return;
26755
26756     Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
26757                                         GA->getValueType(0), Offset);
26758     break;
26759   }
26760   }
26761
26762   if (Result.getNode()) {
26763     Ops.push_back(Result);
26764     return;
26765   }
26766   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
26767 }
26768
26769 std::pair<unsigned, const TargetRegisterClass*>
26770 X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
26771                                                 MVT VT) const {
26772   // First, see if this is a constraint that directly corresponds to an LLVM
26773   // register class.
26774   if (Constraint.size() == 1) {
26775     // GCC Constraint Letters
26776     switch (Constraint[0]) {
26777     default: break;
26778       // TODO: Slight differences here in allocation order and leaving
26779       // RIP in the class. Do they matter any more here than they do
26780       // in the normal allocation?
26781     case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
26782       if (Subtarget->is64Bit()) {
26783         if (VT == MVT::i32 || VT == MVT::f32)
26784           return std::make_pair(0U, &X86::GR32RegClass);
26785         if (VT == MVT::i16)
26786           return std::make_pair(0U, &X86::GR16RegClass);
26787         if (VT == MVT::i8 || VT == MVT::i1)
26788           return std::make_pair(0U, &X86::GR8RegClass);
26789         if (VT == MVT::i64 || VT == MVT::f64)
26790           return std::make_pair(0U, &X86::GR64RegClass);
26791         break;
26792       }
26793       // 32-bit fallthrough
26794     case 'Q':   // Q_REGS
26795       if (VT == MVT::i32 || VT == MVT::f32)
26796         return std::make_pair(0U, &X86::GR32_ABCDRegClass);
26797       if (VT == MVT::i16)
26798         return std::make_pair(0U, &X86::GR16_ABCDRegClass);
26799       if (VT == MVT::i8 || VT == MVT::i1)
26800         return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
26801       if (VT == MVT::i64)
26802         return std::make_pair(0U, &X86::GR64_ABCDRegClass);
26803       break;
26804     case 'r':   // GENERAL_REGS
26805     case 'l':   // INDEX_REGS
26806       if (VT == MVT::i8 || VT == MVT::i1)
26807         return std::make_pair(0U, &X86::GR8RegClass);
26808       if (VT == MVT::i16)
26809         return std::make_pair(0U, &X86::GR16RegClass);
26810       if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit())
26811         return std::make_pair(0U, &X86::GR32RegClass);
26812       return std::make_pair(0U, &X86::GR64RegClass);
26813     case 'R':   // LEGACY_REGS
26814       if (VT == MVT::i8 || VT == MVT::i1)
26815         return std::make_pair(0U, &X86::GR8_NOREXRegClass);
26816       if (VT == MVT::i16)
26817         return std::make_pair(0U, &X86::GR16_NOREXRegClass);
26818       if (VT == MVT::i32 || !Subtarget->is64Bit())
26819         return std::make_pair(0U, &X86::GR32_NOREXRegClass);
26820       return std::make_pair(0U, &X86::GR64_NOREXRegClass);
26821     case 'f':  // FP Stack registers.
26822       // If SSE is enabled for this VT, use f80 to ensure the isel moves the
26823       // value to the correct fpstack register class.
26824       if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
26825         return std::make_pair(0U, &X86::RFP32RegClass);
26826       if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
26827         return std::make_pair(0U, &X86::RFP64RegClass);
26828       return std::make_pair(0U, &X86::RFP80RegClass);
26829     case 'y':   // MMX_REGS if MMX allowed.
26830       if (!Subtarget->hasMMX()) break;
26831       return std::make_pair(0U, &X86::VR64RegClass);
26832     case 'Y':   // SSE_REGS if SSE2 allowed
26833       if (!Subtarget->hasSSE2()) break;
26834       // FALL THROUGH.
26835     case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
26836       if (!Subtarget->hasSSE1()) break;
26837
26838       switch (VT.SimpleTy) {
26839       default: break;
26840       // Scalar SSE types.
26841       case MVT::f32:
26842       case MVT::i32:
26843         return std::make_pair(0U, &X86::FR32RegClass);
26844       case MVT::f64:
26845       case MVT::i64:
26846         return std::make_pair(0U, &X86::FR64RegClass);
26847       // Vector types.
26848       case MVT::v16i8:
26849       case MVT::v8i16:
26850       case MVT::v4i32:
26851       case MVT::v2i64:
26852       case MVT::v4f32:
26853       case MVT::v2f64:
26854         return std::make_pair(0U, &X86::VR128RegClass);
26855       // AVX types.
26856       case MVT::v32i8:
26857       case MVT::v16i16:
26858       case MVT::v8i32:
26859       case MVT::v4i64:
26860       case MVT::v8f32:
26861       case MVT::v4f64:
26862         return std::make_pair(0U, &X86::VR256RegClass);
26863       case MVT::v8f64:
26864       case MVT::v16f32:
26865       case MVT::v16i32:
26866       case MVT::v8i64:
26867         return std::make_pair(0U, &X86::VR512RegClass);
26868       }
26869       break;
26870     }
26871   }
26872
26873   // Use the default implementation in TargetLowering to convert the register
26874   // constraint into a member of a register class.
26875   std::pair<unsigned, const TargetRegisterClass*> Res;
26876   Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
26877
26878   // Not found as a standard register?
26879   if (!Res.second) {
26880     // Map st(0) -> st(7) -> ST0
26881     if (Constraint.size() == 7 && Constraint[0] == '{' &&
26882         tolower(Constraint[1]) == 's' &&
26883         tolower(Constraint[2]) == 't' &&
26884         Constraint[3] == '(' &&
26885         (Constraint[4] >= '0' && Constraint[4] <= '7') &&
26886         Constraint[5] == ')' &&
26887         Constraint[6] == '}') {
26888
26889       Res.first = X86::FP0+Constraint[4]-'0';
26890       Res.second = &X86::RFP80RegClass;
26891       return Res;
26892     }
26893
26894     // GCC allows "st(0)" to be called just plain "st".
26895     if (StringRef("{st}").equals_lower(Constraint)) {
26896       Res.first = X86::FP0;
26897       Res.second = &X86::RFP80RegClass;
26898       return Res;
26899     }
26900
26901     // flags -> EFLAGS
26902     if (StringRef("{flags}").equals_lower(Constraint)) {
26903       Res.first = X86::EFLAGS;
26904       Res.second = &X86::CCRRegClass;
26905       return Res;
26906     }
26907
26908     // 'A' means EAX + EDX.
26909     if (Constraint == "A") {
26910       Res.first = X86::EAX;
26911       Res.second = &X86::GR32_ADRegClass;
26912       return Res;
26913     }
26914     return Res;
26915   }
26916
26917   // Otherwise, check to see if this is a register class of the wrong value
26918   // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
26919   // turn into {ax},{dx}.
26920   if (Res.second->hasType(VT))
26921     return Res;   // Correct type already, nothing to do.
26922
26923   // All of the single-register GCC register classes map their values onto
26924   // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp".  If we
26925   // really want an 8-bit or 32-bit register, map to the appropriate register
26926   // class and return the appropriate register.
26927   if (Res.second == &X86::GR16RegClass) {
26928     if (VT == MVT::i8 || VT == MVT::i1) {
26929       unsigned DestReg = 0;
26930       switch (Res.first) {
26931       default: break;
26932       case X86::AX: DestReg = X86::AL; break;
26933       case X86::DX: DestReg = X86::DL; break;
26934       case X86::CX: DestReg = X86::CL; break;
26935       case X86::BX: DestReg = X86::BL; break;
26936       }
26937       if (DestReg) {
26938         Res.first = DestReg;
26939         Res.second = &X86::GR8RegClass;
26940       }
26941     } else if (VT == MVT::i32 || VT == MVT::f32) {
26942       unsigned DestReg = 0;
26943       switch (Res.first) {
26944       default: break;
26945       case X86::AX: DestReg = X86::EAX; break;
26946       case X86::DX: DestReg = X86::EDX; break;
26947       case X86::CX: DestReg = X86::ECX; break;
26948       case X86::BX: DestReg = X86::EBX; break;
26949       case X86::SI: DestReg = X86::ESI; break;
26950       case X86::DI: DestReg = X86::EDI; break;
26951       case X86::BP: DestReg = X86::EBP; break;
26952       case X86::SP: DestReg = X86::ESP; break;
26953       }
26954       if (DestReg) {
26955         Res.first = DestReg;
26956         Res.second = &X86::GR32RegClass;
26957       }
26958     } else if (VT == MVT::i64 || VT == MVT::f64) {
26959       unsigned DestReg = 0;
26960       switch (Res.first) {
26961       default: break;
26962       case X86::AX: DestReg = X86::RAX; break;
26963       case X86::DX: DestReg = X86::RDX; break;
26964       case X86::CX: DestReg = X86::RCX; break;
26965       case X86::BX: DestReg = X86::RBX; break;
26966       case X86::SI: DestReg = X86::RSI; break;
26967       case X86::DI: DestReg = X86::RDI; break;
26968       case X86::BP: DestReg = X86::RBP; break;
26969       case X86::SP: DestReg = X86::RSP; break;
26970       }
26971       if (DestReg) {
26972         Res.first = DestReg;
26973         Res.second = &X86::GR64RegClass;
26974       }
26975     }
26976   } else if (Res.second == &X86::FR32RegClass ||
26977              Res.second == &X86::FR64RegClass ||
26978              Res.second == &X86::VR128RegClass ||
26979              Res.second == &X86::VR256RegClass ||
26980              Res.second == &X86::FR32XRegClass ||
26981              Res.second == &X86::FR64XRegClass ||
26982              Res.second == &X86::VR128XRegClass ||
26983              Res.second == &X86::VR256XRegClass ||
26984              Res.second == &X86::VR512RegClass) {
26985     // Handle references to XMM physical registers that got mapped into the
26986     // wrong class.  This can happen with constraints like {xmm0} where the
26987     // target independent register mapper will just pick the first match it can
26988     // find, ignoring the required type.
26989
26990     if (VT == MVT::f32 || VT == MVT::i32)
26991       Res.second = &X86::FR32RegClass;
26992     else if (VT == MVT::f64 || VT == MVT::i64)
26993       Res.second = &X86::FR64RegClass;
26994     else if (X86::VR128RegClass.hasType(VT))
26995       Res.second = &X86::VR128RegClass;
26996     else if (X86::VR256RegClass.hasType(VT))
26997       Res.second = &X86::VR256RegClass;
26998     else if (X86::VR512RegClass.hasType(VT))
26999       Res.second = &X86::VR512RegClass;
27000   }
27001
27002   return Res;
27003 }
27004
27005 int X86TargetLowering::getScalingFactorCost(const AddrMode &AM,
27006                                             Type *Ty) const {
27007   // Scaling factors are not free at all.
27008   // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
27009   // will take 2 allocations in the out of order engine instead of 1
27010   // for plain addressing mode, i.e. inst (reg1).
27011   // E.g.,
27012   // vaddps (%rsi,%drx), %ymm0, %ymm1
27013   // Requires two allocations (one for the load, one for the computation)
27014   // whereas:
27015   // vaddps (%rsi), %ymm0, %ymm1
27016   // Requires just 1 allocation, i.e., freeing allocations for other operations
27017   // and having less micro operations to execute.
27018   //
27019   // For some X86 architectures, this is even worse because for instance for
27020   // stores, the complex addressing mode forces the instruction to use the
27021   // "load" ports instead of the dedicated "store" port.
27022   // E.g., on Haswell:
27023   // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
27024   // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
27025   if (isLegalAddressingMode(AM, Ty))
27026     // Scale represents reg2 * scale, thus account for 1
27027     // as soon as we use a second register.
27028     return AM.Scale != 0;
27029   return -1;
27030 }
27031
27032 bool X86TargetLowering::isTargetFTOL() const {
27033   return Subtarget->isTargetKnownWindowsMSVC() && !Subtarget->is64Bit();
27034 }