lib/Target/X86/X86ISelLowering.cpp

   1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This file defines the interfaces that X86 uses to lower LLVM code into a
  11 // selection DAG.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "X86ISelLowering.h"
  16 #include "Utils/X86ShuffleDecode.h"
  17 #include "X86CallingConv.h"
  18 #include "X86FrameLowering.h"
  19 #include "X86InstrBuilder.h"
  20 #include "X86MachineFunctionInfo.h"
  21 #include "X86TargetMachine.h"
  22 #include "X86TargetObjectFile.h"
  23 #include "llvm/ADT/SmallBitVector.h"
  24 #include "llvm/ADT/SmallSet.h"
  25 #include "llvm/ADT/Statistic.h"
  26 #include "llvm/ADT/StringExtras.h"
  27 #include "llvm/ADT/StringSwitch.h"
  28 #include "llvm/ADT/VariadicFunction.h"
  29 #include "llvm/CodeGen/IntrinsicLowering.h"
  30 #include "llvm/CodeGen/MachineFrameInfo.h"
  31 #include "llvm/CodeGen/MachineFunction.h"
  32 #include "llvm/CodeGen/MachineInstrBuilder.h"
  33 #include "llvm/CodeGen/MachineJumpTableInfo.h"
  34 #include "llvm/CodeGen/MachineModuleInfo.h"
  35 #include "llvm/CodeGen/MachineRegisterInfo.h"
  36 #include "llvm/IR/CallSite.h"
  37 #include "llvm/IR/CallingConv.h"
  38 #include "llvm/IR/Constants.h"
  39 #include "llvm/IR/DerivedTypes.h"
  40 #include "llvm/IR/Function.h"
  41 #include "llvm/IR/GlobalAlias.h"
  42 #include "llvm/IR/GlobalVariable.h"
  43 #include "llvm/IR/Instructions.h"
  44 #include "llvm/IR/Intrinsics.h"
  45 #include "llvm/MC/MCAsmInfo.h"
  46 #include "llvm/MC/MCContext.h"
  47 #include "llvm/MC/MCExpr.h"
  48 #include "llvm/MC/MCSymbol.h"
  49 #include "llvm/Support/CommandLine.h"
  50 #include "llvm/Support/Debug.h"
  51 #include "llvm/Support/ErrorHandling.h"
  52 #include "llvm/Support/MathExtras.h"
  53 #include "llvm/Target/TargetOptions.h"
  54 #include "X86IntrinsicsInfo.h"
  55 #include <bitset>
  56 #include <numeric>
  57 #include <cctype>
  58 using namespace llvm;
  59
  60 #define DEBUG_TYPE "x86-isel"
  61
  62 STATISTIC(NumTailCalls, "Number of tail calls");
  63
  64 static cl::opt<bool> ExperimentalVectorWideningLegalization(
  65     "x86-experimental-vector-widening-legalization", cl::init(false),
  66     cl::desc("Enable an experimental vector type legalization through widening "
  67              "rather than promotion."),
  68     cl::Hidden);
  69
  70 static cl::opt<bool> ExperimentalVectorShuffleLowering(
  71     "x86-experimental-vector-shuffle-lowering", cl::init(true),
  72     cl::desc("Enable an experimental vector shuffle lowering code path."),
  73     cl::Hidden);
  74
  75 static cl::opt<bool> ExperimentalVectorShuffleLegality(
  76     "x86-experimental-vector-shuffle-legality", cl::init(false),
  77     cl::desc("Enable experimental shuffle legality based on the experimental "
  78              "shuffle lowering. Should only be used with the experimental "
  79              "shuffle lowering."),
  80     cl::Hidden);
  81
  82 static cl::opt<int> ReciprocalEstimateRefinementSteps(
  83     "x86-recip-refinement-steps", cl::init(1),
  84     cl::desc("Specify the number of Newton-Raphson iterations applied to the "
  85              "result of the hardware reciprocal estimate instruction."),
  86     cl::NotHidden);
  87
  88 // Forward declarations.
  89 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
  90                        SDValue V2);
  91
  92 static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
  93                                 SelectionDAG &DAG, SDLoc dl,
  94                                 unsigned vectorWidth) {
  95   assert((vectorWidth == 128 || vectorWidth == 256) &&
  96          "Unsupported vector width");
  97   EVT VT = Vec.getValueType();
  98   EVT ElVT = VT.getVectorElementType();
  99   unsigned Factor = VT.getSizeInBits()/vectorWidth;
 100   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
 101                                   VT.getVectorNumElements()/Factor);
 102
 103   // Extract from UNDEF is UNDEF.
 104   if (Vec.getOpcode() == ISD::UNDEF)
 105     return DAG.getUNDEF(ResultVT);
 106
 107   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
 108   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
 109
 110   // This is the index of the first element of the vectorWidth-bit chunk
 111   // we want.
 112   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
 113                                * ElemsPerChunk);
 114
 115   // If the input is a buildvector just emit a smaller one.
 116   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
 117     return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
 118                        makeArrayRef(Vec->op_begin() + NormalizedIdxVal,
 119                                     ElemsPerChunk));
 120
 121   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
 122   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
 123 }
 124
 125 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
 126 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
 127 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
 128 /// instructions or a simple subregister reference. Idx is an index in the
 129 /// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
 130 /// lowering EXTRACT_VECTOR_ELT operations easier.
 131 static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
 132                                    SelectionDAG &DAG, SDLoc dl) {
 133   assert((Vec.getValueType().is256BitVector() ||
 134           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
 135   return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
 136 }
 137
 138 /// Generate a DAG to grab 256-bits from a 512-bit vector.
 139 static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
 140                                    SelectionDAG &DAG, SDLoc dl) {
 141   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
 142   return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
 143 }
 144
 145 static SDValue InsertSubVector(SDValue Result, SDValue Vec,
 146                                unsigned IdxVal, SelectionDAG &DAG,
 147                                SDLoc dl, unsigned vectorWidth) {
 148   assert((vectorWidth == 128 || vectorWidth == 256) &&
 149          "Unsupported vector width");
 150   // Inserting UNDEF is Result
 151   if (Vec.getOpcode() == ISD::UNDEF)
 152     return Result;
 153   EVT VT = Vec.getValueType();
 154   EVT ElVT = VT.getVectorElementType();
 155   EVT ResultVT = Result.getValueType();
 156
 157   // Insert the relevant vectorWidth bits.
 158   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
 159
 160   // This is the index of the first element of the vectorWidth-bit chunk
 161   // we want.
 162   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
 163                                * ElemsPerChunk);
 164
 165   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
 166   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
 167 }
 168
 169 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
 170 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
 171 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
 172 /// simple superregister reference.  Idx is an index in the 128 bits
 173 /// we want.  It need not be aligned to a 128-bit boundary.  That makes
 174 /// lowering INSERT_VECTOR_ELT operations easier.
 175 static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
 176                                   SelectionDAG &DAG,SDLoc dl) {
 177   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
 178   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
 179 }
 180
 181 static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
 182                                   SelectionDAG &DAG, SDLoc dl) {
 183   assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
 184   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
 185 }
 186
 187 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
 188 /// instructions. This is used because creating CONCAT_VECTOR nodes of
 189 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
 190 /// large BUILD_VECTORS.
 191 static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
 192                                    unsigned NumElems, SelectionDAG &DAG,
 193                                    SDLoc dl) {
 194   SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
 195   return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
 196 }
 197
 198 static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
 199                                    unsigned NumElems, SelectionDAG &DAG,
 200                                    SDLoc dl) {
 201   SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
 202   return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
 203 }
 204
 205 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 206                                      const X86Subtarget &STI)
 207     : TargetLowering(TM), Subtarget(&STI) {
 208   X86ScalarSSEf64 = Subtarget->hasSSE2();
 209   X86ScalarSSEf32 = Subtarget->hasSSE1();
 210   TD = getDataLayout();
 211
 212   // Set up the TargetLowering object.
 213   static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
 214
 215   // X86 is weird. It always uses i8 for shift amounts and setcc results.
 216   setBooleanContents(ZeroOrOneBooleanContent);
 217   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
 218   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 219
 220   // For 64-bit, since we have so many registers, use the ILP scheduler.
 221   // For 32-bit, use the register pressure specific scheduling.
 222   // For Atom, always use ILP scheduling.
 223   if (Subtarget->isAtom())
 224     setSchedulingPreference(Sched::ILP);
 225   else if (Subtarget->is64Bit())
 226     setSchedulingPreference(Sched::ILP);
 227   else
 228     setSchedulingPreference(Sched::RegPressure);
 229   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
 230   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
 231
 232   // Bypass expensive divides on Atom when compiling with O2.
 233   if (TM.getOptLevel() >= CodeGenOpt::Default) {
 234     if (Subtarget->hasSlowDivide32())
 235       addBypassSlowDiv(32, 8);
 236     if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit())
 237       addBypassSlowDiv(64, 16);
 238   }
 239
 240   if (Subtarget->isTargetKnownWindowsMSVC()) {
 241     // Setup Windows compiler runtime calls.
 242     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
 243     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
 244     setLibcallName(RTLIB::SREM_I64, "_allrem");
 245     setLibcallName(RTLIB::UREM_I64, "_aullrem");
 246     setLibcallName(RTLIB::MUL_I64, "_allmul");
 247     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
 248     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
 249     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
 250     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
 251     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
 252
 253     // The _ftol2 runtime function has an unusual calling conv, which
 254     // is modeled by a special pseudo-instruction.
 255     setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr);
 256     setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr);
 257     setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr);
 258     setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr);
 259   }
 260
 261   if (Subtarget->isTargetDarwin()) {
 262     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
 263     setUseUnderscoreSetJmp(false);
 264     setUseUnderscoreLongJmp(false);
 265   } else if (Subtarget->isTargetWindowsGNU()) {
 266     // MS runtime is weird: it exports _setjmp, but longjmp!
 267     setUseUnderscoreSetJmp(true);
 268     setUseUnderscoreLongJmp(false);
 269   } else {
 270     setUseUnderscoreSetJmp(true);
 271     setUseUnderscoreLongJmp(true);
 272   }
 273
 274   // Set up the register classes.
 275   addRegisterClass(MVT::i8, &X86::GR8RegClass);
 276   addRegisterClass(MVT::i16, &X86::GR16RegClass);
 277   addRegisterClass(MVT::i32, &X86::GR32RegClass);
 278   if (Subtarget->is64Bit())
 279     addRegisterClass(MVT::i64, &X86::GR64RegClass);
 280
 281   for (MVT VT : MVT::integer_valuetypes())
 282     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
 283
 284   // We don't accept any truncstore of integer registers.
 285   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
 286   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
 287   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
 288   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
 289   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
 290   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
 291
 292   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 293
 294   // SETOEQ and SETUNE require checking two conditions.
 295   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
 296   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
 297   setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
 298   setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
 299   setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
 300   setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
 301
 302   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
 303   // operation.
 304   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
 305   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
 306   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
 307
 308   if (Subtarget->is64Bit()) {
 309     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
 310     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
 311   } else if (!TM.Options.UseSoftFloat) {
 312     // We have an algorithm for SSE2->double, and we turn this into a
 313     // 64-bit FILD followed by conditional FADD for other targets.
 314     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
 315     // We have an algorithm for SSE2, and we turn this into a 64-bit
 316     // FILD for other targets.
 317     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
 318   }
 319
 320   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
 321   // this operation.
 322   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
 323   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
 324
 325   if (!TM.Options.UseSoftFloat) {
 326     // SSE has no i16 to fp conversion, only i32
 327     if (X86ScalarSSEf32) {
 328       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
 329       // f32 and f64 cases are Legal, f80 case is not
 330       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
 331     } else {
 332       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
 333       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
 334     }
 335   } else {
 336     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
 337     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
 338   }
 339
 340   // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
 341   // are Legal, f80 is custom lowered.
 342   setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
 343   setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
 344
 345   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
 346   // this operation.
 347   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
 348   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
 349
 350   if (X86ScalarSSEf32) {
 351     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
 352     // f32 and f64 cases are Legal, f80 case is not
 353     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
 354   } else {
 355     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
 356     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
 357   }
 358
 359   // Handle FP_TO_UINT by promoting the destination to a larger signed
 360   // conversion.
 361   setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
 362   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
 363   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
 364
 365   if (Subtarget->is64Bit()) {
 366     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
 367     setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
 368   } else if (!TM.Options.UseSoftFloat) {
 369     // Since AVX is a superset of SSE3, only check for SSE here.
 370     if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
 371       // Expand FP_TO_UINT into a select.
 372       // FIXME: We would like to use a Custom expander here eventually to do
 373       // the optimal thing for SSE vs. the default expansion in the legalizer.
 374       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
 375     else
 376       // With SSE3 we can use fisttpll to convert to a signed i64; without
 377       // SSE, we're stuck with a fistpll.
 378       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
 379   }
 380
 381   if (isTargetFTOL()) {
 382     // Use the _ftol2 runtime function, which has a pseudo-instruction
 383     // to handle its weird calling convention.
 384     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
 385   }
 386
 387   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
 388   if (!X86ScalarSSEf64) {
 389     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
 390     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
 391     if (Subtarget->is64Bit()) {
 392       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
 393       // Without SSE, i64->f64 goes through memory.
 394       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
 395     }
 396   }
 397
 398   // Scalar integer divide and remainder are lowered to use operations that
 399   // produce two results, to match the available instructions. This exposes
 400   // the two-result form to trivial CSE, which is able to combine x/y and x%y
 401   // into a single instruction.
 402   //
 403   // Scalar integer multiply-high is also lowered to use two-result
 404   // operations, to match the available instructions. However, plain multiply
 405   // (low) operations are left as Legal, as there are single-result
 406   // instructions for this in x86. Using the two-result multiply instructions
 407   // when both high and low results are needed must be arranged by dagcombine.
 408   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
 409     MVT VT = IntVTs[i];
 410     setOperationAction(ISD::MULHS, VT, Expand);
 411     setOperationAction(ISD::MULHU, VT, Expand);
 412     setOperationAction(ISD::SDIV, VT, Expand);
 413     setOperationAction(ISD::UDIV, VT, Expand);
 414     setOperationAction(ISD::SREM, VT, Expand);
 415     setOperationAction(ISD::UREM, VT, Expand);
 416
 417     // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
 418     setOperationAction(ISD::ADDC, VT, Custom);
 419     setOperationAction(ISD::ADDE, VT, Custom);
 420     setOperationAction(ISD::SUBC, VT, Custom);
 421     setOperationAction(ISD::SUBE, VT, Custom);
 422   }
 423
 424   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
 425   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
 426   setOperationAction(ISD::BR_CC            , MVT::f32,   Expand);
 427   setOperationAction(ISD::BR_CC            , MVT::f64,   Expand);
 428   setOperationAction(ISD::BR_CC            , MVT::f80,   Expand);
 429   setOperationAction(ISD::BR_CC            , MVT::i8,    Expand);
 430   setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
 431   setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
 432   setOperationAction(ISD::BR_CC            , MVT::i64,   Expand);
 433   setOperationAction(ISD::SELECT_CC        , MVT::f32,   Expand);
 434   setOperationAction(ISD::SELECT_CC        , MVT::f64,   Expand);
 435   setOperationAction(ISD::SELECT_CC        , MVT::f80,   Expand);
 436   setOperationAction(ISD::SELECT_CC        , MVT::i8,    Expand);
 437   setOperationAction(ISD::SELECT_CC        , MVT::i16,   Expand);
 438   setOperationAction(ISD::SELECT_CC        , MVT::i32,   Expand);
 439   setOperationAction(ISD::SELECT_CC        , MVT::i64,   Expand);
 440   if (Subtarget->is64Bit())
 441     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
 442   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
 443   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
 444   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
 445   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
 446   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
 447   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
 448   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
 449   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
 450
 451   // Promote the i8 variants and force them on up to i32 which has a shorter
 452   // encoding.
 453   setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
 454   AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
 455   setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
 456   AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
 457   if (Subtarget->hasBMI()) {
 458     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
 459     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
 460     if (Subtarget->is64Bit())
 461       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
 462   } else {
 463     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
 464     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
 465     if (Subtarget->is64Bit())
 466       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
 467   }
 468
 469   if (Subtarget->hasLZCNT()) {
 470     // When promoting the i8 variants, force them to i32 for a shorter
 471     // encoding.
 472     setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
 473     AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
 474     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
 475     AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
 476     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
 477     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
 478     if (Subtarget->is64Bit())
 479       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
 480   } else {
 481     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
 482     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
 483     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
 484     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
 485     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
 486     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
 487     if (Subtarget->is64Bit()) {
 488       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
 489       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
 490     }
 491   }
 492
 493   // Special handling for half-precision floating point conversions.
 494   // If we don't have F16C support, then lower half float conversions
 495   // into library calls.
 496   if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) {
 497     setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
 498     setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
 499   }
 500
 501   // There's never any support for operations beyond MVT::f32.
 502   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
 503   setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
 504   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
 505   setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
 506
 507   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
 508   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
 509   setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
 510   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
 511   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
 512   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
 513
 514   if (Subtarget->hasPOPCNT()) {
 515     setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
 516   } else {
 517     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
 518     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
 519     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
 520     if (Subtarget->is64Bit())
 521       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
 522   }
 523
 524   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
 525
 526   if (!Subtarget->hasMOVBE())
 527     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
 528
 529   // These should be promoted to a larger select which is supported.
 530   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
 531   // X86 wants to expand cmov itself.
 532   setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
 533   setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
 534   setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
 535   setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
 536   setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
 537   setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
 538   setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
 539   setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
 540   setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
 541   setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
 542   setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
 543   setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
 544   if (Subtarget->is64Bit()) {
 545     setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
 546     setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
 547   }
 548   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
 549   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
 550   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
 551   // support continuation, user-level threading, and etc.. As a result, no
 552   // other SjLj exception interfaces are implemented and please don't build
 553   // your own exception handling based on them.
 554   // LLVM/Clang supports zero-cost DWARF exception handling.
 555   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
 556   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
 557
 558   // Darwin ABI issue.
 559   setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
 560   setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
 561   setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
 562   setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
 563   if (Subtarget->is64Bit())
 564     setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
 565   setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
 566   setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
 567   if (Subtarget->is64Bit()) {
 568     setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
 569     setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
 570     setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
 571     setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
 572     setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
 573   }
 574   // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
 575   setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
 576   setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
 577   setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
 578   if (Subtarget->is64Bit()) {
 579     setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
 580     setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
 581     setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
 582   }
 583
 584   if (Subtarget->hasSSE1())
 585     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
 586
 587   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
 588
 589   // Expand certain atomics
 590   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
 591     MVT VT = IntVTs[i];
 592     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
 593     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
 594     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
 595   }
 596
 597   if (Subtarget->hasCmpxchg16b()) {
 598     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
 599   }
 600
 601   // FIXME - use subtarget debug flags
 602   if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() &&
 603       !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) {
 604     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
 605   }
 606
 607   if (Subtarget->is64Bit()) {
 608     setExceptionPointerRegister(X86::RAX);
 609     setExceptionSelectorRegister(X86::RDX);
 610   } else {
 611     setExceptionPointerRegister(X86::EAX);
 612     setExceptionSelectorRegister(X86::EDX);
 613   }
 614   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
 615   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
 616
 617   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
 618   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
 619
 620   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 621   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
 622
 623   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
 624   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
 625   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
 626   if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {
 627     // TargetInfo::X86_64ABIBuiltinVaList
 628     setOperationAction(ISD::VAARG           , MVT::Other, Custom);
 629     setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
 630   } else {
 631     // TargetInfo::CharPtrBuiltinVaList
 632     setOperationAction(ISD::VAARG           , MVT::Other, Expand);
 633     setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
 634   }
 635
 636   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
 637   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
 638
 639   setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(), Custom);
 640
 641   if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
 642     // f32 and f64 use SSE.
 643     // Set up the FP register classes.
 644     addRegisterClass(MVT::f32, &X86::FR32RegClass);
 645     addRegisterClass(MVT::f64, &X86::FR64RegClass);
 646
 647     // Use ANDPD to simulate FABS.
 648     setOperationAction(ISD::FABS , MVT::f64, Custom);
 649     setOperationAction(ISD::FABS , MVT::f32, Custom);
 650
 651     // Use XORP to simulate FNEG.
 652     setOperationAction(ISD::FNEG , MVT::f64, Custom);
 653     setOperationAction(ISD::FNEG , MVT::f32, Custom);
 654
 655     // Use ANDPD and ORPD to simulate FCOPYSIGN.
 656     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
 657     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 658
 659     // Lower this to FGETSIGNx86 plus an AND.
 660     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
 661     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
 662
 663     // We don't support sin/cos/fmod
 664     setOperationAction(ISD::FSIN   , MVT::f64, Expand);
 665     setOperationAction(ISD::FCOS   , MVT::f64, Expand);
 666     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 667     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
 668     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
 669     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 670
 671     // Expand FP immediates into loads from the stack, except for the special
 672     // cases we handle.
 673     addLegalFPImmediate(APFloat(+0.0)); // xorpd
 674     addLegalFPImmediate(APFloat(+0.0f)); // xorps
 675   } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
 676     // Use SSE for f32, x87 for f64.
 677     // Set up the FP register classes.
 678     addRegisterClass(MVT::f32, &X86::FR32RegClass);
 679     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
 680
 681     // Use ANDPS to simulate FABS.
 682     setOperationAction(ISD::FABS , MVT::f32, Custom);
 683
 684     // Use XORP to simulate FNEG.
 685     setOperationAction(ISD::FNEG , MVT::f32, Custom);
 686
 687     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
 688
 689     // Use ANDPS and ORPS to simulate FCOPYSIGN.
 690     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
 691     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 692
 693     // We don't support sin/cos/fmod
 694     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
 695     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
 696     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 697
 698     // Special cases we handle for FP constants.
 699     addLegalFPImmediate(APFloat(+0.0f)); // xorps
 700     addLegalFPImmediate(APFloat(+0.0)); // FLD0
 701     addLegalFPImmediate(APFloat(+1.0)); // FLD1
 702     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
 703     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
 704
 705     if (!TM.Options.UnsafeFPMath) {
 706       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
 707       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
 708       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 709     }
 710   } else if (!TM.Options.UseSoftFloat) {
 711     // f32 and f64 in x87.
 712     // Set up the FP register classes.
 713     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
 714     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
 715
 716     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
 717     setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
 718     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
 719     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
 720
 721     if (!TM.Options.UnsafeFPMath) {
 722       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
 723       setOperationAction(ISD::FSIN   , MVT::f32, Expand);
 724       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
 725       setOperationAction(ISD::FCOS   , MVT::f32, Expand);
 726       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 727       setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 728     }
 729     addLegalFPImmediate(APFloat(+0.0)); // FLD0
 730     addLegalFPImmediate(APFloat(+1.0)); // FLD1
 731     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
 732     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
 733     addLegalFPImmediate(APFloat(+0.0f)); // FLD0
 734     addLegalFPImmediate(APFloat(+1.0f)); // FLD1
 735     addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
 736     addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
 737   }
 738
 739   // We don't support FMA.
 740   setOperationAction(ISD::FMA, MVT::f64, Expand);
 741   setOperationAction(ISD::FMA, MVT::f32, Expand);
 742
 743   // Long double always uses X87.
 744   if (!TM.Options.UseSoftFloat) {
 745     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
 746     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
 747     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
 748     {
 749       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
 750       addLegalFPImmediate(TmpFlt);  // FLD0
 751       TmpFlt.changeSign();
 752       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
 753
 754       bool ignored;
 755       APFloat TmpFlt2(+1.0);
 756       TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
 757                       &ignored);
 758       addLegalFPImmediate(TmpFlt2);  // FLD1
 759       TmpFlt2.changeSign();
 760       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
 761     }
 762
 763     if (!TM.Options.UnsafeFPMath) {
 764       setOperationAction(ISD::FSIN   , MVT::f80, Expand);
 765       setOperationAction(ISD::FCOS   , MVT::f80, Expand);
 766       setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
 767     }
 768
 769     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
 770     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
 771     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
 772     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
 773     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
 774     setOperationAction(ISD::FMA, MVT::f80, Expand);
 775   }
 776
 777   // Always use a library call for pow.
 778   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
 779   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
 780   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
 781
 782   setOperationAction(ISD::FLOG, MVT::f80, Expand);
 783   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
 784   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
 785   setOperationAction(ISD::FEXP, MVT::f80, Expand);
 786   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
 787   setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
 788   setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
 789
 790   // First set operation action for all vector types to either promote
 791   // (for widening) or expand (for scalarization). Then we will selectively
 792   // turn on ones that can be effectively codegen'd.
 793   for (MVT VT : MVT::vector_valuetypes()) {
 794     setOperationAction(ISD::ADD , VT, Expand);
 795     setOperationAction(ISD::SUB , VT, Expand);
 796     setOperationAction(ISD::FADD, VT, Expand);
 797     setOperationAction(ISD::FNEG, VT, Expand);
 798     setOperationAction(ISD::FSUB, VT, Expand);
 799     setOperationAction(ISD::MUL , VT, Expand);
 800     setOperationAction(ISD::FMUL, VT, Expand);
 801     setOperationAction(ISD::SDIV, VT, Expand);
 802     setOperationAction(ISD::UDIV, VT, Expand);
 803     setOperationAction(ISD::FDIV, VT, Expand);
 804     setOperationAction(ISD::SREM, VT, Expand);
 805     setOperationAction(ISD::UREM, VT, Expand);
 806     setOperationAction(ISD::LOAD, VT, Expand);
 807     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
 808     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
 809     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
 810     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
 811     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
 812     setOperationAction(ISD::FABS, VT, Expand);
 813     setOperationAction(ISD::FSIN, VT, Expand);
 814     setOperationAction(ISD::FSINCOS, VT, Expand);
 815     setOperationAction(ISD::FCOS, VT, Expand);
 816     setOperationAction(ISD::FSINCOS, VT, Expand);
 817     setOperationAction(ISD::FREM, VT, Expand);
 818     setOperationAction(ISD::FMA,  VT, Expand);
 819     setOperationAction(ISD::FPOWI, VT, Expand);
 820     setOperationAction(ISD::FSQRT, VT, Expand);
 821     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
 822     setOperationAction(ISD::FFLOOR, VT, Expand);
 823     setOperationAction(ISD::FCEIL, VT, Expand);
 824     setOperationAction(ISD::FTRUNC, VT, Expand);
 825     setOperationAction(ISD::FRINT, VT, Expand);
 826     setOperationAction(ISD::FNEARBYINT, VT, Expand);
 827     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
 828     setOperationAction(ISD::MULHS, VT, Expand);
 829     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 830     setOperationAction(ISD::MULHU, VT, Expand);
 831     setOperationAction(ISD::SDIVREM, VT, Expand);
 832     setOperationAction(ISD::UDIVREM, VT, Expand);
 833     setOperationAction(ISD::FPOW, VT, Expand);
 834     setOperationAction(ISD::CTPOP, VT, Expand);
 835     setOperationAction(ISD::CTTZ, VT, Expand);
 836     setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
 837     setOperationAction(ISD::CTLZ, VT, Expand);
 838     setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
 839     setOperationAction(ISD::SHL, VT, Expand);
 840     setOperationAction(ISD::SRA, VT, Expand);
 841     setOperationAction(ISD::SRL, VT, Expand);
 842     setOperationAction(ISD::ROTL, VT, Expand);
 843     setOperationAction(ISD::ROTR, VT, Expand);
 844     setOperationAction(ISD::BSWAP, VT, Expand);
 845     setOperationAction(ISD::SETCC, VT, Expand);
 846     setOperationAction(ISD::FLOG, VT, Expand);
 847     setOperationAction(ISD::FLOG2, VT, Expand);
 848     setOperationAction(ISD::FLOG10, VT, Expand);
 849     setOperationAction(ISD::FEXP, VT, Expand);
 850     setOperationAction(ISD::FEXP2, VT, Expand);
 851     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
 852     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
 853     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
 854     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
 855     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
 856     setOperationAction(ISD::TRUNCATE, VT, Expand);
 857     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
 858     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
 859     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
 860     setOperationAction(ISD::VSELECT, VT, Expand);
 861     setOperationAction(ISD::SELECT_CC, VT, Expand);
 862     for (MVT InnerVT : MVT::vector_valuetypes()) {
 863       setTruncStoreAction(InnerVT, VT, Expand);
 864
 865       setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
 866       setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
 867
 868       // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
 869       // types, we have to deal with them whether we ask for Expansion or not.
 870       // Setting Expand causes its own optimisation problems though, so leave
 871       // them legal.
 872       if (VT.getVectorElementType() == MVT::i1)
 873         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
 874     }
 875   }
 876
 877   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
 878   // with -msoft-float, disable use of MMX as well.
 879   if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
 880     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
 881     // No operations on x86mmx supported, everything uses intrinsics.
 882   }
 883
 884   // MMX-sized vectors (other than x86mmx) are expected to be expanded
 885   // into smaller operations.
 886   setOperationAction(ISD::MULHS,              MVT::v8i8,  Expand);
 887   setOperationAction(ISD::MULHS,              MVT::v4i16, Expand);
 888   setOperationAction(ISD::MULHS,              MVT::v2i32, Expand);
 889   setOperationAction(ISD::MULHS,              MVT::v1i64, Expand);
 890   setOperationAction(ISD::AND,                MVT::v8i8,  Expand);
 891   setOperationAction(ISD::AND,                MVT::v4i16, Expand);
 892   setOperationAction(ISD::AND,                MVT::v2i32, Expand);
 893   setOperationAction(ISD::AND,                MVT::v1i64, Expand);
 894   setOperationAction(ISD::OR,                 MVT::v8i8,  Expand);
 895   setOperationAction(ISD::OR,                 MVT::v4i16, Expand);
 896   setOperationAction(ISD::OR,                 MVT::v2i32, Expand);
 897   setOperationAction(ISD::OR,                 MVT::v1i64, Expand);
 898   setOperationAction(ISD::XOR,                MVT::v8i8,  Expand);
 899   setOperationAction(ISD::XOR,                MVT::v4i16, Expand);
 900   setOperationAction(ISD::XOR,                MVT::v2i32, Expand);
 901   setOperationAction(ISD::XOR,                MVT::v1i64, Expand);
 902   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Expand);
 903   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Expand);
 904   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2i32, Expand);
 905   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Expand);
 906   setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
 907   setOperationAction(ISD::SELECT,             MVT::v8i8,  Expand);
 908   setOperationAction(ISD::SELECT,             MVT::v4i16, Expand);
 909   setOperationAction(ISD::SELECT,             MVT::v2i32, Expand);
 910   setOperationAction(ISD::SELECT,             MVT::v1i64, Expand);
 911   setOperationAction(ISD::BITCAST,            MVT::v8i8,  Expand);
 912   setOperationAction(ISD::BITCAST,            MVT::v4i16, Expand);
 913   setOperationAction(ISD::BITCAST,            MVT::v2i32, Expand);
 914   setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
 915
 916   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
 917     addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
 918
 919     setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
 920     setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
 921     setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
 922     setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
 923     setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
 924     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
 925     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
 926     setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
 927     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
 928     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
 929     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
 930     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
 931     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
 932   }
 933
 934   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
 935     addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
 936
 937     // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
 938     // registers cannot be used even for integer operations.
 939     addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
 940     addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
 941     addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
 942     addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
 943
 944     setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
 945     setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
 946     setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
 947     setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
 948     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
 949     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
 950     setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
 951     setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
 952     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
 953     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
 954     setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
 955     setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
 956     setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
 957     setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
 958     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
 959     setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
 960     setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
 961     setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
 962     setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
 963     setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
 964     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
 965     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
 966
 967     setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
 968     setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
 969     setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
 970     setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
 971
 972     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
 973     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
 974     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
 975     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
 976     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
 977
 978     // Only provide customized ctpop vector bit twiddling for vector types we
 979     // know to perform better than using the popcnt instructions on each vector
 980     // element. If popcnt isn't supported, always provide the custom version.
 981     if (!Subtarget->hasPOPCNT()) {
 982       setOperationAction(ISD::CTPOP,            MVT::v4i32, Custom);
 983       setOperationAction(ISD::CTPOP,            MVT::v2i64, Custom);
 984     }
 985
 986     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
 987     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
 988       MVT VT = (MVT::SimpleValueType)i;
 989       // Do not attempt to custom lower non-power-of-2 vectors
 990       if (!isPowerOf2_32(VT.getVectorNumElements()))
 991         continue;
 992       // Do not attempt to custom lower non-128-bit vectors
 993       if (!VT.is128BitVector())
 994         continue;
 995       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
 996       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
 997       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
 998     }
 999
1000     // We support custom legalizing of sext and anyext loads for specific
1001     // memory vector types which we can load as a scalar (or sequence of
1002     // scalars) and extend in-register to a legal 128-bit vector type. For sext
1003     // loads these must work with a single scalar load.
1004     for (MVT VT : MVT::integer_vector_valuetypes()) {
1005       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
1006       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
1007       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
1008       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
1009       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
1010       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
1011       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
1012       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
1013       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
1014     }
1015
1016     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
1017     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
1018     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
1019     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
1020     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
1021     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
1022
1023     if (Subtarget->is64Bit()) {
1024       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
1025       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
1026     }
1027
1028     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
1029     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
1030       MVT VT = (MVT::SimpleValueType)i;
1031
1032       // Do not attempt to promote non-128-bit vectors
1033       if (!VT.is128BitVector())
1034         continue;
1035
1036       setOperationAction(ISD::AND,    VT, Promote);
1037       AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
1038       setOperationAction(ISD::OR,     VT, Promote);
1039       AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
1040       setOperationAction(ISD::XOR,    VT, Promote);
1041       AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
1042       setOperationAction(ISD::LOAD,   VT, Promote);
1043       AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
1044       setOperationAction(ISD::SELECT, VT, Promote);
1045       AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
1046     }
1047
1048     // Custom lower v2i64 and v2f64 selects.
1049     setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
1050     setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
1051     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
1052     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
1053
1054     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
1055     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
1056
1057     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
1058     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
1059     // As there is no 64-bit GPR available, we need build a special custom
1060     // sequence to convert from v2i32 to v2f32.
1061     if (!Subtarget->is64Bit())
1062       setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
1063
1064     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
1065     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
1066
1067     for (MVT VT : MVT::fp_vector_valuetypes())
1068       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
1069
1070     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
1071     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
1072     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
1073   }
1074
1075   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
1076     setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
1077     setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
1078     setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
1079     setOperationAction(ISD::FRINT,              MVT::f32,   Legal);
1080     setOperationAction(ISD::FNEARBYINT,         MVT::f32,   Legal);
1081     setOperationAction(ISD::FFLOOR,             MVT::f64,   Legal);
1082     setOperationAction(ISD::FCEIL,              MVT::f64,   Legal);
1083     setOperationAction(ISD::FTRUNC,             MVT::f64,   Legal);
1084     setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
1085     setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
1086
1087     setOperationAction(ISD::FFLOOR,             MVT::v4f32, Legal);
1088     setOperationAction(ISD::FCEIL,              MVT::v4f32, Legal);
1089     setOperationAction(ISD::FTRUNC,             MVT::v4f32, Legal);
1090     setOperationAction(ISD::FRINT,              MVT::v4f32, Legal);
1091     setOperationAction(ISD::FNEARBYINT,         MVT::v4f32, Legal);
1092     setOperationAction(ISD::FFLOOR,             MVT::v2f64, Legal);
1093     setOperationAction(ISD::FCEIL,              MVT::v2f64, Legal);
1094     setOperationAction(ISD::FTRUNC,             MVT::v2f64, Legal);
1095     setOperationAction(ISD::FRINT,              MVT::v2f64, Legal);
1096     setOperationAction(ISD::FNEARBYINT,         MVT::v2f64, Legal);
1097
1098     // FIXME: Do we need to handle scalar-to-vector here?
1099     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
1100
1101     setOperationAction(ISD::VSELECT,            MVT::v2f64, Custom);
1102     setOperationAction(ISD::VSELECT,            MVT::v2i64, Custom);
1103     setOperationAction(ISD::VSELECT,            MVT::v4i32, Custom);
1104     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
1105     setOperationAction(ISD::VSELECT,            MVT::v8i16, Custom);
1106     // There is no BLENDI for byte vectors. We don't need to custom lower
1107     // some vselects for now.
1108     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
1109
1110     // SSE41 brings specific instructions for doing vector sign extend even in
1111     // cases where we don't have SRA.
1112     for (MVT VT : MVT::integer_vector_valuetypes()) {
1113       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
1114       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
1115       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
1116     }
1117
1118     // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1119     setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
1120     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
1121     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
1122     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1123     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1124     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1125
1126     setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
1127     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
1128     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
1129     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1130     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1131     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1132
1133     // i8 and i16 vectors are custom because the source register and source
1134     // source memory operand types are not the same width.  f32 vectors are
1135     // custom since the immediate controlling the insert encodes additional
1136     // information.
1137     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
1138     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
1139     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
1140     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
1141
1142     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
1143     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
1144     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
1145     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
1146
1147     // FIXME: these should be Legal, but that's only for the case where
1148     // the index is constant.  For now custom expand to deal with that.
1149     if (Subtarget->is64Bit()) {
1150       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
1151       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
1152     }
1153   }
1154
1155   if (Subtarget->hasSSE2()) {
1156     setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
1157     setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
1158
1159     setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
1160     setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
1161
1162     setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
1163     setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
1164
1165     // In the customized shift lowering, the legal cases in AVX2 will be
1166     // recognized.
1167     setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
1168     setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
1169
1170     setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
1171     setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
1172
1173     setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
1174   }
1175
1176   if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
1177     addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
1178     addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
1179     addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
1180     addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
1181     addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
1182     addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
1183
1184     setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
1185     setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
1186     setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
1187
1188     setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
1189     setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
1190     setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
1191     setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
1192     setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
1193     setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
1194     setOperationAction(ISD::FCEIL,              MVT::v8f32, Legal);
1195     setOperationAction(ISD::FTRUNC,             MVT::v8f32, Legal);
1196     setOperationAction(ISD::FRINT,              MVT::v8f32, Legal);
1197     setOperationAction(ISD::FNEARBYINT,         MVT::v8f32, Legal);
1198     setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
1199     setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
1200
1201     setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
1202     setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
1203     setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
1204     setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
1205     setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
1206     setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
1207     setOperationAction(ISD::FCEIL,              MVT::v4f64, Legal);
1208     setOperationAction(ISD::FTRUNC,             MVT::v4f64, Legal);
1209     setOperationAction(ISD::FRINT,              MVT::v4f64, Legal);
1210     setOperationAction(ISD::FNEARBYINT,         MVT::v4f64, Legal);
1211     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
1212     setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
1213
1214     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1215     // even though v8i16 is a legal type.
1216     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Promote);
1217     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Promote);
1218     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
1219
1220     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
1221     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
1222     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
1223
1224     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
1225     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
1226
1227     for (MVT VT : MVT::fp_vector_valuetypes())
1228       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1229
1230     setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
1231     setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
1232
1233     setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
1234     setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
1235
1236     setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
1237     setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
1238
1239     setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
1240     setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
1241     setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
1242     setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
1243
1244     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
1245     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
1246     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
1247
1248     setOperationAction(ISD::VSELECT,           MVT::v4f64, Custom);
1249     setOperationAction(ISD::VSELECT,           MVT::v4i64, Custom);
1250     setOperationAction(ISD::VSELECT,           MVT::v8i32, Custom);
1251     setOperationAction(ISD::VSELECT,           MVT::v8f32, Custom);
1252
1253     setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
1254     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
1255     setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
1256     setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
1257     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
1258     setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i16, Custom);
1259     setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
1260     setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
1261     setOperationAction(ISD::ANY_EXTEND,        MVT::v16i16, Custom);
1262     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
1263     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
1264     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
1265
1266     if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
1267       setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
1268       setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
1269       setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
1270       setOperationAction(ISD::FMA,             MVT::v2f64, Legal);
1271       setOperationAction(ISD::FMA,             MVT::f32, Legal);
1272       setOperationAction(ISD::FMA,             MVT::f64, Legal);
1273     }
1274
1275     if (Subtarget->hasInt256()) {
1276       setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
1277       setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
1278       setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
1279       setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
1280
1281       setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
1282       setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
1283       setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
1284       setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
1285
1286       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
1287       setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
1288       setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
1289       // Don't lower v32i8 because there is no 128-bit byte mul
1290
1291       setOperationAction(ISD::UMUL_LOHI,       MVT::v8i32, Custom);
1292       setOperationAction(ISD::SMUL_LOHI,       MVT::v8i32, Custom);
1293       setOperationAction(ISD::MULHU,           MVT::v16i16, Legal);
1294       setOperationAction(ISD::MULHS,           MVT::v16i16, Legal);
1295
1296       setOperationAction(ISD::VSELECT,         MVT::v16i16, Custom);
1297       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
1298
1299       // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1300       // when we have a 256bit-wide blend with immediate.
1301       setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1302
1303       // Only provide customized ctpop vector bit twiddling for vector types we
1304       // know to perform better than using the popcnt instructions on each
1305       // vector element. If popcnt isn't supported, always provide the custom
1306       // version.
1307       if (!Subtarget->hasPOPCNT())
1308         setOperationAction(ISD::CTPOP,           MVT::v4i64, Custom);
1309
1310       // Custom CTPOP always performs better on natively supported v8i32
1311       setOperationAction(ISD::CTPOP,             MVT::v8i32, Custom);
1312
1313       // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1314       setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1315       setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
1316       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
1317       setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
1318       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
1319       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
1320
1321       setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1322       setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
1323       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
1324       setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
1325       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
1326       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
1327     } else {
1328       setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
1329       setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
1330       setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
1331       setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
1332
1333       setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
1334       setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
1335       setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
1336       setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
1337
1338       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
1339       setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
1340       setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
1341       // Don't lower v32i8 because there is no 128-bit byte mul
1342     }
1343
1344     // In the customized shift lowering, the legal cases in AVX2 will be
1345     // recognized.
1346     setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
1347     setOperationAction(ISD::SRL,               MVT::v8i32, Custom);
1348
1349     setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
1350     setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
1351
1352     setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
1353
1354     // Custom lower several nodes for 256-bit types.
1355     for (MVT VT : MVT::vector_valuetypes()) {
1356       if (VT.getScalarSizeInBits() >= 32) {
1357         setOperationAction(ISD::MLOAD,  VT, Legal);
1358         setOperationAction(ISD::MSTORE, VT, Legal);
1359       }
1360       // Extract subvector is special because the value type
1361       // (result) is 128-bit but the source is 256-bit wide.
1362       if (VT.is128BitVector()) {
1363         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1364       }
1365       // Do not attempt to custom lower other non-256-bit vectors
1366       if (!VT.is256BitVector())
1367         continue;
1368
1369       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1370       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1371       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1372       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1373       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
1374       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
1375       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1376     }
1377
1378     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1379     for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
1380       MVT VT = (MVT::SimpleValueType)i;
1381
1382       // Do not attempt to promote non-256-bit vectors
1383       if (!VT.is256BitVector())
1384         continue;
1385
1386       setOperationAction(ISD::AND,    VT, Promote);
1387       AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
1388       setOperationAction(ISD::OR,     VT, Promote);
1389       AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
1390       setOperationAction(ISD::XOR,    VT, Promote);
1391       AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
1392       setOperationAction(ISD::LOAD,   VT, Promote);
1393       AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
1394       setOperationAction(ISD::SELECT, VT, Promote);
1395       AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
1396     }
1397   }
1398
1399   if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) {
1400     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1401     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1402     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
1403     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
1404
1405     addRegisterClass(MVT::i1,     &X86::VK1RegClass);
1406     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
1407     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
1408
1409     for (MVT VT : MVT::fp_vector_valuetypes())
1410       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1411
1412     setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
1413     setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
1414     setOperationAction(ISD::XOR,                MVT::i1,    Legal);
1415     setOperationAction(ISD::OR,                 MVT::i1,    Legal);
1416     setOperationAction(ISD::AND,                MVT::i1,    Legal);
1417     setOperationAction(ISD::LOAD,               MVT::v16f32, Legal);
1418     setOperationAction(ISD::LOAD,               MVT::v8f64, Legal);
1419     setOperationAction(ISD::LOAD,               MVT::v8i64, Legal);
1420     setOperationAction(ISD::LOAD,               MVT::v16i32, Legal);
1421     setOperationAction(ISD::LOAD,               MVT::v16i1, Legal);
1422
1423     setOperationAction(ISD::FADD,               MVT::v16f32, Legal);
1424     setOperationAction(ISD::FSUB,               MVT::v16f32, Legal);
1425     setOperationAction(ISD::FMUL,               MVT::v16f32, Legal);
1426     setOperationAction(ISD::FDIV,               MVT::v16f32, Legal);
1427     setOperationAction(ISD::FSQRT,              MVT::v16f32, Legal);
1428     setOperationAction(ISD::FNEG,               MVT::v16f32, Custom);
1429
1430     setOperationAction(ISD::FADD,               MVT::v8f64, Legal);
1431     setOperationAction(ISD::FSUB,               MVT::v8f64, Legal);
1432     setOperationAction(ISD::FMUL,               MVT::v8f64, Legal);
1433     setOperationAction(ISD::FDIV,               MVT::v8f64, Legal);
1434     setOperationAction(ISD::FSQRT,              MVT::v8f64, Legal);
1435     setOperationAction(ISD::FNEG,               MVT::v8f64, Custom);
1436     setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
1437     setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
1438
1439     setOperationAction(ISD::FP_TO_SINT,         MVT::i32, Legal);
1440     setOperationAction(ISD::FP_TO_UINT,         MVT::i32, Legal);
1441     setOperationAction(ISD::SINT_TO_FP,         MVT::i32, Legal);
1442     setOperationAction(ISD::UINT_TO_FP,         MVT::i32, Legal);
1443     if (Subtarget->is64Bit()) {
1444       setOperationAction(ISD::FP_TO_UINT,       MVT::i64, Legal);
1445       setOperationAction(ISD::FP_TO_SINT,       MVT::i64, Legal);
1446       setOperationAction(ISD::SINT_TO_FP,       MVT::i64, Legal);
1447       setOperationAction(ISD::UINT_TO_FP,       MVT::i64, Legal);
1448     }
1449     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
1450     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
1451     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
1452     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
1453     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
1454     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i1,   Custom);
1455     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i1,  Custom);
1456     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i8,  Promote);
1457     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i16, Promote);
1458     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
1459     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
1460     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
1461     setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
1462     setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
1463
1464     setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
1465     setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
1466     setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
1467     setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
1468     setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
1469     setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
1470     setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
1471     setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
1472     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
1473     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
1474     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
1475     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
1476     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
1477
1478     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
1479     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
1480     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
1481     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
1482     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1,    Custom);
1483     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1, Legal);
1484
1485     setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
1486     setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);
1487
1488     setOperationAction(ISD::MUL,              MVT::v8i64, Custom);
1489
1490     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
1491     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
1492     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
1493     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
1494     setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
1495     setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
1496     setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
1497     setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
1498     setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
1499
1500     setOperationAction(ISD::ADD,                MVT::v8i64, Legal);
1501     setOperationAction(ISD::ADD,                MVT::v16i32, Legal);
1502
1503     setOperationAction(ISD::SUB,                MVT::v8i64, Legal);
1504     setOperationAction(ISD::SUB,                MVT::v16i32, Legal);
1505
1506     setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
1507
1508     setOperationAction(ISD::SRL,                MVT::v8i64, Custom);
1509     setOperationAction(ISD::SRL,                MVT::v16i32, Custom);
1510
1511     setOperationAction(ISD::SHL,                MVT::v8i64, Custom);
1512     setOperationAction(ISD::SHL,                MVT::v16i32, Custom);
1513
1514     setOperationAction(ISD::SRA,                MVT::v8i64, Custom);
1515     setOperationAction(ISD::SRA,                MVT::v16i32, Custom);
1516
1517     setOperationAction(ISD::AND,                MVT::v8i64, Legal);
1518     setOperationAction(ISD::OR,                 MVT::v8i64, Legal);
1519     setOperationAction(ISD::XOR,                MVT::v8i64, Legal);
1520     setOperationAction(ISD::AND,                MVT::v16i32, Legal);
1521     setOperationAction(ISD::OR,                 MVT::v16i32, Legal);
1522     setOperationAction(ISD::XOR,                MVT::v16i32, Legal);
1523
1524     if (Subtarget->hasCDI()) {
1525       setOperationAction(ISD::CTLZ,             MVT::v8i64, Legal);
1526       setOperationAction(ISD::CTLZ,             MVT::v16i32, Legal);
1527     }
1528
1529     // Custom lower several nodes.
1530     for (MVT VT : MVT::vector_valuetypes()) {
1531       unsigned EltSize = VT.getVectorElementType().getSizeInBits();
1532       // Extract subvector is special because the value type
1533       // (result) is 256/128-bit but the source is 512-bit wide.
1534       if (VT.is128BitVector() || VT.is256BitVector()) {
1535         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1536       }
1537       if (VT.getVectorElementType() == MVT::i1)
1538         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1539
1540       // Do not attempt to custom lower other non-512-bit vectors
1541       if (!VT.is512BitVector())
1542         continue;
1543
1544       if ( EltSize >= 32) {
1545         setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
1546         setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
1547         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
1548         setOperationAction(ISD::VSELECT,             VT, Legal);
1549         setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
1550         setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
1551         setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
1552         setOperationAction(ISD::MLOAD,               VT, Legal);
1553         setOperationAction(ISD::MSTORE,              VT, Legal);
1554       }
1555     }
1556     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
1557       MVT VT = (MVT::SimpleValueType)i;
1558
1559       // Do not attempt to promote non-512-bit vectors.
1560       if (!VT.is512BitVector())
1561         continue;
1562
1563       setOperationAction(ISD::SELECT, VT, Promote);
1564       AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
1565     }
1566   }// has  AVX-512
1567
1568   if (!TM.Options.UseSoftFloat && Subtarget->hasBWI()) {
1569     addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1570     addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
1571
1572     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
1573     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
1574
1575     setOperationAction(ISD::LOAD,               MVT::v32i16, Legal);
1576     setOperationAction(ISD::LOAD,               MVT::v64i8, Legal);
1577     setOperationAction(ISD::SETCC,              MVT::v32i1, Custom);
1578     setOperationAction(ISD::SETCC,              MVT::v64i1, Custom);
1579     setOperationAction(ISD::ADD,                MVT::v32i16, Legal);
1580     setOperationAction(ISD::ADD,                MVT::v64i8, Legal);
1581     setOperationAction(ISD::SUB,                MVT::v32i16, Legal);
1582     setOperationAction(ISD::SUB,                MVT::v64i8, Legal);
1583     setOperationAction(ISD::MUL,                MVT::v32i16, Legal);
1584
1585     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
1586       const MVT VT = (MVT::SimpleValueType)i;
1587
1588       const unsigned EltSize = VT.getVectorElementType().getSizeInBits();
1589
1590       // Do not attempt to promote non-512-bit vectors.
1591       if (!VT.is512BitVector())
1592         continue;
1593
1594       if (EltSize < 32) {
1595         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
1596         setOperationAction(ISD::VSELECT,             VT, Legal);
1597       }
1598     }
1599   }
1600
1601   if (!TM.Options.UseSoftFloat && Subtarget->hasVLX()) {
1602     addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
1603     addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
1604
1605     setOperationAction(ISD::SETCC,              MVT::v4i1, Custom);
1606     setOperationAction(ISD::SETCC,              MVT::v2i1, Custom);
1607     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v8i1, Legal);
1608
1609     setOperationAction(ISD::AND,                MVT::v8i32, Legal);
1610     setOperationAction(ISD::OR,                 MVT::v8i32, Legal);
1611     setOperationAction(ISD::XOR,                MVT::v8i32, Legal);
1612     setOperationAction(ISD::AND,                MVT::v4i32, Legal);
1613     setOperationAction(ISD::OR,                 MVT::v4i32, Legal);
1614     setOperationAction(ISD::XOR,                MVT::v4i32, Legal);
1615   }
1616
1617   // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
1618   // of this type with custom code.
1619   for (MVT VT : MVT::vector_valuetypes())
1620     setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
1621
1622   // We want to custom lower some of our intrinsics.
1623   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1624   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1625   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1626   if (!Subtarget->is64Bit())
1627     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1628
1629   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1630   // handle type legalization for these operations here.
1631   //
1632   // FIXME: We really should do custom legalization for addition and
1633   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
1634   // than generic legalization for 64-bit multiplication-with-overflow, though.
1635   for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
1636     // Add/Sub/Mul with overflow operations are custom lowered.
1637     MVT VT = IntVTs[i];
1638     setOperationAction(ISD::SADDO, VT, Custom);
1639     setOperationAction(ISD::UADDO, VT, Custom);
1640     setOperationAction(ISD::SSUBO, VT, Custom);
1641     setOperationAction(ISD::USUBO, VT, Custom);
1642     setOperationAction(ISD::SMULO, VT, Custom);
1643     setOperationAction(ISD::UMULO, VT, Custom);
1644   }
1645
1646
1647   if (!Subtarget->is64Bit()) {
1648     // These libcalls are not available in 32-bit.
1649     setLibcallName(RTLIB::SHL_I128, nullptr);
1650     setLibcallName(RTLIB::SRL_I128, nullptr);
1651     setLibcallName(RTLIB::SRA_I128, nullptr);
1652   }
1653
1654   // Combine sin / cos into one node or libcall if possible.
1655   if (Subtarget->hasSinCos()) {
1656     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1657     setLibcallName(RTLIB::SINCOS_F64, "sincos");
1658     if (Subtarget->isTargetDarwin()) {
1659       // For MacOSX, we don't want the normal expansion of a libcall to sincos.
1660       // We want to issue a libcall to __sincos_stret to avoid memory traffic.
1661       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1662       setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1663     }
1664   }
1665
1666   if (Subtarget->isTargetWin64()) {
1667     setOperationAction(ISD::SDIV, MVT::i128, Custom);
1668     setOperationAction(ISD::UDIV, MVT::i128, Custom);
1669     setOperationAction(ISD::SREM, MVT::i128, Custom);
1670     setOperationAction(ISD::UREM, MVT::i128, Custom);
1671     setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1672     setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1673   }
1674
1675   // We have target-specific dag combine patterns for the following nodes:
1676   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1677   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1678   setTargetDAGCombine(ISD::BITCAST);
1679   setTargetDAGCombine(ISD::VSELECT);
1680   setTargetDAGCombine(ISD::SELECT);
1681   setTargetDAGCombine(ISD::SHL);
1682   setTargetDAGCombine(ISD::SRA);
1683   setTargetDAGCombine(ISD::SRL);
1684   setTargetDAGCombine(ISD::OR);
1685   setTargetDAGCombine(ISD::AND);
1686   setTargetDAGCombine(ISD::ADD);
1687   setTargetDAGCombine(ISD::FADD);
1688   setTargetDAGCombine(ISD::FSUB);
1689   setTargetDAGCombine(ISD::FMA);
1690   setTargetDAGCombine(ISD::SUB);
1691   setTargetDAGCombine(ISD::LOAD);
1692   setTargetDAGCombine(ISD::MLOAD);
1693   setTargetDAGCombine(ISD::STORE);
1694   setTargetDAGCombine(ISD::MSTORE);
1695   setTargetDAGCombine(ISD::ZERO_EXTEND);
1696   setTargetDAGCombine(ISD::ANY_EXTEND);
1697   setTargetDAGCombine(ISD::SIGN_EXTEND);
1698   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1699   setTargetDAGCombine(ISD::TRUNCATE);
1700   setTargetDAGCombine(ISD::SINT_TO_FP);
1701   setTargetDAGCombine(ISD::SETCC);
1702   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
1703   setTargetDAGCombine(ISD::BUILD_VECTOR);
1704   setTargetDAGCombine(ISD::MUL);
1705   setTargetDAGCombine(ISD::XOR);
1706
1707   computeRegisterProperties();
1708
1709   // On Darwin, -Os means optimize for size without hurting performance,
1710   // do not reduce the limit.
1711   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1712   MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
1713   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1714   MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1715   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1716   MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1717   setPrefLoopAlignment(4); // 2^4 bytes.
1718
1719   // Predictable cmov don't hurt on atom because it's in-order.
1720   PredictableSelectIsExpensive = !Subtarget->isAtom();
1721   EnableExtLdPromotion = true;
1722   setPrefFunctionAlignment(4); // 2^4 bytes.
1723
1724   verifyIntrinsicTables();
1725 }
1726
1727 // This has so far only been implemented for 64-bit MachO.
1728 bool X86TargetLowering::useLoadStackGuardNode() const {
1729   return Subtarget->isTargetMachO() && Subtarget->is64Bit();
1730 }
1731
1732 TargetLoweringBase::LegalizeTypeAction
1733 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1734   if (ExperimentalVectorWideningLegalization &&
1735       VT.getVectorNumElements() != 1 &&
1736       VT.getVectorElementType().getSimpleVT() != MVT::i1)
1737     return TypeWidenVector;
1738
1739   return TargetLoweringBase::getPreferredVectorAction(VT);
1740 }
1741
1742 EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1743   if (!VT.isVector())
1744     return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
1745
1746   const unsigned NumElts = VT.getVectorNumElements();
1747   const EVT EltVT = VT.getVectorElementType();
1748   if (VT.is512BitVector()) {
1749     if (Subtarget->hasAVX512())
1750       if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1751           EltVT == MVT::f32 || EltVT == MVT::f64)
1752         switch(NumElts) {
1753         case  8: return MVT::v8i1;
1754         case 16: return MVT::v16i1;
1755       }
1756     if (Subtarget->hasBWI())
1757       if (EltVT == MVT::i8 || EltVT == MVT::i16)
1758         switch(NumElts) {
1759         case 32: return MVT::v32i1;
1760         case 64: return MVT::v64i1;
1761       }
1762   }
1763
1764   if (VT.is256BitVector() || VT.is128BitVector()) {
1765     if (Subtarget->hasVLX())
1766       if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1767           EltVT == MVT::f32 || EltVT == MVT::f64)
1768         switch(NumElts) {
1769         case 2: return MVT::v2i1;
1770         case 4: return MVT::v4i1;
1771         case 8: return MVT::v8i1;
1772       }
1773     if (Subtarget->hasBWI() && Subtarget->hasVLX())
1774       if (EltVT == MVT::i8 || EltVT == MVT::i16)
1775         switch(NumElts) {
1776         case  8: return MVT::v8i1;
1777         case 16: return MVT::v16i1;
1778         case 32: return MVT::v32i1;
1779       }
1780   }
1781
1782   return VT.changeVectorElementTypeToInteger();
1783 }
1784
1785 /// Helper for getByValTypeAlignment to determine
1786 /// the desired ByVal argument alignment.
1787 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1788   if (MaxAlign == 16)
1789     return;
1790   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1791     if (VTy->getBitWidth() == 128)
1792       MaxAlign = 16;
1793   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1794     unsigned EltAlign = 0;
1795     getMaxByValAlign(ATy->getElementType(), EltAlign);
1796     if (EltAlign > MaxAlign)
1797       MaxAlign = EltAlign;
1798   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1799     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
1800       unsigned EltAlign = 0;
1801       getMaxByValAlign(STy->getElementType(i), EltAlign);
1802       if (EltAlign > MaxAlign)
1803         MaxAlign = EltAlign;
1804       if (MaxAlign == 16)
1805         break;
1806     }
1807   }
1808 }
1809
1810 /// Return the desired alignment for ByVal aggregate
1811 /// function arguments in the caller parameter area. For X86, aggregates
1812 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1813 /// are at 4-byte boundaries.
1814 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
1815   if (Subtarget->is64Bit()) {
1816     // Max of 8 and alignment of type.
1817     unsigned TyAlign = TD->getABITypeAlignment(Ty);
1818     if (TyAlign > 8)
1819       return TyAlign;
1820     return 8;
1821   }
1822
1823   unsigned Align = 4;
1824   if (Subtarget->hasSSE1())
1825     getMaxByValAlign(Ty, Align);
1826   return Align;
1827 }
1828
1829 /// Returns the target specific optimal type for load
1830 /// and store operations as a result of memset, memcpy, and memmove
1831 /// lowering. If DstAlign is zero that means it's safe to destination
1832 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1833 /// means there isn't a need to check it against alignment requirement,
1834 /// probably because the source does not need to be loaded. If 'IsMemset' is
1835 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1836 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1837 /// source is constant so it does not need to be loaded.
1838 /// It returns EVT::Other if the type should be determined using generic
1839 /// target-independent logic.
1840 EVT
1841 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1842                                        unsigned DstAlign, unsigned SrcAlign,
1843                                        bool IsMemset, bool ZeroMemset,
1844                                        bool MemcpyStrSrc,
1845                                        MachineFunction &MF) const {
1846   const Function *F = MF.getFunction();
1847   if ((!IsMemset || ZeroMemset) &&
1848       !F->hasFnAttribute(Attribute::NoImplicitFloat)) {
1849     if (Size >= 16 &&
1850         (Subtarget->isUnalignedMemAccessFast() ||
1851          ((DstAlign == 0 || DstAlign >= 16) &&
1852           (SrcAlign == 0 || SrcAlign >= 16)))) {
1853       if (Size >= 32) {
1854         if (Subtarget->hasInt256())
1855           return MVT::v8i32;
1856         if (Subtarget->hasFp256())
1857           return MVT::v8f32;
1858       }
1859       if (Subtarget->hasSSE2())
1860         return MVT::v4i32;
1861       if (Subtarget->hasSSE1())
1862         return MVT::v4f32;
1863     } else if (!MemcpyStrSrc && Size >= 8 &&
1864                !Subtarget->is64Bit() &&
1865                Subtarget->hasSSE2()) {
1866       // Do not use f64 to lower memcpy if source is string constant. It's
1867       // better to use i32 to avoid the loads.
1868       return MVT::f64;
1869     }
1870   }
1871   if (Subtarget->is64Bit() && Size >= 8)
1872     return MVT::i64;
1873   return MVT::i32;
1874 }
1875
1876 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1877   if (VT == MVT::f32)
1878     return X86ScalarSSEf32;
1879   else if (VT == MVT::f64)
1880     return X86ScalarSSEf64;
1881   return true;
1882 }
1883
1884 bool
1885 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1886                                                   unsigned,
1887                                                   unsigned,
1888                                                   bool *Fast) const {
1889   if (Fast)
1890     *Fast = Subtarget->isUnalignedMemAccessFast();
1891   return true;
1892 }
1893
1894 /// Return the entry encoding for a jump table in the
1895 /// current function.  The returned value is a member of the
1896 /// MachineJumpTableInfo::JTEntryKind enum.
1897 unsigned X86TargetLowering::getJumpTableEncoding() const {
1898   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1899   // symbol.
1900   if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1901       Subtarget->isPICStyleGOT())
1902     return MachineJumpTableInfo::EK_Custom32;
1903
1904   // Otherwise, use the normal jump table encoding heuristics.
1905   return TargetLowering::getJumpTableEncoding();
1906 }
1907
1908 const MCExpr *
1909 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1910                                              const MachineBasicBlock *MBB,
1911                                              unsigned uid,MCContext &Ctx) const{
1912   assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ &&
1913          Subtarget->isPICStyleGOT());
1914   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1915   // entries.
1916   return MCSymbolRefExpr::Create(MBB->getSymbol(),
1917                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
1918 }
1919
1920 /// Returns relocation base for the given PIC jumptable.
1921 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1922                                                     SelectionDAG &DAG) const {
1923   if (!Subtarget->is64Bit())
1924     // This doesn't have SDLoc associated with it, but is not really the
1925     // same as a Register.
1926     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
1927   return Table;
1928 }
1929
1930 /// This returns the relocation base for the given PIC jumptable,
1931 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
1932 const MCExpr *X86TargetLowering::
1933 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1934                              MCContext &Ctx) const {
1935   // X86-64 uses RIP relative addressing based on the jump table label.
1936   if (Subtarget->isPICStyleRIPRel())
1937     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1938
1939   // Otherwise, the reference is relative to the PIC base.
1940   return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
1941 }
1942
1943 // FIXME: Why this routine is here? Move to RegInfo!
1944 std::pair<const TargetRegisterClass*, uint8_t>
1945 X86TargetLowering::findRepresentativeClass(MVT VT) const{
1946   const TargetRegisterClass *RRC = nullptr;
1947   uint8_t Cost = 1;
1948   switch (VT.SimpleTy) {
1949   default:
1950     return TargetLowering::findRepresentativeClass(VT);
1951   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1952     RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
1953     break;
1954   case MVT::x86mmx:
1955     RRC = &X86::VR64RegClass;
1956     break;
1957   case MVT::f32: case MVT::f64:
1958   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1959   case MVT::v4f32: case MVT::v2f64:
1960   case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
1961   case MVT::v4f64:
1962     RRC = &X86::VR128RegClass;
1963     break;
1964   }
1965   return std::make_pair(RRC, Cost);
1966 }
1967
1968 bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
1969                                                unsigned &Offset) const {
1970   if (!Subtarget->isTargetLinux())
1971     return false;
1972
1973   if (Subtarget->is64Bit()) {
1974     // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
1975     Offset = 0x28;
1976     if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
1977       AddressSpace = 256;
1978     else
1979       AddressSpace = 257;
1980   } else {
1981     // %gs:0x14 on i386
1982     Offset = 0x14;
1983     AddressSpace = 256;
1984   }
1985   return true;
1986 }
1987
1988 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
1989                                             unsigned DestAS) const {
1990   assert(SrcAS != DestAS && "Expected different address spaces!");
1991
1992   return SrcAS < 256 && DestAS < 256;
1993 }
1994
1995 //===----------------------------------------------------------------------===//
1996 //               Return Value Calling Convention Implementation
1997 //===----------------------------------------------------------------------===//
1998
1999 #include "X86GenCallingConv.inc"
2000
2001 bool
2002 X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
2003                                   MachineFunction &MF, bool isVarArg,
2004                         const SmallVectorImpl<ISD::OutputArg> &Outs,
2005                         LLVMContext &Context) const {
2006   SmallVector<CCValAssign, 16> RVLocs;
2007   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2008   return CCInfo.CheckReturn(Outs, RetCC_X86);
2009 }
2010
2011 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2012   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2013   return ScratchRegs;
2014 }
2015
2016 SDValue
2017 X86TargetLowering::LowerReturn(SDValue Chain,
2018                                CallingConv::ID CallConv, bool isVarArg,
2019                                const SmallVectorImpl<ISD::OutputArg> &Outs,
2020                                const SmallVectorImpl<SDValue> &OutVals,
2021                                SDLoc dl, SelectionDAG &DAG) const {
2022   MachineFunction &MF = DAG.getMachineFunction();
2023   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2024
2025   SmallVector<CCValAssign, 16> RVLocs;
2026   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2027   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2028
2029   SDValue Flag;
2030   SmallVector<SDValue, 6> RetOps;
2031   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2032   // Operand #1 = Bytes To Pop
2033   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
2034                    MVT::i16));
2035
2036   // Copy the result values into the output registers.
2037   for (unsigned i = 0; i != RVLocs.size(); ++i) {
2038     CCValAssign &VA = RVLocs[i];
2039     assert(VA.isRegLoc() && "Can only return in registers!");
2040     SDValue ValToCopy = OutVals[i];
2041     EVT ValVT = ValToCopy.getValueType();
2042
2043     // Promote values to the appropriate types.
2044     if (VA.getLocInfo() == CCValAssign::SExt)
2045       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2046     else if (VA.getLocInfo() == CCValAssign::ZExt)
2047       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2048     else if (VA.getLocInfo() == CCValAssign::AExt)
2049       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2050     else if (VA.getLocInfo() == CCValAssign::BCvt)
2051       ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
2052
2053     assert(VA.getLocInfo() != CCValAssign::FPExt &&
2054            "Unexpected FP-extend for return value.");
2055
2056     // If this is x86-64, and we disabled SSE, we can't return FP values,
2057     // or SSE or MMX vectors.
2058     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2059          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2060           (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
2061       report_fatal_error("SSE register return with SSE disabled");
2062     }
2063     // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
2064     // llvm-gcc has never done it right and no one has noticed, so this
2065     // should be OK for now.
2066     if (ValVT == MVT::f64 &&
2067         (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
2068       report_fatal_error("SSE2 register return with SSE2 disabled");
2069
2070     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2071     // the RET instruction and handled by the FP Stackifier.
2072     if (VA.getLocReg() == X86::FP0 ||
2073         VA.getLocReg() == X86::FP1) {
2074       // If this is a copy from an xmm register to ST(0), use an FPExtend to
2075       // change the value to the FP stack register class.
2076       if (isScalarFPTypeInSSEReg(VA.getValVT()))
2077         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2078       RetOps.push_back(ValToCopy);
2079       // Don't emit a copytoreg.
2080       continue;
2081     }
2082
2083     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2084     // which is returned in RAX / RDX.
2085     if (Subtarget->is64Bit()) {
2086       if (ValVT == MVT::x86mmx) {
2087         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2088           ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
2089           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2090                                   ValToCopy);
2091           // If we don't have SSE2 available, convert to v4f32 so the generated
2092           // register is legal.
2093           if (!Subtarget->hasSSE2())
2094             ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
2095         }
2096       }
2097     }
2098
2099     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
2100     Flag = Chain.getValue(1);
2101     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2102   }
2103
2104   // The x86-64 ABIs require that for returning structs by value we copy
2105   // the sret argument into %rax/%eax (depending on ABI) for the return.
2106   // Win32 requires us to put the sret argument to %eax as well.
2107   // We saved the argument into a virtual register in the entry block,
2108   // so now we copy the value out and into %rax/%eax.
2109   //
2110   // Checking Function.hasStructRetAttr() here is insufficient because the IR
2111   // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2112   // false, then an sret argument may be implicitly inserted in the SelDAG. In
2113   // either case FuncInfo->setSRetReturnReg() will have been called.
2114   if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2115     assert((Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) &&
2116            "No need for an sret register");
2117     SDValue Val = DAG.getCopyFromReg(Chain, dl, SRetReg, getPointerTy());
2118
2119     unsigned RetValReg
2120         = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
2121           X86::RAX : X86::EAX;
2122     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2123     Flag = Chain.getValue(1);
2124
2125     // RAX/EAX now acts like a return value.
2126     RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
2127   }
2128
2129   RetOps[0] = Chain;  // Update chain.
2130
2131   // Add the flag if we have it.
2132   if (Flag.getNode())
2133     RetOps.push_back(Flag);
2134
2135   return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps);
2136 }
2137
2138 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2139   if (N->getNumValues() != 1)
2140     return false;
2141   if (!N->hasNUsesOfValue(1, 0))
2142     return false;
2143
2144   SDValue TCChain = Chain;
2145   SDNode *Copy = *N->use_begin();
2146   if (Copy->getOpcode() == ISD::CopyToReg) {
2147     // If the copy has a glue operand, we conservatively assume it isn't safe to
2148     // perform a tail call.
2149     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2150       return false;
2151     TCChain = Copy->getOperand(0);
2152   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2153     return false;
2154
2155   bool HasRet = false;
2156   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2157        UI != UE; ++UI) {
2158     if (UI->getOpcode() != X86ISD::RET_FLAG)
2159       return false;
2160     // If we are returning more than one value, we can definitely
2161     // not make a tail call see PR19530
2162     if (UI->getNumOperands() > 4)
2163       return false;
2164     if (UI->getNumOperands() == 4 &&
2165         UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2166       return false;
2167     HasRet = true;
2168   }
2169
2170   if (!HasRet)
2171     return false;
2172
2173   Chain = TCChain;
2174   return true;
2175 }
2176
2177 EVT
2178 X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
2179                                             ISD::NodeType ExtendKind) const {
2180   MVT ReturnMVT;
2181   // TODO: Is this also valid on 32-bit?
2182   if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
2183     ReturnMVT = MVT::i8;
2184   else
2185     ReturnMVT = MVT::i32;
2186
2187   EVT MinVT = getRegisterType(Context, ReturnMVT);
2188   return VT.bitsLT(MinVT) ? MinVT : VT;
2189 }
2190
2191 /// Lower the result values of a call into the
2192 /// appropriate copies out of appropriate physical registers.
2193 ///
2194 SDValue
2195 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
2196                                    CallingConv::ID CallConv, bool isVarArg,
2197                                    const SmallVectorImpl<ISD::InputArg> &Ins,
2198                                    SDLoc dl, SelectionDAG &DAG,
2199                                    SmallVectorImpl<SDValue> &InVals) const {
2200
2201   // Assign locations to each value returned by this call.
2202   SmallVector<CCValAssign, 16> RVLocs;
2203   bool Is64Bit = Subtarget->is64Bit();
2204   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2205                  *DAG.getContext());
2206   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2207
2208   // Copy all of the result registers out of their specified physreg.
2209   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
2210     CCValAssign &VA = RVLocs[i];
2211     EVT CopyVT = VA.getValVT();
2212
2213     // If this is x86-64, and we disabled SSE, we can't return FP values
2214     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
2215         ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
2216       report_fatal_error("SSE register return with SSE disabled");
2217     }
2218
2219     // If we prefer to use the value in xmm registers, copy it out as f80 and
2220     // use a truncate to move it from fp stack reg to xmm reg.
2221     if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2222         isScalarFPTypeInSSEReg(VA.getValVT()))
2223       CopyVT = MVT::f80;
2224
2225     Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
2226                                CopyVT, InFlag).getValue(1);
2227     SDValue Val = Chain.getValue(0);
2228
2229     if (CopyVT != VA.getValVT())
2230       Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2231                         // This truncation won't change the value.
2232                         DAG.getIntPtrConstant(1));
2233
2234     InFlag = Chain.getValue(2);
2235     InVals.push_back(Val);
2236   }
2237
2238   return Chain;
2239 }
2240
2241 //===----------------------------------------------------------------------===//
2242 //                C & StdCall & Fast Calling Convention implementation
2243 //===----------------------------------------------------------------------===//
2244 //  StdCall calling convention seems to be standard for many Windows' API
2245 //  routines and around. It differs from C calling convention just a little:
2246 //  callee should clean up the stack, not caller. Symbols should be also
2247 //  decorated in some fancy way :) It doesn't support any vector arguments.
2248 //  For info on fast calling convention see Fast Calling Convention (tail call)
2249 //  implementation LowerX86_32FastCCCallTo.
2250
2251 /// CallIsStructReturn - Determines whether a call uses struct return
2252 /// semantics.
2253 enum StructReturnType {
2254   NotStructReturn,
2255   RegStructReturn,
2256   StackStructReturn
2257 };
2258 static StructReturnType
2259 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
2260   if (Outs.empty())
2261     return NotStructReturn;
2262
2263   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2264   if (!Flags.isSRet())
2265     return NotStructReturn;
2266   if (Flags.isInReg())
2267     return RegStructReturn;
2268   return StackStructReturn;
2269 }
2270
2271 /// Determines whether a function uses struct return semantics.
2272 static StructReturnType
2273 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
2274   if (Ins.empty())
2275     return NotStructReturn;
2276
2277   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2278   if (!Flags.isSRet())
2279     return NotStructReturn;
2280   if (Flags.isInReg())
2281     return RegStructReturn;
2282   return StackStructReturn;
2283 }
2284
2285 /// Make a copy of an aggregate at address specified by "Src" to address
2286 /// "Dst" with size and alignment information specified by the specific
2287 /// parameter attribute. The copy will be passed as a byval function parameter.
2288 static SDValue
2289 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
2290                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
2291                           SDLoc dl) {
2292   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
2293
2294   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2295                        /*isVolatile*/false, /*AlwaysInline=*/true,
2296                        MachinePointerInfo(), MachinePointerInfo());
2297 }
2298
2299 /// Return true if the calling convention is one that
2300 /// supports tail call optimization.
2301 static bool IsTailCallConvention(CallingConv::ID CC) {
2302   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2303           CC == CallingConv::HiPE);
2304 }
2305
2306 /// \brief Return true if the calling convention is a C calling convention.
2307 static bool IsCCallConvention(CallingConv::ID CC) {
2308   return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||
2309           CC == CallingConv::X86_64_SysV);
2310 }
2311
2312 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
2313   if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
2314     return false;
2315
2316   CallSite CS(CI);
2317   CallingConv::ID CalleeCC = CS.getCallingConv();
2318   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
2319     return false;
2320
2321   return true;
2322 }
2323
2324 /// Return true if the function is being made into
2325 /// a tailcall target by changing its ABI.
2326 static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
2327                                    bool GuaranteedTailCallOpt) {
2328   return GuaranteedTailCallOpt && IsTailCallConvention(CC);
2329 }
2330
2331 SDValue
2332 X86TargetLowering::LowerMemArgument(SDValue Chain,
2333                                     CallingConv::ID CallConv,
2334                                     const SmallVectorImpl<ISD::InputArg> &Ins,
2335                                     SDLoc dl, SelectionDAG &DAG,
2336                                     const CCValAssign &VA,
2337                                     MachineFrameInfo *MFI,
2338                                     unsigned i) const {
2339   // Create the nodes corresponding to a load from this parameter slot.
2340   ISD::ArgFlagsTy Flags = Ins[i].Flags;
2341   bool AlwaysUseMutable = FuncIsMadeTailCallSafe(
2342       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2343   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2344   EVT ValVT;
2345
2346   // If value is passed by pointer we have address passed instead of the value
2347   // itself.
2348   if (VA.getLocInfo() == CCValAssign::Indirect)
2349     ValVT = VA.getLocVT();
2350   else
2351     ValVT = VA.getValVT();
2352
2353   // FIXME: For now, all byval parameter objects are marked mutable. This can be
2354   // changed with more analysis.
2355   // In case of tail call optimization mark all arguments mutable. Since they
2356   // could be overwritten by lowering of arguments in case of a tail call.
2357   if (Flags.isByVal()) {
2358     unsigned Bytes = Flags.getByValSize();
2359     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2360     int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2361     return DAG.getFrameIndex(FI, getPointerTy());
2362   } else {
2363     int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
2364                                     VA.getLocMemOffset(), isImmutable);
2365     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
2366     return DAG.getLoad(ValVT, dl, Chain, FIN,
2367                        MachinePointerInfo::getFixedStack(FI),
2368                        false, false, false, 0);
2369   }
2370 }
2371
2372 // FIXME: Get this from tablegen.
2373 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2374                                                 const X86Subtarget *Subtarget) {
2375   assert(Subtarget->is64Bit());
2376
2377   if (Subtarget->isCallingConvWin64(CallConv)) {
2378     static const MCPhysReg GPR64ArgRegsWin64[] = {
2379       X86::RCX, X86::RDX, X86::R8,  X86::R9
2380     };
2381     return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2382   }
2383
2384   static const MCPhysReg GPR64ArgRegs64Bit[] = {
2385     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2386   };
2387   return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2388 }
2389
2390 // FIXME: Get this from tablegen.
2391 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2392                                                 CallingConv::ID CallConv,
2393                                                 const X86Subtarget *Subtarget) {
2394   assert(Subtarget->is64Bit());
2395   if (Subtarget->isCallingConvWin64(CallConv)) {
2396     // The XMM registers which might contain var arg parameters are shadowed
2397     // in their paired GPR.  So we only need to save the GPR to their home
2398     // slots.
2399     // TODO: __vectorcall will change this.
2400     return None;
2401   }
2402
2403   const Function *Fn = MF.getFunction();
2404   bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
2405   assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) &&
2406          "SSE register cannot be used when SSE is disabled!");
2407   if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
2408       !Subtarget->hasSSE1())
2409     // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2410     // registers.
2411     return None;
2412
2413   static const MCPhysReg XMMArgRegs64Bit[] = {
2414     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2415     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2416   };
2417   return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2418 }
2419
2420 SDValue
2421 X86TargetLowering::LowerFormalArguments(SDValue Chain,
2422                                         CallingConv::ID CallConv,
2423                                         bool isVarArg,
2424                                       const SmallVectorImpl<ISD::InputArg> &Ins,
2425                                         SDLoc dl,
2426                                         SelectionDAG &DAG,
2427                                         SmallVectorImpl<SDValue> &InVals)
2428                                           const {
2429   MachineFunction &MF = DAG.getMachineFunction();
2430   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2431
2432   const Function* Fn = MF.getFunction();
2433   if (Fn->hasExternalLinkage() &&
2434       Subtarget->isTargetCygMing() &&
2435       Fn->getName() == "main")
2436     FuncInfo->setForceFramePointer(true);
2437
2438   MachineFrameInfo *MFI = MF.getFrameInfo();
2439   bool Is64Bit = Subtarget->is64Bit();
2440   bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
2441
2442   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
2443          "Var args not supported with calling convention fastcc, ghc or hipe");
2444
2445   // Assign locations to all of the incoming arguments.
2446   SmallVector<CCValAssign, 16> ArgLocs;
2447   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2448
2449   // Allocate shadow area for Win64
2450   if (IsWin64)
2451     CCInfo.AllocateStack(32, 8);
2452
2453   CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
2454
2455   unsigned LastVal = ~0U;
2456   SDValue ArgValue;
2457   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2458     CCValAssign &VA = ArgLocs[i];
2459     // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
2460     // places.
2461     assert(VA.getValNo() != LastVal &&
2462            "Don't support value assigned to multiple locs yet");
2463     (void)LastVal;
2464     LastVal = VA.getValNo();
2465
2466     if (VA.isRegLoc()) {
2467       EVT RegVT = VA.getLocVT();
2468       const TargetRegisterClass *RC;
2469       if (RegVT == MVT::i32)
2470         RC = &X86::GR32RegClass;
2471       else if (Is64Bit && RegVT == MVT::i64)
2472         RC = &X86::GR64RegClass;
2473       else if (RegVT == MVT::f32)
2474         RC = &X86::FR32RegClass;
2475       else if (RegVT == MVT::f64)
2476         RC = &X86::FR64RegClass;
2477       else if (RegVT.is512BitVector())
2478         RC = &X86::VR512RegClass;
2479       else if (RegVT.is256BitVector())
2480         RC = &X86::VR256RegClass;
2481       else if (RegVT.is128BitVector())
2482         RC = &X86::VR128RegClass;
2483       else if (RegVT == MVT::x86mmx)
2484         RC = &X86::VR64RegClass;
2485       else if (RegVT == MVT::i1)
2486         RC = &X86::VK1RegClass;
2487       else if (RegVT == MVT::v8i1)
2488         RC = &X86::VK8RegClass;
2489       else if (RegVT == MVT::v16i1)
2490         RC = &X86::VK16RegClass;
2491       else if (RegVT == MVT::v32i1)
2492         RC = &X86::VK32RegClass;
2493       else if (RegVT == MVT::v64i1)
2494         RC = &X86::VK64RegClass;
2495       else
2496         llvm_unreachable("Unknown argument type!");
2497
2498       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2499       ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2500
2501       // If this is an 8 or 16-bit value, it is really passed promoted to 32
2502       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
2503       // right size.
2504       if (VA.getLocInfo() == CCValAssign::SExt)
2505         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
2506                                DAG.getValueType(VA.getValVT()));
2507       else if (VA.getLocInfo() == CCValAssign::ZExt)
2508         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
2509                                DAG.getValueType(VA.getValVT()));
2510       else if (VA.getLocInfo() == CCValAssign::BCvt)
2511         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
2512
2513       if (VA.isExtInLoc()) {
2514         // Handle MMX values passed in XMM regs.
2515         if (RegVT.isVector())
2516           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
2517         else
2518           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
2519       }
2520     } else {
2521       assert(VA.isMemLoc());
2522       ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
2523     }
2524
2525     // If value is passed via pointer - do a load.
2526     if (VA.getLocInfo() == CCValAssign::Indirect)
2527       ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
2528                              MachinePointerInfo(), false, false, false, 0);
2529
2530     InVals.push_back(ArgValue);
2531   }
2532
2533   if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) {
2534     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2535       // The x86-64 ABIs require that for returning structs by value we copy
2536       // the sret argument into %rax/%eax (depending on ABI) for the return.
2537       // Win32 requires us to put the sret argument to %eax as well.
2538       // Save the argument into a virtual register so that we can access it
2539       // from the return points.
2540       if (Ins[i].Flags.isSRet()) {
2541         unsigned Reg = FuncInfo->getSRetReturnReg();
2542         if (!Reg) {
2543           MVT PtrTy = getPointerTy();
2544           Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
2545           FuncInfo->setSRetReturnReg(Reg);
2546         }
2547         SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
2548         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
2549         break;
2550       }
2551     }
2552   }
2553
2554   unsigned StackSize = CCInfo.getNextStackOffset();
2555   // Align stack specially for tail calls.
2556   if (FuncIsMadeTailCallSafe(CallConv,
2557                              MF.getTarget().Options.GuaranteedTailCallOpt))
2558     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
2559
2560   // If the function takes variable number of arguments, make a frame index for
2561   // the start of the first vararg value... for expansion of llvm.va_start. We
2562   // can skip this if there are no va_start calls.
2563   if (MFI->hasVAStart() &&
2564       (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
2565                    CallConv != CallingConv::X86_ThisCall))) {
2566     FuncInfo->setVarArgsFrameIndex(
2567         MFI->CreateFixedObject(1, StackSize, true));
2568   }
2569
2570   // Figure out if XMM registers are in use.
2571   assert(!(MF.getTarget().Options.UseSoftFloat &&
2572            Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
2573          "SSE register cannot be used when SSE is disabled!");
2574
2575   // 64-bit calling conventions support varargs and register parameters, so we
2576   // have to do extra work to spill them in the prologue.
2577   if (Is64Bit && isVarArg && MFI->hasVAStart()) {
2578     // Find the first unallocated argument registers.
2579     ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
2580     ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
2581     unsigned NumIntRegs =
2582         CCInfo.getFirstUnallocated(ArgGPRs.data(), ArgGPRs.size());
2583     unsigned NumXMMRegs =
2584         CCInfo.getFirstUnallocated(ArgXMMs.data(), ArgXMMs.size());
2585     assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
2586            "SSE register cannot be used when SSE is disabled!");
2587
2588     // Gather all the live in physical registers.
2589     SmallVector<SDValue, 6> LiveGPRs;
2590     SmallVector<SDValue, 8> LiveXMMRegs;
2591     SDValue ALVal;
2592     for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
2593       unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
2594       LiveGPRs.push_back(
2595           DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
2596     }
2597     if (!ArgXMMs.empty()) {
2598       unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2599       ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
2600       for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
2601         unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
2602         LiveXMMRegs.push_back(
2603             DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
2604       }
2605     }
2606
2607     if (IsWin64) {
2608       const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
2609       // Get to the caller-allocated home save location.  Add 8 to account
2610       // for the return address.
2611       int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
2612       FuncInfo->setRegSaveFrameIndex(
2613           MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
2614       // Fixup to set vararg frame on shadow area (4 x i64).
2615       if (NumIntRegs < 4)
2616         FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
2617     } else {
2618       // For X86-64, if there are vararg parameters that are passed via
2619       // registers, then we must store them to their spots on the stack so
2620       // they may be loaded by deferencing the result of va_next.
2621       FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
2622       FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
2623       FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
2624           ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
2625     }
2626
2627     // Store the integer parameter registers.
2628     SmallVector<SDValue, 8> MemOps;
2629     SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
2630                                       getPointerTy());
2631     unsigned Offset = FuncInfo->getVarArgsGPOffset();
2632     for (SDValue Val : LiveGPRs) {
2633       SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
2634                                 DAG.getIntPtrConstant(Offset));
2635       SDValue Store =
2636         DAG.getStore(Val.getValue(1), dl, Val, FIN,
2637                      MachinePointerInfo::getFixedStack(
2638                        FuncInfo->getRegSaveFrameIndex(), Offset),
2639                      false, false, 0);
2640       MemOps.push_back(Store);
2641       Offset += 8;
2642     }
2643
2644     if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
2645       // Now store the XMM (fp + vector) parameter registers.
2646       SmallVector<SDValue, 12> SaveXMMOps;
2647       SaveXMMOps.push_back(Chain);
2648       SaveXMMOps.push_back(ALVal);
2649       SaveXMMOps.push_back(DAG.getIntPtrConstant(
2650                              FuncInfo->getRegSaveFrameIndex()));
2651       SaveXMMOps.push_back(DAG.getIntPtrConstant(
2652                              FuncInfo->getVarArgsFPOffset()));
2653       SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
2654                         LiveXMMRegs.end());
2655       MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
2656                                    MVT::Other, SaveXMMOps));
2657     }
2658
2659     if (!MemOps.empty())
2660       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
2661   }
2662
2663   if (isVarArg && MFI->hasMustTailInVarArgFunc()) {
2664     // Find the largest legal vector type.
2665     MVT VecVT = MVT::Other;
2666     // FIXME: Only some x86_32 calling conventions support AVX512.
2667     if (Subtarget->hasAVX512() &&
2668         (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
2669                      CallConv == CallingConv::Intel_OCL_BI)))
2670       VecVT = MVT::v16f32;
2671     else if (Subtarget->hasAVX())
2672       VecVT = MVT::v8f32;
2673     else if (Subtarget->hasSSE2())
2674       VecVT = MVT::v4f32;
2675
2676     // We forward some GPRs and some vector types.
2677     SmallVector<MVT, 2> RegParmTypes;
2678     MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
2679     RegParmTypes.push_back(IntVT);
2680     if (VecVT != MVT::Other)
2681       RegParmTypes.push_back(VecVT);
2682
2683     // Compute the set of forwarded registers. The rest are scratch.
2684     SmallVectorImpl<ForwardedRegister> &Forwards =
2685         FuncInfo->getForwardedMustTailRegParms();
2686     CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
2687
2688     // Conservatively forward AL on x86_64, since it might be used for varargs.
2689     if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
2690       unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2691       Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
2692     }
2693
2694     // Copy all forwards from physical to virtual registers.
2695     for (ForwardedRegister &F : Forwards) {
2696       // FIXME: Can we use a less constrained schedule?
2697       SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
2698       F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
2699       Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
2700     }
2701   }
2702
2703   // Some CCs need callee pop.
2704   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
2705                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
2706     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
2707   } else {
2708     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
2709     // If this is an sret function, the return should pop the hidden pointer.
2710     if (!Is64Bit && !IsTailCallConvention(CallConv) &&
2711         !Subtarget->getTargetTriple().isOSMSVCRT() &&
2712         argsAreStructReturn(Ins) == StackStructReturn)
2713       FuncInfo->setBytesToPopOnReturn(4);
2714   }
2715
2716   if (!Is64Bit) {
2717     // RegSaveFrameIndex is X86-64 only.
2718     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
2719     if (CallConv == CallingConv::X86_FastCall ||
2720         CallConv == CallingConv::X86_ThisCall)
2721       // fastcc functions can't have varargs.
2722       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
2723   }
2724
2725   FuncInfo->setArgumentStackSize(StackSize);
2726
2727   return Chain;
2728 }
2729
2730 SDValue
2731 X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
2732                                     SDValue StackPtr, SDValue Arg,
2733                                     SDLoc dl, SelectionDAG &DAG,
2734                                     const CCValAssign &VA,
2735                                     ISD::ArgFlagsTy Flags) const {
2736   unsigned LocMemOffset = VA.getLocMemOffset();
2737   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
2738   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
2739   if (Flags.isByVal())
2740     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
2741
2742   return DAG.getStore(Chain, dl, Arg, PtrOff,
2743                       MachinePointerInfo::getStack(LocMemOffset),
2744                       false, false, 0);
2745 }
2746
2747 /// Emit a load of return address if tail call
2748 /// optimization is performed and it is required.
2749 SDValue
2750 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
2751                                            SDValue &OutRetAddr, SDValue Chain,
2752                                            bool IsTailCall, bool Is64Bit,
2753                                            int FPDiff, SDLoc dl) const {
2754   // Adjust the Return address stack slot.
2755   EVT VT = getPointerTy();
2756   OutRetAddr = getReturnAddressFrameIndex(DAG);
2757
2758   // Load the "old" Return address.
2759   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
2760                            false, false, false, 0);
2761   return SDValue(OutRetAddr.getNode(), 1);
2762 }
2763
2764 /// Emit a store of the return address if tail call
2765 /// optimization is performed and it is required (FPDiff!=0).
2766 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
2767                                         SDValue Chain, SDValue RetAddrFrIdx,
2768                                         EVT PtrVT, unsigned SlotSize,
2769                                         int FPDiff, SDLoc dl) {
2770   // Store the return address to the appropriate stack slot.
2771   if (!FPDiff) return Chain;
2772   // Calculate the new stack slot for the return address.
2773   int NewReturnAddrFI =
2774     MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
2775                                          false);
2776   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
2777   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
2778                        MachinePointerInfo::getFixedStack(NewReturnAddrFI),
2779                        false, false, 0);
2780   return Chain;
2781 }
2782
2783 SDValue
2784 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2785                              SmallVectorImpl<SDValue> &InVals) const {
2786   SelectionDAG &DAG                     = CLI.DAG;
2787   SDLoc &dl                             = CLI.DL;
2788   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2789   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
2790   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
2791   SDValue Chain                         = CLI.Chain;
2792   SDValue Callee                        = CLI.Callee;
2793   CallingConv::ID CallConv              = CLI.CallConv;
2794   bool &isTailCall                      = CLI.IsTailCall;
2795   bool isVarArg                         = CLI.IsVarArg;
2796
2797   MachineFunction &MF = DAG.getMachineFunction();
2798   bool Is64Bit        = Subtarget->is64Bit();
2799   bool IsWin64        = Subtarget->isCallingConvWin64(CallConv);
2800   StructReturnType SR = callIsStructReturn(Outs);
2801   bool IsSibcall      = false;
2802   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
2803
2804   if (MF.getTarget().Options.DisableTailCalls)
2805     isTailCall = false;
2806
2807   bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
2808   if (IsMustTail) {
2809     // Force this to be a tail call.  The verifier rules are enough to ensure
2810     // that we can lower this successfully without moving the return address
2811     // around.
2812     isTailCall = true;
2813   } else if (isTailCall) {
2814     // Check if it's really possible to do a tail call.
2815     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
2816                     isVarArg, SR != NotStructReturn,
2817                     MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
2818                     Outs, OutVals, Ins, DAG);
2819
2820     // Sibcalls are automatically detected tailcalls which do not require
2821     // ABI changes.
2822     if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
2823       IsSibcall = true;
2824
2825     if (isTailCall)
2826       ++NumTailCalls;
2827   }
2828
2829   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
2830          "Var args not supported with calling convention fastcc, ghc or hipe");
2831
2832   // Analyze operands of the call, assigning locations to each operand.
2833   SmallVector<CCValAssign, 16> ArgLocs;
2834   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2835
2836   // Allocate shadow area for Win64
2837   if (IsWin64)
2838     CCInfo.AllocateStack(32, 8);
2839
2840   CCInfo.AnalyzeCallOperands(Outs, CC_X86);
2841
2842   // Get a count of how many bytes are to be pushed on the stack.
2843   unsigned NumBytes = CCInfo.getNextStackOffset();
2844   if (IsSibcall)
2845     // This is a sibcall. The memory operands are available in caller's
2846     // own caller's stack.
2847     NumBytes = 0;
2848   else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
2849            IsTailCallConvention(CallConv))
2850     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
2851
2852   int FPDiff = 0;
2853   if (isTailCall && !IsSibcall && !IsMustTail) {
2854     // Lower arguments at fp - stackoffset + fpdiff.
2855     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
2856
2857     FPDiff = NumBytesCallerPushed - NumBytes;
2858
2859     // Set the delta of movement of the returnaddr stackslot.
2860     // But only set if delta is greater than previous delta.
2861     if (FPDiff < X86Info->getTCReturnAddrDelta())
2862       X86Info->setTCReturnAddrDelta(FPDiff);
2863   }
2864
2865   unsigned NumBytesToPush = NumBytes;
2866   unsigned NumBytesToPop = NumBytes;
2867
2868   // If we have an inalloca argument, all stack space has already been allocated
2869   // for us and be right at the top of the stack.  We don't support multiple
2870   // arguments passed in memory when using inalloca.
2871   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
2872     NumBytesToPush = 0;
2873     if (!ArgLocs.back().isMemLoc())
2874       report_fatal_error("cannot use inalloca attribute on a register "
2875                          "parameter");
2876     if (ArgLocs.back().getLocMemOffset() != 0)
2877       report_fatal_error("any parameter with the inalloca attribute must be "
2878                          "the only memory argument");
2879   }
2880
2881   if (!IsSibcall)
2882     Chain = DAG.getCALLSEQ_START(
2883         Chain, DAG.getIntPtrConstant(NumBytesToPush, true), dl);
2884
2885   SDValue RetAddrFrIdx;
2886   // Load return address for tail calls.
2887   if (isTailCall && FPDiff)
2888     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
2889                                     Is64Bit, FPDiff, dl);
2890
2891   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2892   SmallVector<SDValue, 8> MemOpChains;
2893   SDValue StackPtr;
2894
2895   // Walk the register/memloc assignments, inserting copies/loads.  In the case
2896   // of tail call optimization arguments are handle later.
2897   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
2898   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2899     // Skip inalloca arguments, they have already been written.
2900     ISD::ArgFlagsTy Flags = Outs[i].Flags;
2901     if (Flags.isInAlloca())
2902       continue;
2903
2904     CCValAssign &VA = ArgLocs[i];
2905     EVT RegVT = VA.getLocVT();
2906     SDValue Arg = OutVals[i];
2907     bool isByVal = Flags.isByVal();
2908
2909     // Promote the value if needed.
2910     switch (VA.getLocInfo()) {
2911     default: llvm_unreachable("Unknown loc info!");
2912     case CCValAssign::Full: break;
2913     case CCValAssign::SExt:
2914       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
2915       break;
2916     case CCValAssign::ZExt:
2917       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
2918       break;
2919     case CCValAssign::AExt:
2920       if (RegVT.is128BitVector()) {
2921         // Special case: passing MMX values in XMM registers.
2922         Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
2923         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
2924         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
2925       } else
2926         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
2927       break;
2928     case CCValAssign::BCvt:
2929       Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
2930       break;
2931     case CCValAssign::Indirect: {
2932       // Store the argument.
2933       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
2934       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
2935       Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
2936                            MachinePointerInfo::getFixedStack(FI),
2937                            false, false, 0);
2938       Arg = SpillSlot;
2939       break;
2940     }
2941     }
2942
2943     if (VA.isRegLoc()) {
2944       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2945       if (isVarArg && IsWin64) {
2946         // Win64 ABI requires argument XMM reg to be copied to the corresponding
2947         // shadow reg if callee is a varargs function.
2948         unsigned ShadowReg = 0;
2949         switch (VA.getLocReg()) {
2950         case X86::XMM0: ShadowReg = X86::RCX; break;
2951         case X86::XMM1: ShadowReg = X86::RDX; break;
2952         case X86::XMM2: ShadowReg = X86::R8; break;
2953         case X86::XMM3: ShadowReg = X86::R9; break;
2954         }
2955         if (ShadowReg)
2956           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
2957       }
2958     } else if (!IsSibcall && (!isTailCall || isByVal)) {
2959       assert(VA.isMemLoc());
2960       if (!StackPtr.getNode())
2961         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
2962                                       getPointerTy());
2963       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
2964                                              dl, DAG, VA, Flags));
2965     }
2966   }
2967
2968   if (!MemOpChains.empty())
2969     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2970
2971   if (Subtarget->isPICStyleGOT()) {
2972     // ELF / PIC requires GOT in the EBX register before function calls via PLT
2973     // GOT pointer.
2974     if (!isTailCall) {
2975       RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
2976                DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
2977     } else {
2978       // If we are tail calling and generating PIC/GOT style code load the
2979       // address of the callee into ECX. The value in ecx is used as target of
2980       // the tail jump. This is done to circumvent the ebx/callee-saved problem
2981       // for tail calls on PIC/GOT architectures. Normally we would just put the
2982       // address of GOT into ebx and then call target@PLT. But for tail calls
2983       // ebx would be restored (since ebx is callee saved) before jumping to the
2984       // target@PLT.
2985
2986       // Note: The actual moving to ECX is done further down.
2987       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
2988       if (G && !G->getGlobal()->hasHiddenVisibility() &&
2989           !G->getGlobal()->hasProtectedVisibility())
2990         Callee = LowerGlobalAddress(Callee, DAG);
2991       else if (isa<ExternalSymbolSDNode>(Callee))
2992         Callee = LowerExternalSymbol(Callee, DAG);
2993     }
2994   }
2995
2996   if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
2997     // From AMD64 ABI document:
2998     // For calls that may call functions that use varargs or stdargs
2999     // (prototype-less calls or calls to functions containing ellipsis (...) in
3000     // the declaration) %al is used as hidden argument to specify the number
3001     // of SSE registers used. The contents of %al do not need to match exactly
3002     // the number of registers, but must be an ubound on the number of SSE
3003     // registers used and is in the range 0 - 8 inclusive.
3004
3005     // Count the number of XMM registers allocated.
3006     static const MCPhysReg XMMArgRegs[] = {
3007       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3008       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3009     };
3010     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
3011     assert((Subtarget->hasSSE1() || !NumXMMRegs)
3012            && "SSE registers cannot be used when SSE is disabled");
3013
3014     RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3015                                         DAG.getConstant(NumXMMRegs, MVT::i8)));
3016   }
3017
3018   if (isVarArg && IsMustTail) {
3019     const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3020     for (const auto &F : Forwards) {
3021       SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3022       RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3023     }
3024   }
3025
3026   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
3027   // don't need this because the eligibility check rejects calls that require
3028   // shuffling arguments passed in memory.
3029   if (!IsSibcall && isTailCall) {
3030     // Force all the incoming stack arguments to be loaded from the stack
3031     // before any new outgoing arguments are stored to the stack, because the
3032     // outgoing stack slots may alias the incoming argument stack slots, and
3033     // the alias isn't otherwise explicit. This is slightly more conservative
3034     // than necessary, because it means that each store effectively depends
3035     // on every argument instead of just those arguments it would clobber.
3036     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3037
3038     SmallVector<SDValue, 8> MemOpChains2;
3039     SDValue FIN;
3040     int FI = 0;
3041     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3042       CCValAssign &VA = ArgLocs[i];
3043       if (VA.isRegLoc())
3044         continue;
3045       assert(VA.isMemLoc());
3046       SDValue Arg = OutVals[i];
3047       ISD::ArgFlagsTy Flags = Outs[i].Flags;
3048       // Skip inalloca arguments.  They don't require any work.
3049       if (Flags.isInAlloca())
3050         continue;
3051       // Create frame index.
3052       int32_t Offset = VA.getLocMemOffset()+FPDiff;
3053       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3054       FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
3055       FIN = DAG.getFrameIndex(FI, getPointerTy());
3056
3057       if (Flags.isByVal()) {
3058         // Copy relative to framepointer.
3059         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
3060         if (!StackPtr.getNode())
3061           StackPtr = DAG.getCopyFromReg(Chain, dl,
3062                                         RegInfo->getStackRegister(),
3063                                         getPointerTy());
3064         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
3065
3066         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3067                                                          ArgChain,
3068                                                          Flags, DAG, dl));
3069       } else {
3070         // Store relative to framepointer.
3071         MemOpChains2.push_back(
3072           DAG.getStore(ArgChain, dl, Arg, FIN,
3073                        MachinePointerInfo::getFixedStack(FI),
3074                        false, false, 0));
3075       }
3076     }
3077
3078     if (!MemOpChains2.empty())
3079       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3080
3081     // Store the return address to the appropriate stack slot.
3082     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3083                                      getPointerTy(), RegInfo->getSlotSize(),
3084                                      FPDiff, dl);
3085   }
3086
3087   // Build a sequence of copy-to-reg nodes chained together with token chain
3088   // and flag operands which copy the outgoing args into registers.
3089   SDValue InFlag;
3090   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3091     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3092                              RegsToPass[i].second, InFlag);
3093     InFlag = Chain.getValue(1);
3094   }
3095
3096   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3097     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3098     // In the 64-bit large code model, we have to make all calls
3099     // through a register, since the call instruction's 32-bit
3100     // pc-relative offset may not be large enough to hold the whole
3101     // address.
3102   } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3103     // If the callee is a GlobalAddress node (quite common, every direct call
3104     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3105     // it.
3106     GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3107
3108     // We should use extra load for direct calls to dllimported functions in
3109     // non-JIT mode.
3110     const GlobalValue *GV = G->getGlobal();
3111     if (!GV->hasDLLImportStorageClass()) {
3112       unsigned char OpFlags = 0;
3113       bool ExtraLoad = false;
3114       unsigned WrapperKind = ISD::DELETED_NODE;
3115
3116       // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
3117       // external symbols most go through the PLT in PIC mode.  If the symbol
3118       // has hidden or protected visibility, or if it is static or local, then
3119       // we don't need to use the PLT - we can directly call it.
3120       if (Subtarget->isTargetELF() &&
3121           DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
3122           GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
3123         OpFlags = X86II::MO_PLT;
3124       } else if (Subtarget->isPICStyleStubAny() &&
3125                  (GV->isDeclaration() || GV->isWeakForLinker()) &&
3126                  (!Subtarget->getTargetTriple().isMacOSX() ||
3127                   Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
3128         // PC-relative references to external symbols should go through $stub,
3129         // unless we're building with the leopard linker or later, which
3130         // automatically synthesizes these stubs.
3131         OpFlags = X86II::MO_DARWIN_STUB;
3132       } else if (Subtarget->isPICStyleRIPRel() && isa<Function>(GV) &&
3133                  cast<Function>(GV)->hasFnAttribute(Attribute::NonLazyBind)) {
3134         // If the function is marked as non-lazy, generate an indirect call
3135         // which loads from the GOT directly. This avoids runtime overhead
3136         // at the cost of eager binding (and one extra byte of encoding).
3137         OpFlags = X86II::MO_GOTPCREL;
3138         WrapperKind = X86ISD::WrapperRIP;
3139         ExtraLoad = true;
3140       }
3141
3142       Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
3143                                           G->getOffset(), OpFlags);
3144
3145       // Add a wrapper if needed.
3146       if (WrapperKind != ISD::DELETED_NODE)
3147         Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
3148       // Add extra indirection if needed.
3149       if (ExtraLoad)
3150         Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
3151                              MachinePointerInfo::getGOT(),
3152                              false, false, false, 0);
3153     }
3154   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3155     unsigned char OpFlags = 0;
3156
3157     // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
3158     // external symbols should go through the PLT.
3159     if (Subtarget->isTargetELF() &&
3160         DAG.getTarget().getRelocationModel() == Reloc::PIC_) {
3161       OpFlags = X86II::MO_PLT;
3162     } else if (Subtarget->isPICStyleStubAny() &&
3163                (!Subtarget->getTargetTriple().isMacOSX() ||
3164                 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
3165       // PC-relative references to external symbols should go through $stub,
3166       // unless we're building with the leopard linker or later, which
3167       // automatically synthesizes these stubs.
3168       OpFlags = X86II::MO_DARWIN_STUB;
3169     }
3170
3171     Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
3172                                          OpFlags);
3173   } else if (Subtarget->isTarget64BitILP32() &&
3174              Callee->getValueType(0) == MVT::i32) {
3175     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3176     Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3177   }
3178
3179   // Returns a chain & a flag for retval copy to use.
3180   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3181   SmallVector<SDValue, 8> Ops;
3182
3183   if (!IsSibcall && isTailCall) {
3184     Chain = DAG.getCALLSEQ_END(Chain,
3185                                DAG.getIntPtrConstant(NumBytesToPop, true),
3186                                DAG.getIntPtrConstant(0, true), InFlag, dl);
3187     InFlag = Chain.getValue(1);
3188   }
3189
3190   Ops.push_back(Chain);
3191   Ops.push_back(Callee);
3192
3193   if (isTailCall)
3194     Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
3195
3196   // Add argument registers to the end of the list so that they are known live
3197   // into the call.
3198   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3199     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3200                                   RegsToPass[i].second.getValueType()));
3201
3202   // Add a register mask operand representing the call-preserved registers.
3203   const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
3204   const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
3205   assert(Mask && "Missing call preserved mask for calling convention");
3206   Ops.push_back(DAG.getRegisterMask(Mask));
3207
3208   if (InFlag.getNode())
3209     Ops.push_back(InFlag);
3210
3211   if (isTailCall) {
3212     // We used to do:
3213     //// If this is the first return lowered for this function, add the regs
3214     //// to the liveout set for the function.
3215     // This isn't right, although it's probably harmless on x86; liveouts
3216     // should be computed from returns not tail calls.  Consider a void
3217     // function making a tail call to a function returning int.
3218     return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3219   }
3220
3221   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3222   InFlag = Chain.getValue(1);
3223
3224   // Create the CALLSEQ_END node.
3225   unsigned NumBytesForCalleeToPop;
3226   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3227                        DAG.getTarget().Options.GuaranteedTailCallOpt))
3228     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
3229   else if (!Is64Bit && !IsTailCallConvention(CallConv) &&
3230            !Subtarget->getTargetTriple().isOSMSVCRT() &&
3231            SR == StackStructReturn)
3232     // If this is a call to a struct-return function, the callee
3233     // pops the hidden struct pointer, so we have to push it back.
3234     // This is common for Darwin/X86, Linux & Mingw32 targets.
3235     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3236     NumBytesForCalleeToPop = 4;
3237   else
3238     NumBytesForCalleeToPop = 0;  // Callee pops nothing.
3239
3240   // Returns a flag for retval copy to use.
3241   if (!IsSibcall) {
3242     Chain = DAG.getCALLSEQ_END(Chain,
3243                                DAG.getIntPtrConstant(NumBytesToPop, true),
3244                                DAG.getIntPtrConstant(NumBytesForCalleeToPop,
3245                                                      true),
3246                                InFlag, dl);
3247     InFlag = Chain.getValue(1);
3248   }
3249
3250   // Handle result values, copying them out of physregs into vregs that we
3251   // return.
3252   return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
3253                          Ins, dl, DAG, InVals);
3254 }
3255
3256 //===----------------------------------------------------------------------===//
3257 //                Fast Calling Convention (tail call) implementation
3258 //===----------------------------------------------------------------------===//
3259
3260 //  Like std call, callee cleans arguments, convention except that ECX is
3261 //  reserved for storing the tail called function address. Only 2 registers are
3262 //  free for argument passing (inreg). Tail call optimization is performed
3263 //  provided:
3264 //                * tailcallopt is enabled
3265 //                * caller/callee are fastcc
3266 //  On X86_64 architecture with GOT-style position independent code only local
3267 //  (within module) calls are supported at the moment.
3268 //  To keep the stack aligned according to platform abi the function
3269 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
3270 //  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3271 //  If a tail called function callee has more arguments than the caller the
3272 //  caller needs to make sure that there is room to move the RETADDR to. This is
3273 //  achieved by reserving an area the size of the argument delta right after the
3274 //  original RETADDR, but before the saved framepointer or the spilled registers
3275 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3276 //  stack layout:
3277 //    arg1
3278 //    arg2
3279 //    RETADDR
3280 //    [ new RETADDR
3281 //      move area ]
3282 //    (possible EBP)
3283 //    ESI
3284 //    EDI
3285 //    local1 ..
3286
3287 /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
3288 /// for a 16 byte align requirement.
3289 unsigned
3290 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3291                                                SelectionDAG& DAG) const {
3292   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
3293   const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
3294   unsigned StackAlignment = TFI.getStackAlignment();
3295   uint64_t AlignMask = StackAlignment - 1;
3296   int64_t Offset = StackSize;
3297   unsigned SlotSize = RegInfo->getSlotSize();
3298   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3299     // Number smaller than 12 so just add the difference.
3300     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3301   } else {
3302     // Mask out lower bits, add stackalignment once plus the 12 bytes.
3303     Offset = ((~AlignMask) & Offset) + StackAlignment +
3304       (StackAlignment-SlotSize);
3305   }
3306   return Offset;
3307 }
3308
3309 /// MatchingStackOffset - Return true if the given stack call argument is
3310 /// already available in the same position (relatively) of the caller's
3311 /// incoming argument stack.
3312 static
3313 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3314                          MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
3315                          const X86InstrInfo *TII) {
3316   unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
3317   int FI = INT_MAX;
3318   if (Arg.getOpcode() == ISD::CopyFromReg) {
3319     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3320     if (!TargetRegisterInfo::isVirtualRegister(VR))
3321       return false;
3322     MachineInstr *Def = MRI->getVRegDef(VR);
3323     if (!Def)
3324       return false;
3325     if (!Flags.isByVal()) {
3326       if (!TII->isLoadFromStackSlot(Def, FI))
3327         return false;
3328     } else {
3329       unsigned Opcode = Def->getOpcode();
3330       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
3331            Opcode == X86::LEA64_32r) &&
3332           Def->getOperand(1).isFI()) {
3333         FI = Def->getOperand(1).getIndex();
3334         Bytes = Flags.getByValSize();
3335       } else
3336         return false;
3337     }
3338   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
3339     if (Flags.isByVal())
3340       // ByVal argument is passed in as a pointer but it's now being
3341       // dereferenced. e.g.
3342       // define @foo(%struct.X* %A) {
3343       //   tail call @bar(%struct.X* byval %A)
3344       // }
3345       return false;
3346     SDValue Ptr = Ld->getBasePtr();
3347     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
3348     if (!FINode)
3349       return false;
3350     FI = FINode->getIndex();
3351   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
3352     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
3353     FI = FINode->getIndex();
3354     Bytes = Flags.getByValSize();
3355   } else
3356     return false;
3357
3358   assert(FI != INT_MAX);
3359   if (!MFI->isFixedObjectIndex(FI))
3360     return false;
3361   return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
3362 }
3363
3364 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
3365 /// for tail call optimization. Targets which want to do tail call
3366 /// optimization should implement this function.
3367 bool
3368 X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
3369                                                      CallingConv::ID CalleeCC,
3370                                                      bool isVarArg,
3371                                                      bool isCalleeStructRet,
3372                                                      bool isCallerStructRet,
3373                                                      Type *RetTy,
3374                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
3375                                     const SmallVectorImpl<SDValue> &OutVals,
3376                                     const SmallVectorImpl<ISD::InputArg> &Ins,
3377                                                      SelectionDAG &DAG) const {
3378   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
3379     return false;
3380
3381   // If -tailcallopt is specified, make fastcc functions tail-callable.
3382   const MachineFunction &MF = DAG.getMachineFunction();
3383   const Function *CallerF = MF.getFunction();
3384
3385   // If the function return type is x86_fp80 and the callee return type is not,
3386   // then the FP_EXTEND of the call result is not a nop. It's not safe to
3387   // perform a tailcall optimization here.
3388   if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
3389     return false;
3390
3391   CallingConv::ID CallerCC = CallerF->getCallingConv();
3392   bool CCMatch = CallerCC == CalleeCC;
3393   bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
3394   bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
3395
3396   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
3397     if (IsTailCallConvention(CalleeCC) && CCMatch)
3398       return true;
3399     return false;
3400   }
3401
3402   // Look for obvious safe cases to perform tail call optimization that do not
3403   // require ABI changes. This is what gcc calls sibcall.
3404
3405   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
3406   // emit a special epilogue.
3407   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
3408   if (RegInfo->needsStackRealignment(MF))
3409     return false;
3410
3411   // Also avoid sibcall optimization if either caller or callee uses struct
3412   // return semantics.
3413   if (isCalleeStructRet || isCallerStructRet)
3414     return false;
3415
3416   // An stdcall/thiscall caller is expected to clean up its arguments; the
3417   // callee isn't going to do that.
3418   // FIXME: this is more restrictive than needed. We could produce a tailcall
3419   // when the stack adjustment matches. For example, with a thiscall that takes
3420   // only one argument.
3421   if (!CCMatch && (CallerCC == CallingConv::X86_StdCall ||
3422                    CallerCC == CallingConv::X86_ThisCall))
3423     return false;
3424
3425   // Do not sibcall optimize vararg calls unless all arguments are passed via
3426   // registers.
3427   if (isVarArg && !Outs.empty()) {
3428
3429     // Optimizing for varargs on Win64 is unlikely to be safe without
3430     // additional testing.
3431     if (IsCalleeWin64 || IsCallerWin64)
3432       return false;
3433
3434     SmallVector<CCValAssign, 16> ArgLocs;
3435     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
3436                    *DAG.getContext());
3437
3438     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3439     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
3440       if (!ArgLocs[i].isRegLoc())
3441         return false;
3442   }
3443
3444   // If the call result is in ST0 / ST1, it needs to be popped off the x87
3445   // stack.  Therefore, if it's not used by the call it is not safe to optimize
3446   // this into a sibcall.
3447   bool Unused = false;
3448   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
3449     if (!Ins[i].Used) {
3450       Unused = true;
3451       break;
3452     }
3453   }
3454   if (Unused) {
3455     SmallVector<CCValAssign, 16> RVLocs;
3456     CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs,
3457                    *DAG.getContext());
3458     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3459     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
3460       CCValAssign &VA = RVLocs[i];
3461       if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
3462         return false;
3463     }
3464   }
3465
3466   // If the calling conventions do not match, then we'd better make sure the
3467   // results are returned in the same way as what the caller expects.
3468   if (!CCMatch) {
3469     SmallVector<CCValAssign, 16> RVLocs1;
3470     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
3471                     *DAG.getContext());
3472     CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
3473
3474     SmallVector<CCValAssign, 16> RVLocs2;
3475     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
3476                     *DAG.getContext());
3477     CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
3478
3479     if (RVLocs1.size() != RVLocs2.size())
3480       return false;
3481     for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
3482       if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
3483         return false;
3484       if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
3485         return false;
3486       if (RVLocs1[i].isRegLoc()) {
3487         if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
3488           return false;
3489       } else {
3490         if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
3491           return false;
3492       }
3493     }
3494   }
3495
3496   // If the callee takes no arguments then go on to check the results of the
3497   // call.
3498   if (!Outs.empty()) {
3499     // Check if stack adjustment is needed. For now, do not do this if any
3500     // argument is passed on the stack.
3501     SmallVector<CCValAssign, 16> ArgLocs;
3502     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
3503                    *DAG.getContext());
3504
3505     // Allocate shadow area for Win64
3506     if (IsCalleeWin64)
3507       CCInfo.AllocateStack(32, 8);
3508
3509     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3510     if (CCInfo.getNextStackOffset()) {
3511       MachineFunction &MF = DAG.getMachineFunction();
3512       if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
3513         return false;
3514
3515       // Check if the arguments are already laid out in the right way as
3516       // the caller's fixed stack objects.
3517       MachineFrameInfo *MFI = MF.getFrameInfo();
3518       const MachineRegisterInfo *MRI = &MF.getRegInfo();
3519       const X86InstrInfo *TII = Subtarget->getInstrInfo();
3520       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3521         CCValAssign &VA = ArgLocs[i];
3522         SDValue Arg = OutVals[i];
3523         ISD::ArgFlagsTy Flags = Outs[i].Flags;
3524         if (VA.getLocInfo() == CCValAssign::Indirect)
3525           return false;
3526         if (!VA.isRegLoc()) {
3527           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
3528                                    MFI, MRI, TII))
3529             return false;
3530         }
3531       }
3532     }
3533
3534     // If the tailcall address may be in a register, then make sure it's
3535     // possible to register allocate for it. In 32-bit, the call address can
3536     // only target EAX, EDX, or ECX since the tail call must be scheduled after
3537     // callee-saved registers are restored. These happen to be the same
3538     // registers used to pass 'inreg' arguments so watch out for those.
3539     if (!Subtarget->is64Bit() &&
3540         ((!isa<GlobalAddressSDNode>(Callee) &&
3541           !isa<ExternalSymbolSDNode>(Callee)) ||
3542          DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
3543       unsigned NumInRegs = 0;
3544       // In PIC we need an extra register to formulate the address computation
3545       // for the callee.
3546       unsigned MaxInRegs =
3547         (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
3548
3549       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3550         CCValAssign &VA = ArgLocs[i];
3551         if (!VA.isRegLoc())
3552           continue;
3553         unsigned Reg = VA.getLocReg();
3554         switch (Reg) {
3555         default: break;
3556         case X86::EAX: case X86::EDX: case X86::ECX:
3557           if (++NumInRegs == MaxInRegs)
3558             return false;
3559           break;
3560         }
3561       }
3562     }
3563   }
3564
3565   return true;
3566 }
3567
3568 FastISel *
3569 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
3570                                   const TargetLibraryInfo *libInfo) const {
3571   return X86::createFastISel(funcInfo, libInfo);
3572 }
3573
3574 //===----------------------------------------------------------------------===//
3575 //                           Other Lowering Hooks
3576 //===----------------------------------------------------------------------===//
3577
3578 static bool MayFoldLoad(SDValue Op) {
3579   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
3580 }
3581
3582 static bool MayFoldIntoStore(SDValue Op) {
3583   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
3584 }
3585
3586 static bool isTargetShuffle(unsigned Opcode) {
3587   switch(Opcode) {
3588   default: return false;
3589   case X86ISD::BLENDI:
3590   case X86ISD::PSHUFB:
3591   case X86ISD::PSHUFD:
3592   case X86ISD::PSHUFHW:
3593   case X86ISD::PSHUFLW:
3594   case X86ISD::SHUFP:
3595   case X86ISD::PALIGNR:
3596   case X86ISD::MOVLHPS:
3597   case X86ISD::MOVLHPD:
3598   case X86ISD::MOVHLPS:
3599   case X86ISD::MOVLPS:
3600   case X86ISD::MOVLPD:
3601   case X86ISD::MOVSHDUP:
3602   case X86ISD::MOVSLDUP:
3603   case X86ISD::MOVDDUP:
3604   case X86ISD::MOVSS:
3605   case X86ISD::MOVSD:
3606   case X86ISD::UNPCKL:
3607   case X86ISD::UNPCKH:
3608   case X86ISD::VPERMILPI:
3609   case X86ISD::VPERM2X128:
3610   case X86ISD::VPERMI:
3611     return true;
3612   }
3613 }
3614
3615 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3616                                     SDValue V1, SelectionDAG &DAG) {
3617   switch(Opc) {
3618   default: llvm_unreachable("Unknown x86 shuffle node");
3619   case X86ISD::MOVSHDUP:
3620   case X86ISD::MOVSLDUP:
3621   case X86ISD::MOVDDUP:
3622     return DAG.getNode(Opc, dl, VT, V1);
3623   }
3624 }
3625
3626 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3627                                     SDValue V1, unsigned TargetMask,
3628                                     SelectionDAG &DAG) {
3629   switch(Opc) {
3630   default: llvm_unreachable("Unknown x86 shuffle node");
3631   case X86ISD::PSHUFD:
3632   case X86ISD::PSHUFHW:
3633   case X86ISD::PSHUFLW:
3634   case X86ISD::VPERMILPI:
3635   case X86ISD::VPERMI:
3636     return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
3637   }
3638 }
3639
3640 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3641                                     SDValue V1, SDValue V2, unsigned TargetMask,
3642                                     SelectionDAG &DAG) {
3643   switch(Opc) {
3644   default: llvm_unreachable("Unknown x86 shuffle node");
3645   case X86ISD::PALIGNR:
3646   case X86ISD::VALIGN:
3647   case X86ISD::SHUFP:
3648   case X86ISD::VPERM2X128:
3649     return DAG.getNode(Opc, dl, VT, V1, V2,
3650                        DAG.getConstant(TargetMask, MVT::i8));
3651   }
3652 }
3653
3654 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3655                                     SDValue V1, SDValue V2, SelectionDAG &DAG) {
3656   switch(Opc) {
3657   default: llvm_unreachable("Unknown x86 shuffle node");
3658   case X86ISD::MOVLHPS:
3659   case X86ISD::MOVLHPD:
3660   case X86ISD::MOVHLPS:
3661   case X86ISD::MOVLPS:
3662   case X86ISD::MOVLPD:
3663   case X86ISD::MOVSS:
3664   case X86ISD::MOVSD:
3665   case X86ISD::UNPCKL:
3666   case X86ISD::UNPCKH:
3667     return DAG.getNode(Opc, dl, VT, V1, V2);
3668   }
3669 }
3670
3671 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
3672   MachineFunction &MF = DAG.getMachineFunction();
3673   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
3674   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3675   int ReturnAddrIndex = FuncInfo->getRAIndex();
3676
3677   if (ReturnAddrIndex == 0) {
3678     // Set up a frame object for the return address.
3679     unsigned SlotSize = RegInfo->getSlotSize();
3680     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
3681                                                            -(int64_t)SlotSize,
3682                                                            false);
3683     FuncInfo->setRAIndex(ReturnAddrIndex);
3684   }
3685
3686   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
3687 }
3688
3689 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
3690                                        bool hasSymbolicDisplacement) {
3691   // Offset should fit into 32 bit immediate field.
3692   if (!isInt<32>(Offset))
3693     return false;
3694
3695   // If we don't have a symbolic displacement - we don't have any extra
3696   // restrictions.
3697   if (!hasSymbolicDisplacement)
3698     return true;
3699
3700   // FIXME: Some tweaks might be needed for medium code model.
3701   if (M != CodeModel::Small && M != CodeModel::Kernel)
3702     return false;
3703
3704   // For small code model we assume that latest object is 16MB before end of 31
3705   // bits boundary. We may also accept pretty large negative constants knowing
3706   // that all objects are in the positive half of address space.
3707   if (M == CodeModel::Small && Offset < 16*1024*1024)
3708     return true;
3709
3710   // For kernel code model we know that all object resist in the negative half
3711   // of 32bits address space. We may not accept negative offsets, since they may
3712   // be just off and we may accept pretty large positive ones.
3713   if (M == CodeModel::Kernel && Offset >= 0)
3714     return true;
3715
3716   return false;
3717 }
3718
3719 /// isCalleePop - Determines whether the callee is required to pop its
3720 /// own arguments. Callee pop is necessary to support tail calls.
3721 bool X86::isCalleePop(CallingConv::ID CallingConv,
3722                       bool is64Bit, bool IsVarArg, bool TailCallOpt) {
3723   switch (CallingConv) {
3724   default:
3725     return false;
3726   case CallingConv::X86_StdCall:
3727   case CallingConv::X86_FastCall:
3728   case CallingConv::X86_ThisCall:
3729     return !is64Bit;
3730   case CallingConv::Fast:
3731   case CallingConv::GHC:
3732   case CallingConv::HiPE:
3733     if (IsVarArg)
3734       return false;
3735     return TailCallOpt;
3736   }
3737 }
3738
3739 /// \brief Return true if the condition is an unsigned comparison operation.
3740 static bool isX86CCUnsigned(unsigned X86CC) {
3741   switch (X86CC) {
3742   default: llvm_unreachable("Invalid integer condition!");
3743   case X86::COND_E:     return true;
3744   case X86::COND_G:     return false;
3745   case X86::COND_GE:    return false;
3746   case X86::COND_L:     return false;
3747   case X86::COND_LE:    return false;
3748   case X86::COND_NE:    return true;
3749   case X86::COND_B:     return true;
3750   case X86::COND_A:     return true;
3751   case X86::COND_BE:    return true;
3752   case X86::COND_AE:    return true;
3753   }
3754   llvm_unreachable("covered switch fell through?!");
3755 }
3756
3757 /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
3758 /// specific condition code, returning the condition code and the LHS/RHS of the
3759 /// comparison to make.
3760 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
3761                                SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
3762   if (!isFP) {
3763     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
3764       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
3765         // X > -1   -> X == 0, jump !sign.
3766         RHS = DAG.getConstant(0, RHS.getValueType());
3767         return X86::COND_NS;
3768       }
3769       if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
3770         // X < 0   -> X == 0, jump on sign.
3771         return X86::COND_S;
3772       }
3773       if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
3774         // X < 1   -> X <= 0
3775         RHS = DAG.getConstant(0, RHS.getValueType());
3776         return X86::COND_LE;
3777       }
3778     }
3779
3780     switch (SetCCOpcode) {
3781     default: llvm_unreachable("Invalid integer condition!");
3782     case ISD::SETEQ:  return X86::COND_E;
3783     case ISD::SETGT:  return X86::COND_G;
3784     case ISD::SETGE:  return X86::COND_GE;
3785     case ISD::SETLT:  return X86::COND_L;
3786     case ISD::SETLE:  return X86::COND_LE;
3787     case ISD::SETNE:  return X86::COND_NE;
3788     case ISD::SETULT: return X86::COND_B;
3789     case ISD::SETUGT: return X86::COND_A;
3790     case ISD::SETULE: return X86::COND_BE;
3791     case ISD::SETUGE: return X86::COND_AE;
3792     }
3793   }
3794
3795   // First determine if it is required or is profitable to flip the operands.
3796
3797   // If LHS is a foldable load, but RHS is not, flip the condition.
3798   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3799       !ISD::isNON_EXTLoad(RHS.getNode())) {
3800     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3801     std::swap(LHS, RHS);
3802   }
3803
3804   switch (SetCCOpcode) {
3805   default: break;
3806   case ISD::SETOLT:
3807   case ISD::SETOLE:
3808   case ISD::SETUGT:
3809   case ISD::SETUGE:
3810     std::swap(LHS, RHS);
3811     break;
3812   }
3813
3814   // On a floating point condition, the flags are set as follows:
3815   // ZF  PF  CF   op
3816   //  0 | 0 | 0 | X > Y
3817   //  0 | 0 | 1 | X < Y
3818   //  1 | 0 | 0 | X == Y
3819   //  1 | 1 | 1 | unordered
3820   switch (SetCCOpcode) {
3821   default: llvm_unreachable("Condcode should be pre-legalized away");
3822   case ISD::SETUEQ:
3823   case ISD::SETEQ:   return X86::COND_E;
3824   case ISD::SETOLT:              // flipped
3825   case ISD::SETOGT:
3826   case ISD::SETGT:   return X86::COND_A;
3827   case ISD::SETOLE:              // flipped
3828   case ISD::SETOGE:
3829   case ISD::SETGE:   return X86::COND_AE;
3830   case ISD::SETUGT:              // flipped
3831   case ISD::SETULT:
3832   case ISD::SETLT:   return X86::COND_B;
3833   case ISD::SETUGE:              // flipped
3834   case ISD::SETULE:
3835   case ISD::SETLE:   return X86::COND_BE;
3836   case ISD::SETONE:
3837   case ISD::SETNE:   return X86::COND_NE;
3838   case ISD::SETUO:   return X86::COND_P;
3839   case ISD::SETO:    return X86::COND_NP;
3840   case ISD::SETOEQ:
3841   case ISD::SETUNE:  return X86::COND_INVALID;
3842   }
3843 }
3844
3845 /// hasFPCMov - is there a floating point cmov for the specific X86 condition
3846 /// code. Current x86 isa includes the following FP cmov instructions:
3847 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3848 static bool hasFPCMov(unsigned X86CC) {
3849   switch (X86CC) {
3850   default:
3851     return false;
3852   case X86::COND_B:
3853   case X86::COND_BE:
3854   case X86::COND_E:
3855   case X86::COND_P:
3856   case X86::COND_A:
3857   case X86::COND_AE:
3858   case X86::COND_NE:
3859   case X86::COND_NP:
3860     return true;
3861   }
3862 }
3863
3864 /// isFPImmLegal - Returns true if the target can instruction select the
3865 /// specified FP immediate natively. If false, the legalizer will
3866 /// materialize the FP immediate as a load from a constant pool.
3867 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
3868   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
3869     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
3870       return true;
3871   }
3872   return false;
3873 }
3874
3875 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
3876                                               ISD::LoadExtType ExtTy,
3877                                               EVT NewVT) const {
3878   // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3879   // relocation target a movq or addq instruction: don't let the load shrink.
3880   SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3881   if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3882     if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3883       return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3884   return true;
3885 }
3886
3887 /// \brief Returns true if it is beneficial to convert a load of a constant
3888 /// to just the constant itself.
3889 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
3890                                                           Type *Ty) const {
3891   assert(Ty->isIntegerTy());
3892
3893   unsigned BitSize = Ty->getPrimitiveSizeInBits();
3894   if (BitSize == 0 || BitSize > 64)
3895     return false;
3896   return true;
3897 }
3898
3899 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
3900                                                 unsigned Index) const {
3901   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
3902     return false;
3903
3904   return (Index == 0 || Index == ResVT.getVectorNumElements());
3905 }
3906
3907 bool X86TargetLowering::isCheapToSpeculateCttz() const {
3908   // Speculate cttz only if we can directly use TZCNT.
3909   return Subtarget->hasBMI();
3910 }
3911
3912 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
3913   // Speculate ctlz only if we can directly use LZCNT.
3914   return Subtarget->hasLZCNT();
3915 }
3916
3917 /// isUndefOrInRange - Return true if Val is undef or if its value falls within
3918 /// the specified range (L, H].
3919 static bool isUndefOrInRange(int Val, int Low, int Hi) {
3920   return (Val < 0) || (Val >= Low && Val < Hi);
3921 }
3922
3923 /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
3924 /// specified value.
3925 static bool isUndefOrEqual(int Val, int CmpVal) {
3926   return (Val < 0 || Val == CmpVal);
3927 }
3928
3929 /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
3930 /// from position Pos and ending in Pos+Size, falls within the specified
3931 /// sequential range (Low, Low+Size]. or is undef.
3932 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
3933                                        unsigned Pos, unsigned Size, int Low) {
3934   for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
3935     if (!isUndefOrEqual(Mask[i], Low))
3936       return false;
3937   return true;
3938 }
3939
3940 /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
3941 /// is suitable for input to PSHUFD. That is, it doesn't reference the other
3942 /// operand - by default will match for first operand.
3943 static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT,
3944                          bool TestSecondOperand = false) {
3945   if (VT != MVT::v4f32 && VT != MVT::v4i32 &&
3946       VT != MVT::v2f64 && VT != MVT::v2i64)
3947     return false;
3948
3949   unsigned NumElems = VT.getVectorNumElements();
3950   unsigned Lo = TestSecondOperand ? NumElems : 0;
3951   unsigned Hi = Lo + NumElems;
3952
3953   for (unsigned i = 0; i < NumElems; ++i)
3954     if (!isUndefOrInRange(Mask[i], (int)Lo, (int)Hi))
3955       return false;
3956
3957   return true;
3958 }
3959
3960 /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
3961 /// is suitable for input to PSHUFHW.
3962 static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
3963   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
3964     return false;
3965
3966   // Lower quadword copied in order or undef.
3967   if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
3968     return false;
3969
3970   // Upper quadword shuffled.
3971   for (unsigned i = 4; i != 8; ++i)
3972     if (!isUndefOrInRange(Mask[i], 4, 8))
3973       return false;
3974
3975   if (VT == MVT::v16i16) {
3976     // Lower quadword copied in order or undef.
3977     if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
3978       return false;
3979
3980     // Upper quadword shuffled.
3981     for (unsigned i = 12; i != 16; ++i)
3982       if (!isUndefOrInRange(Mask[i], 12, 16))
3983         return false;
3984   }
3985
3986   return true;
3987 }
3988
3989 /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
3990 /// is suitable for input to PSHUFLW.
3991 static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
3992   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
3993     return false;
3994
3995   // Upper quadword copied in order.
3996   if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
3997     return false;
3998
3999   // Lower quadword shuffled.
4000   for (unsigned i = 0; i != 4; ++i)
4001     if (!isUndefOrInRange(Mask[i], 0, 4))
4002       return false;
4003
4004   if (VT == MVT::v16i16) {
4005     // Upper quadword copied in order.
4006     if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
4007       return false;
4008
4009     // Lower quadword shuffled.
4010     for (unsigned i = 8; i != 12; ++i)
4011       if (!isUndefOrInRange(Mask[i], 8, 12))
4012         return false;
4013   }
4014
4015   return true;
4016 }
4017
4018 /// \brief Return true if the mask specifies a shuffle of elements that is
4019 /// suitable for input to intralane (palignr) or interlane (valign) vector
4020 /// right-shift.
4021 static bool isAlignrMask(ArrayRef<int> Mask, MVT VT, bool InterLane) {
4022   unsigned NumElts = VT.getVectorNumElements();
4023   unsigned NumLanes = InterLane ? 1: VT.getSizeInBits()/128;
4024   unsigned NumLaneElts = NumElts/NumLanes;
4025
4026   // Do not handle 64-bit element shuffles with palignr.
4027   if (NumLaneElts == 2)
4028     return false;
4029
4030   for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
4031     unsigned i;
4032     for (i = 0; i != NumLaneElts; ++i) {
4033       if (Mask[i+l] >= 0)
4034         break;
4035     }
4036
4037     // Lane is all undef, go to next lane
4038     if (i == NumLaneElts)
4039       continue;
4040
4041     int Start = Mask[i+l];
4042
4043     // Make sure its in this lane in one of the sources
4044     if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
4045         !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
4046       return false;
4047
4048     // If not lane 0, then we must match lane 0
4049     if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
4050       return false;
4051
4052     // Correct second source to be contiguous with first source
4053     if (Start >= (int)NumElts)
4054       Start -= NumElts - NumLaneElts;
4055
4056     // Make sure we're shifting in the right direction.
4057     if (Start <= (int)(i+l))
4058       return false;
4059
4060     Start -= i;
4061
4062     // Check the rest of the elements to see if they are consecutive.
4063     for (++i; i != NumLaneElts; ++i) {
4064       int Idx = Mask[i+l];
4065
4066       // Make sure its in this lane
4067       if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
4068           !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
4069         return false;
4070
4071       // If not lane 0, then we must match lane 0
4072       if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
4073         return false;
4074
4075       if (Idx >= (int)NumElts)
4076         Idx -= NumElts - NumLaneElts;
4077
4078       if (!isUndefOrEqual(Idx, Start+i))
4079         return false;
4080
4081     }
4082   }
4083
4084   return true;
4085 }
4086
4087 /// \brief Return true if the node specifies a shuffle of elements that is
4088 /// suitable for input to PALIGNR.
4089 static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
4090                           const X86Subtarget *Subtarget) {
4091   if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
4092       (VT.is256BitVector() && !Subtarget->hasInt256()) ||
4093       VT.is512BitVector())
4094     // FIXME: Add AVX512BW.
4095     return false;
4096
4097   return isAlignrMask(Mask, VT, false);
4098 }
4099
4100 /// \brief Return true if the node specifies a shuffle of elements that is
4101 /// suitable for input to VALIGN.
4102 static bool isVALIGNMask(ArrayRef<int> Mask, MVT VT,
4103                           const X86Subtarget *Subtarget) {
4104   // FIXME: Add AVX512VL.
4105   if (!VT.is512BitVector() || !Subtarget->hasAVX512())
4106     return false;
4107   return isAlignrMask(Mask, VT, true);
4108 }
4109
4110 /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
4111 /// the two vector operands have swapped position.
4112 static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
4113                                      unsigned NumElems) {
4114   for (unsigned i = 0; i != NumElems; ++i) {
4115     int idx = Mask[i];
4116     if (idx < 0)
4117       continue;
4118     else if (idx < (int)NumElems)
4119       Mask[i] = idx + NumElems;
4120     else
4121       Mask[i] = idx - NumElems;
4122   }
4123 }
4124
4125 /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
4126 /// specifies a shuffle of elements that is suitable for input to 128/256-bit
4127 /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
4128 /// reverse of what x86 shuffles want.
4129 static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) {
4130
4131   unsigned NumElems = VT.getVectorNumElements();
4132   unsigned NumLanes = VT.getSizeInBits()/128;
4133   unsigned NumLaneElems = NumElems/NumLanes;
4134
4135   if (NumLaneElems != 2 && NumLaneElems != 4)
4136     return false;
4137
4138   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4139   bool symmetricMaskRequired =
4140     (VT.getSizeInBits() >= 256) && (EltSize == 32);
4141
4142   // VSHUFPSY divides the resulting vector into 4 chunks.
4143   // The sources are also splitted into 4 chunks, and each destination
4144   // chunk must come from a different source chunk.
4145   //
4146   //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
4147   //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
4148   //
4149   //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
4150   //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
4151   //
4152   // VSHUFPDY divides the resulting vector into 4 chunks.
4153   // The sources are also splitted into 4 chunks, and each destination
4154   // chunk must come from a different source chunk.
4155   //
4156   //  SRC1 =>      X3       X2       X1       X0
4157   //  SRC2 =>      Y3       Y2       Y1       Y0
4158   //
4159   //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
4160   //
4161   SmallVector<int, 4> MaskVal(NumLaneElems, -1);
4162   unsigned HalfLaneElems = NumLaneElems/2;
4163   for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
4164     for (unsigned i = 0; i != NumLaneElems; ++i) {
4165       int Idx = Mask[i+l];
4166       unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
4167       if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
4168         return false;
4169       // For VSHUFPSY, the mask of the second half must be the same as the
4170       // first but with the appropriate offsets. This works in the same way as
4171       // VPERMILPS works with masks.
4172       if (!symmetricMaskRequired || Idx < 0)
4173         continue;
4174       if (MaskVal[i] < 0) {
4175         MaskVal[i] = Idx - l;
4176         continue;
4177       }
4178       if ((signed)(Idx - l) != MaskVal[i])
4179         return false;
4180     }
4181   }
4182
4183   return true;
4184 }
4185
4186 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
4187 /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
4188 static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) {
4189   if (!VT.is128BitVector())
4190     return false;
4191
4192   unsigned NumElems = VT.getVectorNumElements();
4193
4194   if (NumElems != 4)
4195     return false;
4196
4197   // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
4198   return isUndefOrEqual(Mask[0], 6) &&
4199          isUndefOrEqual(Mask[1], 7) &&
4200          isUndefOrEqual(Mask[2], 2) &&
4201          isUndefOrEqual(Mask[3], 3);
4202 }
4203
4204 /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
4205 /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
4206 /// <2, 3, 2, 3>
4207 static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) {
4208   if (!VT.is128BitVector())
4209     return false;
4210
4211   unsigned NumElems = VT.getVectorNumElements();
4212
4213   if (NumElems != 4)
4214     return false;
4215
4216   return isUndefOrEqual(Mask[0], 2) &&
4217          isUndefOrEqual(Mask[1], 3) &&
4218          isUndefOrEqual(Mask[2], 2) &&
4219          isUndefOrEqual(Mask[3], 3);
4220 }
4221
4222 /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
4223 /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
4224 static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) {
4225   if (!VT.is128BitVector())
4226     return false;
4227
4228   unsigned NumElems = VT.getVectorNumElements();
4229
4230   if (NumElems != 2 && NumElems != 4)
4231     return false;
4232
4233   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4234     if (!isUndefOrEqual(Mask[i], i + NumElems))
4235       return false;
4236
4237   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
4238     if (!isUndefOrEqual(Mask[i], i))
4239       return false;
4240
4241   return true;
4242 }
4243
4244 /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
4245 /// specifies a shuffle of elements that is suitable for input to MOVLHPS.
4246 static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
4247   if (!VT.is128BitVector())
4248     return false;
4249
4250   unsigned NumElems = VT.getVectorNumElements();
4251
4252   if (NumElems != 2 && NumElems != 4)
4253     return false;
4254
4255   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4256     if (!isUndefOrEqual(Mask[i], i))
4257       return false;
4258
4259   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4260     if (!isUndefOrEqual(Mask[i + e], i + NumElems))
4261       return false;
4262
4263   return true;
4264 }
4265
4266 /// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
4267 /// specifies a shuffle of elements that is suitable for input to INSERTPS.
4268 /// i. e: If all but one element come from the same vector.
4269 static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
4270   // TODO: Deal with AVX's VINSERTPS
4271   if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))
4272     return false;
4273
4274   unsigned CorrectPosV1 = 0;
4275   unsigned CorrectPosV2 = 0;
4276   for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) {
4277     if (Mask[i] == -1) {
4278       ++CorrectPosV1;
4279       ++CorrectPosV2;
4280       continue;
4281     }
4282
4283     if (Mask[i] == i)
4284       ++CorrectPosV1;
4285     else if (Mask[i] == i + 4)
4286       ++CorrectPosV2;
4287   }
4288
4289   if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
4290     // We have 3 elements (undefs count as elements from any vector) from one
4291     // vector, and one from another.
4292     return true;
4293
4294   return false;
4295 }
4296
4297 //
4298 // Some special combinations that can be optimized.
4299 //
4300 static
4301 SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
4302                                SelectionDAG &DAG) {
4303   MVT VT = SVOp->getSimpleValueType(0);
4304   SDLoc dl(SVOp);
4305
4306   if (VT != MVT::v8i32 && VT != MVT::v8f32)
4307     return SDValue();
4308
4309   ArrayRef<int> Mask = SVOp->getMask();
4310
4311   // These are the special masks that may be optimized.
4312   static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
4313   static const int MaskToOptimizeOdd[]  = {1, 9, 3, 11, 5, 13, 7, 15};
4314   bool MatchEvenMask = true;
4315   bool MatchOddMask  = true;
4316   for (int i=0; i<8; ++i) {
4317     if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
4318       MatchEvenMask = false;
4319     if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
4320       MatchOddMask = false;
4321   }
4322
4323   if (!MatchEvenMask && !MatchOddMask)
4324     return SDValue();
4325
4326   SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
4327
4328   SDValue Op0 = SVOp->getOperand(0);
4329   SDValue Op1 = SVOp->getOperand(1);
4330
4331   if (MatchEvenMask) {
4332     // Shift the second operand right to 32 bits.
4333     static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
4334     Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
4335   } else {
4336     // Shift the first operand left to 32 bits.
4337     static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
4338     Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
4339   }
4340   static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
4341   return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
4342 }
4343
4344 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
4345 /// specifies a shuffle of elements that is suitable for input to UNPCKL.
4346 static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT,
4347                          bool HasInt256, bool V2IsSplat = false) {
4348
4349   assert(VT.getSizeInBits() >= 128 &&
4350          "Unsupported vector type for unpckl");
4351
4352   unsigned NumElts = VT.getVectorNumElements();
4353   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
4354       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4355     return false;
4356
4357   assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
4358          "Unsupported vector type for unpckh");
4359
4360   // AVX defines UNPCK* to operate independently on 128-bit lanes.
4361   unsigned NumLanes = VT.getSizeInBits()/128;
4362   unsigned NumLaneElts = NumElts/NumLanes;
4363
4364   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4365     for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
4366       int BitI  = Mask[l+i];
4367       int BitI1 = Mask[l+i+1];
4368       if (!isUndefOrEqual(BitI, j))
4369         return false;
4370       if (V2IsSplat) {
4371         if (!isUndefOrEqual(BitI1, NumElts))
4372           return false;
4373       } else {
4374         if (!isUndefOrEqual(BitI1, j + NumElts))
4375           return false;
4376       }
4377     }
4378   }
4379
4380   return true;
4381 }
4382
4383 /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
4384 /// specifies a shuffle of elements that is suitable for input to UNPCKH.
4385 static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT,
4386                          bool HasInt256, bool V2IsSplat = false) {
4387   assert(VT.getSizeInBits() >= 128 &&
4388          "Unsupported vector type for unpckh");
4389
4390   unsigned NumElts = VT.getVectorNumElements();
4391   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
4392       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4393     return false;
4394
4395   assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
4396          "Unsupported vector type for unpckh");
4397
4398   // AVX defines UNPCK* to operate independently on 128-bit lanes.
4399   unsigned NumLanes = VT.getSizeInBits()/128;
4400   unsigned NumLaneElts = NumElts/NumLanes;
4401
4402   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4403     for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
4404       int BitI  = Mask[l+i];
4405       int BitI1 = Mask[l+i+1];
4406       if (!isUndefOrEqual(BitI, j))
4407         return false;
4408       if (V2IsSplat) {
4409         if (isUndefOrEqual(BitI1, NumElts))
4410           return false;
4411       } else {
4412         if (!isUndefOrEqual(BitI1, j+NumElts))
4413           return false;
4414       }
4415     }
4416   }
4417   return true;
4418 }
4419
4420 /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
4421 /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
4422 /// <0, 0, 1, 1>
4423 static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
4424   unsigned NumElts = VT.getVectorNumElements();
4425   bool Is256BitVec = VT.is256BitVector();
4426
4427   if (VT.is512BitVector())
4428     return false;
4429   assert((VT.is128BitVector() || VT.is256BitVector()) &&
4430          "Unsupported vector type for unpckh");
4431
4432   if (Is256BitVec && NumElts != 4 && NumElts != 8 &&
4433       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4434     return false;
4435
4436   // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
4437   // FIXME: Need a better way to get rid of this, there's no latency difference
4438   // between UNPCKLPD and MOVDDUP, the later should always be checked first and
4439   // the former later. We should also remove the "_undef" special mask.
4440   if (NumElts == 4 && Is256BitVec)
4441     return false;
4442
4443   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
4444   // independently on 128-bit lanes.
4445   unsigned NumLanes = VT.getSizeInBits()/128;
4446   unsigned NumLaneElts = NumElts/NumLanes;
4447
4448   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4449     for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
4450       int BitI  = Mask[l+i];
4451       int BitI1 = Mask[l+i+1];
4452
4453       if (!isUndefOrEqual(BitI, j))
4454         return false;
4455       if (!isUndefOrEqual(BitI1, j))
4456         return false;
4457     }
4458   }
4459
4460   return true;
4461 }
4462
4463 /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
4464 /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
4465 /// <2, 2, 3, 3>
4466 static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
4467   unsigned NumElts = VT.getVectorNumElements();
4468
4469   if (VT.is512BitVector())
4470     return false;
4471
4472   assert((VT.is128BitVector() || VT.is256BitVector()) &&
4473          "Unsupported vector type for unpckh");
4474
4475   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
4476       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4477     return false;
4478
4479   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
4480   // independently on 128-bit lanes.
4481   unsigned NumLanes = VT.getSizeInBits()/128;
4482   unsigned NumLaneElts = NumElts/NumLanes;
4483
4484   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4485     for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
4486       int BitI  = Mask[l+i];
4487       int BitI1 = Mask[l+i+1];
4488       if (!isUndefOrEqual(BitI, j))
4489         return false;
4490       if (!isUndefOrEqual(BitI1, j))
4491         return false;
4492     }
4493   }
4494   return true;
4495 }
4496
4497 // Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or
4498 // (src1[0], src0[1]), manipulation with 256-bit sub-vectors
4499 static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) {
4500   if (!VT.is512BitVector())
4501     return false;
4502
4503   unsigned NumElts = VT.getVectorNumElements();
4504   unsigned HalfSize = NumElts/2;
4505   if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) {
4506     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) {
4507       *Imm = 1;
4508       return true;
4509     }
4510   }
4511   if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) {
4512     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) {
4513       *Imm = 0;
4514       return true;
4515     }
4516   }
4517   return false;
4518 }
4519
4520 /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
4521 /// specifies a shuffle of elements that is suitable for input to MOVSS,
4522 /// MOVSD, and MOVD, i.e. setting the lowest element.
4523 static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
4524   if (VT.getVectorElementType().getSizeInBits() < 32)
4525     return false;
4526   if (!VT.is128BitVector())
4527     return false;
4528
4529   unsigned NumElts = VT.getVectorNumElements();
4530
4531   if (!isUndefOrEqual(Mask[0], NumElts))
4532     return false;
4533
4534   for (unsigned i = 1; i != NumElts; ++i)
4535     if (!isUndefOrEqual(Mask[i], i))
4536       return false;
4537
4538   return true;
4539 }
4540
4541 /// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
4542 /// as permutations between 128-bit chunks or halves. As an example: this
4543 /// shuffle bellow:
4544 ///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
4545 /// The first half comes from the second half of V1 and the second half from the
4546 /// the second half of V2.
4547 static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
4548   if (!HasFp256 || !VT.is256BitVector())
4549     return false;
4550
4551   // The shuffle result is divided into half A and half B. In total the two
4552   // sources have 4 halves, namely: C, D, E, F. The final values of A and
4553   // B must come from C, D, E or F.
4554   unsigned HalfSize = VT.getVectorNumElements()/2;
4555   bool MatchA = false, MatchB = false;
4556
4557   // Check if A comes from one of C, D, E, F.
4558   for (unsigned Half = 0; Half != 4; ++Half) {
4559     if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
4560       MatchA = true;
4561       break;
4562     }
4563   }
4564
4565   // Check if B comes from one of C, D, E, F.
4566   for (unsigned Half = 0; Half != 4; ++Half) {
4567     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
4568       MatchB = true;
4569       break;
4570     }
4571   }
4572
4573   return MatchA && MatchB;
4574 }
4575
4576 /// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
4577 /// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
4578 static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
4579   MVT VT = SVOp->getSimpleValueType(0);
4580
4581   unsigned HalfSize = VT.getVectorNumElements()/2;
4582
4583   unsigned FstHalf = 0, SndHalf = 0;
4584   for (unsigned i = 0; i < HalfSize; ++i) {
4585     if (SVOp->getMaskElt(i) > 0) {
4586       FstHalf = SVOp->getMaskElt(i)/HalfSize;
4587       break;
4588     }
4589   }
4590   for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
4591     if (SVOp->getMaskElt(i) > 0) {
4592       SndHalf = SVOp->getMaskElt(i)/HalfSize;
4593       break;
4594     }
4595   }
4596
4597   return (FstHalf | (SndHalf << 4));
4598 }
4599
4600 // Symmetric in-lane mask. Each lane has 4 elements (for imm8)
4601 static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) {
4602   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4603   if (EltSize < 32)
4604     return false;
4605
4606   unsigned NumElts = VT.getVectorNumElements();
4607   Imm8 = 0;
4608   if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) {
4609     for (unsigned i = 0; i != NumElts; ++i) {
4610       if (Mask[i] < 0)
4611         continue;
4612       Imm8 |= Mask[i] << (i*2);
4613     }
4614     return true;
4615   }
4616
4617   unsigned LaneSize = 4;
4618   SmallVector<int, 4> MaskVal(LaneSize, -1);
4619
4620   for (unsigned l = 0; l != NumElts; l += LaneSize) {
4621     for (unsigned i = 0; i != LaneSize; ++i) {
4622       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
4623         return false;
4624       if (Mask[i+l] < 0)
4625         continue;
4626       if (MaskVal[i] < 0) {
4627         MaskVal[i] = Mask[i+l] - l;
4628         Imm8 |= MaskVal[i] << (i*2);
4629         continue;
4630       }
4631       if (Mask[i+l] != (signed)(MaskVal[i]+l))
4632         return false;
4633     }
4634   }
4635   return true;
4636 }
4637
4638 /// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
4639 /// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
4640 /// Note that VPERMIL mask matching is different depending whether theunderlying
4641 /// type is 32 or 64. In the VPERMILPS the high half of the mask should point
4642 /// to the same elements of the low, but to the higher half of the source.
4643 /// In VPERMILPD the two lanes could be shuffled independently of each other
4644 /// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
4645 static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {
4646   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4647   if (VT.getSizeInBits() < 256 || EltSize < 32)
4648     return false;
4649   bool symmetricMaskRequired = (EltSize == 32);
4650   unsigned NumElts = VT.getVectorNumElements();
4651
4652   unsigned NumLanes = VT.getSizeInBits()/128;
4653   unsigned LaneSize = NumElts/NumLanes;
4654   // 2 or 4 elements in one lane
4655
4656   SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);
4657   for (unsigned l = 0; l != NumElts; l += LaneSize) {
4658     for (unsigned i = 0; i != LaneSize; ++i) {
4659       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
4660         return false;
4661       if (symmetricMaskRequired) {
4662         if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {
4663           ExpectedMaskVal[i] = Mask[i+l] - l;
4664           continue;
4665         }
4666         if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l))
4667           return false;
4668       }
4669     }
4670   }
4671   return true;
4672 }
4673
4674 /// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
4675 /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
4676 /// element of vector 2 and the other elements to come from vector 1 in order.
4677 static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT,
4678                                bool V2IsSplat = false, bool V2IsUndef = false) {
4679   if (!VT.is128BitVector())
4680     return false;
4681
4682   unsigned NumOps = VT.getVectorNumElements();
4683   if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
4684     return false;
4685
4686   if (!isUndefOrEqual(Mask[0], 0))
4687     return false;
4688
4689   for (unsigned i = 1; i != NumOps; ++i)
4690     if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
4691           (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
4692           (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
4693       return false;
4694
4695   return true;
4696 }
4697
4698 /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4699 /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
4700 /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
4701 static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT,
4702                            const X86Subtarget *Subtarget) {
4703   if (!Subtarget->hasSSE3())
4704     return false;
4705
4706   unsigned NumElems = VT.getVectorNumElements();
4707
4708   if ((VT.is128BitVector() && NumElems != 4) ||
4709       (VT.is256BitVector() && NumElems != 8) ||
4710       (VT.is512BitVector() && NumElems != 16))
4711     return false;
4712
4713   // "i+1" is the value the indexed mask element must have
4714   for (unsigned i = 0; i != NumElems; i += 2)
4715     if (!isUndefOrEqual(Mask[i], i+1) ||
4716         !isUndefOrEqual(Mask[i+1], i+1))
4717       return false;
4718
4719   return true;
4720 }
4721
4722 /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4723 /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
4724 /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
4725 static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT,
4726                            const X86Subtarget *Subtarget) {
4727   if (!Subtarget->hasSSE3())
4728     return false;
4729
4730   unsigned NumElems = VT.getVectorNumElements();
4731
4732   if ((VT.is128BitVector() && NumElems != 4) ||
4733       (VT.is256BitVector() && NumElems != 8) ||
4734       (VT.is512BitVector() && NumElems != 16))
4735     return false;
4736
4737   // "i" is the value the indexed mask element must have
4738   for (unsigned i = 0; i != NumElems; i += 2)
4739     if (!isUndefOrEqual(Mask[i], i) ||
4740         !isUndefOrEqual(Mask[i+1], i))
4741       return false;
4742
4743   return true;
4744 }
4745
4746 /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
4747 /// specifies a shuffle of elements that is suitable for input to 256-bit
4748 /// version of MOVDDUP.
4749 static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
4750   if (!HasFp256 || !VT.is256BitVector())
4751     return false;
4752
4753   unsigned NumElts = VT.getVectorNumElements();
4754   if (NumElts != 4)
4755     return false;
4756
4757   for (unsigned i = 0; i != NumElts/2; ++i)
4758     if (!isUndefOrEqual(Mask[i], 0))
4759       return false;
4760   for (unsigned i = NumElts/2; i != NumElts; ++i)
4761     if (!isUndefOrEqual(Mask[i], NumElts/2))
4762       return false;
4763   return true;
4764 }
4765
4766 /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4767 /// specifies a shuffle of elements that is suitable for input to 128-bit
4768 /// version of MOVDDUP.
4769 static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) {
4770   if (!VT.is128BitVector())
4771     return false;
4772
4773   unsigned e = VT.getVectorNumElements() / 2;
4774   for (unsigned i = 0; i != e; ++i)
4775     if (!isUndefOrEqual(Mask[i], i))
4776       return false;
4777   for (unsigned i = 0; i != e; ++i)
4778     if (!isUndefOrEqual(Mask[e+i], i))
4779       return false;
4780   return true;
4781 }
4782
4783 /// isVEXTRACTIndex - Return true if the specified
4784 /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
4785 /// suitable for instruction that extract 128 or 256 bit vectors
4786 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
4787   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4788   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4789     return false;
4790
4791   // The index should be aligned on a vecWidth-bit boundary.
4792   uint64_t Index =
4793     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4794
4795   MVT VT = N->getSimpleValueType(0);
4796   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
4797   bool Result = (Index * ElSize) % vecWidth == 0;
4798
4799   return Result;
4800 }
4801
4802 /// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR
4803 /// operand specifies a subvector insert that is suitable for input to
4804 /// insertion of 128 or 256-bit subvectors
4805 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
4806   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4807   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4808     return false;
4809   // The index should be aligned on a vecWidth-bit boundary.
4810   uint64_t Index =
4811     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4812
4813   MVT VT = N->getSimpleValueType(0);
4814   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
4815   bool Result = (Index * ElSize) % vecWidth == 0;
4816
4817   return Result;
4818 }
4819
4820 bool X86::isVINSERT128Index(SDNode *N) {
4821   return isVINSERTIndex(N, 128);
4822 }
4823
4824 bool X86::isVINSERT256Index(SDNode *N) {
4825   return isVINSERTIndex(N, 256);
4826 }
4827
4828 bool X86::isVEXTRACT128Index(SDNode *N) {
4829   return isVEXTRACTIndex(N, 128);
4830 }
4831
4832 bool X86::isVEXTRACT256Index(SDNode *N) {
4833   return isVEXTRACTIndex(N, 256);
4834 }
4835
4836 /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
4837 /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
4838 /// Handles 128-bit and 256-bit.
4839 static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
4840   MVT VT = N->getSimpleValueType(0);
4841
4842   assert((VT.getSizeInBits() >= 128) &&
4843          "Unsupported vector type for PSHUF/SHUFP");
4844
4845   // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
4846   // independently on 128-bit lanes.
4847   unsigned NumElts = VT.getVectorNumElements();
4848   unsigned NumLanes = VT.getSizeInBits()/128;
4849   unsigned NumLaneElts = NumElts/NumLanes;
4850
4851   assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) &&
4852          "Only supports 2, 4 or 8 elements per lane");
4853
4854   unsigned Shift = (NumLaneElts >= 4) ? 1 : 0;
4855   unsigned Mask = 0;
4856   for (unsigned i = 0; i != NumElts; ++i) {
4857     int Elt = N->getMaskElt(i);
4858     if (Elt < 0) continue;
4859     Elt &= NumLaneElts - 1;
4860     unsigned ShAmt = (i << Shift) % 8;
4861     Mask |= Elt << ShAmt;
4862   }
4863
4864   return Mask;
4865 }
4866
4867 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
4868 /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
4869 static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
4870   MVT VT = N->getSimpleValueType(0);
4871
4872   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
4873          "Unsupported vector type for PSHUFHW");
4874
4875   unsigned NumElts = VT.getVectorNumElements();
4876
4877   unsigned Mask = 0;
4878   for (unsigned l = 0; l != NumElts; l += 8) {
4879     // 8 nodes per lane, but we only care about the last 4.
4880     for (unsigned i = 0; i < 4; ++i) {
4881       int Elt = N->getMaskElt(l+i+4);
4882       if (Elt < 0) continue;
4883       Elt &= 0x3; // only 2-bits.
4884       Mask |= Elt << (i * 2);
4885     }
4886   }
4887
4888   return Mask;
4889 }
4890
4891 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
4892 /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
4893 static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
4894   MVT VT = N->getSimpleValueType(0);
4895
4896   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
4897          "Unsupported vector type for PSHUFHW");
4898
4899   unsigned NumElts = VT.getVectorNumElements();
4900
4901   unsigned Mask = 0;
4902   for (unsigned l = 0; l != NumElts; l += 8) {
4903     // 8 nodes per lane, but we only care about the first 4.
4904     for (unsigned i = 0; i < 4; ++i) {
4905       int Elt = N->getMaskElt(l+i);
4906       if (Elt < 0) continue;
4907       Elt &= 0x3; // only 2-bits
4908       Mask |= Elt << (i * 2);
4909     }
4910   }
4911
4912   return Mask;
4913 }
4914
4915 /// \brief Return the appropriate immediate to shuffle the specified
4916 /// VECTOR_SHUFFLE mask with the PALIGNR (if InterLane is false) or with
4917 /// VALIGN (if Interlane is true) instructions.
4918 static unsigned getShuffleAlignrImmediate(ShuffleVectorSDNode *SVOp,
4919                                            bool InterLane) {
4920   MVT VT = SVOp->getSimpleValueType(0);
4921   unsigned EltSize = InterLane ? 1 :
4922     VT.getVectorElementType().getSizeInBits() >> 3;
4923
4924   unsigned NumElts = VT.getVectorNumElements();
4925   unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128;
4926   unsigned NumLaneElts = NumElts/NumLanes;
4927
4928   int Val = 0;
4929   unsigned i;
4930   for (i = 0; i != NumElts; ++i) {
4931     Val = SVOp->getMaskElt(i);
4932     if (Val >= 0)
4933       break;
4934   }
4935   if (Val >= (int)NumElts)
4936     Val -= NumElts - NumLaneElts;
4937
4938   assert(Val - i > 0 && "PALIGNR imm should be positive");
4939   return (Val - i) * EltSize;
4940 }
4941
4942 /// \brief Return the appropriate immediate to shuffle the specified
4943 /// VECTOR_SHUFFLE mask with the PALIGNR instruction.
4944 static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
4945   return getShuffleAlignrImmediate(SVOp, false);
4946 }
4947
4948 /// \brief Return the appropriate immediate to shuffle the specified
4949 /// VECTOR_SHUFFLE mask with the VALIGN instruction.
4950 static unsigned getShuffleVALIGNImmediate(ShuffleVectorSDNode *SVOp) {
4951   return getShuffleAlignrImmediate(SVOp, true);
4952 }
4953
4954
4955 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
4956   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4957   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4958     llvm_unreachable("Illegal extract subvector for VEXTRACT");
4959
4960   uint64_t Index =
4961     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4962
4963   MVT VecVT = N->getOperand(0).getSimpleValueType();
4964   MVT ElVT = VecVT.getVectorElementType();
4965
4966   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4967   return Index / NumElemsPerChunk;
4968 }
4969
4970 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
4971   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4972   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4973     llvm_unreachable("Illegal insert subvector for VINSERT");
4974
4975   uint64_t Index =
4976     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4977
4978   MVT VecVT = N->getSimpleValueType(0);
4979   MVT ElVT = VecVT.getVectorElementType();
4980
4981   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4982   return Index / NumElemsPerChunk;
4983 }
4984
4985 /// getExtractVEXTRACT128Immediate - Return the appropriate immediate
4986 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
4987 /// and VINSERTI128 instructions.
4988 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
4989   return getExtractVEXTRACTImmediate(N, 128);
4990 }
4991
4992 /// getExtractVEXTRACT256Immediate - Return the appropriate immediate
4993 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4
4994 /// and VINSERTI64x4 instructions.
4995 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
4996   return getExtractVEXTRACTImmediate(N, 256);
4997 }
4998
4999 /// getInsertVINSERT128Immediate - Return the appropriate immediate
5000 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
5001 /// and VINSERTI128 instructions.
5002 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
5003   return getInsertVINSERTImmediate(N, 128);
5004 }
5005
5006 /// getInsertVINSERT256Immediate - Return the appropriate immediate
5007 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4
5008 /// and VINSERTI64x4 instructions.
5009 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
5010   return getInsertVINSERTImmediate(N, 256);
5011 }
5012
5013 /// isZero - Returns true if Elt is a constant integer zero
5014 static bool isZero(SDValue V) {
5015   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
5016   return C && C->isNullValue();
5017 }
5018
5019 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
5020 /// constant +0.0.
5021 bool X86::isZeroNode(SDValue Elt) {
5022   if (isZero(Elt))
5023     return true;
5024   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
5025     return CFP->getValueAPF().isPosZero();
5026   return false;
5027 }
5028
5029 /// ShouldXformToMOVHLPS - Return true if the node should be transformed to
5030 /// match movhlps. The lower half elements should come from upper half of
5031 /// V1 (and in order), and the upper half elements should come from the upper
5032 /// half of V2 (and in order).
5033 static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) {
5034   if (!VT.is128BitVector())
5035     return false;
5036   if (VT.getVectorNumElements() != 4)
5037     return false;
5038   for (unsigned i = 0, e = 2; i != e; ++i)
5039     if (!isUndefOrEqual(Mask[i], i+2))
5040       return false;
5041   for (unsigned i = 2; i != 4; ++i)
5042     if (!isUndefOrEqual(Mask[i], i+4))
5043       return false;
5044   return true;
5045 }
5046
5047 /// isScalarLoadToVector - Returns true if the node is a scalar load that
5048 /// is promoted to a vector. It also returns the LoadSDNode by reference if
5049 /// required.
5050 static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) {
5051   if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
5052     return false;
5053   N = N->getOperand(0).getNode();
5054   if (!ISD::isNON_EXTLoad(N))
5055     return false;
5056   if (LD)
5057     *LD = cast<LoadSDNode>(N);
5058   return true;
5059 }
5060
5061 // Test whether the given value is a vector value which will be legalized
5062 // into a load.
5063 static bool WillBeConstantPoolLoad(SDNode *N) {
5064   if (N->getOpcode() != ISD::BUILD_VECTOR)
5065     return false;
5066
5067   // Check for any non-constant elements.
5068   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
5069     switch (N->getOperand(i).getNode()->getOpcode()) {
5070     case ISD::UNDEF:
5071     case ISD::ConstantFP:
5072     case ISD::Constant:
5073       break;
5074     default:
5075       return false;
5076     }
5077
5078   // Vectors of all-zeros and all-ones are materialized with special
5079   // instructions rather than being loaded.
5080   return !ISD::isBuildVectorAllZeros(N) &&
5081          !ISD::isBuildVectorAllOnes(N);
5082 }
5083
5084 /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
5085 /// match movlp{s|d}. The lower half elements should come from lower half of
5086 /// V1 (and in order), and the upper half elements should come from the upper
5087 /// half of V2 (and in order). And since V1 will become the source of the
5088 /// MOVLP, it must be either a vector load or a scalar load to vector.
5089 static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
5090                                ArrayRef<int> Mask, MVT VT) {
5091   if (!VT.is128BitVector())
5092     return false;
5093
5094   if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
5095     return false;
5096   // Is V2 is a vector load, don't do this transformation. We will try to use
5097   // load folding shufps op.
5098   if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
5099     return false;
5100
5101   unsigned NumElems = VT.getVectorNumElements();
5102
5103   if (NumElems != 2 && NumElems != 4)
5104     return false;
5105   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
5106     if (!isUndefOrEqual(Mask[i], i))
5107       return false;
5108   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
5109     if (!isUndefOrEqual(Mask[i], i+NumElems))
5110       return false;
5111   return true;
5112 }
5113
5114 /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
5115 /// to an zero vector.
5116 /// FIXME: move to dag combiner / method on ShuffleVectorSDNode
5117 static bool isZeroShuffle(ShuffleVectorSDNode *N) {
5118   SDValue V1 = N->getOperand(0);
5119   SDValue V2 = N->getOperand(1);
5120   unsigned NumElems = N->getValueType(0).getVectorNumElements();
5121   for (unsigned i = 0; i != NumElems; ++i) {
5122     int Idx = N->getMaskElt(i);
5123     if (Idx >= (int)NumElems) {
5124       unsigned Opc = V2.getOpcode();
5125       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
5126         continue;
5127       if (Opc != ISD::BUILD_VECTOR ||
5128           !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
5129         return false;
5130     } else if (Idx >= 0) {
5131       unsigned Opc = V1.getOpcode();
5132       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
5133         continue;
5134       if (Opc != ISD::BUILD_VECTOR ||
5135           !X86::isZeroNode(V1.getOperand(Idx)))
5136         return false;
5137     }
5138   }
5139   return true;
5140 }
5141
5142 /// getZeroVector - Returns a vector of specified type with all zero elements.
5143 ///
5144 static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
5145                              SelectionDAG &DAG, SDLoc dl) {
5146   assert(VT.isVector() && "Expected a vector type");
5147
5148   // Always build SSE zero vectors as <4 x i32> bitcasted
5149   // to their dest type. This ensures they get CSE'd.
5150   SDValue Vec;
5151   if (VT.is128BitVector()) {  // SSE
5152     if (Subtarget->hasSSE2()) {  // SSE2
5153       SDValue Cst = DAG.getConstant(0, MVT::i32);
5154       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
5155     } else { // SSE1
5156       SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
5157       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
5158     }
5159   } else if (VT.is256BitVector()) { // AVX
5160     if (Subtarget->hasInt256()) { // AVX2
5161       SDValue Cst = DAG.getConstant(0, MVT::i32);
5162       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5163       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
5164     } else {
5165       // 256-bit logic and arithmetic instructions in AVX are all
5166       // floating-point, no support for integer ops. Emit fp zeroed vectors.
5167       SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
5168       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5169       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
5170     }
5171   } else if (VT.is512BitVector()) { // AVX-512
5172       SDValue Cst = DAG.getConstant(0, MVT::i32);
5173       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
5174                         Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5175       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
5176   } else if (VT.getScalarType() == MVT::i1) {
5177     assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type");
5178     SDValue Cst = DAG.getConstant(0, MVT::i1);
5179     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
5180     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
5181   } else
5182     llvm_unreachable("Unexpected vector type");
5183
5184   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
5185 }
5186
5187 /// getOnesVector - Returns a vector of specified type with all bits set.
5188 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
5189 /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
5190 /// Then bitcast to their original type, ensuring they get CSE'd.
5191 static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
5192                              SDLoc dl) {
5193   assert(VT.isVector() && "Expected a vector type");
5194
5195   SDValue Cst = DAG.getConstant(~0U, MVT::i32);
5196   SDValue Vec;
5197   if (VT.is256BitVector()) {
5198     if (HasInt256) { // AVX2
5199       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5200       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
5201     } else { // AVX
5202       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
5203       Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
5204     }
5205   } else if (VT.is128BitVector()) {
5206     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
5207   } else
5208     llvm_unreachable("Unexpected vector type");
5209
5210   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
5211 }
5212
5213 /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
5214 /// that point to V2 points to its first element.
5215 static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
5216   for (unsigned i = 0; i != NumElems; ++i) {
5217     if (Mask[i] > (int)NumElems) {
5218       Mask[i] = NumElems;
5219     }
5220   }
5221 }
5222
5223 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
5224 /// operation of specified width.
5225 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
5226                        SDValue V2) {
5227   unsigned NumElems = VT.getVectorNumElements();
5228   SmallVector<int, 8> Mask;
5229   Mask.push_back(NumElems);
5230   for (unsigned i = 1; i != NumElems; ++i)
5231     Mask.push_back(i);
5232   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
5233 }
5234
5235 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
5236 static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
5237                           SDValue V2) {
5238   unsigned NumElems = VT.getVectorNumElements();
5239   SmallVector<int, 8> Mask;
5240   for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
5241     Mask.push_back(i);
5242     Mask.push_back(i + NumElems);
5243   }
5244   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
5245 }
5246
5247 /// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
5248 static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
5249                           SDValue V2) {
5250   unsigned NumElems = VT.getVectorNumElements();
5251   SmallVector<int, 8> Mask;
5252   for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
5253     Mask.push_back(i + Half);
5254     Mask.push_back(i + NumElems + Half);
5255   }
5256   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
5257 }
5258
5259 // PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
5260 // a generic shuffle instruction because the target has no such instructions.
5261 // Generate shuffles which repeat i16 and i8 several times until they can be
5262 // represented by v4f32 and then be manipulated by target suported shuffles.
5263 static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
5264   MVT VT = V.getSimpleValueType();
5265   int NumElems = VT.getVectorNumElements();
5266   SDLoc dl(V);
5267
5268   while (NumElems > 4) {
5269     if (EltNo < NumElems/2) {
5270       V = getUnpackl(DAG, dl, VT, V, V);
5271     } else {
5272       V = getUnpackh(DAG, dl, VT, V, V);
5273       EltNo -= NumElems/2;
5274     }
5275     NumElems >>= 1;
5276   }
5277   return V;
5278 }
5279
5280 /// getLegalSplat - Generate a legal splat with supported x86 shuffles
5281 static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
5282   MVT VT = V.getSimpleValueType();
5283   SDLoc dl(V);
5284
5285   if (VT.is128BitVector()) {
5286     V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
5287     int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
5288     V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
5289                              &SplatMask[0]);
5290   } else if (VT.is256BitVector()) {
5291     // To use VPERMILPS to splat scalars, the second half of indicies must
5292     // refer to the higher part, which is a duplication of the lower one,
5293     // because VPERMILPS can only handle in-lane permutations.
5294     int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
5295                          EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
5296
5297     V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
5298     V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
5299                              &SplatMask[0]);
5300   } else
5301     llvm_unreachable("Vector size not supported");
5302
5303   return DAG.getNode(ISD::BITCAST, dl, VT, V);
5304 }
5305
5306 /// PromoteSplat - Splat is promoted to target supported vector shuffles.
5307 static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
5308   MVT SrcVT = SV->getSimpleValueType(0);
5309   SDValue V1 = SV->getOperand(0);
5310   SDLoc dl(SV);
5311
5312   int EltNo = SV->getSplatIndex();
5313   int NumElems = SrcVT.getVectorNumElements();
5314   bool Is256BitVec = SrcVT.is256BitVector();
5315
5316   assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) &&
5317          "Unknown how to promote splat for type");
5318
5319   // Extract the 128-bit part containing the splat element and update
5320   // the splat element index when it refers to the higher register.
5321   if (Is256BitVec) {
5322     V1 = Extract128BitVector(V1, EltNo, DAG, dl);
5323     if (EltNo >= NumElems/2)
5324       EltNo -= NumElems/2;
5325   }
5326
5327   // All i16 and i8 vector types can't be used directly by a generic shuffle
5328   // instruction because the target has no such instruction. Generate shuffles
5329   // which repeat i16 and i8 several times until they fit in i32, and then can
5330   // be manipulated by target suported shuffles.
5331   MVT EltVT = SrcVT.getVectorElementType();
5332   if (EltVT == MVT::i8 || EltVT == MVT::i16)
5333     V1 = PromoteSplati8i16(V1, DAG, EltNo);
5334
5335   // Recreate the 256-bit vector and place the same 128-bit vector
5336   // into the low and high part. This is necessary because we want
5337   // to use VPERM* to shuffle the vectors
5338   if (Is256BitVec) {
5339     V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
5340   }
5341
5342   return getLegalSplat(DAG, V1, EltNo);
5343 }
5344
5345 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
5346 /// vector of zero or undef vector.  This produces a shuffle where the low
5347 /// element of V2 is swizzled into the zero/undef vector, landing at element
5348 /// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
5349 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
5350                                            bool IsZero,
5351                                            const X86Subtarget *Subtarget,
5352                                            SelectionDAG &DAG) {
5353   MVT VT = V2.getSimpleValueType();
5354   SDValue V1 = IsZero
5355     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5356   unsigned NumElems = VT.getVectorNumElements();
5357   SmallVector<int, 16> MaskVec;
5358   for (unsigned i = 0; i != NumElems; ++i)
5359     // If this is the insertion idx, put the low elt of V2 here.
5360     MaskVec.push_back(i == Idx ? NumElems : i);
5361   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
5362 }
5363
5364 /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
5365 /// target specific opcode. Returns true if the Mask could be calculated. Sets
5366 /// IsUnary to true if only uses one source. Note that this will set IsUnary for
5367 /// shuffles which use a single input multiple times, and in those cases it will
5368 /// adjust the mask to only have indices within that single input.
5369 static bool getTargetShuffleMask(SDNode *N, MVT VT,
5370                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
5371   unsigned NumElems = VT.getVectorNumElements();
5372   SDValue ImmN;
5373
5374   IsUnary = false;
5375   bool IsFakeUnary = false;
5376   switch(N->getOpcode()) {
5377   case X86ISD::BLENDI:
5378     ImmN = N->getOperand(N->getNumOperands()-1);
5379     DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5380     break;
5381   case X86ISD::SHUFP:
5382     ImmN = N->getOperand(N->getNumOperands()-1);
5383     DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5384     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5385     break;
5386   case X86ISD::UNPCKH:
5387     DecodeUNPCKHMask(VT, Mask);
5388     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5389     break;
5390   case X86ISD::UNPCKL:
5391     DecodeUNPCKLMask(VT, Mask);
5392     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5393     break;
5394   case X86ISD::MOVHLPS:
5395     DecodeMOVHLPSMask(NumElems, Mask);
5396     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5397     break;
5398   case X86ISD::MOVLHPS:
5399     DecodeMOVLHPSMask(NumElems, Mask);
5400     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5401     break;
5402   case X86ISD::PALIGNR:
5403     ImmN = N->getOperand(N->getNumOperands()-1);
5404     DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5405     break;
5406   case X86ISD::PSHUFD:
5407   case X86ISD::VPERMILPI:
5408     ImmN = N->getOperand(N->getNumOperands()-1);
5409     DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5410     IsUnary = true;
5411     break;
5412   case X86ISD::PSHUFHW:
5413     ImmN = N->getOperand(N->getNumOperands()-1);
5414     DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5415     IsUnary = true;
5416     break;
5417   case X86ISD::PSHUFLW:
5418     ImmN = N->getOperand(N->getNumOperands()-1);
5419     DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5420     IsUnary = true;
5421     break;
5422   case X86ISD::PSHUFB: {
5423     IsUnary = true;
5424     SDValue MaskNode = N->getOperand(1);
5425     while (MaskNode->getOpcode() == ISD::BITCAST)
5426       MaskNode = MaskNode->getOperand(0);
5427
5428     if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
5429       // If we have a build-vector, then things are easy.
5430       EVT VT = MaskNode.getValueType();
5431       assert(VT.isVector() &&
5432              "Can't produce a non-vector with a build_vector!");
5433       if (!VT.isInteger())
5434         return false;
5435
5436       int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8;
5437
5438       SmallVector<uint64_t, 32> RawMask;
5439       for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) {
5440         SDValue Op = MaskNode->getOperand(i);
5441         if (Op->getOpcode() == ISD::UNDEF) {
5442           RawMask.push_back((uint64_t)SM_SentinelUndef);
5443           continue;
5444         }
5445         auto *CN = dyn_cast<ConstantSDNode>(Op.getNode());
5446         if (!CN)
5447           return false;
5448         APInt MaskElement = CN->getAPIntValue();
5449
5450         // We now have to decode the element which could be any integer size and
5451         // extract each byte of it.
5452         for (int j = 0; j < NumBytesPerElement; ++j) {
5453           // Note that this is x86 and so always little endian: the low byte is
5454           // the first byte of the mask.
5455           RawMask.push_back(MaskElement.getLoBits(8).getZExtValue());
5456           MaskElement = MaskElement.lshr(8);
5457         }
5458       }
5459       DecodePSHUFBMask(RawMask, Mask);
5460       break;
5461     }
5462
5463     auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
5464     if (!MaskLoad)
5465       return false;
5466
5467     SDValue Ptr = MaskLoad->getBasePtr();
5468     if (Ptr->getOpcode() == X86ISD::Wrapper)
5469       Ptr = Ptr->getOperand(0);
5470
5471     auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
5472     if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
5473       return false;
5474
5475     if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
5476       DecodePSHUFBMask(C, Mask);
5477       if (Mask.empty())
5478         return false;
5479       break;
5480     }
5481
5482     return false;
5483   }
5484   case X86ISD::VPERMI:
5485     ImmN = N->getOperand(N->getNumOperands()-1);
5486     DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5487     IsUnary = true;
5488     break;
5489   case X86ISD::MOVSS:
5490   case X86ISD::MOVSD:
5491     DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
5492     break;
5493   case X86ISD::VPERM2X128:
5494     ImmN = N->getOperand(N->getNumOperands()-1);
5495     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5496     if (Mask.empty()) return false;
5497     break;
5498   case X86ISD::MOVSLDUP:
5499     DecodeMOVSLDUPMask(VT, Mask);
5500     IsUnary = true;
5501     break;
5502   case X86ISD::MOVSHDUP:
5503     DecodeMOVSHDUPMask(VT, Mask);
5504     IsUnary = true;
5505     break;
5506   case X86ISD::MOVDDUP:
5507     DecodeMOVDDUPMask(VT, Mask);
5508     IsUnary = true;
5509     break;
5510   case X86ISD::MOVLHPD:
5511   case X86ISD::MOVLPD:
5512   case X86ISD::MOVLPS:
5513     // Not yet implemented
5514     return false;
5515   default: llvm_unreachable("unknown target shuffle node");
5516   }
5517
5518   // If we have a fake unary shuffle, the shuffle mask is spread across two
5519   // inputs that are actually the same node. Re-map the mask to always point
5520   // into the first input.
5521   if (IsFakeUnary)
5522     for (int &M : Mask)
5523       if (M >= (int)Mask.size())
5524         M -= Mask.size();
5525
5526   return true;
5527 }
5528
5529 /// getShuffleScalarElt - Returns the scalar element that will make up the ith
5530 /// element of the result of the vector shuffle.
5531 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
5532                                    unsigned Depth) {
5533   if (Depth == 6)
5534     return SDValue();  // Limit search depth.
5535
5536   SDValue V = SDValue(N, 0);
5537   EVT VT = V.getValueType();
5538   unsigned Opcode = V.getOpcode();
5539
5540   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
5541   if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
5542     int Elt = SV->getMaskElt(Index);
5543
5544     if (Elt < 0)
5545       return DAG.getUNDEF(VT.getVectorElementType());
5546
5547     unsigned NumElems = VT.getVectorNumElements();
5548     SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
5549                                          : SV->getOperand(1);
5550     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
5551   }
5552
5553   // Recurse into target specific vector shuffles to find scalars.
5554   if (isTargetShuffle(Opcode)) {
5555     MVT ShufVT = V.getSimpleValueType();
5556     unsigned NumElems = ShufVT.getVectorNumElements();
5557     SmallVector<int, 16> ShuffleMask;
5558     bool IsUnary;
5559
5560     if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
5561       return SDValue();
5562
5563     int Elt = ShuffleMask[Index];
5564     if (Elt < 0)
5565       return DAG.getUNDEF(ShufVT.getVectorElementType());
5566
5567     SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
5568                                          : N->getOperand(1);
5569     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
5570                                Depth+1);
5571   }
5572
5573   // Actual nodes that may contain scalar elements
5574   if (Opcode == ISD::BITCAST) {
5575     V = V.getOperand(0);
5576     EVT SrcVT = V.getValueType();
5577     unsigned NumElems = VT.getVectorNumElements();
5578
5579     if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
5580       return SDValue();
5581   }
5582
5583   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
5584     return (Index == 0) ? V.getOperand(0)
5585                         : DAG.getUNDEF(VT.getVectorElementType());
5586
5587   if (V.getOpcode() == ISD::BUILD_VECTOR)
5588     return V.getOperand(Index);
5589
5590   return SDValue();
5591 }
5592
5593 /// getNumOfConsecutiveZeros - Return the number of elements of a vector
5594 /// shuffle operation which come from a consecutively from a zero. The
5595 /// search can start in two different directions, from left or right.
5596 /// We count undefs as zeros until PreferredNum is reached.
5597 static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp,
5598                                          unsigned NumElems, bool ZerosFromLeft,
5599                                          SelectionDAG &DAG,
5600                                          unsigned PreferredNum = -1U) {
5601   unsigned NumZeros = 0;
5602   for (unsigned i = 0; i != NumElems; ++i) {
5603     unsigned Index = ZerosFromLeft ? i : NumElems - i - 1;
5604     SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
5605     if (!Elt.getNode())
5606       break;
5607
5608     if (X86::isZeroNode(Elt))
5609       ++NumZeros;
5610     else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum.
5611       NumZeros = std::min(NumZeros + 1, PreferredNum);
5612     else
5613       break;
5614   }
5615
5616   return NumZeros;
5617 }
5618
5619 /// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
5620 /// correspond consecutively to elements from one of the vector operands,
5621 /// starting from its index OpIdx. Also tell OpNum which source vector operand.
5622 static
5623 bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
5624                               unsigned MaskI, unsigned MaskE, unsigned OpIdx,
5625                               unsigned NumElems, unsigned &OpNum) {
5626   bool SeenV1 = false;
5627   bool SeenV2 = false;
5628
5629   for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
5630     int Idx = SVOp->getMaskElt(i);
5631     // Ignore undef indicies
5632     if (Idx < 0)
5633       continue;
5634
5635     if (Idx < (int)NumElems)
5636       SeenV1 = true;
5637     else
5638       SeenV2 = true;
5639
5640     // Only accept consecutive elements from the same vector
5641     if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
5642       return false;
5643   }
5644
5645   OpNum = SeenV1 ? 0 : 1;
5646   return true;
5647 }
5648
5649 /// isVectorShiftRight - Returns true if the shuffle can be implemented as a
5650 /// logical left shift of a vector.
5651 static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
5652                                bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5653   unsigned NumElems =
5654     SVOp->getSimpleValueType(0).getVectorNumElements();
5655   unsigned NumZeros = getNumOfConsecutiveZeros(
5656       SVOp, NumElems, false /* check zeros from right */, DAG,
5657       SVOp->getMaskElt(0));
5658   unsigned OpSrc;
5659
5660   if (!NumZeros)
5661     return false;
5662
5663   // Considering the elements in the mask that are not consecutive zeros,
5664   // check if they consecutively come from only one of the source vectors.
5665   //
5666   //               V1 = {X, A, B, C}     0
5667   //                         \  \  \    /
5668   //   vector_shuffle V1, V2 <1, 2, 3, X>
5669   //
5670   if (!isShuffleMaskConsecutive(SVOp,
5671             0,                   // Mask Start Index
5672             NumElems-NumZeros,   // Mask End Index(exclusive)
5673             NumZeros,            // Where to start looking in the src vector
5674             NumElems,            // Number of elements in vector
5675             OpSrc))              // Which source operand ?
5676     return false;
5677
5678   isLeft = false;
5679   ShAmt = NumZeros;
5680   ShVal = SVOp->getOperand(OpSrc);
5681   return true;
5682 }
5683
5684 /// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
5685 /// logical left shift of a vector.
5686 static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
5687                               bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5688   unsigned NumElems =
5689     SVOp->getSimpleValueType(0).getVectorNumElements();
5690   unsigned NumZeros = getNumOfConsecutiveZeros(
5691       SVOp, NumElems, true /* check zeros from left */, DAG,
5692       NumElems - SVOp->getMaskElt(NumElems - 1) - 1);
5693   unsigned OpSrc;
5694
5695   if (!NumZeros)
5696     return false;
5697
5698   // Considering the elements in the mask that are not consecutive zeros,
5699   // check if they consecutively come from only one of the source vectors.
5700   //
5701   //                           0    { A, B, X, X } = V2
5702   //                          / \    /  /
5703   //   vector_shuffle V1, V2 <X, X, 4, 5>
5704   //
5705   if (!isShuffleMaskConsecutive(SVOp,
5706             NumZeros,     // Mask Start Index
5707             NumElems,     // Mask End Index(exclusive)
5708             0,            // Where to start looking in the src vector
5709             NumElems,     // Number of elements in vector
5710             OpSrc))       // Which source operand ?
5711     return false;
5712
5713   isLeft = true;
5714   ShAmt = NumZeros;
5715   ShVal = SVOp->getOperand(OpSrc);
5716   return true;
5717 }
5718
5719 /// isVectorShift - Returns true if the shuffle can be implemented as a
5720 /// logical left or right shift of a vector.
5721 static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
5722                           bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5723   // Although the logic below support any bitwidth size, there are no
5724   // shift instructions which handle more than 128-bit vectors.
5725   if (!SVOp->getSimpleValueType(0).is128BitVector())
5726     return false;
5727
5728   if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
5729       isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
5730     return true;
5731
5732   return false;
5733 }
5734
5735 /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
5736 ///
5737 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
5738                                        unsigned NumNonZero, unsigned NumZero,
5739                                        SelectionDAG &DAG,
5740                                        const X86Subtarget* Subtarget,
5741                                        const TargetLowering &TLI) {
5742   if (NumNonZero > 8)
5743     return SDValue();
5744
5745   SDLoc dl(Op);
5746   SDValue V;
5747   bool First = true;
5748   for (unsigned i = 0; i < 16; ++i) {
5749     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
5750     if (ThisIsNonZero && First) {
5751       if (NumZero)
5752         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5753       else
5754         V = DAG.getUNDEF(MVT::v8i16);
5755       First = false;
5756     }
5757
5758     if ((i & 1) != 0) {
5759       SDValue ThisElt, LastElt;
5760       bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
5761       if (LastIsNonZero) {
5762         LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
5763                               MVT::i16, Op.getOperand(i-1));
5764       }
5765       if (ThisIsNonZero) {
5766         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
5767         ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
5768                               ThisElt, DAG.getConstant(8, MVT::i8));
5769         if (LastIsNonZero)
5770           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
5771       } else
5772         ThisElt = LastElt;
5773
5774       if (ThisElt.getNode())
5775         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
5776                         DAG.getIntPtrConstant(i/2));
5777     }
5778   }
5779
5780   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
5781 }
5782
5783 /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
5784 ///
5785 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
5786                                      unsigned NumNonZero, unsigned NumZero,
5787                                      SelectionDAG &DAG,
5788                                      const X86Subtarget* Subtarget,
5789                                      const TargetLowering &TLI) {
5790   if (NumNonZero > 4)
5791     return SDValue();
5792
5793   SDLoc dl(Op);
5794   SDValue V;
5795   bool First = true;
5796   for (unsigned i = 0; i < 8; ++i) {
5797     bool isNonZero = (NonZeros & (1 << i)) != 0;
5798     if (isNonZero) {
5799       if (First) {
5800         if (NumZero)
5801           V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5802         else
5803           V = DAG.getUNDEF(MVT::v8i16);
5804         First = false;
5805       }
5806       V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
5807                       MVT::v8i16, V, Op.getOperand(i),
5808                       DAG.getIntPtrConstant(i));
5809     }
5810   }
5811
5812   return V;
5813 }
5814
5815 /// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
5816 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
5817                                      const X86Subtarget *Subtarget,
5818                                      const TargetLowering &TLI) {
5819   // Find all zeroable elements.
5820   bool Zeroable[4];
5821   for (int i=0; i < 4; ++i) {
5822     SDValue Elt = Op->getOperand(i);
5823     Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt));
5824   }
5825   assert(std::count_if(&Zeroable[0], &Zeroable[4],
5826                        [](bool M) { return !M; }) > 1 &&
5827          "We expect at least two non-zero elements!");
5828
5829   // We only know how to deal with build_vector nodes where elements are either
5830   // zeroable or extract_vector_elt with constant index.
5831   SDValue FirstNonZero;
5832   unsigned FirstNonZeroIdx;
5833   for (unsigned i=0; i < 4; ++i) {
5834     if (Zeroable[i])
5835       continue;
5836     SDValue Elt = Op->getOperand(i);
5837     if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5838         !isa<ConstantSDNode>(Elt.getOperand(1)))
5839       return SDValue();
5840     // Make sure that this node is extracting from a 128-bit vector.
5841     MVT VT = Elt.getOperand(0).getSimpleValueType();
5842     if (!VT.is128BitVector())
5843       return SDValue();
5844     if (!FirstNonZero.getNode()) {
5845       FirstNonZero = Elt;
5846       FirstNonZeroIdx = i;
5847     }
5848   }
5849
5850   assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
5851   SDValue V1 = FirstNonZero.getOperand(0);
5852   MVT VT = V1.getSimpleValueType();
5853
5854   // See if this build_vector can be lowered as a blend with zero.
5855   SDValue Elt;
5856   unsigned EltMaskIdx, EltIdx;
5857   int Mask[4];
5858   for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
5859     if (Zeroable[EltIdx]) {
5860       // The zero vector will be on the right hand side.
5861       Mask[EltIdx] = EltIdx+4;
5862       continue;
5863     }
5864
5865     Elt = Op->getOperand(EltIdx);
5866     // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
5867     EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
5868     if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
5869       break;
5870     Mask[EltIdx] = EltIdx;
5871   }
5872
5873   if (EltIdx == 4) {
5874     // Let the shuffle legalizer deal with blend operations.
5875     SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
5876     if (V1.getSimpleValueType() != VT)
5877       V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1);
5878     return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]);
5879   }
5880
5881   // See if we can lower this build_vector to a INSERTPS.
5882   if (!Subtarget->hasSSE41())
5883     return SDValue();
5884
5885   SDValue V2 = Elt.getOperand(0);
5886   if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
5887     V1 = SDValue();
5888
5889   bool CanFold = true;
5890   for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
5891     if (Zeroable[i])
5892       continue;
5893
5894     SDValue Current = Op->getOperand(i);
5895     SDValue SrcVector = Current->getOperand(0);
5896     if (!V1.getNode())
5897       V1 = SrcVector;
5898     CanFold = SrcVector == V1 &&
5899       cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
5900   }
5901
5902   if (!CanFold)
5903     return SDValue();
5904
5905   assert(V1.getNode() && "Expected at least two non-zero elements!");
5906   if (V1.getSimpleValueType() != MVT::v4f32)
5907     V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1);
5908   if (V2.getSimpleValueType() != MVT::v4f32)
5909     V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2);
5910
5911   // Ok, we can emit an INSERTPS instruction.
5912   unsigned ZMask = 0;
5913   for (int i = 0; i < 4; ++i)
5914     if (Zeroable[i])
5915       ZMask |= 1 << i;
5916
5917   unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
5918   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
5919   SDValue Result = DAG.getNode(X86ISD::INSERTPS, SDLoc(Op), MVT::v4f32, V1, V2,
5920                                DAG.getIntPtrConstant(InsertPSMask));
5921   return DAG.getNode(ISD::BITCAST, SDLoc(Op), VT, Result);
5922 }
5923
5924 /// Return a vector logical shift node.
5925 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
5926                          unsigned NumBits, SelectionDAG &DAG,
5927                          const TargetLowering &TLI, SDLoc dl) {
5928   assert(VT.is128BitVector() && "Unknown type for VShift");
5929   MVT ShVT = MVT::v2i64;
5930   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
5931   SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
5932   MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(SrcOp.getValueType());
5933   SDValue ShiftVal = DAG.getConstant(NumBits, ScalarShiftTy);
5934   return DAG.getNode(ISD::BITCAST, dl, VT,
5935                      DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
5936 }
5937
5938 static SDValue
5939 LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
5940
5941   // Check if the scalar load can be widened into a vector load. And if
5942   // the address is "base + cst" see if the cst can be "absorbed" into
5943   // the shuffle mask.
5944   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
5945     SDValue Ptr = LD->getBasePtr();
5946     if (!ISD::isNormalLoad(LD) || LD->isVolatile())
5947       return SDValue();
5948     EVT PVT = LD->getValueType(0);
5949     if (PVT != MVT::i32 && PVT != MVT::f32)
5950       return SDValue();
5951
5952     int FI = -1;
5953     int64_t Offset = 0;
5954     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
5955       FI = FINode->getIndex();
5956       Offset = 0;
5957     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
5958                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
5959       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
5960       Offset = Ptr.getConstantOperandVal(1);
5961       Ptr = Ptr.getOperand(0);
5962     } else {
5963       return SDValue();
5964     }
5965
5966     // FIXME: 256-bit vector instructions don't require a strict alignment,
5967     // improve this code to support it better.
5968     unsigned RequiredAlign = VT.getSizeInBits()/8;
5969     SDValue Chain = LD->getChain();
5970     // Make sure the stack object alignment is at least 16 or 32.
5971     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
5972     if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
5973       if (MFI->isFixedObjectIndex(FI)) {
5974         // Can't change the alignment. FIXME: It's possible to compute
5975         // the exact stack offset and reference FI + adjust offset instead.
5976         // If someone *really* cares about this. That's the way to implement it.
5977         return SDValue();
5978       } else {
5979         MFI->setObjectAlignment(FI, RequiredAlign);
5980       }
5981     }
5982
5983     // (Offset % 16 or 32) must be multiple of 4. Then address is then
5984     // Ptr + (Offset & ~15).
5985     if (Offset < 0)
5986       return SDValue();
5987     if ((Offset % RequiredAlign) & 3)
5988       return SDValue();
5989     int64_t StartOffset = Offset & ~(RequiredAlign-1);
5990     if (StartOffset)
5991       Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(),
5992                         Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
5993
5994     int EltNo = (Offset - StartOffset) >> 2;
5995     unsigned NumElems = VT.getVectorNumElements();
5996
5997     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
5998     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
5999                              LD->getPointerInfo().getWithOffset(StartOffset),
6000                              false, false, false, 0);
6001
6002     SmallVector<int, 8> Mask;
6003     for (unsigned i = 0; i != NumElems; ++i)
6004       Mask.push_back(EltNo);
6005
6006     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
6007   }
6008
6009   return SDValue();
6010 }
6011
6012 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6013 /// elements can be replaced by a single large load which has the same value as
6014 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6015 ///
6016 /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
6017 ///
6018 /// FIXME: we'd also like to handle the case where the last elements are zero
6019 /// rather than undef via VZEXT_LOAD, but we do not detect that case today.
6020 /// There's even a handy isZeroNode for that purpose.
6021 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6022                                         SDLoc &DL, SelectionDAG &DAG,
6023                                         bool isAfterLegalize) {
6024   unsigned NumElems = Elts.size();
6025
6026   LoadSDNode *LDBase = nullptr;
6027   unsigned LastLoadedElt = -1U;
6028
6029   // For each element in the initializer, see if we've found a load or an undef.
6030   // If we don't find an initial load element, or later load elements are
6031   // non-consecutive, bail out.
6032   for (unsigned i = 0; i < NumElems; ++i) {
6033     SDValue Elt = Elts[i];
6034     // Look through a bitcast.
6035     if (Elt.getNode() && Elt.getOpcode() == ISD::BITCAST)
6036       Elt = Elt.getOperand(0);
6037     if (!Elt.getNode() ||
6038         (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
6039       return SDValue();
6040     if (!LDBase) {
6041       if (Elt.getNode()->getOpcode() == ISD::UNDEF)
6042         return SDValue();
6043       LDBase = cast<LoadSDNode>(Elt.getNode());
6044       LastLoadedElt = i;
6045       continue;
6046     }
6047     if (Elt.getOpcode() == ISD::UNDEF)
6048       continue;
6049
6050     LoadSDNode *LD = cast<LoadSDNode>(Elt);
6051     EVT LdVT = Elt.getValueType();
6052     // Each loaded element must be the correct fractional portion of the
6053     // requested vector load.
6054     if (LdVT.getSizeInBits() != VT.getSizeInBits() / NumElems)
6055       return SDValue();
6056     if (!DAG.isConsecutiveLoad(LD, LDBase, LdVT.getSizeInBits() / 8, i))
6057       return SDValue();
6058     LastLoadedElt = i;
6059   }
6060
6061   // If we have found an entire vector of loads and undefs, then return a large
6062   // load of the entire vector width starting at the base pointer.  If we found
6063   // consecutive loads for the low half, generate a vzext_load node.
6064   if (LastLoadedElt == NumElems - 1) {
6065     assert(LDBase && "Did not find base load for merging consecutive loads");
6066     EVT EltVT = LDBase->getValueType(0);
6067     // Ensure that the input vector size for the merged loads matches the
6068     // cumulative size of the input elements.
6069     if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
6070       return SDValue();
6071
6072     if (isAfterLegalize &&
6073         !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
6074       return SDValue();
6075
6076     SDValue NewLd = SDValue();
6077
6078     NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6079                         LDBase->getPointerInfo(), LDBase->isVolatile(),
6080                         LDBase->isNonTemporal(), LDBase->isInvariant(),
6081                         LDBase->getAlignment());
6082
6083     if (LDBase->hasAnyUseOfValue(1)) {
6084       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
6085                                      SDValue(LDBase, 1),
6086                                      SDValue(NewLd.getNode(), 1));
6087       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6088       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6089                              SDValue(NewLd.getNode(), 1));
6090     }
6091
6092     return NewLd;
6093   }
6094
6095   //TODO: The code below fires only for for loading the low v2i32 / v2f32
6096   //of a v4i32 / v4f32. It's probably worth generalizing.
6097   EVT EltVT = VT.getVectorElementType();
6098   if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) &&
6099       DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
6100     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
6101     SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6102     SDValue ResNode =
6103         DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
6104                                 LDBase->getPointerInfo(),
6105                                 LDBase->getAlignment(),
6106                                 false/*isVolatile*/, true/*ReadMem*/,
6107                                 false/*WriteMem*/);
6108
6109     // Make sure the newly-created LOAD is in the same position as LDBase in
6110     // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
6111     // update uses of LDBase's output chain to use the TokenFactor.
6112     if (LDBase->hasAnyUseOfValue(1)) {
6113       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
6114                              SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
6115       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6116       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6117                              SDValue(ResNode.getNode(), 1));
6118     }
6119
6120     return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
6121   }
6122   return SDValue();
6123 }
6124
6125 /// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
6126 /// to generate a splat value for the following cases:
6127 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
6128 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
6129 /// a scalar load, or a constant.
6130 /// The VBROADCAST node is returned when a pattern is found,
6131 /// or SDValue() otherwise.
6132 static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
6133                                     SelectionDAG &DAG) {
6134   // VBROADCAST requires AVX.
6135   // TODO: Splats could be generated for non-AVX CPUs using SSE
6136   // instructions, but there's less potential gain for only 128-bit vectors.
6137   if (!Subtarget->hasAVX())
6138     return SDValue();
6139
6140   MVT VT = Op.getSimpleValueType();
6141   SDLoc dl(Op);
6142
6143   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
6144          "Unsupported vector type for broadcast.");
6145
6146   SDValue Ld;
6147   bool ConstSplatVal;
6148
6149   switch (Op.getOpcode()) {
6150     default:
6151       // Unknown pattern found.
6152       return SDValue();
6153
6154     case ISD::BUILD_VECTOR: {
6155       auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
6156       BitVector UndefElements;
6157       SDValue Splat = BVOp->getSplatValue(&UndefElements);
6158
6159       // We need a splat of a single value to use broadcast, and it doesn't
6160       // make any sense if the value is only in one element of the vector.
6161       if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
6162         return SDValue();
6163
6164       Ld = Splat;
6165       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
6166                        Ld.getOpcode() == ISD::ConstantFP);
6167
6168       // Make sure that all of the users of a non-constant load are from the
6169       // BUILD_VECTOR node.
6170       if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
6171         return SDValue();
6172       break;
6173     }
6174
6175     case ISD::VECTOR_SHUFFLE: {
6176       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
6177
6178       // Shuffles must have a splat mask where the first element is
6179       // broadcasted.
6180       if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
6181         return SDValue();
6182
6183       SDValue Sc = Op.getOperand(0);
6184       if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
6185           Sc.getOpcode() != ISD::BUILD_VECTOR) {
6186
6187         if (!Subtarget->hasInt256())
6188           return SDValue();
6189
6190         // Use the register form of the broadcast instruction available on AVX2.
6191         if (VT.getSizeInBits() >= 256)
6192           Sc = Extract128BitVector(Sc, 0, DAG, dl);
6193         return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
6194       }
6195
6196       Ld = Sc.getOperand(0);
6197       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
6198                        Ld.getOpcode() == ISD::ConstantFP);
6199
6200       // The scalar_to_vector node and the suspected
6201       // load node must have exactly one user.
6202       // Constants may have multiple users.
6203
6204       // AVX-512 has register version of the broadcast
6205       bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
6206         Ld.getValueType().getSizeInBits() >= 32;
6207       if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
6208           !hasRegVer))
6209         return SDValue();
6210       break;
6211     }
6212   }
6213
6214   unsigned ScalarSize = Ld.getValueType().getSizeInBits();
6215   bool IsGE256 = (VT.getSizeInBits() >= 256);
6216
6217   // When optimizing for size, generate up to 5 extra bytes for a broadcast
6218   // instruction to save 8 or more bytes of constant pool data.
6219   // TODO: If multiple splats are generated to load the same constant,
6220   // it may be detrimental to overall size. There needs to be a way to detect
6221   // that condition to know if this is truly a size win.
6222   const Function *F = DAG.getMachineFunction().getFunction();
6223   bool OptForSize = F->hasFnAttribute(Attribute::OptimizeForSize);
6224
6225   // Handle broadcasting a single constant scalar from the constant pool
6226   // into a vector.
6227   // On Sandybridge (no AVX2), it is still better to load a constant vector
6228   // from the constant pool and not to broadcast it from a scalar.
6229   // But override that restriction when optimizing for size.
6230   // TODO: Check if splatting is recommended for other AVX-capable CPUs.
6231   if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) {
6232     EVT CVT = Ld.getValueType();
6233     assert(!CVT.isVector() && "Must not broadcast a vector type");
6234
6235     // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
6236     // For size optimization, also splat v2f64 and v2i64, and for size opt
6237     // with AVX2, also splat i8 and i16.
6238     // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
6239     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6240         (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) {
6241       const Constant *C = nullptr;
6242       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
6243         C = CI->getConstantIntValue();
6244       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
6245         C = CF->getConstantFPValue();
6246
6247       assert(C && "Invalid constant type");
6248
6249       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6250       SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
6251       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6252       Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
6253                        MachinePointerInfo::getConstantPool(),
6254                        false, false, false, Alignment);
6255
6256       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6257     }
6258   }
6259
6260   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
6261
6262   // Handle AVX2 in-register broadcasts.
6263   if (!IsLoad && Subtarget->hasInt256() &&
6264       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
6265     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6266
6267   // The scalar source must be a normal load.
6268   if (!IsLoad)
6269     return SDValue();
6270
6271   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6272       (Subtarget->hasVLX() && ScalarSize == 64))
6273     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6274
6275   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
6276   // double since there is no vbroadcastsd xmm
6277   if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
6278     if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
6279       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6280   }
6281
6282   // Unsupported broadcast.
6283   return SDValue();
6284 }
6285
6286 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
6287 /// underlying vector and index.
6288 ///
6289 /// Modifies \p ExtractedFromVec to the real vector and returns the real
6290 /// index.
6291 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
6292                                          SDValue ExtIdx) {
6293   int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
6294   if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
6295     return Idx;
6296
6297   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
6298   // lowered this:
6299   //   (extract_vector_elt (v8f32 %vreg1), Constant<6>)
6300   // to:
6301   //   (extract_vector_elt (vector_shuffle<2,u,u,u>
6302   //                           (extract_subvector (v8f32 %vreg0), Constant<4>),
6303   //                           undef)
6304   //                       Constant<0>)
6305   // In this case the vector is the extract_subvector expression and the index
6306   // is 2, as specified by the shuffle.
6307   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
6308   SDValue ShuffleVec = SVOp->getOperand(0);
6309   MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
6310   assert(ShuffleVecVT.getVectorElementType() ==
6311          ExtractedFromVec.getSimpleValueType().getVectorElementType());
6312
6313   int ShuffleIdx = SVOp->getMaskElt(Idx);
6314   if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
6315     ExtractedFromVec = ShuffleVec;
6316     return ShuffleIdx;
6317   }
6318   return Idx;
6319 }
6320
6321 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
6322   MVT VT = Op.getSimpleValueType();
6323
6324   // Skip if insert_vec_elt is not supported.
6325   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6326   if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
6327     return SDValue();
6328
6329   SDLoc DL(Op);
6330   unsigned NumElems = Op.getNumOperands();
6331
6332   SDValue VecIn1;
6333   SDValue VecIn2;
6334   SmallVector<unsigned, 4> InsertIndices;
6335   SmallVector<int, 8> Mask(NumElems, -1);
6336
6337   for (unsigned i = 0; i != NumElems; ++i) {
6338     unsigned Opc = Op.getOperand(i).getOpcode();
6339
6340     if (Opc == ISD::UNDEF)
6341       continue;
6342
6343     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
6344       // Quit if more than 1 elements need inserting.
6345       if (InsertIndices.size() > 1)
6346         return SDValue();
6347
6348       InsertIndices.push_back(i);
6349       continue;
6350     }
6351
6352     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
6353     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
6354     // Quit if non-constant index.
6355     if (!isa<ConstantSDNode>(ExtIdx))
6356       return SDValue();
6357     int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
6358
6359     // Quit if extracted from vector of different type.
6360     if (ExtractedFromVec.getValueType() != VT)
6361       return SDValue();
6362
6363     if (!VecIn1.getNode())
6364       VecIn1 = ExtractedFromVec;
6365     else if (VecIn1 != ExtractedFromVec) {
6366       if (!VecIn2.getNode())
6367         VecIn2 = ExtractedFromVec;
6368       else if (VecIn2 != ExtractedFromVec)
6369         // Quit if more than 2 vectors to shuffle
6370         return SDValue();
6371     }
6372
6373     if (ExtractedFromVec == VecIn1)
6374       Mask[i] = Idx;
6375     else if (ExtractedFromVec == VecIn2)
6376       Mask[i] = Idx + NumElems;
6377   }
6378
6379   if (!VecIn1.getNode())
6380     return SDValue();
6381
6382   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
6383   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
6384   for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
6385     unsigned Idx = InsertIndices[i];
6386     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
6387                      DAG.getIntPtrConstant(Idx));
6388   }
6389
6390   return NV;
6391 }
6392
6393 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
6394 SDValue
6395 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
6396
6397   MVT VT = Op.getSimpleValueType();
6398   assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) &&
6399          "Unexpected type in LowerBUILD_VECTORvXi1!");
6400
6401   SDLoc dl(Op);
6402   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
6403     SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
6404     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
6405     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
6406   }
6407
6408   if (ISD::isBuildVectorAllOnes(Op.getNode())) {
6409     SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
6410     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
6411     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
6412   }
6413
6414   bool AllContants = true;
6415   uint64_t Immediate = 0;
6416   int NonConstIdx = -1;
6417   bool IsSplat = true;
6418   unsigned NumNonConsts = 0;
6419   unsigned NumConsts = 0;
6420   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6421     SDValue In = Op.getOperand(idx);
6422     if (In.getOpcode() == ISD::UNDEF)
6423       continue;
6424     if (!isa<ConstantSDNode>(In)) {
6425       AllContants = false;
6426       NonConstIdx = idx;
6427       NumNonConsts++;
6428     } else {
6429       NumConsts++;
6430       if (cast<ConstantSDNode>(In)->getZExtValue())
6431       Immediate |= (1ULL << idx);
6432     }
6433     if (In != Op.getOperand(0))
6434       IsSplat = false;
6435   }
6436
6437   if (AllContants) {
6438     SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1,
6439       DAG.getConstant(Immediate, MVT::i16));
6440     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask,
6441                        DAG.getIntPtrConstant(0));
6442   }
6443
6444   if (NumNonConsts == 1 && NonConstIdx != 0) {
6445     SDValue DstVec;
6446     if (NumConsts) {
6447       SDValue VecAsImm = DAG.getConstant(Immediate,
6448                                          MVT::getIntegerVT(VT.getSizeInBits()));
6449       DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
6450     }
6451     else
6452       DstVec = DAG.getUNDEF(VT);
6453     return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
6454                        Op.getOperand(NonConstIdx),
6455                        DAG.getIntPtrConstant(NonConstIdx));
6456   }
6457   if (!IsSplat && (NonConstIdx != 0))
6458     llvm_unreachable("Unsupported BUILD_VECTOR operation");
6459   MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8;
6460   SDValue Select;
6461   if (IsSplat)
6462     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
6463                           DAG.getConstant(-1, SelectVT),
6464                           DAG.getConstant(0, SelectVT));
6465   else
6466     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
6467                          DAG.getConstant((Immediate | 1), SelectVT),
6468                          DAG.getConstant(Immediate, SelectVT));
6469   return DAG.getNode(ISD::BITCAST, dl, VT, Select);
6470 }
6471
6472 /// \brief Return true if \p N implements a horizontal binop and return the
6473 /// operands for the horizontal binop into V0 and V1.
6474 ///
6475 /// This is a helper function of PerformBUILD_VECTORCombine.
6476 /// This function checks that the build_vector \p N in input implements a
6477 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
6478 /// operation to match.
6479 /// For example, if \p Opcode is equal to ISD::ADD, then this function
6480 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
6481 /// is equal to ISD::SUB, then this function checks if this is a horizontal
6482 /// arithmetic sub.
6483 ///
6484 /// This function only analyzes elements of \p N whose indices are
6485 /// in range [BaseIdx, LastIdx).
6486 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
6487                               SelectionDAG &DAG,
6488                               unsigned BaseIdx, unsigned LastIdx,
6489                               SDValue &V0, SDValue &V1) {
6490   EVT VT = N->getValueType(0);
6491
6492   assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
6493   assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
6494          "Invalid Vector in input!");
6495
6496   bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
6497   bool CanFold = true;
6498   unsigned ExpectedVExtractIdx = BaseIdx;
6499   unsigned NumElts = LastIdx - BaseIdx;
6500   V0 = DAG.getUNDEF(VT);
6501   V1 = DAG.getUNDEF(VT);
6502
6503   // Check if N implements a horizontal binop.
6504   for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
6505     SDValue Op = N->getOperand(i + BaseIdx);
6506
6507     // Skip UNDEFs.
6508     if (Op->getOpcode() == ISD::UNDEF) {
6509       // Update the expected vector extract index.
6510       if (i * 2 == NumElts)
6511         ExpectedVExtractIdx = BaseIdx;
6512       ExpectedVExtractIdx += 2;
6513       continue;
6514     }
6515
6516     CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
6517
6518     if (!CanFold)
6519       break;
6520
6521     SDValue Op0 = Op.getOperand(0);
6522     SDValue Op1 = Op.getOperand(1);
6523
6524     // Try to match the following pattern:
6525     // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
6526     CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6527         Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6528         Op0.getOperand(0) == Op1.getOperand(0) &&
6529         isa<ConstantSDNode>(Op0.getOperand(1)) &&
6530         isa<ConstantSDNode>(Op1.getOperand(1)));
6531     if (!CanFold)
6532       break;
6533
6534     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
6535     unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
6536
6537     if (i * 2 < NumElts) {
6538       if (V0.getOpcode() == ISD::UNDEF)
6539         V0 = Op0.getOperand(0);
6540     } else {
6541       if (V1.getOpcode() == ISD::UNDEF)
6542         V1 = Op0.getOperand(0);
6543       if (i * 2 == NumElts)
6544         ExpectedVExtractIdx = BaseIdx;
6545     }
6546
6547     SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
6548     if (I0 == ExpectedVExtractIdx)
6549       CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
6550     else if (IsCommutable && I1 == ExpectedVExtractIdx) {
6551       // Try to match the following dag sequence:
6552       // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
6553       CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
6554     } else
6555       CanFold = false;
6556
6557     ExpectedVExtractIdx += 2;
6558   }
6559
6560   return CanFold;
6561 }
6562
6563 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
6564 /// a concat_vector.
6565 ///
6566 /// This is a helper function of PerformBUILD_VECTORCombine.
6567 /// This function expects two 256-bit vectors called V0 and V1.
6568 /// At first, each vector is split into two separate 128-bit vectors.
6569 /// Then, the resulting 128-bit vectors are used to implement two
6570 /// horizontal binary operations.
6571 ///
6572 /// The kind of horizontal binary operation is defined by \p X86Opcode.
6573 ///
6574 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
6575 /// the two new horizontal binop.
6576 /// When Mode is set, the first horizontal binop dag node would take as input
6577 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
6578 /// horizontal binop dag node would take as input the lower 128-bit of V1
6579 /// and the upper 128-bit of V1.
6580 ///   Example:
6581 ///     HADD V0_LO, V0_HI
6582 ///     HADD V1_LO, V1_HI
6583 ///
6584 /// Otherwise, the first horizontal binop dag node takes as input the lower
6585 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
6586 /// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1.
6587 ///   Example:
6588 ///     HADD V0_LO, V1_LO
6589 ///     HADD V0_HI, V1_HI
6590 ///
6591 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
6592 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
6593 /// the upper 128-bits of the result.
6594 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
6595                                      SDLoc DL, SelectionDAG &DAG,
6596                                      unsigned X86Opcode, bool Mode,
6597                                      bool isUndefLO, bool isUndefHI) {
6598   EVT VT = V0.getValueType();
6599   assert(VT.is256BitVector() && VT == V1.getValueType() &&
6600          "Invalid nodes in input!");
6601
6602   unsigned NumElts = VT.getVectorNumElements();
6603   SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL);
6604   SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL);
6605   SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL);
6606   SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL);
6607   EVT NewVT = V0_LO.getValueType();
6608
6609   SDValue LO = DAG.getUNDEF(NewVT);
6610   SDValue HI = DAG.getUNDEF(NewVT);
6611
6612   if (Mode) {
6613     // Don't emit a horizontal binop if the result is expected to be UNDEF.
6614     if (!isUndefLO && V0->getOpcode() != ISD::UNDEF)
6615       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
6616     if (!isUndefHI && V1->getOpcode() != ISD::UNDEF)
6617       HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
6618   } else {
6619     // Don't emit a horizontal binop if the result is expected to be UNDEF.
6620     if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF ||
6621                        V1_LO->getOpcode() != ISD::UNDEF))
6622       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
6623
6624     if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF ||
6625                        V1_HI->getOpcode() != ISD::UNDEF))
6626       HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
6627   }
6628
6629   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
6630 }
6631
6632 /// \brief Try to fold a build_vector that performs an 'addsub' into the
6633 /// sequence of 'vadd + vsub + blendi'.
6634 static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG,
6635                            const X86Subtarget *Subtarget) {
6636   SDLoc DL(BV);
6637   EVT VT = BV->getValueType(0);
6638   unsigned NumElts = VT.getVectorNumElements();
6639   SDValue InVec0 = DAG.getUNDEF(VT);
6640   SDValue InVec1 = DAG.getUNDEF(VT);
6641
6642   assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
6643           VT == MVT::v2f64) && "build_vector with an invalid type found!");
6644
6645   // Odd-numbered elements in the input build vector are obtained from
6646   // adding two integer/float elements.
6647   // Even-numbered elements in the input build vector are obtained from
6648   // subtracting two integer/float elements.
6649   unsigned ExpectedOpcode = ISD::FSUB;
6650   unsigned NextExpectedOpcode = ISD::FADD;
6651   bool AddFound = false;
6652   bool SubFound = false;
6653
6654   for (unsigned i = 0, e = NumElts; i != e; ++i) {
6655     SDValue Op = BV->getOperand(i);
6656
6657     // Skip 'undef' values.
6658     unsigned Opcode = Op.getOpcode();
6659     if (Opcode == ISD::UNDEF) {
6660       std::swap(ExpectedOpcode, NextExpectedOpcode);
6661       continue;
6662     }
6663
6664     // Early exit if we found an unexpected opcode.
6665     if (Opcode != ExpectedOpcode)
6666       return SDValue();
6667
6668     SDValue Op0 = Op.getOperand(0);
6669     SDValue Op1 = Op.getOperand(1);
6670
6671     // Try to match the following pattern:
6672     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
6673     // Early exit if we cannot match that sequence.
6674     if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6675         Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6676         !isa<ConstantSDNode>(Op0.getOperand(1)) ||
6677         !isa<ConstantSDNode>(Op1.getOperand(1)) ||
6678         Op0.getOperand(1) != Op1.getOperand(1))
6679       return SDValue();
6680
6681     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
6682     if (I0 != i)
6683       return SDValue();
6684
6685     // We found a valid add/sub node. Update the information accordingly.
6686     if (i & 1)
6687       AddFound = true;
6688     else
6689       SubFound = true;
6690
6691     // Update InVec0 and InVec1.
6692     if (InVec0.getOpcode() == ISD::UNDEF)
6693       InVec0 = Op0.getOperand(0);
6694     if (InVec1.getOpcode() == ISD::UNDEF)
6695       InVec1 = Op1.getOperand(0);
6696
6697     // Make sure that operands in input to each add/sub node always
6698     // come from a same pair of vectors.
6699     if (InVec0 != Op0.getOperand(0)) {
6700       if (ExpectedOpcode == ISD::FSUB)
6701         return SDValue();
6702
6703       // FADD is commutable. Try to commute the operands
6704       // and then test again.
6705       std::swap(Op0, Op1);
6706       if (InVec0 != Op0.getOperand(0))
6707         return SDValue();
6708     }
6709
6710     if (InVec1 != Op1.getOperand(0))
6711       return SDValue();
6712
6713     // Update the pair of expected opcodes.
6714     std::swap(ExpectedOpcode, NextExpectedOpcode);
6715   }
6716
6717   // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
6718   if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF &&
6719       InVec1.getOpcode() != ISD::UNDEF)
6720     return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
6721
6722   return SDValue();
6723 }
6724
6725 static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
6726                                           const X86Subtarget *Subtarget) {
6727   SDLoc DL(N);
6728   EVT VT = N->getValueType(0);
6729   unsigned NumElts = VT.getVectorNumElements();
6730   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N);
6731   SDValue InVec0, InVec1;
6732
6733   // Try to match an ADDSUB.
6734   if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
6735       (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
6736     SDValue Value = matchAddSub(BV, DAG, Subtarget);
6737     if (Value.getNode())
6738       return Value;
6739   }
6740
6741   // Try to match horizontal ADD/SUB.
6742   unsigned NumUndefsLO = 0;
6743   unsigned NumUndefsHI = 0;
6744   unsigned Half = NumElts/2;
6745
6746   // Count the number of UNDEF operands in the build_vector in input.
6747   for (unsigned i = 0, e = Half; i != e; ++i)
6748     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
6749       NumUndefsLO++;
6750
6751   for (unsigned i = Half, e = NumElts; i != e; ++i)
6752     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
6753       NumUndefsHI++;
6754
6755   // Early exit if this is either a build_vector of all UNDEFs or all the
6756   // operands but one are UNDEF.
6757   if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
6758     return SDValue();
6759
6760   if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) {
6761     // Try to match an SSE3 float HADD/HSUB.
6762     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
6763       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
6764
6765     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
6766       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
6767   } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
6768     // Try to match an SSSE3 integer HADD/HSUB.
6769     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
6770       return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
6771
6772     if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
6773       return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
6774   }
6775
6776   if (!Subtarget->hasAVX())
6777     return SDValue();
6778
6779   if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
6780     // Try to match an AVX horizontal add/sub of packed single/double
6781     // precision floating point values from 256-bit vectors.
6782     SDValue InVec2, InVec3;
6783     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
6784         isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
6785         ((InVec0.getOpcode() == ISD::UNDEF ||
6786           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6787         ((InVec1.getOpcode() == ISD::UNDEF ||
6788           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6789       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
6790
6791     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
6792         isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
6793         ((InVec0.getOpcode() == ISD::UNDEF ||
6794           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6795         ((InVec1.getOpcode() == ISD::UNDEF ||
6796           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6797       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
6798   } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
6799     // Try to match an AVX2 horizontal add/sub of signed integers.
6800     SDValue InVec2, InVec3;
6801     unsigned X86Opcode;
6802     bool CanFold = true;
6803
6804     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
6805         isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
6806         ((InVec0.getOpcode() == ISD::UNDEF ||
6807           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6808         ((InVec1.getOpcode() == ISD::UNDEF ||
6809           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6810       X86Opcode = X86ISD::HADD;
6811     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
6812         isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
6813         ((InVec0.getOpcode() == ISD::UNDEF ||
6814           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6815         ((InVec1.getOpcode() == ISD::UNDEF ||
6816           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6817       X86Opcode = X86ISD::HSUB;
6818     else
6819       CanFold = false;
6820
6821     if (CanFold) {
6822       // Fold this build_vector into a single horizontal add/sub.
6823       // Do this only if the target has AVX2.
6824       if (Subtarget->hasAVX2())
6825         return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
6826
6827       // Do not try to expand this build_vector into a pair of horizontal
6828       // add/sub if we can emit a pair of scalar add/sub.
6829       if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
6830         return SDValue();
6831
6832       // Convert this build_vector into a pair of horizontal binop followed by
6833       // a concat vector.
6834       bool isUndefLO = NumUndefsLO == Half;
6835       bool isUndefHI = NumUndefsHI == Half;
6836       return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
6837                                    isUndefLO, isUndefHI);
6838     }
6839   }
6840
6841   if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
6842        VT == MVT::v16i16) && Subtarget->hasAVX()) {
6843     unsigned X86Opcode;
6844     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
6845       X86Opcode = X86ISD::HADD;
6846     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
6847       X86Opcode = X86ISD::HSUB;
6848     else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
6849       X86Opcode = X86ISD::FHADD;
6850     else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
6851       X86Opcode = X86ISD::FHSUB;
6852     else
6853       return SDValue();
6854
6855     // Don't try to expand this build_vector into a pair of horizontal add/sub
6856     // if we can simply emit a pair of scalar add/sub.
6857     if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
6858       return SDValue();
6859
6860     // Convert this build_vector into two horizontal add/sub followed by
6861     // a concat vector.
6862     bool isUndefLO = NumUndefsLO == Half;
6863     bool isUndefHI = NumUndefsHI == Half;
6864     return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
6865                                  isUndefLO, isUndefHI);
6866   }
6867
6868   return SDValue();
6869 }
6870
6871 SDValue
6872 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
6873   SDLoc dl(Op);
6874
6875   MVT VT = Op.getSimpleValueType();
6876   MVT ExtVT = VT.getVectorElementType();
6877   unsigned NumElems = Op.getNumOperands();
6878
6879   // Generate vectors for predicate vectors.
6880   if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512())
6881     return LowerBUILD_VECTORvXi1(Op, DAG);
6882
6883   // Vectors containing all zeros can be matched by pxor and xorps later
6884   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
6885     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
6886     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
6887     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
6888       return Op;
6889
6890     return getZeroVector(VT, Subtarget, DAG, dl);
6891   }
6892
6893   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
6894   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
6895   // vpcmpeqd on 256-bit vectors.
6896   if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
6897     if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
6898       return Op;
6899
6900     if (!VT.is512BitVector())
6901       return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
6902   }
6903
6904   SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
6905   if (Broadcast.getNode())
6906     return Broadcast;
6907
6908   unsigned EVTBits = ExtVT.getSizeInBits();
6909
6910   unsigned NumZero  = 0;
6911   unsigned NumNonZero = 0;
6912   unsigned NonZeros = 0;
6913   bool IsAllConstants = true;
6914   SmallSet<SDValue, 8> Values;
6915   for (unsigned i = 0; i < NumElems; ++i) {
6916     SDValue Elt = Op.getOperand(i);
6917     if (Elt.getOpcode() == ISD::UNDEF)
6918       continue;
6919     Values.insert(Elt);
6920     if (Elt.getOpcode() != ISD::Constant &&
6921         Elt.getOpcode() != ISD::ConstantFP)
6922       IsAllConstants = false;
6923     if (X86::isZeroNode(Elt))
6924       NumZero++;
6925     else {
6926       NonZeros |= (1 << i);
6927       NumNonZero++;
6928     }
6929   }
6930
6931   // All undef vector. Return an UNDEF.  All zero vectors were handled above.
6932   if (NumNonZero == 0)
6933     return DAG.getUNDEF(VT);
6934
6935   // Special case for single non-zero, non-undef, element.
6936   if (NumNonZero == 1) {
6937     unsigned Idx = countTrailingZeros(NonZeros);
6938     SDValue Item = Op.getOperand(Idx);
6939
6940     // If this is an insertion of an i64 value on x86-32, and if the top bits of
6941     // the value are obviously zero, truncate the value to i32 and do the
6942     // insertion that way.  Only do this if the value is non-constant or if the
6943     // value is a constant being inserted into element 0.  It is cheaper to do
6944     // a constant pool load than it is to do a movd + shuffle.
6945     if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
6946         (!IsAllConstants || Idx == 0)) {
6947       if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
6948         // Handle SSE only.
6949         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
6950         EVT VecVT = MVT::v4i32;
6951         unsigned VecElts = 4;
6952
6953         // Truncate the value (which may itself be a constant) to i32, and
6954         // convert it to a vector with movd (S2V+shuffle to zero extend).
6955         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
6956         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
6957
6958         // If using the new shuffle lowering, just directly insert this.
6959         if (ExperimentalVectorShuffleLowering)
6960           return DAG.getNode(
6961               ISD::BITCAST, dl, VT,
6962               getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG));
6963
6964         Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
6965
6966         // Now we have our 32-bit value zero extended in the low element of
6967         // a vector.  If Idx != 0, swizzle it into place.
6968         if (Idx != 0) {
6969           SmallVector<int, 4> Mask;
6970           Mask.push_back(Idx);
6971           for (unsigned i = 1; i != VecElts; ++i)
6972             Mask.push_back(i);
6973           Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
6974                                       &Mask[0]);
6975         }
6976         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
6977       }
6978     }
6979
6980     // If we have a constant or non-constant insertion into the low element of
6981     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
6982     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
6983     // depending on what the source datatype is.
6984     if (Idx == 0) {
6985       if (NumZero == 0)
6986         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
6987
6988       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
6989           (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
6990         if (VT.is256BitVector() || VT.is512BitVector()) {
6991           SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
6992           return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
6993                              Item, DAG.getIntPtrConstant(0));
6994         }
6995         assert(VT.is128BitVector() && "Expected an SSE value type!");
6996         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
6997         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
6998         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
6999       }
7000
7001       if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
7002         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
7003         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7004         if (VT.is256BitVector()) {
7005           SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
7006           Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
7007         } else {
7008           assert(VT.is128BitVector() && "Expected an SSE value type!");
7009           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7010         }
7011         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
7012       }
7013     }
7014
7015     // Is it a vector logical left shift?
7016     if (NumElems == 2 && Idx == 1 &&
7017         X86::isZeroNode(Op.getOperand(0)) &&
7018         !X86::isZeroNode(Op.getOperand(1))) {
7019       unsigned NumBits = VT.getSizeInBits();
7020       return getVShift(true, VT,
7021                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
7022                                    VT, Op.getOperand(1)),
7023                        NumBits/2, DAG, *this, dl);
7024     }
7025
7026     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
7027       return SDValue();
7028
7029     // Otherwise, if this is a vector with i32 or f32 elements, and the element
7030     // is a non-constant being inserted into an element other than the low one,
7031     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
7032     // movd/movss) to move this into the low element, then shuffle it into
7033     // place.
7034     if (EVTBits == 32) {
7035       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7036
7037       // If using the new shuffle lowering, just directly insert this.
7038       if (ExperimentalVectorShuffleLowering)
7039         return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
7040
7041       // Turn it into a shuffle of zero and zero-extended scalar to vector.
7042       Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
7043       SmallVector<int, 8> MaskVec;
7044       for (unsigned i = 0; i != NumElems; ++i)
7045         MaskVec.push_back(i == Idx ? 0 : 1);
7046       return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
7047     }
7048   }
7049
7050   // Splat is obviously ok. Let legalizer expand it to a shuffle.
7051   if (Values.size() == 1) {
7052     if (EVTBits == 32) {
7053       // Instead of a shuffle like this:
7054       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
7055       // Check if it's possible to issue this instead.
7056       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
7057       unsigned Idx = countTrailingZeros(NonZeros);
7058       SDValue Item = Op.getOperand(Idx);
7059       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
7060         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
7061     }
7062     return SDValue();
7063   }
7064
7065   // A vector full of immediates; various special cases are already
7066   // handled, so this is best done with a single constant-pool load.
7067   if (IsAllConstants)
7068     return SDValue();
7069
7070   // For AVX-length vectors, see if we can use a vector load to get all of the
7071   // elements, otherwise build the individual 128-bit pieces and use
7072   // shuffles to put them in place.
7073   if (VT.is256BitVector() || VT.is512BitVector()) {
7074     SmallVector<SDValue, 64> V;
7075     for (unsigned i = 0; i != NumElems; ++i)
7076       V.push_back(Op.getOperand(i));
7077
7078     // Check for a build vector of consecutive loads.
7079     if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
7080       return LD;
7081
7082     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
7083
7084     // Build both the lower and upper subvector.
7085     SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
7086                                 makeArrayRef(&V[0], NumElems/2));
7087     SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
7088                                 makeArrayRef(&V[NumElems / 2], NumElems/2));
7089
7090     // Recreate the wider vector with the lower and upper part.
7091     if (VT.is256BitVector())
7092       return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7093     return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7094   }
7095
7096   // Let legalizer expand 2-wide build_vectors.
7097   if (EVTBits == 64) {
7098     if (NumNonZero == 1) {
7099       // One half is zero or undef.
7100       unsigned Idx = countTrailingZeros(NonZeros);
7101       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
7102                                  Op.getOperand(Idx));
7103       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
7104     }
7105     return SDValue();
7106   }
7107
7108   // If element VT is < 32 bits, convert it to inserts into a zero vector.
7109   if (EVTBits == 8 && NumElems == 16) {
7110     SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
7111                                         Subtarget, *this);
7112     if (V.getNode()) return V;
7113   }
7114
7115   if (EVTBits == 16 && NumElems == 8) {
7116     SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
7117                                       Subtarget, *this);
7118     if (V.getNode()) return V;
7119   }
7120
7121   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
7122   if (EVTBits == 32 && NumElems == 4) {
7123     SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this);
7124     if (V.getNode())
7125       return V;
7126   }
7127
7128   // If element VT is == 32 bits, turn it into a number of shuffles.
7129   SmallVector<SDValue, 8> V(NumElems);
7130   if (NumElems == 4 && NumZero > 0) {
7131     for (unsigned i = 0; i < 4; ++i) {
7132       bool isZero = !(NonZeros & (1 << i));
7133       if (isZero)
7134         V[i] = getZeroVector(VT, Subtarget, DAG, dl);
7135       else
7136         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7137     }
7138
7139     for (unsigned i = 0; i < 2; ++i) {
7140       switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
7141         default: break;
7142         case 0:
7143           V[i] = V[i*2];  // Must be a zero vector.
7144           break;
7145         case 1:
7146           V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
7147           break;
7148         case 2:
7149           V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
7150           break;
7151         case 3:
7152           V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
7153           break;
7154       }
7155     }
7156
7157     bool Reverse1 = (NonZeros & 0x3) == 2;
7158     bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
7159     int MaskVec[] = {
7160       Reverse1 ? 1 : 0,
7161       Reverse1 ? 0 : 1,
7162       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
7163       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
7164     };
7165     return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
7166   }
7167
7168   if (Values.size() > 1 && VT.is128BitVector()) {
7169     // Check for a build vector of consecutive loads.
7170     for (unsigned i = 0; i < NumElems; ++i)
7171       V[i] = Op.getOperand(i);
7172
7173     // Check for elements which are consecutive loads.
7174     SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false);
7175     if (LD.getNode())
7176       return LD;
7177
7178     // Check for a build vector from mostly shuffle plus few inserting.
7179     SDValue Sh = buildFromShuffleMostly(Op, DAG);
7180     if (Sh.getNode())
7181       return Sh;
7182
7183     // For SSE 4.1, use insertps to put the high elements into the low element.
7184     if (Subtarget->hasSSE41()) {
7185       SDValue Result;
7186       if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
7187         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
7188       else
7189         Result = DAG.getUNDEF(VT);
7190
7191       for (unsigned i = 1; i < NumElems; ++i) {
7192         if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
7193         Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
7194                              Op.getOperand(i), DAG.getIntPtrConstant(i));
7195       }
7196       return Result;
7197     }
7198
7199     // Otherwise, expand into a number of unpckl*, start by extending each of
7200     // our (non-undef) elements to the full vector width with the element in the
7201     // bottom slot of the vector (which generates no code for SSE).
7202     for (unsigned i = 0; i < NumElems; ++i) {
7203       if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
7204         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7205       else
7206         V[i] = DAG.getUNDEF(VT);
7207     }
7208
7209     // Next, we iteratively mix elements, e.g. for v4f32:
7210     //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
7211     //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
7212     //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
7213     unsigned EltStride = NumElems >> 1;
7214     while (EltStride != 0) {
7215       for (unsigned i = 0; i < EltStride; ++i) {
7216         // If V[i+EltStride] is undef and this is the first round of mixing,
7217         // then it is safe to just drop this shuffle: V[i] is already in the
7218         // right place, the one element (since it's the first round) being
7219         // inserted as undef can be dropped.  This isn't safe for successive
7220         // rounds because they will permute elements within both vectors.
7221         if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
7222             EltStride == NumElems/2)
7223           continue;
7224
7225         V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
7226       }
7227       EltStride >>= 1;
7228     }
7229     return V[0];
7230   }
7231   return SDValue();
7232 }
7233
7234 // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
7235 // to create 256-bit vectors from two other 128-bit ones.
7236 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7237   SDLoc dl(Op);
7238   MVT ResVT = Op.getSimpleValueType();
7239
7240   assert((ResVT.is256BitVector() ||
7241           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
7242
7243   SDValue V1 = Op.getOperand(0);
7244   SDValue V2 = Op.getOperand(1);
7245   unsigned NumElems = ResVT.getVectorNumElements();
7246   if(ResVT.is256BitVector())
7247     return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7248
7249   if (Op.getNumOperands() == 4) {
7250     MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),
7251                                 ResVT.getVectorNumElements()/2);
7252     SDValue V3 = Op.getOperand(2);
7253     SDValue V4 = Op.getOperand(3);
7254     return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl),
7255       Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl);
7256   }
7257   return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7258 }
7259
7260 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7261   MVT LLVM_ATTRIBUTE_UNUSED VT = Op.getSimpleValueType();
7262   assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
7263          (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
7264           Op.getNumOperands() == 4)));
7265
7266   // AVX can use the vinsertf128 instruction to create 256-bit vectors
7267   // from two other 128-bit ones.
7268
7269   // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
7270   return LowerAVXCONCAT_VECTORS(Op, DAG);
7271 }
7272
7273
7274 //===----------------------------------------------------------------------===//
7275 // Vector shuffle lowering
7276 //
7277 // This is an experimental code path for lowering vector shuffles on x86. It is
7278 // designed to handle arbitrary vector shuffles and blends, gracefully
7279 // degrading performance as necessary. It works hard to recognize idiomatic
7280 // shuffles and lower them to optimal instruction patterns without leaving
7281 // a framework that allows reasonably efficient handling of all vector shuffle
7282 // patterns.
7283 //===----------------------------------------------------------------------===//
7284
7285 /// \brief Tiny helper function to identify a no-op mask.
7286 ///
7287 /// This is a somewhat boring predicate function. It checks whether the mask
7288 /// array input, which is assumed to be a single-input shuffle mask of the kind
7289 /// used by the X86 shuffle instructions (not a fully general
7290 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
7291 /// in-place shuffle are 'no-op's.
7292 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
7293   for (int i = 0, Size = Mask.size(); i < Size; ++i)
7294     if (Mask[i] != -1 && Mask[i] != i)
7295       return false;
7296   return true;
7297 }
7298
7299 /// \brief Helper function to classify a mask as a single-input mask.
7300 ///
7301 /// This isn't a generic single-input test because in the vector shuffle
7302 /// lowering we canonicalize single inputs to be the first input operand. This
7303 /// means we can more quickly test for a single input by only checking whether
7304 /// an input from the second operand exists. We also assume that the size of
7305 /// mask corresponds to the size of the input vectors which isn't true in the
7306 /// fully general case.
7307 static bool isSingleInputShuffleMask(ArrayRef<int> Mask) {
7308   for (int M : Mask)
7309     if (M >= (int)Mask.size())
7310       return false;
7311   return true;
7312 }
7313
7314 /// \brief Test whether there are elements crossing 128-bit lanes in this
7315 /// shuffle mask.
7316 ///
7317 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
7318 /// and we routinely test for these.
7319 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
7320   int LaneSize = 128 / VT.getScalarSizeInBits();
7321   int Size = Mask.size();
7322   for (int i = 0; i < Size; ++i)
7323     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
7324       return true;
7325   return false;
7326 }
7327
7328 /// \brief Test whether a shuffle mask is equivalent within each 128-bit lane.
7329 ///
7330 /// This checks a shuffle mask to see if it is performing the same
7331 /// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies
7332 /// that it is also not lane-crossing. It may however involve a blend from the
7333 /// same lane of a second vector.
7334 ///
7335 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
7336 /// non-trivial to compute in the face of undef lanes. The representation is
7337 /// *not* suitable for use with existing 128-bit shuffles as it will contain
7338 /// entries from both V1 and V2 inputs to the wider mask.
7339 static bool
7340 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
7341                                 SmallVectorImpl<int> &RepeatedMask) {
7342   int LaneSize = 128 / VT.getScalarSizeInBits();
7343   RepeatedMask.resize(LaneSize, -1);
7344   int Size = Mask.size();
7345   for (int i = 0; i < Size; ++i) {
7346     if (Mask[i] < 0)
7347       continue;
7348     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
7349       // This entry crosses lanes, so there is no way to model this shuffle.
7350       return false;
7351
7352     // Ok, handle the in-lane shuffles by detecting if and when they repeat.
7353     if (RepeatedMask[i % LaneSize] == -1)
7354       // This is the first non-undef entry in this slot of a 128-bit lane.
7355       RepeatedMask[i % LaneSize] =
7356           Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size;
7357     else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i])
7358       // Found a mismatch with the repeated mask.
7359       return false;
7360   }
7361   return true;
7362 }
7363
7364 // Hide this symbol with an anonymous namespace instead of 'static' so that MSVC
7365 // 2013 will allow us to use it as a non-type template parameter.
7366 namespace {
7367
7368 /// \brief Implementation of the \c isShuffleEquivalent variadic functor.
7369 ///
7370 /// See its documentation for details.
7371 bool isShuffleEquivalentImpl(SDValue V1, SDValue V2, ArrayRef<int> Mask,
7372                              ArrayRef<const int *> Args) {
7373   if (Mask.size() != Args.size())
7374     return false;
7375
7376   // If the values are build vectors, we can look through them to find
7377   // equivalent inputs that make the shuffles equivalent.
7378   auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
7379   auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
7380
7381   for (int i = 0, e = Mask.size(); i < e; ++i) {
7382     assert(*Args[i] >= 0 && "Arguments must be positive integers!");
7383     if (Mask[i] != -1 && Mask[i] != *Args[i]) {
7384       auto *MaskBV = Mask[i] < e ? BV1 : BV2;
7385       auto *ArgsBV = *Args[i] < e ? BV1 : BV2;
7386       if (!MaskBV || !ArgsBV ||
7387           MaskBV->getOperand(Mask[i] % e) != ArgsBV->getOperand(*Args[i] % e))
7388         return false;
7389     }
7390   }
7391   return true;
7392 }
7393
7394 } // namespace
7395
7396 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
7397 /// arguments.
7398 ///
7399 /// This is a fast way to test a shuffle mask against a fixed pattern:
7400 ///
7401 ///   if (isShuffleEquivalent(Mask, 3, 2, 1, 0)) { ... }
7402 ///
7403 /// It returns true if the mask is exactly as wide as the argument list, and
7404 /// each element of the mask is either -1 (signifying undef) or the value given
7405 /// in the argument.
7406 static const VariadicFunction3<bool, SDValue, SDValue, ArrayRef<int>, int,
7407                                isShuffleEquivalentImpl> isShuffleEquivalent =
7408     {};
7409
7410 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
7411 ///
7412 /// This helper function produces an 8-bit shuffle immediate corresponding to
7413 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
7414 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
7415 /// example.
7416 ///
7417 /// NB: We rely heavily on "undef" masks preserving the input lane.
7418 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask,
7419                                           SelectionDAG &DAG) {
7420   assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
7421   assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
7422   assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
7423   assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
7424   assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
7425
7426   unsigned Imm = 0;
7427   Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0;
7428   Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2;
7429   Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4;
7430   Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6;
7431   return DAG.getConstant(Imm, MVT::i8);
7432 }
7433
7434 /// \brief Try to emit a blend instruction for a shuffle.
7435 ///
7436 /// This doesn't do any checks for the availability of instructions for blending
7437 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
7438 /// be matched in the backend with the type given. What it does check for is
7439 /// that the shuffle mask is in fact a blend.
7440 static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
7441                                          SDValue V2, ArrayRef<int> Mask,
7442                                          const X86Subtarget *Subtarget,
7443                                          SelectionDAG &DAG) {
7444
7445   unsigned BlendMask = 0;
7446   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7447     if (Mask[i] >= Size) {
7448       if (Mask[i] != i + Size)
7449         return SDValue(); // Shuffled V2 input!
7450       BlendMask |= 1u << i;
7451       continue;
7452     }
7453     if (Mask[i] >= 0 && Mask[i] != i)
7454       return SDValue(); // Shuffled V1 input!
7455   }
7456   switch (VT.SimpleTy) {
7457   case MVT::v2f64:
7458   case MVT::v4f32:
7459   case MVT::v4f64:
7460   case MVT::v8f32:
7461     return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
7462                        DAG.getConstant(BlendMask, MVT::i8));
7463
7464   case MVT::v4i64:
7465   case MVT::v8i32:
7466     assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
7467     // FALLTHROUGH
7468   case MVT::v2i64:
7469   case MVT::v4i32:
7470     // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
7471     // that instruction.
7472     if (Subtarget->hasAVX2()) {
7473       // Scale the blend by the number of 32-bit dwords per element.
7474       int Scale =  VT.getScalarSizeInBits() / 32;
7475       BlendMask = 0;
7476       for (int i = 0, Size = Mask.size(); i < Size; ++i)
7477         if (Mask[i] >= Size)
7478           for (int j = 0; j < Scale; ++j)
7479             BlendMask |= 1u << (i * Scale + j);
7480
7481       MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
7482       V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
7483       V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
7484       return DAG.getNode(ISD::BITCAST, DL, VT,
7485                          DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
7486                                      DAG.getConstant(BlendMask, MVT::i8)));
7487     }
7488     // FALLTHROUGH
7489   case MVT::v8i16: {
7490     // For integer shuffles we need to expand the mask and cast the inputs to
7491     // v8i16s prior to blending.
7492     int Scale = 8 / VT.getVectorNumElements();
7493     BlendMask = 0;
7494     for (int i = 0, Size = Mask.size(); i < Size; ++i)
7495       if (Mask[i] >= Size)
7496         for (int j = 0; j < Scale; ++j)
7497           BlendMask |= 1u << (i * Scale + j);
7498
7499     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
7500     V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
7501     return DAG.getNode(ISD::BITCAST, DL, VT,
7502                        DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
7503                                    DAG.getConstant(BlendMask, MVT::i8)));
7504   }
7505
7506   case MVT::v16i16: {
7507     assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
7508     SmallVector<int, 8> RepeatedMask;
7509     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
7510       // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
7511       assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
7512       BlendMask = 0;
7513       for (int i = 0; i < 8; ++i)
7514         if (RepeatedMask[i] >= 16)
7515           BlendMask |= 1u << i;
7516       return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
7517                          DAG.getConstant(BlendMask, MVT::i8));
7518     }
7519   }
7520     // FALLTHROUGH
7521   case MVT::v32i8: {
7522     assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
7523     // Scale the blend by the number of bytes per element.
7524     int Scale =  VT.getScalarSizeInBits() / 8;
7525     assert(Mask.size() * Scale == 32 && "Not a 256-bit vector!");
7526
7527     // Compute the VSELECT mask. Note that VSELECT is really confusing in the
7528     // mix of LLVM's code generator and the x86 backend. We tell the code
7529     // generator that boolean values in the elements of an x86 vector register
7530     // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
7531     // mapping a select to operand #1, and 'false' mapping to operand #2. The
7532     // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
7533     // of the element (the remaining are ignored) and 0 in that high bit would
7534     // mean operand #1 while 1 in the high bit would mean operand #2. So while
7535     // the LLVM model for boolean values in vector elements gets the relevant
7536     // bit set, it is set backwards and over constrained relative to x86's
7537     // actual model.
7538     SDValue VSELECTMask[32];
7539     for (int i = 0, Size = Mask.size(); i < Size; ++i)
7540       for (int j = 0; j < Scale; ++j)
7541         VSELECTMask[Scale * i + j] =
7542             Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
7543                         : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8);
7544
7545     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1);
7546     V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V2);
7547     return DAG.getNode(
7548         ISD::BITCAST, DL, VT,
7549         DAG.getNode(ISD::VSELECT, DL, MVT::v32i8,
7550                     DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, VSELECTMask),
7551                     V1, V2));
7552   }
7553
7554   default:
7555     llvm_unreachable("Not a supported integer vector type!");
7556   }
7557 }
7558
7559 /// \brief Try to lower as a blend of elements from two inputs followed by
7560 /// a single-input permutation.
7561 ///
7562 /// This matches the pattern where we can blend elements from two inputs and
7563 /// then reduce the shuffle to a single-input permutation.
7564 static SDValue lowerVectorShuffleAsBlendAndPermute(SDLoc DL, MVT VT, SDValue V1,
7565                                                    SDValue V2,
7566                                                    ArrayRef<int> Mask,
7567                                                    SelectionDAG &DAG) {
7568   // We build up the blend mask while checking whether a blend is a viable way
7569   // to reduce the shuffle.
7570   SmallVector<int, 32> BlendMask(Mask.size(), -1);
7571   SmallVector<int, 32> PermuteMask(Mask.size(), -1);
7572
7573   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7574     if (Mask[i] < 0)
7575       continue;
7576
7577     assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
7578
7579     if (BlendMask[Mask[i] % Size] == -1)
7580       BlendMask[Mask[i] % Size] = Mask[i];
7581     else if (BlendMask[Mask[i] % Size] != Mask[i])
7582       return SDValue(); // Can't blend in the needed input!
7583
7584     PermuteMask[i] = Mask[i] % Size;
7585   }
7586
7587   SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
7588   return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
7589 }
7590
7591 /// \brief Generic routine to decompose a shuffle and blend into indepndent
7592 /// blends and permutes.
7593 ///
7594 /// This matches the extremely common pattern for handling combined
7595 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
7596 /// operations. It will try to pick the best arrangement of shuffles and
7597 /// blends.
7598 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
7599                                                           SDValue V1,
7600                                                           SDValue V2,
7601                                                           ArrayRef<int> Mask,
7602                                                           SelectionDAG &DAG) {
7603   // Shuffle the input elements into the desired positions in V1 and V2 and
7604   // blend them together.
7605   SmallVector<int, 32> V1Mask(Mask.size(), -1);
7606   SmallVector<int, 32> V2Mask(Mask.size(), -1);
7607   SmallVector<int, 32> BlendMask(Mask.size(), -1);
7608   for (int i = 0, Size = Mask.size(); i < Size; ++i)
7609     if (Mask[i] >= 0 && Mask[i] < Size) {
7610       V1Mask[i] = Mask[i];
7611       BlendMask[i] = i;
7612     } else if (Mask[i] >= Size) {
7613       V2Mask[i] = Mask[i] - Size;
7614       BlendMask[i] = i + Size;
7615     }
7616
7617   // Try to lower with the simpler initial blend strategy unless one of the
7618   // input shuffles would be a no-op. We prefer to shuffle inputs as the
7619   // shuffle may be able to fold with a load or other benefit. However, when
7620   // we'll have to do 2x as many shuffles in order to achieve this, blending
7621   // first is a better strategy.
7622   if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
7623     if (SDValue BlendPerm =
7624             lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
7625       return BlendPerm;
7626
7627   V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
7628   V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
7629   return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
7630 }
7631
7632 /// \brief Try to lower a vector shuffle as a byte rotation.
7633 ///
7634 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
7635 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
7636 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
7637 /// try to generically lower a vector shuffle through such an pattern. It
7638 /// does not check for the profitability of lowering either as PALIGNR or
7639 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
7640 /// This matches shuffle vectors that look like:
7641 ///
7642 ///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
7643 ///
7644 /// Essentially it concatenates V1 and V2, shifts right by some number of
7645 /// elements, and takes the low elements as the result. Note that while this is
7646 /// specified as a *right shift* because x86 is little-endian, it is a *left
7647 /// rotate* of the vector lanes.
7648 ///
7649 /// Note that this only handles 128-bit vector widths currently.
7650 static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
7651                                               SDValue V2,
7652                                               ArrayRef<int> Mask,
7653                                               const X86Subtarget *Subtarget,
7654                                               SelectionDAG &DAG) {
7655   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
7656
7657   // We need to detect various ways of spelling a rotation:
7658   //   [11, 12, 13, 14, 15,  0,  1,  2]
7659   //   [-1, 12, 13, 14, -1, -1,  1, -1]
7660   //   [-1, -1, -1, -1, -1, -1,  1,  2]
7661   //   [ 3,  4,  5,  6,  7,  8,  9, 10]
7662   //   [-1,  4,  5,  6, -1, -1,  9, -1]
7663   //   [-1,  4,  5,  6, -1, -1, -1, -1]
7664   int Rotation = 0;
7665   SDValue Lo, Hi;
7666   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7667     if (Mask[i] == -1)
7668       continue;
7669     assert(Mask[i] >= 0 && "Only -1 is a valid negative mask element!");
7670
7671     // Based on the mod-Size value of this mask element determine where
7672     // a rotated vector would have started.
7673     int StartIdx = i - (Mask[i] % Size);
7674     if (StartIdx == 0)
7675       // The identity rotation isn't interesting, stop.
7676       return SDValue();
7677
7678     // If we found the tail of a vector the rotation must be the missing
7679     // front. If we found the head of a vector, it must be how much of the head.
7680     int CandidateRotation = StartIdx < 0 ? -StartIdx : Size - StartIdx;
7681
7682     if (Rotation == 0)
7683       Rotation = CandidateRotation;
7684     else if (Rotation != CandidateRotation)
7685       // The rotations don't match, so we can't match this mask.
7686       return SDValue();
7687
7688     // Compute which value this mask is pointing at.
7689     SDValue MaskV = Mask[i] < Size ? V1 : V2;
7690
7691     // Compute which of the two target values this index should be assigned to.
7692     // This reflects whether the high elements are remaining or the low elements
7693     // are remaining.
7694     SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
7695
7696     // Either set up this value if we've not encountered it before, or check
7697     // that it remains consistent.
7698     if (!TargetV)
7699       TargetV = MaskV;
7700     else if (TargetV != MaskV)
7701       // This may be a rotation, but it pulls from the inputs in some
7702       // unsupported interleaving.
7703       return SDValue();
7704   }
7705
7706   // Check that we successfully analyzed the mask, and normalize the results.
7707   assert(Rotation != 0 && "Failed to locate a viable rotation!");
7708   assert((Lo || Hi) && "Failed to find a rotated input vector!");
7709   if (!Lo)
7710     Lo = Hi;
7711   else if (!Hi)
7712     Hi = Lo;
7713
7714   assert(VT.getSizeInBits() == 128 &&
7715          "Rotate-based lowering only supports 128-bit lowering!");
7716   assert(Mask.size() <= 16 &&
7717          "Can shuffle at most 16 bytes in a 128-bit vector!");
7718
7719   // The actual rotate instruction rotates bytes, so we need to scale the
7720   // rotation based on how many bytes are in the vector.
7721   int Scale = 16 / Mask.size();
7722
7723   // SSSE3 targets can use the palignr instruction
7724   if (Subtarget->hasSSSE3()) {
7725     // Cast the inputs to v16i8 to match PALIGNR.
7726     Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Lo);
7727     Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Hi);
7728
7729     return DAG.getNode(ISD::BITCAST, DL, VT,
7730                        DAG.getNode(X86ISD::PALIGNR, DL, MVT::v16i8, Hi, Lo,
7731                                    DAG.getConstant(Rotation * Scale, MVT::i8)));
7732   }
7733
7734   // Default SSE2 implementation
7735   int LoByteShift = 16 - Rotation * Scale;
7736   int HiByteShift = Rotation * Scale;
7737
7738   // Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ.
7739   Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Lo);
7740   Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Hi);
7741
7742   SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo,
7743                                 DAG.getConstant(8 * LoByteShift, MVT::i8));
7744   SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi,
7745                                 DAG.getConstant(8 * HiByteShift, MVT::i8));
7746   return DAG.getNode(ISD::BITCAST, DL, VT,
7747                      DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift));
7748 }
7749
7750 /// \brief Compute whether each element of a shuffle is zeroable.
7751 ///
7752 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
7753 /// Either it is an undef element in the shuffle mask, the element of the input
7754 /// referenced is undef, or the element of the input referenced is known to be
7755 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
7756 /// as many lanes with this technique as possible to simplify the remaining
7757 /// shuffle.
7758 static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
7759                                                      SDValue V1, SDValue V2) {
7760   SmallBitVector Zeroable(Mask.size(), false);
7761
7762   while (V1.getOpcode() == ISD::BITCAST)
7763     V1 = V1->getOperand(0);
7764   while (V2.getOpcode() == ISD::BITCAST)
7765     V2 = V2->getOperand(0);
7766
7767   bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
7768   bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
7769
7770   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7771     int M = Mask[i];
7772     // Handle the easy cases.
7773     if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
7774       Zeroable[i] = true;
7775       continue;
7776     }
7777
7778     // If this is an index into a build_vector node (which has the same number
7779     // of elements), dig out the input value and use it.
7780     SDValue V = M < Size ? V1 : V2;
7781     if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands())
7782       continue;
7783
7784     SDValue Input = V.getOperand(M % Size);
7785     // The UNDEF opcode check really should be dead code here, but not quite
7786     // worth asserting on (it isn't invalid, just unexpected).
7787     if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input))
7788       Zeroable[i] = true;
7789   }
7790
7791   return Zeroable;
7792 }
7793
7794 /// \brief Try to emit a bitmask instruction for a shuffle.
7795 ///
7796 /// This handles cases where we can model a blend exactly as a bitmask due to
7797 /// one of the inputs being zeroable.
7798 static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1,
7799                                            SDValue V2, ArrayRef<int> Mask,
7800                                            SelectionDAG &DAG) {
7801   MVT EltVT = VT.getScalarType();
7802   int NumEltBits = EltVT.getSizeInBits();
7803   MVT IntEltVT = MVT::getIntegerVT(NumEltBits);
7804   SDValue Zero = DAG.getConstant(0, IntEltVT);
7805   SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), IntEltVT);
7806   if (EltVT.isFloatingPoint()) {
7807     Zero = DAG.getNode(ISD::BITCAST, DL, EltVT, Zero);
7808     AllOnes = DAG.getNode(ISD::BITCAST, DL, EltVT, AllOnes);
7809   }
7810   SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
7811   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7812   SDValue V;
7813   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7814     if (Zeroable[i])
7815       continue;
7816     if (Mask[i] % Size != i)
7817       return SDValue(); // Not a blend.
7818     if (!V)
7819       V = Mask[i] < Size ? V1 : V2;
7820     else if (V != (Mask[i] < Size ? V1 : V2))
7821       return SDValue(); // Can only let one input through the mask.
7822
7823     VMaskOps[i] = AllOnes;
7824   }
7825   if (!V)
7826     return SDValue(); // No non-zeroable elements!
7827
7828   SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps);
7829   V = DAG.getNode(VT.isFloatingPoint()
7830                   ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND,
7831                   DL, VT, V, VMask);
7832   return V;
7833 }
7834
7835 /// \brief Try to lower a vector shuffle as a byte shift (shifts in zeros).
7836 ///
7837 /// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ
7838 /// byte-shift instructions. The mask must consist of a shifted sequential
7839 /// shuffle from one of the input vectors and zeroable elements for the
7840 /// remaining 'shifted in' elements.
7841 static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1,
7842                                              SDValue V2, ArrayRef<int> Mask,
7843                                              SelectionDAG &DAG) {
7844   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
7845
7846   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7847
7848   int NumElts = VT.getVectorNumElements();
7849   int NumLanes = VT.getSizeInBits() / 128;
7850   int NumLaneElts = NumElts / NumLanes;
7851   int Scale = 16 / NumLaneElts;
7852   MVT ShiftVT = MVT::getVectorVT(MVT::i64, 2 * NumLanes);
7853
7854   // PSLLDQ : (little-endian) left byte shift
7855   // [ zz,  0,  1,  2,  3,  4,  5,  6]
7856   // [ zz, zz, -1, -1,  2,  3,  4, -1]
7857   // [ zz, zz, zz, zz, zz, zz, -1,  1]
7858   // PSRLDQ : (little-endian) right byte shift
7859   // [  5, 6,  7, zz, zz, zz, zz, zz]
7860   // [ -1, 5,  6,  7, zz, zz, zz, zz]
7861   // [  1, 2, -1, -1, -1, -1, zz, zz]
7862   auto MatchByteShift = [&](int Shift) -> SDValue {
7863     bool MatchLeft = true, MatchRight = true;
7864     for (int l = 0; l < NumElts; l += NumLaneElts) {
7865       for (int i = 0; i < Shift; ++i)
7866         MatchLeft &= Zeroable[l + i];
7867       for (int i = NumLaneElts - Shift; i < NumLaneElts; ++i)
7868         MatchRight &= Zeroable[l + i];
7869     }
7870     if (!(MatchLeft || MatchRight))
7871       return SDValue();
7872
7873     bool MatchV1 = true, MatchV2 = true;
7874     for (int l = 0; l < NumElts; l += NumLaneElts) {
7875       unsigned Pos = MatchLeft ? Shift + l : l;
7876       unsigned Low = MatchLeft ? l : Shift + l;
7877       unsigned Len = NumLaneElts - Shift;
7878       MatchV1 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low);
7879       MatchV2 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low + NumElts);
7880     }
7881     if (!(MatchV1 || MatchV2))
7882       return SDValue();
7883
7884     int ByteShift = Shift * Scale;
7885     unsigned Op = MatchRight ? X86ISD::VSRLDQ : X86ISD::VSHLDQ;
7886     SDValue V = MatchV1 ? V1 : V2;
7887     V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, V);
7888     V = DAG.getNode(Op, DL, ShiftVT, V,
7889                     DAG.getConstant(ByteShift * 8, MVT::i8));
7890     return DAG.getNode(ISD::BITCAST, DL, VT, V);
7891   };
7892
7893   for (int Shift = 1; Shift < NumLaneElts; ++Shift)
7894     if (SDValue S = MatchByteShift(Shift))
7895       return S;
7896
7897   // no match
7898   return SDValue();
7899 }
7900
7901 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
7902 ///
7903 /// Attempts to match a shuffle mask against the PSRL(W/D/Q) and PSLL(W/D/Q)
7904 /// SSE2 and AVX2 logical bit-shift instructions. The function matches
7905 /// elements from one of the input vectors shuffled to the left or right
7906 /// with zeroable elements 'shifted in'.
7907 static SDValue lowerVectorShuffleAsBitShift(SDLoc DL, MVT VT, SDValue V1,
7908                                             SDValue V2, ArrayRef<int> Mask,
7909                                             SelectionDAG &DAG) {
7910   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7911
7912   int Size = Mask.size();
7913   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
7914
7915   // PSRL : (little-endian) right bit shift.
7916   // [  1, zz,  3, zz]
7917   // [ -1, -1,  7, zz]
7918   // PSHL : (little-endian) left bit shift.
7919   // [ zz, 0, zz,  2 ]
7920   // [ -1, 4, zz, -1 ]
7921   auto MatchBitShift = [&](int Shift, int Scale) -> SDValue {
7922     MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale);
7923     MVT ShiftVT = MVT::getVectorVT(ShiftSVT, Size / Scale);
7924     assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
7925            "Illegal integer vector type");
7926
7927     bool MatchLeft = true, MatchRight = true;
7928     for (int i = 0; i != Size; i += Scale) {
7929       for (int j = 0; j != Shift; ++j) {
7930         MatchLeft &= Zeroable[i + j];
7931       }
7932       for (int j = Scale - Shift; j != Scale; ++j) {
7933         MatchRight &= Zeroable[i + j];
7934       }
7935     }
7936     if (!(MatchLeft || MatchRight))
7937       return SDValue();
7938
7939     bool MatchV1 = true, MatchV2 = true;
7940     for (int i = 0; i != Size; i += Scale) {
7941       unsigned Pos = MatchLeft ? i + Shift : i;
7942       unsigned Low = MatchLeft ? i : i + Shift;
7943       unsigned Len = Scale - Shift;
7944       MatchV1 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low);
7945       MatchV2 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low + Size);
7946     }
7947     if (!(MatchV1 || MatchV2))
7948       return SDValue();
7949
7950     // Cast the inputs to ShiftVT to match VSRLI/VSHLI and back again.
7951     unsigned OpCode = MatchLeft ? X86ISD::VSHLI : X86ISD::VSRLI;
7952     int ShiftAmt = Shift * VT.getScalarSizeInBits();
7953     SDValue V = MatchV1 ? V1 : V2;
7954     V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, V);
7955     V = DAG.getNode(OpCode, DL, ShiftVT, V, DAG.getConstant(ShiftAmt, MVT::i8));
7956     return DAG.getNode(ISD::BITCAST, DL, VT, V);
7957   };
7958
7959   // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
7960   // keep doubling the size of the integer elements up to that. We can
7961   // then shift the elements of the integer vector by whole multiples of
7962   // their width within the elements of the larger integer vector. Test each
7963   // multiple to see if we can find a match with the moved element indices
7964   // and that the shifted in elements are all zeroable.
7965   for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= 64; Scale *= 2)
7966     for (int Shift = 1; Shift != Scale; ++Shift)
7967       if (SDValue BitShift = MatchBitShift(Shift, Scale))
7968         return BitShift;
7969
7970   // no match
7971   return SDValue();
7972 }
7973
7974 /// \brief Lower a vector shuffle as a zero or any extension.
7975 ///
7976 /// Given a specific number of elements, element bit width, and extension
7977 /// stride, produce either a zero or any extension based on the available
7978 /// features of the subtarget.
7979 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
7980     SDLoc DL, MVT VT, int Scale, bool AnyExt, SDValue InputV,
7981     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
7982   assert(Scale > 1 && "Need a scale to extend.");
7983   int NumElements = VT.getVectorNumElements();
7984   int EltBits = VT.getScalarSizeInBits();
7985   assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
7986          "Only 8, 16, and 32 bit elements can be extended.");
7987   assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
7988
7989   // Found a valid zext mask! Try various lowering strategies based on the
7990   // input type and available ISA extensions.
7991   if (Subtarget->hasSSE41()) {
7992     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
7993                                  NumElements / Scale);
7994     return DAG.getNode(ISD::BITCAST, DL, VT,
7995                        DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV));
7996   }
7997
7998   // For any extends we can cheat for larger element sizes and use shuffle
7999   // instructions that can fold with a load and/or copy.
8000   if (AnyExt && EltBits == 32) {
8001     int PSHUFDMask[4] = {0, -1, 1, -1};
8002     return DAG.getNode(
8003         ISD::BITCAST, DL, VT,
8004         DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
8005                     DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
8006                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
8007   }
8008   if (AnyExt && EltBits == 16 && Scale > 2) {
8009     int PSHUFDMask[4] = {0, -1, 0, -1};
8010     InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
8011                          DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
8012                          getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG));
8013     int PSHUFHWMask[4] = {1, -1, -1, -1};
8014     return DAG.getNode(
8015         ISD::BITCAST, DL, VT,
8016         DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16,
8017                     DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, InputV),
8018                     getV4X86ShuffleImm8ForMask(PSHUFHWMask, DAG)));
8019   }
8020
8021   // If this would require more than 2 unpack instructions to expand, use
8022   // pshufb when available. We can only use more than 2 unpack instructions
8023   // when zero extending i8 elements which also makes it easier to use pshufb.
8024   if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) {
8025     assert(NumElements == 16 && "Unexpected byte vector width!");
8026     SDValue PSHUFBMask[16];
8027     for (int i = 0; i < 16; ++i)
8028       PSHUFBMask[i] =
8029           DAG.getConstant((i % Scale == 0) ? i / Scale : 0x80, MVT::i8);
8030     InputV = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, InputV);
8031     return DAG.getNode(ISD::BITCAST, DL, VT,
8032                        DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
8033                                    DAG.getNode(ISD::BUILD_VECTOR, DL,
8034                                                MVT::v16i8, PSHUFBMask)));
8035   }
8036
8037   // Otherwise emit a sequence of unpacks.
8038   do {
8039     MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
8040     SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
8041                          : getZeroVector(InputVT, Subtarget, DAG, DL);
8042     InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
8043     InputV = DAG.getNode(X86ISD::UNPCKL, DL, InputVT, InputV, Ext);
8044     Scale /= 2;
8045     EltBits *= 2;
8046     NumElements /= 2;
8047   } while (Scale > 1);
8048   return DAG.getNode(ISD::BITCAST, DL, VT, InputV);
8049 }
8050
8051 /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
8052 ///
8053 /// This routine will try to do everything in its power to cleverly lower
8054 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
8055 /// check for the profitability of this lowering,  it tries to aggressively
8056 /// match this pattern. It will use all of the micro-architectural details it
8057 /// can to emit an efficient lowering. It handles both blends with all-zero
8058 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
8059 /// masking out later).
8060 ///
8061 /// The reason we have dedicated lowering for zext-style shuffles is that they
8062 /// are both incredibly common and often quite performance sensitive.
8063 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
8064     SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
8065     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
8066   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8067
8068   int Bits = VT.getSizeInBits();
8069   int NumElements = VT.getVectorNumElements();
8070   assert(VT.getScalarSizeInBits() <= 32 &&
8071          "Exceeds 32-bit integer zero extension limit");
8072   assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
8073
8074   // Define a helper function to check a particular ext-scale and lower to it if
8075   // valid.
8076   auto Lower = [&](int Scale) -> SDValue {
8077     SDValue InputV;
8078     bool AnyExt = true;
8079     for (int i = 0; i < NumElements; ++i) {
8080       if (Mask[i] == -1)
8081         continue; // Valid anywhere but doesn't tell us anything.
8082       if (i % Scale != 0) {
8083         // Each of the extended elements need to be zeroable.
8084         if (!Zeroable[i])
8085           return SDValue();
8086
8087         // We no longer are in the anyext case.
8088         AnyExt = false;
8089         continue;
8090       }
8091
8092       // Each of the base elements needs to be consecutive indices into the
8093       // same input vector.
8094       SDValue V = Mask[i] < NumElements ? V1 : V2;
8095       if (!InputV)
8096         InputV = V;
8097       else if (InputV != V)
8098         return SDValue(); // Flip-flopping inputs.
8099
8100       if (Mask[i] % NumElements != i / Scale)
8101         return SDValue(); // Non-consecutive strided elements.
8102     }
8103
8104     // If we fail to find an input, we have a zero-shuffle which should always
8105     // have already been handled.
8106     // FIXME: Maybe handle this here in case during blending we end up with one?
8107     if (!InputV)
8108       return SDValue();
8109
8110     return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
8111         DL, VT, Scale, AnyExt, InputV, Subtarget, DAG);
8112   };
8113
8114   // The widest scale possible for extending is to a 64-bit integer.
8115   assert(Bits % 64 == 0 &&
8116          "The number of bits in a vector must be divisible by 64 on x86!");
8117   int NumExtElements = Bits / 64;
8118
8119   // Each iteration, try extending the elements half as much, but into twice as
8120   // many elements.
8121   for (; NumExtElements < NumElements; NumExtElements *= 2) {
8122     assert(NumElements % NumExtElements == 0 &&
8123            "The input vector size must be divisible by the extended size.");
8124     if (SDValue V = Lower(NumElements / NumExtElements))
8125       return V;
8126   }
8127
8128   // General extends failed, but 128-bit vectors may be able to use MOVQ.
8129   if (Bits != 128)
8130     return SDValue();
8131
8132   // Returns one of the source operands if the shuffle can be reduced to a
8133   // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
8134   auto CanZExtLowHalf = [&]() {
8135     for (int i = NumElements / 2; i != NumElements; ++i)
8136       if (!Zeroable[i])
8137         return SDValue();
8138     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
8139       return V1;
8140     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
8141       return V2;
8142     return SDValue();
8143   };
8144
8145   if (SDValue V = CanZExtLowHalf()) {
8146     V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V);
8147     V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
8148     return DAG.getNode(ISD::BITCAST, DL, VT, V);
8149   }
8150
8151   // No viable ext lowering found.
8152   return SDValue();
8153 }
8154
8155 /// \brief Try to get a scalar value for a specific element of a vector.
8156 ///
8157 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
8158 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
8159                                               SelectionDAG &DAG) {
8160   MVT VT = V.getSimpleValueType();
8161   MVT EltVT = VT.getVectorElementType();
8162   while (V.getOpcode() == ISD::BITCAST)
8163     V = V.getOperand(0);
8164   // If the bitcasts shift the element size, we can't extract an equivalent
8165   // element from it.
8166   MVT NewVT = V.getSimpleValueType();
8167   if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
8168     return SDValue();
8169
8170   if (V.getOpcode() == ISD::BUILD_VECTOR ||
8171       (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR))
8172     return DAG.getNode(ISD::BITCAST, SDLoc(V), EltVT, V.getOperand(Idx));
8173
8174   return SDValue();
8175 }
8176
8177 /// \brief Helper to test for a load that can be folded with x86 shuffles.
8178 ///
8179 /// This is particularly important because the set of instructions varies
8180 /// significantly based on whether the operand is a load or not.
8181 static bool isShuffleFoldableLoad(SDValue V) {
8182   while (V.getOpcode() == ISD::BITCAST)
8183     V = V.getOperand(0);
8184
8185   return ISD::isNON_EXTLoad(V.getNode());
8186 }
8187
8188 /// \brief Try to lower insertion of a single element into a zero vector.
8189 ///
8190 /// This is a common pattern that we have especially efficient patterns to lower
8191 /// across all subtarget feature sets.
8192 static SDValue lowerVectorShuffleAsElementInsertion(
8193     MVT VT, SDLoc DL, SDValue V1, SDValue V2, ArrayRef<int> Mask,
8194     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
8195   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8196   MVT ExtVT = VT;
8197   MVT EltVT = VT.getVectorElementType();
8198
8199   int V2Index = std::find_if(Mask.begin(), Mask.end(),
8200                              [&Mask](int M) { return M >= (int)Mask.size(); }) -
8201                 Mask.begin();
8202   bool IsV1Zeroable = true;
8203   for (int i = 0, Size = Mask.size(); i < Size; ++i)
8204     if (i != V2Index && !Zeroable[i]) {
8205       IsV1Zeroable = false;
8206       break;
8207     }
8208
8209   // Check for a single input from a SCALAR_TO_VECTOR node.
8210   // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
8211   // all the smarts here sunk into that routine. However, the current
8212   // lowering of BUILD_VECTOR makes that nearly impossible until the old
8213   // vector shuffle lowering is dead.
8214   if (SDValue V2S = getScalarValueForVectorElement(
8215           V2, Mask[V2Index] - Mask.size(), DAG)) {
8216     // We need to zext the scalar if it is smaller than an i32.
8217     V2S = DAG.getNode(ISD::BITCAST, DL, EltVT, V2S);
8218     if (EltVT == MVT::i8 || EltVT == MVT::i16) {
8219       // Using zext to expand a narrow element won't work for non-zero
8220       // insertions.
8221       if (!IsV1Zeroable)
8222         return SDValue();
8223
8224       // Zero-extend directly to i32.
8225       ExtVT = MVT::v4i32;
8226       V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
8227     }
8228     V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
8229   } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
8230              EltVT == MVT::i16) {
8231     // Either not inserting from the low element of the input or the input
8232     // element size is too small to use VZEXT_MOVL to clear the high bits.
8233     return SDValue();
8234   }
8235
8236   if (!IsV1Zeroable) {
8237     // If V1 can't be treated as a zero vector we have fewer options to lower
8238     // this. We can't support integer vectors or non-zero targets cheaply, and
8239     // the V1 elements can't be permuted in any way.
8240     assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
8241     if (!VT.isFloatingPoint() || V2Index != 0)
8242       return SDValue();
8243     SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
8244     V1Mask[V2Index] = -1;
8245     if (!isNoopShuffleMask(V1Mask))
8246       return SDValue();
8247     // This is essentially a special case blend operation, but if we have
8248     // general purpose blend operations, they are always faster. Bail and let
8249     // the rest of the lowering handle these as blends.
8250     if (Subtarget->hasSSE41())
8251       return SDValue();
8252
8253     // Otherwise, use MOVSD or MOVSS.
8254     assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
8255            "Only two types of floating point element types to handle!");
8256     return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
8257                        ExtVT, V1, V2);
8258   }
8259
8260   // This lowering only works for the low element with floating point vectors.
8261   if (VT.isFloatingPoint() && V2Index != 0)
8262     return SDValue();
8263
8264   V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
8265   if (ExtVT != VT)
8266     V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
8267
8268   if (V2Index != 0) {
8269     // If we have 4 or fewer lanes we can cheaply shuffle the element into
8270     // the desired position. Otherwise it is more efficient to do a vector
8271     // shift left. We know that we can do a vector shift left because all
8272     // the inputs are zero.
8273     if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
8274       SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
8275       V2Shuffle[V2Index] = 0;
8276       V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
8277     } else {
8278       V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V2);
8279       V2 = DAG.getNode(
8280           X86ISD::VSHLDQ, DL, MVT::v2i64, V2,
8281           DAG.getConstant(
8282               V2Index * EltVT.getSizeInBits(),
8283               DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64)));
8284       V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
8285     }
8286   }
8287   return V2;
8288 }
8289
8290 /// \brief Try to lower broadcast of a single element.
8291 ///
8292 /// For convenience, this code also bundles all of the subtarget feature set
8293 /// filtering. While a little annoying to re-dispatch on type here, there isn't
8294 /// a convenient way to factor it out.
8295 static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V,
8296                                              ArrayRef<int> Mask,
8297                                              const X86Subtarget *Subtarget,
8298                                              SelectionDAG &DAG) {
8299   if (!Subtarget->hasAVX())
8300     return SDValue();
8301   if (VT.isInteger() && !Subtarget->hasAVX2())
8302     return SDValue();
8303
8304   // Check that the mask is a broadcast.
8305   int BroadcastIdx = -1;
8306   for (int M : Mask)
8307     if (M >= 0 && BroadcastIdx == -1)
8308       BroadcastIdx = M;
8309     else if (M >= 0 && M != BroadcastIdx)
8310       return SDValue();
8311
8312   assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
8313                                             "a sorted mask where the broadcast "
8314                                             "comes from V1.");
8315
8316   // Go up the chain of (vector) values to try and find a scalar load that
8317   // we can combine with the broadcast.
8318   for (;;) {
8319     switch (V.getOpcode()) {
8320     case ISD::CONCAT_VECTORS: {
8321       int OperandSize = Mask.size() / V.getNumOperands();
8322       V = V.getOperand(BroadcastIdx / OperandSize);
8323       BroadcastIdx %= OperandSize;
8324       continue;
8325     }
8326
8327     case ISD::INSERT_SUBVECTOR: {
8328       SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
8329       auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
8330       if (!ConstantIdx)
8331         break;
8332
8333       int BeginIdx = (int)ConstantIdx->getZExtValue();
8334       int EndIdx =
8335           BeginIdx + (int)VInner.getValueType().getVectorNumElements();
8336       if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
8337         BroadcastIdx -= BeginIdx;
8338         V = VInner;
8339       } else {
8340         V = VOuter;
8341       }
8342       continue;
8343     }
8344     }
8345     break;
8346   }
8347
8348   // Check if this is a broadcast of a scalar. We special case lowering
8349   // for scalars so that we can more effectively fold with loads.
8350   if (V.getOpcode() == ISD::BUILD_VECTOR ||
8351       (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
8352     V = V.getOperand(BroadcastIdx);
8353
8354     // If the scalar isn't a load we can't broadcast from it in AVX1, only with
8355     // AVX2.
8356     if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V))
8357       return SDValue();
8358   } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) {
8359     // We can't broadcast from a vector register w/o AVX2, and we can only
8360     // broadcast from the zero-element of a vector register.
8361     return SDValue();
8362   }
8363
8364   return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V);
8365 }
8366
8367 // Check for whether we can use INSERTPS to perform the shuffle. We only use
8368 // INSERTPS when the V1 elements are already in the correct locations
8369 // because otherwise we can just always use two SHUFPS instructions which
8370 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
8371 // perform INSERTPS if a single V1 element is out of place and all V2
8372 // elements are zeroable.
8373 static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
8374                                             ArrayRef<int> Mask,
8375                                             SelectionDAG &DAG) {
8376   assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
8377   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8378   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8379   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8380
8381   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8382
8383   unsigned ZMask = 0;
8384   int V1DstIndex = -1;
8385   int V2DstIndex = -1;
8386   bool V1UsedInPlace = false;
8387
8388   for (int i = 0; i < 4; ++i) {
8389     // Synthesize a zero mask from the zeroable elements (includes undefs).
8390     if (Zeroable[i]) {
8391       ZMask |= 1 << i;
8392       continue;
8393     }
8394
8395     // Flag if we use any V1 inputs in place.
8396     if (i == Mask[i]) {
8397       V1UsedInPlace = true;
8398       continue;
8399     }
8400
8401     // We can only insert a single non-zeroable element.
8402     if (V1DstIndex != -1 || V2DstIndex != -1)
8403       return SDValue();
8404
8405     if (Mask[i] < 4) {
8406       // V1 input out of place for insertion.
8407       V1DstIndex = i;
8408     } else {
8409       // V2 input for insertion.
8410       V2DstIndex = i;
8411     }
8412   }
8413
8414   // Don't bother if we have no (non-zeroable) element for insertion.
8415   if (V1DstIndex == -1 && V2DstIndex == -1)
8416     return SDValue();
8417
8418   // Determine element insertion src/dst indices. The src index is from the
8419   // start of the inserted vector, not the start of the concatenated vector.
8420   unsigned V2SrcIndex = 0;
8421   if (V1DstIndex != -1) {
8422     // If we have a V1 input out of place, we use V1 as the V2 element insertion
8423     // and don't use the original V2 at all.
8424     V2SrcIndex = Mask[V1DstIndex];
8425     V2DstIndex = V1DstIndex;
8426     V2 = V1;
8427   } else {
8428     V2SrcIndex = Mask[V2DstIndex] - 4;
8429   }
8430
8431   // If no V1 inputs are used in place, then the result is created only from
8432   // the zero mask and the V2 insertion - so remove V1 dependency.
8433   if (!V1UsedInPlace)
8434     V1 = DAG.getUNDEF(MVT::v4f32);
8435
8436   unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
8437   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
8438
8439   // Insert the V2 element into the desired position.
8440   SDLoc DL(Op);
8441   return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
8442                      DAG.getConstant(InsertPSMask, MVT::i8));
8443 }
8444
8445 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
8446 ///
8447 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
8448 /// support for floating point shuffles but not integer shuffles. These
8449 /// instructions will incur a domain crossing penalty on some chips though so
8450 /// it is better to avoid lowering through this for integer vectors where
8451 /// possible.
8452 static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8453                                        const X86Subtarget *Subtarget,
8454                                        SelectionDAG &DAG) {
8455   SDLoc DL(Op);
8456   assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!");
8457   assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
8458   assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
8459   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8460   ArrayRef<int> Mask = SVOp->getMask();
8461   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
8462
8463   if (isSingleInputShuffleMask(Mask)) {
8464     // Use low duplicate instructions for masks that match their pattern.
8465     if (Subtarget->hasSSE3())
8466       if (isShuffleEquivalent(V1, V2, Mask, 0, 0))
8467         return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, V1);
8468
8469     // Straight shuffle of a single input vector. Simulate this by using the
8470     // single input as both of the "inputs" to this instruction..
8471     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
8472
8473     if (Subtarget->hasAVX()) {
8474       // If we have AVX, we can use VPERMILPS which will allow folding a load
8475       // into the shuffle.
8476       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
8477                          DAG.getConstant(SHUFPDMask, MVT::i8));
8478     }
8479
8480     return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V1,
8481                        DAG.getConstant(SHUFPDMask, MVT::i8));
8482   }
8483   assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
8484   assert(Mask[1] >= 2 && "Non-canonicalized blend!");
8485
8486   // If we have a single input, insert that into V1 if we can do so cheaply.
8487   if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
8488     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8489             MVT::v2f64, DL, V1, V2, Mask, Subtarget, DAG))
8490       return Insertion;
8491     // Try inverting the insertion since for v2 masks it is easy to do and we
8492     // can't reliably sort the mask one way or the other.
8493     int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
8494                           Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
8495     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8496             MVT::v2f64, DL, V2, V1, InverseMask, Subtarget, DAG))
8497       return Insertion;
8498   }
8499
8500   // Try to use one of the special instruction patterns to handle two common
8501   // blend patterns if a zero-blend above didn't work.
8502   if (isShuffleEquivalent(V1, V2, Mask, 0, 3) || isShuffleEquivalent(V1, V2, Mask, 1, 3))
8503     if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
8504       // We can either use a special instruction to load over the low double or
8505       // to move just the low double.
8506       return DAG.getNode(
8507           isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
8508           DL, MVT::v2f64, V2,
8509           DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
8510
8511   if (Subtarget->hasSSE41())
8512     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
8513                                                   Subtarget, DAG))
8514       return Blend;
8515
8516   // Use dedicated unpack instructions for masks that match their pattern.
8517   if (isShuffleEquivalent(V1, V2, Mask, 0, 2))
8518     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2);
8519   if (isShuffleEquivalent(V1, V2, Mask, 1, 3))
8520     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2);
8521
8522   unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
8523   return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2,
8524                      DAG.getConstant(SHUFPDMask, MVT::i8));
8525 }
8526
8527 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
8528 ///
8529 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
8530 /// the integer unit to minimize domain crossing penalties. However, for blends
8531 /// it falls back to the floating point shuffle operation with appropriate bit
8532 /// casting.
8533 static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8534                                        const X86Subtarget *Subtarget,
8535                                        SelectionDAG &DAG) {
8536   SDLoc DL(Op);
8537   assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!");
8538   assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
8539   assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
8540   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8541   ArrayRef<int> Mask = SVOp->getMask();
8542   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
8543
8544   if (isSingleInputShuffleMask(Mask)) {
8545     // Check for being able to broadcast a single element.
8546     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v2i64, DL, V1,
8547                                                           Mask, Subtarget, DAG))
8548       return Broadcast;
8549
8550     // Straight shuffle of a single input vector. For everything from SSE2
8551     // onward this has a single fast instruction with no scary immediates.
8552     // We have to map the mask as it is actually a v4i32 shuffle instruction.
8553     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V1);
8554     int WidenedMask[4] = {
8555         std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
8556         std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
8557     return DAG.getNode(
8558         ISD::BITCAST, DL, MVT::v2i64,
8559         DAG.getNode(X86ISD::PSHUFD, SDLoc(Op), MVT::v4i32, V1,
8560                     getV4X86ShuffleImm8ForMask(WidenedMask, DAG)));
8561   }
8562
8563   // Try to use byte shift instructions.
8564   if (SDValue Shift = lowerVectorShuffleAsByteShift(
8565           DL, MVT::v2i64, V1, V2, Mask, DAG))
8566     return Shift;
8567
8568   // If we have a single input from V2 insert that into V1 if we can do so
8569   // cheaply.
8570   if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
8571     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8572             MVT::v2i64, DL, V1, V2, Mask, Subtarget, DAG))
8573       return Insertion;
8574     // Try inverting the insertion since for v2 masks it is easy to do and we
8575     // can't reliably sort the mask one way or the other.
8576     int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
8577                           Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
8578     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8579             MVT::v2i64, DL, V2, V1, InverseMask, Subtarget, DAG))
8580       return Insertion;
8581   }
8582
8583   // We have different paths for blend lowering, but they all must use the
8584   // *exact* same predicate.
8585   bool IsBlendSupported = Subtarget->hasSSE41();
8586   if (IsBlendSupported)
8587     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
8588                                                   Subtarget, DAG))
8589       return Blend;
8590
8591   // Use dedicated unpack instructions for masks that match their pattern.
8592   if (isShuffleEquivalent(V1, V2, Mask, 0, 2))
8593     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2);
8594   if (isShuffleEquivalent(V1, V2, Mask, 1, 3))
8595     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2);
8596
8597   // Try to use byte rotation instructions.
8598   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
8599   if (Subtarget->hasSSSE3())
8600     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
8601             DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
8602       return Rotate;
8603
8604   // If we have direct support for blends, we should lower by decomposing into
8605   // a permute. That will be faster than the domain cross.
8606   if (IsBlendSupported)
8607     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
8608                                                       Mask, DAG);
8609
8610   // We implement this with SHUFPD which is pretty lame because it will likely
8611   // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
8612   // However, all the alternatives are still more cycles and newer chips don't
8613   // have this problem. It would be really nice if x86 had better shuffles here.
8614   V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V1);
8615   V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V2);
8616   return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
8617                      DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
8618 }
8619
8620 /// \brief Test whether this can be lowered with a single SHUFPS instruction.
8621 ///
8622 /// This is used to disable more specialized lowerings when the shufps lowering
8623 /// will happen to be efficient.
8624 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
8625   // This routine only handles 128-bit shufps.
8626   assert(Mask.size() == 4 && "Unsupported mask size!");
8627
8628   // To lower with a single SHUFPS we need to have the low half and high half
8629   // each requiring a single input.
8630   if (Mask[0] != -1 && Mask[1] != -1 && (Mask[0] < 4) != (Mask[1] < 4))
8631     return false;
8632   if (Mask[2] != -1 && Mask[3] != -1 && (Mask[2] < 4) != (Mask[3] < 4))
8633     return false;
8634
8635   return true;
8636 }
8637
8638 /// \brief Lower a vector shuffle using the SHUFPS instruction.
8639 ///
8640 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
8641 /// It makes no assumptions about whether this is the *best* lowering, it simply
8642 /// uses it.
8643 static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT,
8644                                             ArrayRef<int> Mask, SDValue V1,
8645                                             SDValue V2, SelectionDAG &DAG) {
8646   SDValue LowV = V1, HighV = V2;
8647   int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
8648
8649   int NumV2Elements =
8650       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
8651
8652   if (NumV2Elements == 1) {
8653     int V2Index =
8654         std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
8655         Mask.begin();
8656
8657     // Compute the index adjacent to V2Index and in the same half by toggling
8658     // the low bit.
8659     int V2AdjIndex = V2Index ^ 1;
8660
8661     if (Mask[V2AdjIndex] == -1) {
8662       // Handles all the cases where we have a single V2 element and an undef.
8663       // This will only ever happen in the high lanes because we commute the
8664       // vector otherwise.
8665       if (V2Index < 2)
8666         std::swap(LowV, HighV);
8667       NewMask[V2Index] -= 4;
8668     } else {
8669       // Handle the case where the V2 element ends up adjacent to a V1 element.
8670       // To make this work, blend them together as the first step.
8671       int V1Index = V2AdjIndex;
8672       int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
8673       V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
8674                        getV4X86ShuffleImm8ForMask(BlendMask, DAG));
8675
8676       // Now proceed to reconstruct the final blend as we have the necessary
8677       // high or low half formed.
8678       if (V2Index < 2) {
8679         LowV = V2;
8680         HighV = V1;
8681       } else {
8682         HighV = V2;
8683       }
8684       NewMask[V1Index] = 2; // We put the V1 element in V2[2].
8685       NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
8686     }
8687   } else if (NumV2Elements == 2) {
8688     if (Mask[0] < 4 && Mask[1] < 4) {
8689       // Handle the easy case where we have V1 in the low lanes and V2 in the
8690       // high lanes.
8691       NewMask[2] -= 4;
8692       NewMask[3] -= 4;
8693     } else if (Mask[2] < 4 && Mask[3] < 4) {
8694       // We also handle the reversed case because this utility may get called
8695       // when we detect a SHUFPS pattern but can't easily commute the shuffle to
8696       // arrange things in the right direction.
8697       NewMask[0] -= 4;
8698       NewMask[1] -= 4;
8699       HighV = V1;
8700       LowV = V2;
8701     } else {
8702       // We have a mixture of V1 and V2 in both low and high lanes. Rather than
8703       // trying to place elements directly, just blend them and set up the final
8704       // shuffle to place them.
8705
8706       // The first two blend mask elements are for V1, the second two are for
8707       // V2.
8708       int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
8709                           Mask[2] < 4 ? Mask[2] : Mask[3],
8710                           (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
8711                           (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
8712       V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
8713                        getV4X86ShuffleImm8ForMask(BlendMask, DAG));
8714
8715       // Now we do a normal shuffle of V1 by giving V1 as both operands to
8716       // a blend.
8717       LowV = HighV = V1;
8718       NewMask[0] = Mask[0] < 4 ? 0 : 2;
8719       NewMask[1] = Mask[0] < 4 ? 2 : 0;
8720       NewMask[2] = Mask[2] < 4 ? 1 : 3;
8721       NewMask[3] = Mask[2] < 4 ? 3 : 1;
8722     }
8723   }
8724   return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
8725                      getV4X86ShuffleImm8ForMask(NewMask, DAG));
8726 }
8727
8728 /// \brief Lower 4-lane 32-bit floating point shuffles.
8729 ///
8730 /// Uses instructions exclusively from the floating point unit to minimize
8731 /// domain crossing penalties, as these are sufficient to implement all v4f32
8732 /// shuffles.
8733 static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8734                                        const X86Subtarget *Subtarget,
8735                                        SelectionDAG &DAG) {
8736   SDLoc DL(Op);
8737   assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
8738   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8739   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8740   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8741   ArrayRef<int> Mask = SVOp->getMask();
8742   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8743
8744   int NumV2Elements =
8745       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
8746
8747   if (NumV2Elements == 0) {
8748     // Check for being able to broadcast a single element.
8749     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f32, DL, V1,
8750                                                           Mask, Subtarget, DAG))
8751       return Broadcast;
8752
8753     // Use even/odd duplicate instructions for masks that match their pattern.
8754     if (Subtarget->hasSSE3()) {
8755       if (isShuffleEquivalent(V1, V2, Mask, 0, 0, 2, 2))
8756         return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
8757       if (isShuffleEquivalent(V1, V2, Mask, 1, 1, 3, 3))
8758         return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
8759     }
8760
8761     if (Subtarget->hasAVX()) {
8762       // If we have AVX, we can use VPERMILPS which will allow folding a load
8763       // into the shuffle.
8764       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
8765                          getV4X86ShuffleImm8ForMask(Mask, DAG));
8766     }
8767
8768     // Otherwise, use a straight shuffle of a single input vector. We pass the
8769     // input vector to both operands to simulate this with a SHUFPS.
8770     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
8771                        getV4X86ShuffleImm8ForMask(Mask, DAG));
8772   }
8773
8774   // There are special ways we can lower some single-element blends. However, we
8775   // have custom ways we can lower more complex single-element blends below that
8776   // we defer to if both this and BLENDPS fail to match, so restrict this to
8777   // when the V2 input is targeting element 0 of the mask -- that is the fast
8778   // case here.
8779   if (NumV2Elements == 1 && Mask[0] >= 4)
8780     if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4f32, DL, V1, V2,
8781                                                          Mask, Subtarget, DAG))
8782       return V;
8783
8784   if (Subtarget->hasSSE41()) {
8785     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
8786                                                   Subtarget, DAG))
8787       return Blend;
8788
8789     // Use INSERTPS if we can complete the shuffle efficiently.
8790     if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG))
8791       return V;
8792
8793     if (!isSingleSHUFPSMask(Mask))
8794       if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
8795               DL, MVT::v4f32, V1, V2, Mask, DAG))
8796         return BlendPerm;
8797   }
8798
8799   // Use dedicated unpack instructions for masks that match their pattern.
8800   if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 1, 5))
8801     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2);
8802   if (isShuffleEquivalent(V1, V2, Mask, 2, 6, 3, 7))
8803     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
8804
8805   // Otherwise fall back to a SHUFPS lowering strategy.
8806   return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
8807 }
8808
8809 /// \brief Lower 4-lane i32 vector shuffles.
8810 ///
8811 /// We try to handle these with integer-domain shuffles where we can, but for
8812 /// blends we use the floating point domain blend instructions.
8813 static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8814                                        const X86Subtarget *Subtarget,
8815                                        SelectionDAG &DAG) {
8816   SDLoc DL(Op);
8817   assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!");
8818   assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
8819   assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
8820   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8821   ArrayRef<int> Mask = SVOp->getMask();
8822   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8823
8824   // Whenever we can lower this as a zext, that instruction is strictly faster
8825   // than any alternative. It also allows us to fold memory operands into the
8826   // shuffle in many cases.
8827   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2,
8828                                                          Mask, Subtarget, DAG))
8829     return ZExt;
8830
8831   int NumV2Elements =
8832       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
8833
8834   if (NumV2Elements == 0) {
8835     // Check for being able to broadcast a single element.
8836     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i32, DL, V1,
8837                                                           Mask, Subtarget, DAG))
8838       return Broadcast;
8839
8840     // Straight shuffle of a single input vector. For everything from SSE2
8841     // onward this has a single fast instruction with no scary immediates.
8842     // We coerce the shuffle pattern to be compatible with UNPCK instructions
8843     // but we aren't actually going to use the UNPCK instruction because doing
8844     // so prevents folding a load into this instruction or making a copy.
8845     const int UnpackLoMask[] = {0, 0, 1, 1};
8846     const int UnpackHiMask[] = {2, 2, 3, 3};
8847     if (isShuffleEquivalent(V1, V2, Mask, 0, 0, 1, 1))
8848       Mask = UnpackLoMask;
8849     else if (isShuffleEquivalent(V1, V2, Mask, 2, 2, 3, 3))
8850       Mask = UnpackHiMask;
8851
8852     return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
8853                        getV4X86ShuffleImm8ForMask(Mask, DAG));
8854   }
8855
8856   // Try to use bit shift instructions.
8857   if (SDValue Shift = lowerVectorShuffleAsBitShift(
8858           DL, MVT::v4i32, V1, V2, Mask, DAG))
8859     return Shift;
8860
8861   // Try to use byte shift instructions.
8862   if (SDValue Shift = lowerVectorShuffleAsByteShift(
8863           DL, MVT::v4i32, V1, V2, Mask, DAG))
8864     return Shift;
8865
8866   // There are special ways we can lower some single-element blends.
8867   if (NumV2Elements == 1)
8868     if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2,
8869                                                          Mask, Subtarget, DAG))
8870       return V;
8871
8872   // We have different paths for blend lowering, but they all must use the
8873   // *exact* same predicate.
8874   bool IsBlendSupported = Subtarget->hasSSE41();
8875   if (IsBlendSupported)
8876     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
8877                                                   Subtarget, DAG))
8878       return Blend;
8879
8880   if (SDValue Masked =
8881           lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, DAG))
8882     return Masked;
8883
8884   // Use dedicated unpack instructions for masks that match their pattern.
8885   if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 1, 5))
8886     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2);
8887   if (isShuffleEquivalent(V1, V2, Mask, 2, 6, 3, 7))
8888     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
8889
8890   // Try to use byte rotation instructions.
8891   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
8892   if (Subtarget->hasSSSE3())
8893     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
8894             DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
8895       return Rotate;
8896
8897   // If we have direct support for blends, we should lower by decomposing into
8898   // a permute. That will be faster than the domain cross.
8899   if (IsBlendSupported)
8900     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
8901                                                       Mask, DAG);
8902
8903   // We implement this with SHUFPS because it can blend from two vectors.
8904   // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
8905   // up the inputs, bypassing domain shift penalties that we would encur if we
8906   // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
8907   // relevant.
8908   return DAG.getNode(ISD::BITCAST, DL, MVT::v4i32,
8909                      DAG.getVectorShuffle(
8910                          MVT::v4f32, DL,
8911                          DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V1),
8912                          DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V2), Mask));
8913 }
8914
8915 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
8916 /// shuffle lowering, and the most complex part.
8917 ///
8918 /// The lowering strategy is to try to form pairs of input lanes which are
8919 /// targeted at the same half of the final vector, and then use a dword shuffle
8920 /// to place them onto the right half, and finally unpack the paired lanes into
8921 /// their final position.
8922 ///
8923 /// The exact breakdown of how to form these dword pairs and align them on the
8924 /// correct sides is really tricky. See the comments within the function for
8925 /// more of the details.
8926 static SDValue lowerV8I16SingleInputVectorShuffle(
8927     SDLoc DL, SDValue V, MutableArrayRef<int> Mask,
8928     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
8929   assert(V.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
8930   MutableArrayRef<int> LoMask = Mask.slice(0, 4);
8931   MutableArrayRef<int> HiMask = Mask.slice(4, 4);
8932
8933   SmallVector<int, 4> LoInputs;
8934   std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
8935                [](int M) { return M >= 0; });
8936   std::sort(LoInputs.begin(), LoInputs.end());
8937   LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
8938   SmallVector<int, 4> HiInputs;
8939   std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
8940                [](int M) { return M >= 0; });
8941   std::sort(HiInputs.begin(), HiInputs.end());
8942   HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
8943   int NumLToL =
8944       std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
8945   int NumHToL = LoInputs.size() - NumLToL;
8946   int NumLToH =
8947       std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
8948   int NumHToH = HiInputs.size() - NumLToH;
8949   MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
8950   MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
8951   MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
8952   MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
8953
8954   // Check for being able to broadcast a single element.
8955   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i16, DL, V,
8956                                                         Mask, Subtarget, DAG))
8957     return Broadcast;
8958
8959   // Try to use bit shift instructions.
8960   if (SDValue Shift = lowerVectorShuffleAsBitShift(
8961           DL, MVT::v8i16, V, V, Mask, DAG))
8962     return Shift;
8963
8964   // Try to use byte shift instructions.
8965   if (SDValue Shift = lowerVectorShuffleAsByteShift(
8966           DL, MVT::v8i16, V, V, Mask, DAG))
8967     return Shift;
8968
8969   // Use dedicated unpack instructions for masks that match their pattern.
8970   if (isShuffleEquivalent(V, V, Mask, 0, 0, 1, 1, 2, 2, 3, 3))
8971     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V);
8972   if (isShuffleEquivalent(V, V, Mask, 4, 4, 5, 5, 6, 6, 7, 7))
8973     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V);
8974
8975   // Try to use byte rotation instructions.
8976   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
8977           DL, MVT::v8i16, V, V, Mask, Subtarget, DAG))
8978     return Rotate;
8979
8980   // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
8981   // such inputs we can swap two of the dwords across the half mark and end up
8982   // with <=2 inputs to each half in each half. Once there, we can fall through
8983   // to the generic code below. For example:
8984   //
8985   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
8986   // Mask:  [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
8987   //
8988   // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
8989   // and an existing 2-into-2 on the other half. In this case we may have to
8990   // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
8991   // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
8992   // Fortunately, we don't have to handle anything but a 2-into-2 pattern
8993   // because any other situation (including a 3-into-1 or 1-into-3 in the other
8994   // half than the one we target for fixing) will be fixed when we re-enter this
8995   // path. We will also combine away any sequence of PSHUFD instructions that
8996   // result into a single instruction. Here is an example of the tricky case:
8997   //
8998   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
8999   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
9000   //
9001   // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
9002   //
9003   // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
9004   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
9005   //
9006   // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
9007   // Mask:  [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
9008   //
9009   // The result is fine to be handled by the generic logic.
9010   auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
9011                           ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
9012                           int AOffset, int BOffset) {
9013     assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
9014            "Must call this with A having 3 or 1 inputs from the A half.");
9015     assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
9016            "Must call this with B having 1 or 3 inputs from the B half.");
9017     assert(AToAInputs.size() + BToAInputs.size() == 4 &&
9018            "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
9019
9020     // Compute the index of dword with only one word among the three inputs in
9021     // a half by taking the sum of the half with three inputs and subtracting
9022     // the sum of the actual three inputs. The difference is the remaining
9023     // slot.
9024     int ADWord, BDWord;
9025     int &TripleDWord = AToAInputs.size() == 3 ? ADWord : BDWord;
9026     int &OneInputDWord = AToAInputs.size() == 3 ? BDWord : ADWord;
9027     int TripleInputOffset = AToAInputs.size() == 3 ? AOffset : BOffset;
9028     ArrayRef<int> TripleInputs = AToAInputs.size() == 3 ? AToAInputs : BToAInputs;
9029     int OneInput = AToAInputs.size() == 3 ? BToAInputs[0] : AToAInputs[0];
9030     int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
9031     int TripleNonInputIdx =
9032         TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
9033     TripleDWord = TripleNonInputIdx / 2;
9034
9035     // We use xor with one to compute the adjacent DWord to whichever one the
9036     // OneInput is in.
9037     OneInputDWord = (OneInput / 2) ^ 1;
9038
9039     // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
9040     // and BToA inputs. If there is also such a problem with the BToB and AToB
9041     // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
9042     // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
9043     // is essential that we don't *create* a 3<-1 as then we might oscillate.
9044     if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
9045       // Compute how many inputs will be flipped by swapping these DWords. We
9046       // need
9047       // to balance this to ensure we don't form a 3-1 shuffle in the other
9048       // half.
9049       int NumFlippedAToBInputs =
9050           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
9051           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
9052       int NumFlippedBToBInputs =
9053           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
9054           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
9055       if ((NumFlippedAToBInputs == 1 &&
9056            (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
9057           (NumFlippedBToBInputs == 1 &&
9058            (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
9059         // We choose whether to fix the A half or B half based on whether that
9060         // half has zero flipped inputs. At zero, we may not be able to fix it
9061         // with that half. We also bias towards fixing the B half because that
9062         // will more commonly be the high half, and we have to bias one way.
9063         auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
9064                                                        ArrayRef<int> Inputs) {
9065           int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
9066           bool IsFixIdxInput = std::find(Inputs.begin(), Inputs.end(),
9067                                          PinnedIdx ^ 1) != Inputs.end();
9068           // Determine whether the free index is in the flipped dword or the
9069           // unflipped dword based on where the pinned index is. We use this bit
9070           // in an xor to conditionally select the adjacent dword.
9071           int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
9072           bool IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
9073                                              FixFreeIdx) != Inputs.end();
9074           if (IsFixIdxInput == IsFixFreeIdxInput)
9075             FixFreeIdx += 1;
9076           IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
9077                                         FixFreeIdx) != Inputs.end();
9078           assert(IsFixIdxInput != IsFixFreeIdxInput &&
9079                  "We need to be changing the number of flipped inputs!");
9080           int PSHUFHalfMask[] = {0, 1, 2, 3};
9081           std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
9082           V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
9083                           MVT::v8i16, V,
9084                           getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DAG));
9085
9086           for (int &M : Mask)
9087             if (M != -1 && M == FixIdx)
9088               M = FixFreeIdx;
9089             else if (M != -1 && M == FixFreeIdx)
9090               M = FixIdx;
9091         };
9092         if (NumFlippedBToBInputs != 0) {
9093           int BPinnedIdx =
9094               BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
9095           FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
9096         } else {
9097           assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
9098           int APinnedIdx =
9099               AToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
9100           FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
9101         }
9102       }
9103     }
9104
9105     int PSHUFDMask[] = {0, 1, 2, 3};
9106     PSHUFDMask[ADWord] = BDWord;
9107     PSHUFDMask[BDWord] = ADWord;
9108     V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9109                     DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9110                                 DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
9111                                 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
9112
9113     // Adjust the mask to match the new locations of A and B.
9114     for (int &M : Mask)
9115       if (M != -1 && M/2 == ADWord)
9116         M = 2 * BDWord + M % 2;
9117       else if (M != -1 && M/2 == BDWord)
9118         M = 2 * ADWord + M % 2;
9119
9120     // Recurse back into this routine to re-compute state now that this isn't
9121     // a 3 and 1 problem.
9122     return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
9123                                 Mask);
9124   };
9125   if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
9126     return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
9127   else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
9128     return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
9129
9130   // At this point there are at most two inputs to the low and high halves from
9131   // each half. That means the inputs can always be grouped into dwords and
9132   // those dwords can then be moved to the correct half with a dword shuffle.
9133   // We use at most one low and one high word shuffle to collect these paired
9134   // inputs into dwords, and finally a dword shuffle to place them.
9135   int PSHUFLMask[4] = {-1, -1, -1, -1};
9136   int PSHUFHMask[4] = {-1, -1, -1, -1};
9137   int PSHUFDMask[4] = {-1, -1, -1, -1};
9138
9139   // First fix the masks for all the inputs that are staying in their
9140   // original halves. This will then dictate the targets of the cross-half
9141   // shuffles.
9142   auto fixInPlaceInputs =
9143       [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
9144                     MutableArrayRef<int> SourceHalfMask,
9145                     MutableArrayRef<int> HalfMask, int HalfOffset) {
9146     if (InPlaceInputs.empty())
9147       return;
9148     if (InPlaceInputs.size() == 1) {
9149       SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
9150           InPlaceInputs[0] - HalfOffset;
9151       PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
9152       return;
9153     }
9154     if (IncomingInputs.empty()) {
9155       // Just fix all of the in place inputs.
9156       for (int Input : InPlaceInputs) {
9157         SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
9158         PSHUFDMask[Input / 2] = Input / 2;
9159       }
9160       return;
9161     }
9162
9163     assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
9164     SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
9165         InPlaceInputs[0] - HalfOffset;
9166     // Put the second input next to the first so that they are packed into
9167     // a dword. We find the adjacent index by toggling the low bit.
9168     int AdjIndex = InPlaceInputs[0] ^ 1;
9169     SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
9170     std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
9171     PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
9172   };
9173   fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
9174   fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
9175
9176   // Now gather the cross-half inputs and place them into a free dword of
9177   // their target half.
9178   // FIXME: This operation could almost certainly be simplified dramatically to
9179   // look more like the 3-1 fixing operation.
9180   auto moveInputsToRightHalf = [&PSHUFDMask](
9181       MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
9182       MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
9183       MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
9184       int DestOffset) {
9185     auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
9186       return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word;
9187     };
9188     auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
9189                                                int Word) {
9190       int LowWord = Word & ~1;
9191       int HighWord = Word | 1;
9192       return isWordClobbered(SourceHalfMask, LowWord) ||
9193              isWordClobbered(SourceHalfMask, HighWord);
9194     };
9195
9196     if (IncomingInputs.empty())
9197       return;
9198
9199     if (ExistingInputs.empty()) {
9200       // Map any dwords with inputs from them into the right half.
9201       for (int Input : IncomingInputs) {
9202         // If the source half mask maps over the inputs, turn those into
9203         // swaps and use the swapped lane.
9204         if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
9205           if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == -1) {
9206             SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
9207                 Input - SourceOffset;
9208             // We have to swap the uses in our half mask in one sweep.
9209             for (int &M : HalfMask)
9210               if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
9211                 M = Input;
9212               else if (M == Input)
9213                 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
9214           } else {
9215             assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
9216                        Input - SourceOffset &&
9217                    "Previous placement doesn't match!");
9218           }
9219           // Note that this correctly re-maps both when we do a swap and when
9220           // we observe the other side of the swap above. We rely on that to
9221           // avoid swapping the members of the input list directly.
9222           Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
9223         }
9224
9225         // Map the input's dword into the correct half.
9226         if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == -1)
9227           PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
9228         else
9229           assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
9230                      Input / 2 &&
9231                  "Previous placement doesn't match!");
9232       }
9233
9234       // And just directly shift any other-half mask elements to be same-half
9235       // as we will have mirrored the dword containing the element into the
9236       // same position within that half.
9237       for (int &M : HalfMask)
9238         if (M >= SourceOffset && M < SourceOffset + 4) {
9239           M = M - SourceOffset + DestOffset;
9240           assert(M >= 0 && "This should never wrap below zero!");
9241         }
9242       return;
9243     }
9244
9245     // Ensure we have the input in a viable dword of its current half. This
9246     // is particularly tricky because the original position may be clobbered
9247     // by inputs being moved and *staying* in that half.
9248     if (IncomingInputs.size() == 1) {
9249       if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
9250         int InputFixed = std::find(std::begin(SourceHalfMask),
9251                                    std::end(SourceHalfMask), -1) -
9252                          std::begin(SourceHalfMask) + SourceOffset;
9253         SourceHalfMask[InputFixed - SourceOffset] =
9254             IncomingInputs[0] - SourceOffset;
9255         std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
9256                      InputFixed);
9257         IncomingInputs[0] = InputFixed;
9258       }
9259     } else if (IncomingInputs.size() == 2) {
9260       if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
9261           isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
9262         // We have two non-adjacent or clobbered inputs we need to extract from
9263         // the source half. To do this, we need to map them into some adjacent
9264         // dword slot in the source mask.
9265         int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
9266                               IncomingInputs[1] - SourceOffset};
9267
9268         // If there is a free slot in the source half mask adjacent to one of
9269         // the inputs, place the other input in it. We use (Index XOR 1) to
9270         // compute an adjacent index.
9271         if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
9272             SourceHalfMask[InputsFixed[0] ^ 1] == -1) {
9273           SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
9274           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
9275           InputsFixed[1] = InputsFixed[0] ^ 1;
9276         } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
9277                    SourceHalfMask[InputsFixed[1] ^ 1] == -1) {
9278           SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
9279           SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
9280           InputsFixed[0] = InputsFixed[1] ^ 1;
9281         } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] == -1 &&
9282                    SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] == -1) {
9283           // The two inputs are in the same DWord but it is clobbered and the
9284           // adjacent DWord isn't used at all. Move both inputs to the free
9285           // slot.
9286           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
9287           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
9288           InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
9289           InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
9290         } else {
9291           // The only way we hit this point is if there is no clobbering
9292           // (because there are no off-half inputs to this half) and there is no
9293           // free slot adjacent to one of the inputs. In this case, we have to
9294           // swap an input with a non-input.
9295           for (int i = 0; i < 4; ++i)
9296             assert((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) &&
9297                    "We can't handle any clobbers here!");
9298           assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
9299                  "Cannot have adjacent inputs here!");
9300
9301           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
9302           SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
9303
9304           // We also have to update the final source mask in this case because
9305           // it may need to undo the above swap.
9306           for (int &M : FinalSourceHalfMask)
9307             if (M == (InputsFixed[0] ^ 1) + SourceOffset)
9308               M = InputsFixed[1] + SourceOffset;
9309             else if (M == InputsFixed[1] + SourceOffset)
9310               M = (InputsFixed[0] ^ 1) + SourceOffset;
9311
9312           InputsFixed[1] = InputsFixed[0] ^ 1;
9313         }
9314
9315         // Point everything at the fixed inputs.
9316         for (int &M : HalfMask)
9317           if (M == IncomingInputs[0])
9318             M = InputsFixed[0] + SourceOffset;
9319           else if (M == IncomingInputs[1])
9320             M = InputsFixed[1] + SourceOffset;
9321
9322         IncomingInputs[0] = InputsFixed[0] + SourceOffset;
9323         IncomingInputs[1] = InputsFixed[1] + SourceOffset;
9324       }
9325     } else {
9326       llvm_unreachable("Unhandled input size!");
9327     }
9328
9329     // Now hoist the DWord down to the right half.
9330     int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2;
9331     assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free");
9332     PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
9333     for (int &M : HalfMask)
9334       for (int Input : IncomingInputs)
9335         if (M == Input)
9336           M = FreeDWord * 2 + Input % 2;
9337   };
9338   moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
9339                         /*SourceOffset*/ 4, /*DestOffset*/ 0);
9340   moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
9341                         /*SourceOffset*/ 0, /*DestOffset*/ 4);
9342
9343   // Now enact all the shuffles we've computed to move the inputs into their
9344   // target half.
9345   if (!isNoopShuffleMask(PSHUFLMask))
9346     V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
9347                     getV4X86ShuffleImm8ForMask(PSHUFLMask, DAG));
9348   if (!isNoopShuffleMask(PSHUFHMask))
9349     V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
9350                     getV4X86ShuffleImm8ForMask(PSHUFHMask, DAG));
9351   if (!isNoopShuffleMask(PSHUFDMask))
9352     V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9353                     DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9354                                 DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
9355                                 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
9356
9357   // At this point, each half should contain all its inputs, and we can then
9358   // just shuffle them into their final position.
9359   assert(std::count_if(LoMask.begin(), LoMask.end(),
9360                        [](int M) { return M >= 4; }) == 0 &&
9361          "Failed to lift all the high half inputs to the low mask!");
9362   assert(std::count_if(HiMask.begin(), HiMask.end(),
9363                        [](int M) { return M >= 0 && M < 4; }) == 0 &&
9364          "Failed to lift all the low half inputs to the high mask!");
9365
9366   // Do a half shuffle for the low mask.
9367   if (!isNoopShuffleMask(LoMask))
9368     V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
9369                     getV4X86ShuffleImm8ForMask(LoMask, DAG));
9370
9371   // Do a half shuffle with the high mask after shifting its values down.
9372   for (int &M : HiMask)
9373     if (M >= 0)
9374       M -= 4;
9375   if (!isNoopShuffleMask(HiMask))
9376     V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
9377                     getV4X86ShuffleImm8ForMask(HiMask, DAG));
9378
9379   return V;
9380 }
9381
9382 /// \brief Detect whether the mask pattern should be lowered through
9383 /// interleaving.
9384 ///
9385 /// This essentially tests whether viewing the mask as an interleaving of two
9386 /// sub-sequences reduces the cross-input traffic of a blend operation. If so,
9387 /// lowering it through interleaving is a significantly better strategy.
9388 static bool shouldLowerAsInterleaving(ArrayRef<int> Mask) {
9389   int NumEvenInputs[2] = {0, 0};
9390   int NumOddInputs[2] = {0, 0};
9391   int NumLoInputs[2] = {0, 0};
9392   int NumHiInputs[2] = {0, 0};
9393   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9394     if (Mask[i] < 0)
9395       continue;
9396
9397     int InputIdx = Mask[i] >= Size;
9398
9399     if (i < Size / 2)
9400       ++NumLoInputs[InputIdx];
9401     else
9402       ++NumHiInputs[InputIdx];
9403
9404     if ((i % 2) == 0)
9405       ++NumEvenInputs[InputIdx];
9406     else
9407       ++NumOddInputs[InputIdx];
9408   }
9409
9410   // The minimum number of cross-input results for both the interleaved and
9411   // split cases. If interleaving results in fewer cross-input results, return
9412   // true.
9413   int InterleavedCrosses = std::min(NumEvenInputs[1] + NumOddInputs[0],
9414                                     NumEvenInputs[0] + NumOddInputs[1]);
9415   int SplitCrosses = std::min(NumLoInputs[1] + NumHiInputs[0],
9416                               NumLoInputs[0] + NumHiInputs[1]);
9417   return InterleavedCrosses < SplitCrosses;
9418 }
9419
9420 /// \brief Blend two v8i16 vectors using a naive unpack strategy.
9421 ///
9422 /// This strategy only works when the inputs from each vector fit into a single
9423 /// half of that vector, and generally there are not so many inputs as to leave
9424 /// the in-place shuffles required highly constrained (and thus expensive). It
9425 /// shifts all the inputs into a single side of both input vectors and then
9426 /// uses an unpack to interleave these inputs in a single vector. At that
9427 /// point, we will fall back on the generic single input shuffle lowering.
9428 static SDValue lowerV8I16BasicBlendVectorShuffle(SDLoc DL, SDValue V1,
9429                                                  SDValue V2,
9430                                                  MutableArrayRef<int> Mask,
9431                                                  const X86Subtarget *Subtarget,
9432                                                  SelectionDAG &DAG) {
9433   assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
9434   assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
9435   SmallVector<int, 3> LoV1Inputs, HiV1Inputs, LoV2Inputs, HiV2Inputs;
9436   for (int i = 0; i < 8; ++i)
9437     if (Mask[i] >= 0 && Mask[i] < 4)
9438       LoV1Inputs.push_back(i);
9439     else if (Mask[i] >= 4 && Mask[i] < 8)
9440       HiV1Inputs.push_back(i);
9441     else if (Mask[i] >= 8 && Mask[i] < 12)
9442       LoV2Inputs.push_back(i);
9443     else if (Mask[i] >= 12)
9444       HiV2Inputs.push_back(i);
9445
9446   int NumV1Inputs = LoV1Inputs.size() + HiV1Inputs.size();
9447   int NumV2Inputs = LoV2Inputs.size() + HiV2Inputs.size();
9448   (void)NumV1Inputs;
9449   (void)NumV2Inputs;
9450   assert(NumV1Inputs > 0 && NumV1Inputs <= 3 && "At most 3 inputs supported");
9451   assert(NumV2Inputs > 0 && NumV2Inputs <= 3 && "At most 3 inputs supported");
9452   assert(NumV1Inputs + NumV2Inputs <= 4 && "At most 4 combined inputs");
9453
9454   bool MergeFromLo = LoV1Inputs.size() + LoV2Inputs.size() >=
9455                      HiV1Inputs.size() + HiV2Inputs.size();
9456
9457   auto moveInputsToHalf = [&](SDValue V, ArrayRef<int> LoInputs,
9458                               ArrayRef<int> HiInputs, bool MoveToLo,
9459                               int MaskOffset) {
9460     ArrayRef<int> GoodInputs = MoveToLo ? LoInputs : HiInputs;
9461     ArrayRef<int> BadInputs = MoveToLo ? HiInputs : LoInputs;
9462     if (BadInputs.empty())
9463       return V;
9464
9465     int MoveMask[] = {-1, -1, -1, -1, -1, -1, -1, -1};
9466     int MoveOffset = MoveToLo ? 0 : 4;
9467
9468     if (GoodInputs.empty()) {
9469       for (int BadInput : BadInputs) {
9470         MoveMask[Mask[BadInput] % 4 + MoveOffset] = Mask[BadInput] - MaskOffset;
9471         Mask[BadInput] = Mask[BadInput] % 4 + MoveOffset + MaskOffset;
9472       }
9473     } else {
9474       if (GoodInputs.size() == 2) {
9475         // If the low inputs are spread across two dwords, pack them into
9476         // a single dword.
9477         MoveMask[MoveOffset] = Mask[GoodInputs[0]] - MaskOffset;
9478         MoveMask[MoveOffset + 1] = Mask[GoodInputs[1]] - MaskOffset;
9479         Mask[GoodInputs[0]] = MoveOffset + MaskOffset;
9480         Mask[GoodInputs[1]] = MoveOffset + 1 + MaskOffset;
9481       } else {
9482         // Otherwise pin the good inputs.
9483         for (int GoodInput : GoodInputs)
9484           MoveMask[Mask[GoodInput] - MaskOffset] = Mask[GoodInput] - MaskOffset;
9485       }
9486
9487       if (BadInputs.size() == 2) {
9488         // If we have two bad inputs then there may be either one or two good
9489         // inputs fixed in place. Find a fixed input, and then find the *other*
9490         // two adjacent indices by using modular arithmetic.
9491         int GoodMaskIdx =
9492             std::find_if(std::begin(MoveMask) + MoveOffset, std::end(MoveMask),
9493                          [](int M) { return M >= 0; }) -
9494             std::begin(MoveMask);
9495         int MoveMaskIdx =
9496             ((((GoodMaskIdx - MoveOffset) & ~1) + 2) % 4) + MoveOffset;
9497         assert(MoveMask[MoveMaskIdx] == -1 && "Expected empty slot");
9498         assert(MoveMask[MoveMaskIdx + 1] == -1 && "Expected empty slot");
9499         MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
9500         MoveMask[MoveMaskIdx + 1] = Mask[BadInputs[1]] - MaskOffset;
9501         Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
9502         Mask[BadInputs[1]] = MoveMaskIdx + 1 + MaskOffset;
9503       } else {
9504         assert(BadInputs.size() == 1 && "All sizes handled");
9505         int MoveMaskIdx = std::find(std::begin(MoveMask) + MoveOffset,
9506                                     std::end(MoveMask), -1) -
9507                           std::begin(MoveMask);
9508         MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
9509         Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
9510       }
9511     }
9512
9513     return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
9514                                 MoveMask);
9515   };
9516   V1 = moveInputsToHalf(V1, LoV1Inputs, HiV1Inputs, MergeFromLo,
9517                         /*MaskOffset*/ 0);
9518   V2 = moveInputsToHalf(V2, LoV2Inputs, HiV2Inputs, MergeFromLo,
9519                         /*MaskOffset*/ 8);
9520
9521   // FIXME: Select an interleaving of the merge of V1 and V2 that minimizes
9522   // cross-half traffic in the final shuffle.
9523
9524   // Munge the mask to be a single-input mask after the unpack merges the
9525   // results.
9526   for (int &M : Mask)
9527     if (M != -1)
9528       M = 2 * (M % 4) + (M / 8);
9529
9530   return DAG.getVectorShuffle(
9531       MVT::v8i16, DL, DAG.getNode(MergeFromLo ? X86ISD::UNPCKL : X86ISD::UNPCKH,
9532                                   DL, MVT::v8i16, V1, V2),
9533       DAG.getUNDEF(MVT::v8i16), Mask);
9534 }
9535
9536 /// \brief Generic lowering of 8-lane i16 shuffles.
9537 ///
9538 /// This handles both single-input shuffles and combined shuffle/blends with
9539 /// two inputs. The single input shuffles are immediately delegated to
9540 /// a dedicated lowering routine.
9541 ///
9542 /// The blends are lowered in one of three fundamental ways. If there are few
9543 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
9544 /// of the input is significantly cheaper when lowered as an interleaving of
9545 /// the two inputs, try to interleave them. Otherwise, blend the low and high
9546 /// halves of the inputs separately (making them have relatively few inputs)
9547 /// and then concatenate them.
9548 static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9549                                        const X86Subtarget *Subtarget,
9550                                        SelectionDAG &DAG) {
9551   SDLoc DL(Op);
9552   assert(Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!");
9553   assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
9554   assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
9555   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
9556   ArrayRef<int> OrigMask = SVOp->getMask();
9557   int MaskStorage[8] = {OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3],
9558                         OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7]};
9559   MutableArrayRef<int> Mask(MaskStorage);
9560
9561   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
9562
9563   // Whenever we can lower this as a zext, that instruction is strictly faster
9564   // than any alternative.
9565   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
9566           DL, MVT::v8i16, V1, V2, OrigMask, Subtarget, DAG))
9567     return ZExt;
9568
9569   auto isV1 = [](int M) { return M >= 0 && M < 8; };
9570   auto isV2 = [](int M) { return M >= 8; };
9571
9572   int NumV1Inputs = std::count_if(Mask.begin(), Mask.end(), isV1);
9573   int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2);
9574
9575   if (NumV2Inputs == 0)
9576     return lowerV8I16SingleInputVectorShuffle(DL, V1, Mask, Subtarget, DAG);
9577
9578   assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized "
9579                             "to be V1-input shuffles.");
9580
9581   // Try to use bit shift instructions.
9582   if (SDValue Shift = lowerVectorShuffleAsBitShift(
9583           DL, MVT::v8i16, V1, V2, Mask, DAG))
9584     return Shift;
9585
9586   // Try to use byte shift instructions.
9587   if (SDValue Shift = lowerVectorShuffleAsByteShift(
9588           DL, MVT::v8i16, V1, V2, Mask, DAG))
9589     return Shift;
9590
9591   // There are special ways we can lower some single-element blends.
9592   if (NumV2Inputs == 1)
9593     if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v8i16, DL, V1, V2,
9594                                                          Mask, Subtarget, DAG))
9595       return V;
9596
9597   // We have different paths for blend lowering, but they all must use the
9598   // *exact* same predicate.
9599   bool IsBlendSupported = Subtarget->hasSSE41();
9600   if (IsBlendSupported)
9601     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
9602                                                   Subtarget, DAG))
9603       return Blend;
9604
9605   if (SDValue Masked =
9606           lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, DAG))
9607     return Masked;
9608
9609   // Use dedicated unpack instructions for masks that match their pattern.
9610   if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 1, 9, 2, 10, 3, 11))
9611     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2);
9612   if (isShuffleEquivalent(V1, V2, Mask, 4, 12, 5, 13, 6, 14, 7, 15))
9613     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2);
9614
9615   // Try to use byte rotation instructions.
9616   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9617           DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
9618     return Rotate;
9619
9620   if (NumV1Inputs + NumV2Inputs <= 4)
9621     return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG);
9622
9623   // Check whether an interleaving lowering is likely to be more efficient.
9624   // This isn't perfect but it is a strong heuristic that tends to work well on
9625   // the kinds of shuffles that show up in practice.
9626   //
9627   // FIXME: Handle 1x, 2x, and 4x interleaving.
9628   if (shouldLowerAsInterleaving(Mask)) {
9629     // FIXME: Figure out whether we should pack these into the low or high
9630     // halves.
9631
9632     int EMask[8], OMask[8];
9633     for (int i = 0; i < 4; ++i) {
9634       EMask[i] = Mask[2*i];
9635       OMask[i] = Mask[2*i + 1];
9636       EMask[i + 4] = -1;
9637       OMask[i + 4] = -1;
9638     }
9639
9640     SDValue Evens = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, EMask);
9641     SDValue Odds = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, OMask);
9642
9643     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, Evens, Odds);
9644   }
9645
9646   // If we have direct support for blends, we should lower by decomposing into
9647   // a permute.
9648   if (IsBlendSupported)
9649     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
9650                                                       Mask, DAG);
9651
9652   int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9653   int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9654
9655   for (int i = 0; i < 4; ++i) {
9656     LoBlendMask[i] = Mask[i];
9657     HiBlendMask[i] = Mask[i + 4];
9658   }
9659
9660   SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
9661   SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
9662   LoV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, LoV);
9663   HiV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, HiV);
9664
9665   return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9666                      DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, LoV, HiV));
9667 }
9668
9669 /// \brief Check whether a compaction lowering can be done by dropping even
9670 /// elements and compute how many times even elements must be dropped.
9671 ///
9672 /// This handles shuffles which take every Nth element where N is a power of
9673 /// two. Example shuffle masks:
9674 ///
9675 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14,  0,  2,  4,  6,  8, 10, 12, 14
9676 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
9677 ///  N = 2:  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12
9678 ///  N = 2:  0,  4,  8, 12, 16, 20, 24, 28,  0,  4,  8, 12, 16, 20, 24, 28
9679 ///  N = 3:  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8
9680 ///  N = 3:  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24
9681 ///
9682 /// Any of these lanes can of course be undef.
9683 ///
9684 /// This routine only supports N <= 3.
9685 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
9686 /// for larger N.
9687 ///
9688 /// \returns N above, or the number of times even elements must be dropped if
9689 /// there is such a number. Otherwise returns zero.
9690 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask) {
9691   // Figure out whether we're looping over two inputs or just one.
9692   bool IsSingleInput = isSingleInputShuffleMask(Mask);
9693
9694   // The modulus for the shuffle vector entries is based on whether this is
9695   // a single input or not.
9696   int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
9697   assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
9698          "We should only be called with masks with a power-of-2 size!");
9699
9700   uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
9701
9702   // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
9703   // and 2^3 simultaneously. This is because we may have ambiguity with
9704   // partially undef inputs.
9705   bool ViableForN[3] = {true, true, true};
9706
9707   for (int i = 0, e = Mask.size(); i < e; ++i) {
9708     // Ignore undef lanes, we'll optimistically collapse them to the pattern we
9709     // want.
9710     if (Mask[i] == -1)
9711       continue;
9712
9713     bool IsAnyViable = false;
9714     for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
9715       if (ViableForN[j]) {
9716         uint64_t N = j + 1;
9717
9718         // The shuffle mask must be equal to (i * 2^N) % M.
9719         if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
9720           IsAnyViable = true;
9721         else
9722           ViableForN[j] = false;
9723       }
9724     // Early exit if we exhaust the possible powers of two.
9725     if (!IsAnyViable)
9726       break;
9727   }
9728
9729   for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
9730     if (ViableForN[j])
9731       return j + 1;
9732
9733   // Return 0 as there is no viable power of two.
9734   return 0;
9735 }
9736
9737 /// \brief Generic lowering of v16i8 shuffles.
9738 ///
9739 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
9740 /// detect any complexity reducing interleaving. If that doesn't help, it uses
9741 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
9742 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
9743 /// back together.
9744 static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9745                                        const X86Subtarget *Subtarget,
9746                                        SelectionDAG &DAG) {
9747   SDLoc DL(Op);
9748   assert(Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!");
9749   assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
9750   assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
9751   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
9752   ArrayRef<int> OrigMask = SVOp->getMask();
9753   assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!");
9754
9755   // Try to use bit shift instructions.
9756   if (SDValue Shift = lowerVectorShuffleAsBitShift(
9757           DL, MVT::v16i8, V1, V2, OrigMask, DAG))
9758     return Shift;
9759
9760   // Try to use byte shift instructions.
9761   if (SDValue Shift = lowerVectorShuffleAsByteShift(
9762           DL, MVT::v16i8, V1, V2, OrigMask, DAG))
9763     return Shift;
9764
9765   // Try to use byte rotation instructions.
9766   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9767           DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
9768     return Rotate;
9769
9770   // Try to use a zext lowering.
9771   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
9772           DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
9773     return ZExt;
9774
9775   int MaskStorage[16] = {
9776       OrigMask[0],  OrigMask[1],  OrigMask[2],  OrigMask[3],
9777       OrigMask[4],  OrigMask[5],  OrigMask[6],  OrigMask[7],
9778       OrigMask[8],  OrigMask[9],  OrigMask[10], OrigMask[11],
9779       OrigMask[12], OrigMask[13], OrigMask[14], OrigMask[15]};
9780   MutableArrayRef<int> Mask(MaskStorage);
9781   MutableArrayRef<int> LoMask = Mask.slice(0, 8);
9782   MutableArrayRef<int> HiMask = Mask.slice(8, 8);
9783
9784   int NumV2Elements =
9785       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; });
9786
9787   // For single-input shuffles, there are some nicer lowering tricks we can use.
9788   if (NumV2Elements == 0) {
9789     // Check for being able to broadcast a single element.
9790     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i8, DL, V1,
9791                                                           Mask, Subtarget, DAG))
9792       return Broadcast;
9793
9794     // Check whether we can widen this to an i16 shuffle by duplicating bytes.
9795     // Notably, this handles splat and partial-splat shuffles more efficiently.
9796     // However, it only makes sense if the pre-duplication shuffle simplifies
9797     // things significantly. Currently, this means we need to be able to
9798     // express the pre-duplication shuffle as an i16 shuffle.
9799     //
9800     // FIXME: We should check for other patterns which can be widened into an
9801     // i16 shuffle as well.
9802     auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
9803       for (int i = 0; i < 16; i += 2)
9804         if (Mask[i] != -1 && Mask[i + 1] != -1 && Mask[i] != Mask[i + 1])
9805           return false;
9806
9807       return true;
9808     };
9809     auto tryToWidenViaDuplication = [&]() -> SDValue {
9810       if (!canWidenViaDuplication(Mask))
9811         return SDValue();
9812       SmallVector<int, 4> LoInputs;
9813       std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
9814                    [](int M) { return M >= 0 && M < 8; });
9815       std::sort(LoInputs.begin(), LoInputs.end());
9816       LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
9817                      LoInputs.end());
9818       SmallVector<int, 4> HiInputs;
9819       std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
9820                    [](int M) { return M >= 8; });
9821       std::sort(HiInputs.begin(), HiInputs.end());
9822       HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
9823                      HiInputs.end());
9824
9825       bool TargetLo = LoInputs.size() >= HiInputs.size();
9826       ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
9827       ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
9828
9829       int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
9830       SmallDenseMap<int, int, 8> LaneMap;
9831       for (int I : InPlaceInputs) {
9832         PreDupI16Shuffle[I/2] = I/2;
9833         LaneMap[I] = I;
9834       }
9835       int j = TargetLo ? 0 : 4, je = j + 4;
9836       for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
9837         // Check if j is already a shuffle of this input. This happens when
9838         // there are two adjacent bytes after we move the low one.
9839         if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
9840           // If we haven't yet mapped the input, search for a slot into which
9841           // we can map it.
9842           while (j < je && PreDupI16Shuffle[j] != -1)
9843             ++j;
9844
9845           if (j == je)
9846             // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
9847             return SDValue();
9848
9849           // Map this input with the i16 shuffle.
9850           PreDupI16Shuffle[j] = MovingInputs[i] / 2;
9851         }
9852
9853         // Update the lane map based on the mapping we ended up with.
9854         LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
9855       }
9856       V1 = DAG.getNode(
9857           ISD::BITCAST, DL, MVT::v16i8,
9858           DAG.getVectorShuffle(MVT::v8i16, DL,
9859                                DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
9860                                DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
9861
9862       // Unpack the bytes to form the i16s that will be shuffled into place.
9863       V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
9864                        MVT::v16i8, V1, V1);
9865
9866       int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9867       for (int i = 0; i < 16; ++i)
9868         if (Mask[i] != -1) {
9869           int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
9870           assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
9871           if (PostDupI16Shuffle[i / 2] == -1)
9872             PostDupI16Shuffle[i / 2] = MappedMask;
9873           else
9874             assert(PostDupI16Shuffle[i / 2] == MappedMask &&
9875                    "Conflicting entrties in the original shuffle!");
9876         }
9877       return DAG.getNode(
9878           ISD::BITCAST, DL, MVT::v16i8,
9879           DAG.getVectorShuffle(MVT::v8i16, DL,
9880                                DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
9881                                DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
9882     };
9883     if (SDValue V = tryToWidenViaDuplication())
9884       return V;
9885   }
9886
9887   // Check whether an interleaving lowering is likely to be more efficient.
9888   // This isn't perfect but it is a strong heuristic that tends to work well on
9889   // the kinds of shuffles that show up in practice.
9890   //
9891   // FIXME: We need to handle other interleaving widths (i16, i32, ...).
9892   if (shouldLowerAsInterleaving(Mask)) {
9893     int NumLoHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {
9894       return (M >= 0 && M < 8) || (M >= 16 && M < 24);
9895     });
9896     int NumHiHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {
9897       return (M >= 8 && M < 16) || M >= 24;
9898     });
9899     int EMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,
9900                      -1, -1, -1, -1, -1, -1, -1, -1};
9901     int OMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,
9902                      -1, -1, -1, -1, -1, -1, -1, -1};
9903     bool UnpackLo = NumLoHalf >= NumHiHalf;
9904     MutableArrayRef<int> TargetEMask(UnpackLo ? EMask : EMask + 8, 8);
9905     MutableArrayRef<int> TargetOMask(UnpackLo ? OMask : OMask + 8, 8);
9906     for (int i = 0; i < 8; ++i) {
9907       TargetEMask[i] = Mask[2 * i];
9908       TargetOMask[i] = Mask[2 * i + 1];
9909     }
9910
9911     SDValue Evens = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, EMask);
9912     SDValue Odds = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, OMask);
9913
9914     return DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
9915                        MVT::v16i8, Evens, Odds);
9916   }
9917
9918   // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
9919   // with PSHUFB. It is important to do this before we attempt to generate any
9920   // blends but after all of the single-input lowerings. If the single input
9921   // lowerings can find an instruction sequence that is faster than a PSHUFB, we
9922   // want to preserve that and we can DAG combine any longer sequences into
9923   // a PSHUFB in the end. But once we start blending from multiple inputs,
9924   // the complexity of DAG combining bad patterns back into PSHUFB is too high,
9925   // and there are *very* few patterns that would actually be faster than the
9926   // PSHUFB approach because of its ability to zero lanes.
9927   //
9928   // FIXME: The only exceptions to the above are blends which are exact
9929   // interleavings with direct instructions supporting them. We currently don't
9930   // handle those well here.
9931   if (Subtarget->hasSSSE3()) {
9932     SDValue V1Mask[16];
9933     SDValue V2Mask[16];
9934     bool V1InUse = false;
9935     bool V2InUse = false;
9936     SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
9937
9938     for (int i = 0; i < 16; ++i) {
9939       if (Mask[i] == -1) {
9940         V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
9941       } else {
9942         const int ZeroMask = 0x80;
9943         int V1Idx = (Mask[i] < 16 ? Mask[i] : ZeroMask);
9944         int V2Idx = (Mask[i] < 16 ? ZeroMask : Mask[i] - 16);
9945         if (Zeroable[i])
9946           V1Idx = V2Idx = ZeroMask;
9947         V1Mask[i] = DAG.getConstant(V1Idx, MVT::i8);
9948         V2Mask[i] = DAG.getConstant(V2Idx, MVT::i8);
9949         V1InUse |= (ZeroMask != V1Idx);
9950         V2InUse |= (ZeroMask != V2Idx);
9951       }
9952     }
9953
9954     if (V1InUse)
9955       V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V1,
9956                        DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
9957     if (V2InUse)
9958       V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V2,
9959                        DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));
9960
9961     // If we need shuffled inputs from both, blend the two.
9962     if (V1InUse && V2InUse)
9963       return DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
9964     if (V1InUse)
9965       return V1; // Single inputs are easy.
9966     if (V2InUse)
9967       return V2; // Single inputs are easy.
9968     // Shuffling to a zeroable vector.
9969     return getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
9970   }
9971
9972   // There are special ways we can lower some single-element blends.
9973   if (NumV2Elements == 1)
9974     if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v16i8, DL, V1, V2,
9975                                                          Mask, Subtarget, DAG))
9976       return V;
9977
9978   // Check whether a compaction lowering can be done. This handles shuffles
9979   // which take every Nth element for some even N. See the helper function for
9980   // details.
9981   //
9982   // We special case these as they can be particularly efficiently handled with
9983   // the PACKUSB instruction on x86 and they show up in common patterns of
9984   // rearranging bytes to truncate wide elements.
9985   if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask)) {
9986     // NumEvenDrops is the power of two stride of the elements. Another way of
9987     // thinking about it is that we need to drop the even elements this many
9988     // times to get the original input.
9989     bool IsSingleInput = isSingleInputShuffleMask(Mask);
9990
9991     // First we need to zero all the dropped bytes.
9992     assert(NumEvenDrops <= 3 &&
9993            "No support for dropping even elements more than 3 times.");
9994     // We use the mask type to pick which bytes are preserved based on how many
9995     // elements are dropped.
9996     MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
9997     SDValue ByteClearMask =
9998         DAG.getNode(ISD::BITCAST, DL, MVT::v16i8,
9999                     DAG.getConstant(0xFF, MaskVTs[NumEvenDrops - 1]));
10000     V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
10001     if (!IsSingleInput)
10002       V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
10003
10004     // Now pack things back together.
10005     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
10006     V2 = IsSingleInput ? V1 : DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
10007     SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
10008     for (int i = 1; i < NumEvenDrops; ++i) {
10009       Result = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Result);
10010       Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
10011     }
10012
10013     return Result;
10014   }
10015
10016   int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
10017   int V1HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
10018   int V2LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
10019   int V2HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
10020
10021   auto buildBlendMasks = [](MutableArrayRef<int> HalfMask,
10022                             MutableArrayRef<int> V1HalfBlendMask,
10023                             MutableArrayRef<int> V2HalfBlendMask) {
10024     for (int i = 0; i < 8; ++i)
10025       if (HalfMask[i] >= 0 && HalfMask[i] < 16) {
10026         V1HalfBlendMask[i] = HalfMask[i];
10027         HalfMask[i] = i;
10028       } else if (HalfMask[i] >= 16) {
10029         V2HalfBlendMask[i] = HalfMask[i] - 16;
10030         HalfMask[i] = i + 8;
10031       }
10032   };
10033   buildBlendMasks(LoMask, V1LoBlendMask, V2LoBlendMask);
10034   buildBlendMasks(HiMask, V1HiBlendMask, V2HiBlendMask);
10035
10036   SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
10037
10038   auto buildLoAndHiV8s = [&](SDValue V, MutableArrayRef<int> LoBlendMask,
10039                              MutableArrayRef<int> HiBlendMask) {
10040     SDValue V1, V2;
10041     // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
10042     // them out and avoid using UNPCK{L,H} to extract the elements of V as
10043     // i16s.
10044     if (std::none_of(LoBlendMask.begin(), LoBlendMask.end(),
10045                      [](int M) { return M >= 0 && M % 2 == 1; }) &&
10046         std::none_of(HiBlendMask.begin(), HiBlendMask.end(),
10047                      [](int M) { return M >= 0 && M % 2 == 1; })) {
10048       // Use a mask to drop the high bytes.
10049       V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
10050       V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, V1,
10051                        DAG.getConstant(0x00FF, MVT::v8i16));
10052
10053       // This will be a single vector shuffle instead of a blend so nuke V2.
10054       V2 = DAG.getUNDEF(MVT::v8i16);
10055
10056       // Squash the masks to point directly into V1.
10057       for (int &M : LoBlendMask)
10058         if (M >= 0)
10059           M /= 2;
10060       for (int &M : HiBlendMask)
10061         if (M >= 0)
10062           M /= 2;
10063     } else {
10064       // Otherwise just unpack the low half of V into V1 and the high half into
10065       // V2 so that we can blend them as i16s.
10066       V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
10067                        DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
10068       V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
10069                        DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
10070     }
10071
10072     SDValue BlendedLo = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
10073     SDValue BlendedHi = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
10074     return std::make_pair(BlendedLo, BlendedHi);
10075   };
10076   SDValue V1Lo, V1Hi, V2Lo, V2Hi;
10077   std::tie(V1Lo, V1Hi) = buildLoAndHiV8s(V1, V1LoBlendMask, V1HiBlendMask);
10078   std::tie(V2Lo, V2Hi) = buildLoAndHiV8s(V2, V2LoBlendMask, V2HiBlendMask);
10079
10080   SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Lo, V2Lo, LoMask);
10081   SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Hi, V2Hi, HiMask);
10082
10083   return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
10084 }
10085
10086 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
10087 ///
10088 /// This routine breaks down the specific type of 128-bit shuffle and
10089 /// dispatches to the lowering routines accordingly.
10090 static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10091                                         MVT VT, const X86Subtarget *Subtarget,
10092                                         SelectionDAG &DAG) {
10093   switch (VT.SimpleTy) {
10094   case MVT::v2i64:
10095     return lowerV2I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
10096   case MVT::v2f64:
10097     return lowerV2F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
10098   case MVT::v4i32:
10099     return lowerV4I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10100   case MVT::v4f32:
10101     return lowerV4F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10102   case MVT::v8i16:
10103     return lowerV8I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
10104   case MVT::v16i8:
10105     return lowerV16I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
10106
10107   default:
10108     llvm_unreachable("Unimplemented!");
10109   }
10110 }
10111
10112 /// \brief Helper function to test whether a shuffle mask could be
10113 /// simplified by widening the elements being shuffled.
10114 ///
10115 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
10116 /// leaves it in an unspecified state.
10117 ///
10118 /// NOTE: This must handle normal vector shuffle masks and *target* vector
10119 /// shuffle masks. The latter have the special property of a '-2' representing
10120 /// a zero-ed lane of a vector.
10121 static bool canWidenShuffleElements(ArrayRef<int> Mask,
10122                                     SmallVectorImpl<int> &WidenedMask) {
10123   for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
10124     // If both elements are undef, its trivial.
10125     if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
10126       WidenedMask.push_back(SM_SentinelUndef);
10127       continue;
10128     }
10129
10130     // Check for an undef mask and a mask value properly aligned to fit with
10131     // a pair of values. If we find such a case, use the non-undef mask's value.
10132     if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) {
10133       WidenedMask.push_back(Mask[i + 1] / 2);
10134       continue;
10135     }
10136     if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
10137       WidenedMask.push_back(Mask[i] / 2);
10138       continue;
10139     }
10140
10141     // When zeroing, we need to spread the zeroing across both lanes to widen.
10142     if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
10143       if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
10144           (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
10145         WidenedMask.push_back(SM_SentinelZero);
10146         continue;
10147       }
10148       return false;
10149     }
10150
10151     // Finally check if the two mask values are adjacent and aligned with
10152     // a pair.
10153     if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) {
10154       WidenedMask.push_back(Mask[i] / 2);
10155       continue;
10156     }
10157
10158     // Otherwise we can't safely widen the elements used in this shuffle.
10159     return false;
10160   }
10161   assert(WidenedMask.size() == Mask.size() / 2 &&
10162          "Incorrect size of mask after widening the elements!");
10163
10164   return true;
10165 }
10166
10167 /// \brief Generic routine to split vector shuffle into half-sized shuffles.
10168 ///
10169 /// This routine just extracts two subvectors, shuffles them independently, and
10170 /// then concatenates them back together. This should work effectively with all
10171 /// AVX vector shuffle types.
10172 static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
10173                                           SDValue V2, ArrayRef<int> Mask,
10174                                           SelectionDAG &DAG) {
10175   assert(VT.getSizeInBits() >= 256 &&
10176          "Only for 256-bit or wider vector shuffles!");
10177   assert(V1.getSimpleValueType() == VT && "Bad operand type!");
10178   assert(V2.getSimpleValueType() == VT && "Bad operand type!");
10179
10180   ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
10181   ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
10182
10183   int NumElements = VT.getVectorNumElements();
10184   int SplitNumElements = NumElements / 2;
10185   MVT ScalarVT = VT.getScalarType();
10186   MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
10187
10188   // Rather than splitting build-vectors, just build two narrower build
10189   // vectors. This helps shuffling with splats and zeros.
10190   auto SplitVector = [&](SDValue V) {
10191     while (V.getOpcode() == ISD::BITCAST)
10192       V = V->getOperand(0);
10193
10194     MVT OrigVT = V.getSimpleValueType();
10195     int OrigNumElements = OrigVT.getVectorNumElements();
10196     int OrigSplitNumElements = OrigNumElements / 2;
10197     MVT OrigScalarVT = OrigVT.getScalarType();
10198     MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
10199
10200     SDValue LoV, HiV;
10201
10202     auto *BV = dyn_cast<BuildVectorSDNode>(V);
10203     if (!BV) {
10204       LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
10205                         DAG.getIntPtrConstant(0));
10206       HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
10207                         DAG.getIntPtrConstant(OrigSplitNumElements));
10208     } else {
10209
10210       SmallVector<SDValue, 16> LoOps, HiOps;
10211       for (int i = 0; i < OrigSplitNumElements; ++i) {
10212         LoOps.push_back(BV->getOperand(i));
10213         HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
10214       }
10215       LoV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, LoOps);
10216       HiV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, HiOps);
10217     }
10218     return std::make_pair(DAG.getNode(ISD::BITCAST, DL, SplitVT, LoV),
10219                           DAG.getNode(ISD::BITCAST, DL, SplitVT, HiV));
10220   };
10221
10222   SDValue LoV1, HiV1, LoV2, HiV2;
10223   std::tie(LoV1, HiV1) = SplitVector(V1);
10224   std::tie(LoV2, HiV2) = SplitVector(V2);
10225
10226   // Now create two 4-way blends of these half-width vectors.
10227   auto HalfBlend = [&](ArrayRef<int> HalfMask) {
10228     bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
10229     SmallVector<int, 32> V1BlendMask, V2BlendMask, BlendMask;
10230     for (int i = 0; i < SplitNumElements; ++i) {
10231       int M = HalfMask[i];
10232       if (M >= NumElements) {
10233         if (M >= NumElements + SplitNumElements)
10234           UseHiV2 = true;
10235         else
10236           UseLoV2 = true;
10237         V2BlendMask.push_back(M - NumElements);
10238         V1BlendMask.push_back(-1);
10239         BlendMask.push_back(SplitNumElements + i);
10240       } else if (M >= 0) {
10241         if (M >= SplitNumElements)
10242           UseHiV1 = true;
10243         else
10244           UseLoV1 = true;
10245         V2BlendMask.push_back(-1);
10246         V1BlendMask.push_back(M);
10247         BlendMask.push_back(i);
10248       } else {
10249         V2BlendMask.push_back(-1);
10250         V1BlendMask.push_back(-1);
10251         BlendMask.push_back(-1);
10252       }
10253     }
10254
10255     // Because the lowering happens after all combining takes place, we need to
10256     // manually combine these blend masks as much as possible so that we create
10257     // a minimal number of high-level vector shuffle nodes.
10258
10259     // First try just blending the halves of V1 or V2.
10260     if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
10261       return DAG.getUNDEF(SplitVT);
10262     if (!UseLoV2 && !UseHiV2)
10263       return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
10264     if (!UseLoV1 && !UseHiV1)
10265       return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
10266
10267     SDValue V1Blend, V2Blend;
10268     if (UseLoV1 && UseHiV1) {
10269       V1Blend =
10270         DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
10271     } else {
10272       // We only use half of V1 so map the usage down into the final blend mask.
10273       V1Blend = UseLoV1 ? LoV1 : HiV1;
10274       for (int i = 0; i < SplitNumElements; ++i)
10275         if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
10276           BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
10277     }
10278     if (UseLoV2 && UseHiV2) {
10279       V2Blend =
10280         DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
10281     } else {
10282       // We only use half of V2 so map the usage down into the final blend mask.
10283       V2Blend = UseLoV2 ? LoV2 : HiV2;
10284       for (int i = 0; i < SplitNumElements; ++i)
10285         if (BlendMask[i] >= SplitNumElements)
10286           BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
10287     }
10288     return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
10289   };
10290   SDValue Lo = HalfBlend(LoMask);
10291   SDValue Hi = HalfBlend(HiMask);
10292   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
10293 }
10294
10295 /// \brief Either split a vector in halves or decompose the shuffles and the
10296 /// blend.
10297 ///
10298 /// This is provided as a good fallback for many lowerings of non-single-input
10299 /// shuffles with more than one 128-bit lane. In those cases, we want to select
10300 /// between splitting the shuffle into 128-bit components and stitching those
10301 /// back together vs. extracting the single-input shuffles and blending those
10302 /// results.
10303 static SDValue lowerVectorShuffleAsSplitOrBlend(SDLoc DL, MVT VT, SDValue V1,
10304                                                 SDValue V2, ArrayRef<int> Mask,
10305                                                 SelectionDAG &DAG) {
10306   assert(!isSingleInputShuffleMask(Mask) && "This routine must not be used to "
10307                                             "lower single-input shuffles as it "
10308                                             "could then recurse on itself.");
10309   int Size = Mask.size();
10310
10311   // If this can be modeled as a broadcast of two elements followed by a blend,
10312   // prefer that lowering. This is especially important because broadcasts can
10313   // often fold with memory operands.
10314   auto DoBothBroadcast = [&] {
10315     int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
10316     for (int M : Mask)
10317       if (M >= Size) {
10318         if (V2BroadcastIdx == -1)
10319           V2BroadcastIdx = M - Size;
10320         else if (M - Size != V2BroadcastIdx)
10321           return false;
10322       } else if (M >= 0) {
10323         if (V1BroadcastIdx == -1)
10324           V1BroadcastIdx = M;
10325         else if (M != V1BroadcastIdx)
10326           return false;
10327       }
10328     return true;
10329   };
10330   if (DoBothBroadcast())
10331     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
10332                                                       DAG);
10333
10334   // If the inputs all stem from a single 128-bit lane of each input, then we
10335   // split them rather than blending because the split will decompose to
10336   // unusually few instructions.
10337   int LaneCount = VT.getSizeInBits() / 128;
10338   int LaneSize = Size / LaneCount;
10339   SmallBitVector LaneInputs[2];
10340   LaneInputs[0].resize(LaneCount, false);
10341   LaneInputs[1].resize(LaneCount, false);
10342   for (int i = 0; i < Size; ++i)
10343     if (Mask[i] >= 0)
10344       LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
10345   if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
10346     return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10347
10348   // Otherwise, just fall back to decomposed shuffles and a blend. This requires
10349   // that the decomposed single-input shuffles don't end up here.
10350   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
10351 }
10352
10353 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
10354 /// a permutation and blend of those lanes.
10355 ///
10356 /// This essentially blends the out-of-lane inputs to each lane into the lane
10357 /// from a permuted copy of the vector. This lowering strategy results in four
10358 /// instructions in the worst case for a single-input cross lane shuffle which
10359 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
10360 /// of. Special cases for each particular shuffle pattern should be handled
10361 /// prior to trying this lowering.
10362 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT,
10363                                                        SDValue V1, SDValue V2,
10364                                                        ArrayRef<int> Mask,
10365                                                        SelectionDAG &DAG) {
10366   // FIXME: This should probably be generalized for 512-bit vectors as well.
10367   assert(VT.getSizeInBits() == 256 && "Only for 256-bit vector shuffles!");
10368   int LaneSize = Mask.size() / 2;
10369
10370   // If there are only inputs from one 128-bit lane, splitting will in fact be
10371   // less expensive. The flags track wether the given lane contains an element
10372   // that crosses to another lane.
10373   bool LaneCrossing[2] = {false, false};
10374   for (int i = 0, Size = Mask.size(); i < Size; ++i)
10375     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
10376       LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
10377   if (!LaneCrossing[0] || !LaneCrossing[1])
10378     return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10379
10380   if (isSingleInputShuffleMask(Mask)) {
10381     SmallVector<int, 32> FlippedBlendMask;
10382     for (int i = 0, Size = Mask.size(); i < Size; ++i)
10383       FlippedBlendMask.push_back(
10384           Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
10385                                   ? Mask[i]
10386                                   : Mask[i] % LaneSize +
10387                                         (i / LaneSize) * LaneSize + Size));
10388
10389     // Flip the vector, and blend the results which should now be in-lane. The
10390     // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
10391     // 5 for the high source. The value 3 selects the high half of source 2 and
10392     // the value 2 selects the low half of source 2. We only use source 2 to
10393     // allow folding it into a memory operand.
10394     unsigned PERMMask = 3 | 2 << 4;
10395     SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
10396                                   V1, DAG.getConstant(PERMMask, MVT::i8));
10397     return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
10398   }
10399
10400   // This now reduces to two single-input shuffles of V1 and V2 which at worst
10401   // will be handled by the above logic and a blend of the results, much like
10402   // other patterns in AVX.
10403   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
10404 }
10405
10406 /// \brief Handle lowering 2-lane 128-bit shuffles.
10407 static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
10408                                         SDValue V2, ArrayRef<int> Mask,
10409                                         const X86Subtarget *Subtarget,
10410                                         SelectionDAG &DAG) {
10411   // Blends are faster and handle all the non-lane-crossing cases.
10412   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
10413                                                 Subtarget, DAG))
10414     return Blend;
10415
10416   MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
10417                                VT.getVectorNumElements() / 2);
10418   // Check for patterns which can be matched with a single insert of a 128-bit
10419   // subvector.
10420   if (isShuffleEquivalent(V1, V2, Mask, 0, 1, 0, 1) ||
10421       isShuffleEquivalent(V1, V2, Mask, 0, 1, 4, 5)) {
10422     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
10423                               DAG.getIntPtrConstant(0));
10424     SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
10425                               Mask[2] < 4 ? V1 : V2, DAG.getIntPtrConstant(0));
10426     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
10427   }
10428   if (isShuffleEquivalent(V1, V2, Mask, 0, 1, 6, 7)) {
10429     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
10430                               DAG.getIntPtrConstant(0));
10431     SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
10432                               DAG.getIntPtrConstant(2));
10433     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
10434   }
10435
10436   // Otherwise form a 128-bit permutation.
10437   // FIXME: Detect zero-vector inputs and use the VPERM2X128 to zero that half.
10438   unsigned PermMask = Mask[0] / 2 | (Mask[2] / 2) << 4;
10439   return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
10440                      DAG.getConstant(PermMask, MVT::i8));
10441 }
10442
10443 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
10444 /// shuffling each lane.
10445 ///
10446 /// This will only succeed when the result of fixing the 128-bit lanes results
10447 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
10448 /// each 128-bit lanes. This handles many cases where we can quickly blend away
10449 /// the lane crosses early and then use simpler shuffles within each lane.
10450 ///
10451 /// FIXME: It might be worthwhile at some point to support this without
10452 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
10453 /// in x86 only floating point has interesting non-repeating shuffles, and even
10454 /// those are still *marginally* more expensive.
10455 static SDValue lowerVectorShuffleByMerging128BitLanes(
10456     SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10457     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
10458   assert(!isSingleInputShuffleMask(Mask) &&
10459          "This is only useful with multiple inputs.");
10460
10461   int Size = Mask.size();
10462   int LaneSize = 128 / VT.getScalarSizeInBits();
10463   int NumLanes = Size / LaneSize;
10464   assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
10465
10466   // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
10467   // check whether the in-128-bit lane shuffles share a repeating pattern.
10468   SmallVector<int, 4> Lanes;
10469   Lanes.resize(NumLanes, -1);
10470   SmallVector<int, 4> InLaneMask;
10471   InLaneMask.resize(LaneSize, -1);
10472   for (int i = 0; i < Size; ++i) {
10473     if (Mask[i] < 0)
10474       continue;
10475
10476     int j = i / LaneSize;
10477
10478     if (Lanes[j] < 0) {
10479       // First entry we've seen for this lane.
10480       Lanes[j] = Mask[i] / LaneSize;
10481     } else if (Lanes[j] != Mask[i] / LaneSize) {
10482       // This doesn't match the lane selected previously!
10483       return SDValue();
10484     }
10485
10486     // Check that within each lane we have a consistent shuffle mask.
10487     int k = i % LaneSize;
10488     if (InLaneMask[k] < 0) {
10489       InLaneMask[k] = Mask[i] % LaneSize;
10490     } else if (InLaneMask[k] != Mask[i] % LaneSize) {
10491       // This doesn't fit a repeating in-lane mask.
10492       return SDValue();
10493     }
10494   }
10495
10496   // First shuffle the lanes into place.
10497   MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
10498                                 VT.getSizeInBits() / 64);
10499   SmallVector<int, 8> LaneMask;
10500   LaneMask.resize(NumLanes * 2, -1);
10501   for (int i = 0; i < NumLanes; ++i)
10502     if (Lanes[i] >= 0) {
10503       LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
10504       LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
10505     }
10506
10507   V1 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V1);
10508   V2 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V2);
10509   SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
10510
10511   // Cast it back to the type we actually want.
10512   LaneShuffle = DAG.getNode(ISD::BITCAST, DL, VT, LaneShuffle);
10513
10514   // Now do a simple shuffle that isn't lane crossing.
10515   SmallVector<int, 8> NewMask;
10516   NewMask.resize(Size, -1);
10517   for (int i = 0; i < Size; ++i)
10518     if (Mask[i] >= 0)
10519       NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
10520   assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
10521          "Must not introduce lane crosses at this point!");
10522
10523   return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
10524 }
10525
10526 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
10527 /// given mask.
10528 ///
10529 /// This returns true if the elements from a particular input are already in the
10530 /// slot required by the given mask and require no permutation.
10531 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
10532   assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
10533   int Size = Mask.size();
10534   for (int i = 0; i < Size; ++i)
10535     if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
10536       return false;
10537
10538   return true;
10539 }
10540
10541 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
10542 ///
10543 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
10544 /// isn't available.
10545 static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10546                                        const X86Subtarget *Subtarget,
10547                                        SelectionDAG &DAG) {
10548   SDLoc DL(Op);
10549   assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
10550   assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
10551   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10552   ArrayRef<int> Mask = SVOp->getMask();
10553   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10554
10555   SmallVector<int, 4> WidenedMask;
10556   if (canWidenShuffleElements(Mask, WidenedMask))
10557     return lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, Subtarget,
10558                                     DAG);
10559
10560   if (isSingleInputShuffleMask(Mask)) {
10561     // Check for being able to broadcast a single element.
10562     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f64, DL, V1,
10563                                                           Mask, Subtarget, DAG))
10564       return Broadcast;
10565
10566     // Use low duplicate instructions for masks that match their pattern.
10567     if (isShuffleEquivalent(V1, V2, Mask, 0, 0, 2, 2))
10568       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
10569
10570     if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
10571       // Non-half-crossing single input shuffles can be lowerid with an
10572       // interleaved permutation.
10573       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
10574                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
10575       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
10576                          DAG.getConstant(VPERMILPMask, MVT::i8));
10577     }
10578
10579     // With AVX2 we have direct support for this permutation.
10580     if (Subtarget->hasAVX2())
10581       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
10582                          getV4X86ShuffleImm8ForMask(Mask, DAG));
10583
10584     // Otherwise, fall back.
10585     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
10586                                                    DAG);
10587   }
10588
10589   // X86 has dedicated unpack instructions that can handle specific blend
10590   // operations: UNPCKH and UNPCKL.
10591   if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 2, 6))
10592     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2);
10593   if (isShuffleEquivalent(V1, V2, Mask, 1, 5, 3, 7))
10594     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2);
10595
10596   // If we have a single input to the zero element, insert that into V1 if we
10597   // can do so cheaply.
10598   int NumV2Elements =
10599       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
10600   if (NumV2Elements == 1 && Mask[0] >= 4)
10601     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10602             MVT::v4f64, DL, V1, V2, Mask, Subtarget, DAG))
10603       return Insertion;
10604
10605   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
10606                                                 Subtarget, DAG))
10607     return Blend;
10608
10609   // Check if the blend happens to exactly fit that of SHUFPD.
10610   if ((Mask[0] == -1 || Mask[0] < 2) &&
10611       (Mask[1] == -1 || (Mask[1] >= 4 && Mask[1] < 6)) &&
10612       (Mask[2] == -1 || (Mask[2] >= 2 && Mask[2] < 4)) &&
10613       (Mask[3] == -1 || Mask[3] >= 6)) {
10614     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 5) << 1) |
10615                           ((Mask[2] == 3) << 2) | ((Mask[3] == 7) << 3);
10616     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V1, V2,
10617                        DAG.getConstant(SHUFPDMask, MVT::i8));
10618   }
10619   if ((Mask[0] == -1 || (Mask[0] >= 4 && Mask[0] < 6)) &&
10620       (Mask[1] == -1 || Mask[1] < 2) &&
10621       (Mask[2] == -1 || Mask[2] >= 6) &&
10622       (Mask[3] == -1 || (Mask[3] >= 2 && Mask[3] < 4))) {
10623     unsigned SHUFPDMask = (Mask[0] == 5) | ((Mask[1] == 1) << 1) |
10624                           ((Mask[2] == 7) << 2) | ((Mask[3] == 3) << 3);
10625     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V2, V1,
10626                        DAG.getConstant(SHUFPDMask, MVT::i8));
10627   }
10628
10629   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10630   // shuffle. However, if we have AVX2 and either inputs are already in place,
10631   // we will be able to shuffle even across lanes the other input in a single
10632   // instruction so skip this pattern.
10633   if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
10634                                  isShuffleMaskInputInPlace(1, Mask))))
10635     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10636             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
10637       return Result;
10638
10639   // If we have AVX2 then we always want to lower with a blend because an v4 we
10640   // can fully permute the elements.
10641   if (Subtarget->hasAVX2())
10642     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
10643                                                       Mask, DAG);
10644
10645   // Otherwise fall back on generic lowering.
10646   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
10647 }
10648
10649 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
10650 ///
10651 /// This routine is only called when we have AVX2 and thus a reasonable
10652 /// instruction set for v4i64 shuffling..
10653 static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10654                                        const X86Subtarget *Subtarget,
10655                                        SelectionDAG &DAG) {
10656   SDLoc DL(Op);
10657   assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
10658   assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
10659   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10660   ArrayRef<int> Mask = SVOp->getMask();
10661   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10662   assert(Subtarget->hasAVX2() && "We can only lower v4i64 with AVX2!");
10663
10664   SmallVector<int, 4> WidenedMask;
10665   if (canWidenShuffleElements(Mask, WidenedMask))
10666     return lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, Subtarget,
10667                                     DAG);
10668
10669   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
10670                                                 Subtarget, DAG))
10671     return Blend;
10672
10673   // Check for being able to broadcast a single element.
10674   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i64, DL, V1,
10675                                                         Mask, Subtarget, DAG))
10676     return Broadcast;
10677
10678   // When the shuffle is mirrored between the 128-bit lanes of the unit, we can
10679   // use lower latency instructions that will operate on both 128-bit lanes.
10680   SmallVector<int, 2> RepeatedMask;
10681   if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
10682     if (isSingleInputShuffleMask(Mask)) {
10683       int PSHUFDMask[] = {-1, -1, -1, -1};
10684       for (int i = 0; i < 2; ++i)
10685         if (RepeatedMask[i] >= 0) {
10686           PSHUFDMask[2 * i] = 2 * RepeatedMask[i];
10687           PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1;
10688         }
10689       return DAG.getNode(
10690           ISD::BITCAST, DL, MVT::v4i64,
10691           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
10692                       DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, V1),
10693                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
10694     }
10695   }
10696
10697   // AVX2 provides a direct instruction for permuting a single input across
10698   // lanes.
10699   if (isSingleInputShuffleMask(Mask))
10700     return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
10701                        getV4X86ShuffleImm8ForMask(Mask, DAG));
10702
10703   // Try to use byte shift instructions.
10704   if (SDValue Shift = lowerVectorShuffleAsByteShift(
10705           DL, MVT::v4i64, V1, V2, Mask, DAG))
10706     return Shift;
10707
10708   // Use dedicated unpack instructions for masks that match their pattern.
10709   if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 2, 6))
10710     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
10711   if (isShuffleEquivalent(V1, V2, Mask, 1, 5, 3, 7))
10712     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);
10713
10714   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10715   // shuffle. However, if we have AVX2 and either inputs are already in place,
10716   // we will be able to shuffle even across lanes the other input in a single
10717   // instruction so skip this pattern.
10718   if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
10719                                  isShuffleMaskInputInPlace(1, Mask))))
10720     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10721             DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
10722       return Result;
10723
10724   // Otherwise fall back on generic blend lowering.
10725   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
10726                                                     Mask, DAG);
10727 }
10728
10729 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
10730 ///
10731 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
10732 /// isn't available.
10733 static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10734                                        const X86Subtarget *Subtarget,
10735                                        SelectionDAG &DAG) {
10736   SDLoc DL(Op);
10737   assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
10738   assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
10739   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10740   ArrayRef<int> Mask = SVOp->getMask();
10741   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10742
10743   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
10744                                                 Subtarget, DAG))
10745     return Blend;
10746
10747   // Check for being able to broadcast a single element.
10748   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8f32, DL, V1,
10749                                                         Mask, Subtarget, DAG))
10750     return Broadcast;
10751
10752   // If the shuffle mask is repeated in each 128-bit lane, we have many more
10753   // options to efficiently lower the shuffle.
10754   SmallVector<int, 4> RepeatedMask;
10755   if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
10756     assert(RepeatedMask.size() == 4 &&
10757            "Repeated masks must be half the mask width!");
10758
10759     // Use even/odd duplicate instructions for masks that match their pattern.
10760     if (isShuffleEquivalent(V1, V2, Mask, 0, 0, 2, 2, 4, 4, 6, 6))
10761       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
10762     if (isShuffleEquivalent(V1, V2, Mask, 1, 1, 3, 3, 5, 5, 7, 7))
10763       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
10764
10765     if (isSingleInputShuffleMask(Mask))
10766       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
10767                          getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
10768
10769     // Use dedicated unpack instructions for masks that match their pattern.
10770     if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 1, 9, 4, 12, 5, 13))
10771       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2);
10772     if (isShuffleEquivalent(V1, V2, Mask, 2, 10, 3, 11, 6, 14, 7, 15))
10773       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2);
10774
10775     // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
10776     // have already handled any direct blends. We also need to squash the
10777     // repeated mask into a simulated v4f32 mask.
10778     for (int i = 0; i < 4; ++i)
10779       if (RepeatedMask[i] >= 8)
10780         RepeatedMask[i] -= 4;
10781     return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
10782   }
10783
10784   // If we have a single input shuffle with different shuffle patterns in the
10785   // two 128-bit lanes use the variable mask to VPERMILPS.
10786   if (isSingleInputShuffleMask(Mask)) {
10787     SDValue VPermMask[8];
10788     for (int i = 0; i < 8; ++i)
10789       VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
10790                                  : DAG.getConstant(Mask[i], MVT::i32);
10791     if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
10792       return DAG.getNode(
10793           X86ISD::VPERMILPV, DL, MVT::v8f32, V1,
10794           DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask));
10795
10796     if (Subtarget->hasAVX2())
10797       return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32,
10798                          DAG.getNode(ISD::BITCAST, DL, MVT::v8f32,
10799                                      DAG.getNode(ISD::BUILD_VECTOR, DL,
10800                                                  MVT::v8i32, VPermMask)),
10801                          V1);
10802
10803     // Otherwise, fall back.
10804     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
10805                                                    DAG);
10806   }
10807
10808   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10809   // shuffle.
10810   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10811           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
10812     return Result;
10813
10814   // If we have AVX2 then we always want to lower with a blend because at v8 we
10815   // can fully permute the elements.
10816   if (Subtarget->hasAVX2())
10817     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
10818                                                       Mask, DAG);
10819
10820   // Otherwise fall back on generic lowering.
10821   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
10822 }
10823
10824 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
10825 ///
10826 /// This routine is only called when we have AVX2 and thus a reasonable
10827 /// instruction set for v8i32 shuffling..
10828 static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10829                                        const X86Subtarget *Subtarget,
10830                                        SelectionDAG &DAG) {
10831   SDLoc DL(Op);
10832   assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
10833   assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
10834   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10835   ArrayRef<int> Mask = SVOp->getMask();
10836   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10837   assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!");
10838
10839   // Whenever we can lower this as a zext, that instruction is strictly faster
10840   // than any alternative. It also allows us to fold memory operands into the
10841   // shuffle in many cases.
10842   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2,
10843                                                          Mask, Subtarget, DAG))
10844     return ZExt;
10845
10846   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
10847                                                 Subtarget, DAG))
10848     return Blend;
10849
10850   // Check for being able to broadcast a single element.
10851   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i32, DL, V1,
10852                                                         Mask, Subtarget, DAG))
10853     return Broadcast;
10854
10855   // If the shuffle mask is repeated in each 128-bit lane we can use more
10856   // efficient instructions that mirror the shuffles across the two 128-bit
10857   // lanes.
10858   SmallVector<int, 4> RepeatedMask;
10859   if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) {
10860     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
10861     if (isSingleInputShuffleMask(Mask))
10862       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
10863                          getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
10864
10865     // Use dedicated unpack instructions for masks that match their pattern.
10866     if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 1, 9, 4, 12, 5, 13))
10867       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2);
10868     if (isShuffleEquivalent(V1, V2, Mask, 2, 10, 3, 11, 6, 14, 7, 15))
10869       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2);
10870   }
10871
10872   // If the shuffle patterns aren't repeated but it is a single input, directly
10873   // generate a cross-lane VPERMD instruction.
10874   if (isSingleInputShuffleMask(Mask)) {
10875     SDValue VPermMask[8];
10876     for (int i = 0; i < 8; ++i)
10877       VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
10878                                  : DAG.getConstant(Mask[i], MVT::i32);
10879     return DAG.getNode(
10880         X86ISD::VPERMV, DL, MVT::v8i32,
10881         DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
10882   }
10883
10884   // Try to use bit shift instructions.
10885   if (SDValue Shift = lowerVectorShuffleAsBitShift(
10886           DL, MVT::v8i32, V1, V2, Mask, DAG))
10887     return Shift;
10888
10889   // Try to use byte shift instructions.
10890   if (SDValue Shift = lowerVectorShuffleAsByteShift(
10891           DL, MVT::v8i32, V1, V2, Mask, DAG))
10892     return Shift;
10893
10894   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10895   // shuffle.
10896   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10897           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
10898     return Result;
10899
10900   // Otherwise fall back on generic blend lowering.
10901   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
10902                                                     Mask, DAG);
10903 }
10904
10905 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
10906 ///
10907 /// This routine is only called when we have AVX2 and thus a reasonable
10908 /// instruction set for v16i16 shuffling..
10909 static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10910                                         const X86Subtarget *Subtarget,
10911                                         SelectionDAG &DAG) {
10912   SDLoc DL(Op);
10913   assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
10914   assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
10915   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10916   ArrayRef<int> Mask = SVOp->getMask();
10917   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
10918   assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!");
10919
10920   // Whenever we can lower this as a zext, that instruction is strictly faster
10921   // than any alternative. It also allows us to fold memory operands into the
10922   // shuffle in many cases.
10923   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v16i16, V1, V2,
10924                                                          Mask, Subtarget, DAG))
10925     return ZExt;
10926
10927   // Check for being able to broadcast a single element.
10928   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i16, DL, V1,
10929                                                         Mask, Subtarget, DAG))
10930     return Broadcast;
10931
10932   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
10933                                                 Subtarget, DAG))
10934     return Blend;
10935
10936   // Use dedicated unpack instructions for masks that match their pattern.
10937   if (isShuffleEquivalent(V1, V2, Mask,
10938                           // First 128-bit lane:
10939                           0, 16, 1, 17, 2, 18, 3, 19,
10940                           // Second 128-bit lane:
10941                           8, 24, 9, 25, 10, 26, 11, 27))
10942     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2);
10943   if (isShuffleEquivalent(V1, V2, Mask,
10944                           // First 128-bit lane:
10945                           4, 20, 5, 21, 6, 22, 7, 23,
10946                           // Second 128-bit lane:
10947                           12, 28, 13, 29, 14, 30, 15, 31))
10948     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2);
10949
10950   if (isSingleInputShuffleMask(Mask)) {
10951     // There are no generalized cross-lane shuffle operations available on i16
10952     // element types.
10953     if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
10954       return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
10955                                                      Mask, DAG);
10956
10957     SDValue PSHUFBMask[32];
10958     for (int i = 0; i < 16; ++i) {
10959       if (Mask[i] == -1) {
10960         PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8);
10961         continue;
10962       }
10963
10964       int M = i < 8 ? Mask[i] : Mask[i] - 8;
10965       assert(M >= 0 && M < 8 && "Invalid single-input mask!");
10966       PSHUFBMask[2 * i] = DAG.getConstant(2 * M, MVT::i8);
10967       PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, MVT::i8);
10968     }
10969     return DAG.getNode(
10970         ISD::BITCAST, DL, MVT::v16i16,
10971         DAG.getNode(
10972             X86ISD::PSHUFB, DL, MVT::v32i8,
10973             DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1),
10974             DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)));
10975   }
10976
10977   // Try to use bit shift instructions.
10978   if (SDValue Shift = lowerVectorShuffleAsBitShift(
10979           DL, MVT::v16i16, V1, V2, Mask, DAG))
10980     return Shift;
10981
10982   // Try to use byte shift instructions.
10983   if (SDValue Shift = lowerVectorShuffleAsByteShift(
10984           DL, MVT::v16i16, V1, V2, Mask, DAG))
10985     return Shift;
10986
10987   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10988   // shuffle.
10989   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10990           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
10991     return Result;
10992
10993   // Otherwise fall back on generic lowering.
10994   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
10995 }
10996
10997 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
10998 ///
10999 /// This routine is only called when we have AVX2 and thus a reasonable
11000 /// instruction set for v32i8 shuffling..
11001 static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11002                                        const X86Subtarget *Subtarget,
11003                                        SelectionDAG &DAG) {
11004   SDLoc DL(Op);
11005   assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
11006   assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
11007   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11008   ArrayRef<int> Mask = SVOp->getMask();
11009   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
11010   assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!");
11011
11012   // Whenever we can lower this as a zext, that instruction is strictly faster
11013   // than any alternative. It also allows us to fold memory operands into the
11014   // shuffle in many cases.
11015   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2,
11016                                                          Mask, Subtarget, DAG))
11017     return ZExt;
11018
11019   // Check for being able to broadcast a single element.
11020   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v32i8, DL, V1,
11021                                                         Mask, Subtarget, DAG))
11022     return Broadcast;
11023
11024   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
11025                                                 Subtarget, DAG))
11026     return Blend;
11027
11028   // Use dedicated unpack instructions for masks that match their pattern.
11029   // Note that these are repeated 128-bit lane unpacks, not unpacks across all
11030   // 256-bit lanes.
11031   if (isShuffleEquivalent(
11032           V1, V2, Mask,
11033           // First 128-bit lane:
11034           0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
11035           // Second 128-bit lane:
11036           16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55))
11037     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2);
11038   if (isShuffleEquivalent(
11039           V1, V2, Mask,
11040           // First 128-bit lane:
11041           8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
11042           // Second 128-bit lane:
11043           24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63))
11044     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2);
11045
11046   if (isSingleInputShuffleMask(Mask)) {
11047     // There are no generalized cross-lane shuffle operations available on i8
11048     // element types.
11049     if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
11050       return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2,
11051                                                      Mask, DAG);
11052
11053     SDValue PSHUFBMask[32];
11054     for (int i = 0; i < 32; ++i)
11055       PSHUFBMask[i] =
11056           Mask[i] < 0
11057               ? DAG.getUNDEF(MVT::i8)
11058               : DAG.getConstant(Mask[i] < 16 ? Mask[i] : Mask[i] - 16, MVT::i8);
11059
11060     return DAG.getNode(
11061         X86ISD::PSHUFB, DL, MVT::v32i8, V1,
11062         DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask));
11063   }
11064
11065   // Try to use bit shift instructions.
11066   if (SDValue Shift = lowerVectorShuffleAsBitShift(
11067           DL, MVT::v32i8, V1, V2, Mask, DAG))
11068     return Shift;
11069
11070   // Try to use byte shift instructions.
11071   if (SDValue Shift = lowerVectorShuffleAsByteShift(
11072           DL, MVT::v32i8, V1, V2, Mask, DAG))
11073     return Shift;
11074
11075   // Try to simplify this by merging 128-bit lanes to enable a lane-based
11076   // shuffle.
11077   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
11078           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
11079     return Result;
11080
11081   // Otherwise fall back on generic lowering.
11082   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
11083 }
11084
11085 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
11086 ///
11087 /// This routine either breaks down the specific type of a 256-bit x86 vector
11088 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
11089 /// together based on the available instructions.
11090 static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11091                                         MVT VT, const X86Subtarget *Subtarget,
11092                                         SelectionDAG &DAG) {
11093   SDLoc DL(Op);
11094   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11095   ArrayRef<int> Mask = SVOp->getMask();
11096
11097   // There is a really nice hard cut-over between AVX1 and AVX2 that means we can
11098   // check for those subtargets here and avoid much of the subtarget querying in
11099   // the per-vector-type lowering routines. With AVX1 we have essentially *zero*
11100   // ability to manipulate a 256-bit vector with integer types. Since we'll use
11101   // floating point types there eventually, just immediately cast everything to
11102   // a float and operate entirely in that domain.
11103   if (VT.isInteger() && !Subtarget->hasAVX2()) {
11104     int ElementBits = VT.getScalarSizeInBits();
11105     if (ElementBits < 32)
11106       // No floating point type available, decompose into 128-bit vectors.
11107       return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
11108
11109     MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
11110                                 VT.getVectorNumElements());
11111     V1 = DAG.getNode(ISD::BITCAST, DL, FpVT, V1);
11112     V2 = DAG.getNode(ISD::BITCAST, DL, FpVT, V2);
11113     return DAG.getNode(ISD::BITCAST, DL, VT,
11114                        DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
11115   }
11116
11117   switch (VT.SimpleTy) {
11118   case MVT::v4f64:
11119     return lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
11120   case MVT::v4i64:
11121     return lowerV4I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
11122   case MVT::v8f32:
11123     return lowerV8F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
11124   case MVT::v8i32:
11125     return lowerV8I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
11126   case MVT::v16i16:
11127     return lowerV16I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
11128   case MVT::v32i8:
11129     return lowerV32I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
11130
11131   default:
11132     llvm_unreachable("Not a valid 256-bit x86 vector type!");
11133   }
11134 }
11135
11136 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
11137 static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11138                                        const X86Subtarget *Subtarget,
11139                                        SelectionDAG &DAG) {
11140   SDLoc DL(Op);
11141   assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
11142   assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
11143   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11144   ArrayRef<int> Mask = SVOp->getMask();
11145   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11146
11147   // X86 has dedicated unpack instructions that can handle specific blend
11148   // operations: UNPCKH and UNPCKL.
11149   if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 2, 10, 4, 12, 6, 14))
11150     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2);
11151   if (isShuffleEquivalent(V1, V2, Mask, 1, 9, 3, 11, 5, 13, 7, 15))
11152     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2);
11153
11154   // FIXME: Implement direct support for this type!
11155   return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG);
11156 }
11157
11158 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
11159 static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11160                                        const X86Subtarget *Subtarget,
11161                                        SelectionDAG &DAG) {
11162   SDLoc DL(Op);
11163   assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
11164   assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
11165   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11166   ArrayRef<int> Mask = SVOp->getMask();
11167   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11168
11169   // Use dedicated unpack instructions for masks that match their pattern.
11170   if (isShuffleEquivalent(V1, V2, Mask,
11171                           0, 16, 1, 17, 4, 20, 5, 21,
11172                           8, 24, 9, 25, 12, 28, 13, 29))
11173     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2);
11174   if (isShuffleEquivalent(V1, V2, Mask,
11175                           2, 18, 3, 19, 6, 22, 7, 23,
11176                           10, 26, 11, 27, 14, 30, 15, 31))
11177     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2);
11178
11179   // FIXME: Implement direct support for this type!
11180   return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG);
11181 }
11182
11183 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
11184 static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11185                                        const X86Subtarget *Subtarget,
11186                                        SelectionDAG &DAG) {
11187   SDLoc DL(Op);
11188   assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
11189   assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
11190   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11191   ArrayRef<int> Mask = SVOp->getMask();
11192   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11193
11194   // X86 has dedicated unpack instructions that can handle specific blend
11195   // operations: UNPCKH and UNPCKL.
11196   if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 2, 10, 4, 12, 6, 14))
11197     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2);
11198   if (isShuffleEquivalent(V1, V2, Mask, 1, 9, 3, 11, 5, 13, 7, 15))
11199     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2);
11200
11201   // FIXME: Implement direct support for this type!
11202   return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG);
11203 }
11204
11205 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
11206 static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11207                                        const X86Subtarget *Subtarget,
11208                                        SelectionDAG &DAG) {
11209   SDLoc DL(Op);
11210   assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
11211   assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
11212   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11213   ArrayRef<int> Mask = SVOp->getMask();
11214   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11215
11216   // Use dedicated unpack instructions for masks that match their pattern.
11217   if (isShuffleEquivalent(V1, V2, Mask,
11218                           0, 16, 1, 17, 4, 20, 5, 21,
11219                           8, 24, 9, 25, 12, 28, 13, 29))
11220     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2);
11221   if (isShuffleEquivalent(V1, V2, Mask,
11222                           2, 18, 3, 19, 6, 22, 7, 23,
11223                           10, 26, 11, 27, 14, 30, 15, 31))
11224     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2);
11225
11226   // FIXME: Implement direct support for this type!
11227   return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG);
11228 }
11229
11230 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
11231 static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11232                                         const X86Subtarget *Subtarget,
11233                                         SelectionDAG &DAG) {
11234   SDLoc DL(Op);
11235   assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
11236   assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
11237   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11238   ArrayRef<int> Mask = SVOp->getMask();
11239   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
11240   assert(Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
11241
11242   // FIXME: Implement direct support for this type!
11243   return splitAndLowerVectorShuffle(DL, MVT::v32i16, V1, V2, Mask, DAG);
11244 }
11245
11246 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
11247 static SDValue lowerV64I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11248                                        const X86Subtarget *Subtarget,
11249                                        SelectionDAG &DAG) {
11250   SDLoc DL(Op);
11251   assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
11252   assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
11253   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11254   ArrayRef<int> Mask = SVOp->getMask();
11255   assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
11256   assert(Subtarget->hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
11257
11258   // FIXME: Implement direct support for this type!
11259   return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
11260 }
11261
11262 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
11263 ///
11264 /// This routine either breaks down the specific type of a 512-bit x86 vector
11265 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
11266 /// together based on the available instructions.
11267 static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11268                                         MVT VT, const X86Subtarget *Subtarget,
11269                                         SelectionDAG &DAG) {
11270   SDLoc DL(Op);
11271   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11272   ArrayRef<int> Mask = SVOp->getMask();
11273   assert(Subtarget->hasAVX512() &&
11274          "Cannot lower 512-bit vectors w/ basic ISA!");
11275
11276   // Check for being able to broadcast a single element.
11277   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(VT.SimpleTy, DL, V1,
11278                                                         Mask, Subtarget, DAG))
11279     return Broadcast;
11280
11281   // Dispatch to each element type for lowering. If we don't have supprot for
11282   // specific element type shuffles at 512 bits, immediately split them and
11283   // lower them. Each lowering routine of a given type is allowed to assume that
11284   // the requisite ISA extensions for that element type are available.
11285   switch (VT.SimpleTy) {
11286   case MVT::v8f64:
11287     return lowerV8F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
11288   case MVT::v16f32:
11289     return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
11290   case MVT::v8i64:
11291     return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
11292   case MVT::v16i32:
11293     return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
11294   case MVT::v32i16:
11295     if (Subtarget->hasBWI())
11296       return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
11297     break;
11298   case MVT::v64i8:
11299     if (Subtarget->hasBWI())
11300       return lowerV64I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
11301     break;
11302
11303   default:
11304     llvm_unreachable("Not a valid 512-bit x86 vector type!");
11305   }
11306
11307   // Otherwise fall back on splitting.
11308   return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
11309 }
11310
11311 /// \brief Top-level lowering for x86 vector shuffles.
11312 ///
11313 /// This handles decomposition, canonicalization, and lowering of all x86
11314 /// vector shuffles. Most of the specific lowering strategies are encapsulated
11315 /// above in helper routines. The canonicalization attempts to widen shuffles
11316 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
11317 /// s.t. only one of the two inputs needs to be tested, etc.
11318 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
11319                                   SelectionDAG &DAG) {
11320   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11321   ArrayRef<int> Mask = SVOp->getMask();
11322   SDValue V1 = Op.getOperand(0);
11323   SDValue V2 = Op.getOperand(1);
11324   MVT VT = Op.getSimpleValueType();
11325   int NumElements = VT.getVectorNumElements();
11326   SDLoc dl(Op);
11327
11328   assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
11329
11330   bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
11331   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
11332   if (V1IsUndef && V2IsUndef)
11333     return DAG.getUNDEF(VT);
11334
11335   // When we create a shuffle node we put the UNDEF node to second operand,
11336   // but in some cases the first operand may be transformed to UNDEF.
11337   // In this case we should just commute the node.
11338   if (V1IsUndef)
11339     return DAG.getCommutedVectorShuffle(*SVOp);
11340
11341   // Check for non-undef masks pointing at an undef vector and make the masks
11342   // undef as well. This makes it easier to match the shuffle based solely on
11343   // the mask.
11344   if (V2IsUndef)
11345     for (int M : Mask)
11346       if (M >= NumElements) {
11347         SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
11348         for (int &M : NewMask)
11349           if (M >= NumElements)
11350             M = -1;
11351         return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask);
11352       }
11353
11354   // We actually see shuffles that are entirely re-arrangements of a set of
11355   // zero inputs. This mostly happens while decomposing complex shuffles into
11356   // simple ones. Directly lower these as a buildvector of zeros.
11357   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
11358   if (Zeroable.all())
11359     return getZeroVector(VT, Subtarget, DAG, dl);
11360
11361   // Try to collapse shuffles into using a vector type with fewer elements but
11362   // wider element types. We cap this to not form integers or floating point
11363   // elements wider than 64 bits, but it might be interesting to form i128
11364   // integers to handle flipping the low and high halves of AVX 256-bit vectors.
11365   SmallVector<int, 16> WidenedMask;
11366   if (VT.getScalarSizeInBits() < 64 &&
11367       canWidenShuffleElements(Mask, WidenedMask)) {
11368     MVT NewEltVT = VT.isFloatingPoint()
11369                        ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
11370                        : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
11371     MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
11372     // Make sure that the new vector type is legal. For example, v2f64 isn't
11373     // legal on SSE1.
11374     if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
11375       V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
11376       V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
11377       return DAG.getNode(ISD::BITCAST, dl, VT,
11378                          DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask));
11379     }
11380   }
11381
11382   int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0;
11383   for (int M : SVOp->getMask())
11384     if (M < 0)
11385       ++NumUndefElements;
11386     else if (M < NumElements)
11387       ++NumV1Elements;
11388     else
11389       ++NumV2Elements;
11390
11391   // Commute the shuffle as needed such that more elements come from V1 than
11392   // V2. This allows us to match the shuffle pattern strictly on how many
11393   // elements come from V1 without handling the symmetric cases.
11394   if (NumV2Elements > NumV1Elements)
11395     return DAG.getCommutedVectorShuffle(*SVOp);
11396
11397   // When the number of V1 and V2 elements are the same, try to minimize the
11398   // number of uses of V2 in the low half of the vector. When that is tied,
11399   // ensure that the sum of indices for V1 is equal to or lower than the sum
11400   // indices for V2. When those are equal, try to ensure that the number of odd
11401   // indices for V1 is lower than the number of odd indices for V2.
11402   if (NumV1Elements == NumV2Elements) {
11403     int LowV1Elements = 0, LowV2Elements = 0;
11404     for (int M : SVOp->getMask().slice(0, NumElements / 2))
11405       if (M >= NumElements)
11406         ++LowV2Elements;
11407       else if (M >= 0)
11408         ++LowV1Elements;
11409     if (LowV2Elements > LowV1Elements) {
11410       return DAG.getCommutedVectorShuffle(*SVOp);
11411     } else if (LowV2Elements == LowV1Elements) {
11412       int SumV1Indices = 0, SumV2Indices = 0;
11413       for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
11414         if (SVOp->getMask()[i] >= NumElements)
11415           SumV2Indices += i;
11416         else if (SVOp->getMask()[i] >= 0)
11417           SumV1Indices += i;
11418       if (SumV2Indices < SumV1Indices) {
11419         return DAG.getCommutedVectorShuffle(*SVOp);
11420       } else if (SumV2Indices == SumV1Indices) {
11421         int NumV1OddIndices = 0, NumV2OddIndices = 0;
11422         for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
11423           if (SVOp->getMask()[i] >= NumElements)
11424             NumV2OddIndices += i % 2;
11425           else if (SVOp->getMask()[i] >= 0)
11426             NumV1OddIndices += i % 2;
11427         if (NumV2OddIndices < NumV1OddIndices)
11428           return DAG.getCommutedVectorShuffle(*SVOp);
11429       }
11430     }
11431   }
11432
11433   // For each vector width, delegate to a specialized lowering routine.
11434   if (VT.getSizeInBits() == 128)
11435     return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
11436
11437   if (VT.getSizeInBits() == 256)
11438     return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
11439
11440   // Force AVX-512 vectors to be scalarized for now.
11441   // FIXME: Implement AVX-512 support!
11442   if (VT.getSizeInBits() == 512)
11443     return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
11444
11445   llvm_unreachable("Unimplemented!");
11446 }
11447
11448
11449 //===----------------------------------------------------------------------===//
11450 // Legacy vector shuffle lowering
11451 //
11452 // This code is the legacy code handling vector shuffles until the above
11453 // replaces its functionality and performance.
11454 //===----------------------------------------------------------------------===//
11455
11456 static bool isBlendMask(ArrayRef<int> MaskVals, MVT VT, bool hasSSE41,
11457                         bool hasInt256, unsigned *MaskOut = nullptr) {
11458   MVT EltVT = VT.getVectorElementType();
11459
11460   // There is no blend with immediate in AVX-512.
11461   if (VT.is512BitVector())
11462     return false;
11463
11464   if (!hasSSE41 || EltVT == MVT::i8)
11465     return false;
11466   if (!hasInt256 && VT == MVT::v16i16)
11467     return false;
11468
11469   unsigned MaskValue = 0;
11470   unsigned NumElems = VT.getVectorNumElements();
11471   // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
11472   unsigned NumLanes = (NumElems - 1) / 8 + 1;
11473   unsigned NumElemsInLane = NumElems / NumLanes;
11474
11475   // Blend for v16i16 should be symmetric for both lanes.
11476   for (unsigned i = 0; i < NumElemsInLane; ++i) {
11477
11478     int SndLaneEltIdx = (NumLanes == 2) ? MaskVals[i + NumElemsInLane] : -1;
11479     int EltIdx = MaskVals[i];
11480
11481     if ((EltIdx < 0 || EltIdx == (int)i) &&
11482         (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane)))
11483       continue;
11484
11485     if (((unsigned)EltIdx == (i + NumElems)) &&
11486         (SndLaneEltIdx < 0 ||
11487          (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane))
11488       MaskValue |= (1 << i);
11489     else
11490       return false;
11491   }
11492
11493   if (MaskOut)
11494     *MaskOut = MaskValue;
11495   return true;
11496 }
11497
11498 // Try to lower a shuffle node into a simple blend instruction.
11499 // This function assumes isBlendMask returns true for this
11500 // SuffleVectorSDNode
11501 static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
11502                                           unsigned MaskValue,
11503                                           const X86Subtarget *Subtarget,
11504                                           SelectionDAG &DAG) {
11505   MVT VT = SVOp->getSimpleValueType(0);
11506   MVT EltVT = VT.getVectorElementType();
11507   assert(isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(),
11508                      Subtarget->hasInt256() && "Trying to lower a "
11509                                                "VECTOR_SHUFFLE to a Blend but "
11510                                                "with the wrong mask"));
11511   SDValue V1 = SVOp->getOperand(0);
11512   SDValue V2 = SVOp->getOperand(1);
11513   SDLoc dl(SVOp);
11514   unsigned NumElems = VT.getVectorNumElements();
11515
11516   // Convert i32 vectors to floating point if it is not AVX2.
11517   // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
11518   MVT BlendVT = VT;
11519   if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
11520     BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
11521                                NumElems);
11522     V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1);
11523     V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2);
11524   }
11525
11526   SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2,
11527                             DAG.getConstant(MaskValue, MVT::i32));
11528   return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
11529 }
11530
11531 /// In vector type \p VT, return true if the element at index \p InputIdx
11532 /// falls on a different 128-bit lane than \p OutputIdx.
11533 static bool ShuffleCrosses128bitLane(MVT VT, unsigned InputIdx,
11534                                      unsigned OutputIdx) {
11535   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
11536   return InputIdx * EltSize / 128 != OutputIdx * EltSize / 128;
11537 }
11538
11539 /// Generate a PSHUFB if possible.  Selects elements from \p V1 according to
11540 /// \p MaskVals.  MaskVals[OutputIdx] = InputIdx specifies that we want to
11541 /// shuffle the element at InputIdx in V1 to OutputIdx in the result.  If \p
11542 /// MaskVals refers to elements outside of \p V1 or is undef (-1), insert a
11543 /// zero.
11544 static SDValue getPSHUFB(ArrayRef<int> MaskVals, SDValue V1, SDLoc &dl,
11545                          SelectionDAG &DAG) {
11546   MVT VT = V1.getSimpleValueType();
11547   assert(VT.is128BitVector() || VT.is256BitVector());
11548
11549   MVT EltVT = VT.getVectorElementType();
11550   unsigned EltSizeInBytes = EltVT.getSizeInBits() / 8;
11551   unsigned NumElts = VT.getVectorNumElements();
11552
11553   SmallVector<SDValue, 32> PshufbMask;
11554   for (unsigned OutputIdx = 0; OutputIdx < NumElts; ++OutputIdx) {
11555     int InputIdx = MaskVals[OutputIdx];
11556     unsigned InputByteIdx;
11557
11558     if (InputIdx < 0 || NumElts <= (unsigned)InputIdx)
11559       InputByteIdx = 0x80;
11560     else {
11561       // Cross lane is not allowed.
11562       if (ShuffleCrosses128bitLane(VT, InputIdx, OutputIdx))
11563         return SDValue();
11564       InputByteIdx = InputIdx * EltSizeInBytes;
11565       // Index is an byte offset within the 128-bit lane.
11566       InputByteIdx &= 0xf;
11567     }
11568
11569     for (unsigned j = 0; j < EltSizeInBytes; ++j) {
11570       PshufbMask.push_back(DAG.getConstant(InputByteIdx, MVT::i8));
11571       if (InputByteIdx != 0x80)
11572         ++InputByteIdx;
11573     }
11574   }
11575
11576   MVT ShufVT = MVT::getVectorVT(MVT::i8, PshufbMask.size());
11577   if (ShufVT != VT)
11578     V1 = DAG.getNode(ISD::BITCAST, dl, ShufVT, V1);
11579   return DAG.getNode(X86ISD::PSHUFB, dl, ShufVT, V1,
11580                      DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT, PshufbMask));
11581 }
11582
11583 // v8i16 shuffles - Prefer shuffles in the following order:
11584 // 1. [all]   pshuflw, pshufhw, optional move
11585 // 2. [ssse3] 1 x pshufb
11586 // 3. [ssse3] 2 x pshufb + 1 x por
11587 // 4. [all]   mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
11588 static SDValue
11589 LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
11590                          SelectionDAG &DAG) {
11591   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11592   SDValue V1 = SVOp->getOperand(0);
11593   SDValue V2 = SVOp->getOperand(1);
11594   SDLoc dl(SVOp);
11595   SmallVector<int, 8> MaskVals;
11596
11597   // Determine if more than 1 of the words in each of the low and high quadwords
11598   // of the result come from the same quadword of one of the two inputs.  Undef
11599   // mask values count as coming from any quadword, for better codegen.
11600   //
11601   // Lo/HiQuad[i] = j indicates how many words from the ith quad of the input
11602   // feeds this quad.  For i, 0 and 1 refer to V1, 2 and 3 refer to V2.
11603   unsigned LoQuad[] = { 0, 0, 0, 0 };
11604   unsigned HiQuad[] = { 0, 0, 0, 0 };
11605   // Indices of quads used.
11606   std::bitset<4> InputQuads;
11607   for (unsigned i = 0; i < 8; ++i) {
11608     unsigned *Quad = i < 4 ? LoQuad : HiQuad;
11609     int EltIdx = SVOp->getMaskElt(i);
11610     MaskVals.push_back(EltIdx);
11611     if (EltIdx < 0) {
11612       ++Quad[0];
11613       ++Quad[1];
11614       ++Quad[2];
11615       ++Quad[3];
11616       continue;
11617     }
11618     ++Quad[EltIdx / 4];
11619     InputQuads.set(EltIdx / 4);
11620   }
11621
11622   int BestLoQuad = -1;
11623   unsigned MaxQuad = 1;
11624   for (unsigned i = 0; i < 4; ++i) {
11625     if (LoQuad[i] > MaxQuad) {
11626       BestLoQuad = i;
11627       MaxQuad = LoQuad[i];
11628     }
11629   }
11630
11631   int BestHiQuad = -1;
11632   MaxQuad = 1;
11633   for (unsigned i = 0; i < 4; ++i) {
11634     if (HiQuad[i] > MaxQuad) {
11635       BestHiQuad = i;
11636       MaxQuad = HiQuad[i];
11637     }
11638   }
11639
11640   // For SSSE3, If all 8 words of the result come from only 1 quadword of each
11641   // of the two input vectors, shuffle them into one input vector so only a
11642   // single pshufb instruction is necessary. If there are more than 2 input
11643   // quads, disable the next transformation since it does not help SSSE3.
11644   bool V1Used = InputQuads[0] || InputQuads[1];
11645   bool V2Used = InputQuads[2] || InputQuads[3];
11646   if (Subtarget->hasSSSE3()) {
11647     if (InputQuads.count() == 2 && V1Used && V2Used) {
11648       BestLoQuad = InputQuads[0] ? 0 : 1;
11649       BestHiQuad = InputQuads[2] ? 2 : 3;
11650     }
11651     if (InputQuads.count() > 2) {
11652       BestLoQuad = -1;
11653       BestHiQuad = -1;
11654     }
11655   }
11656
11657   // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
11658   // the shuffle mask.  If a quad is scored as -1, that means that it contains
11659   // words from all 4 input quadwords.
11660   SDValue NewV;
11661   if (BestLoQuad >= 0 || BestHiQuad >= 0) {
11662     int MaskV[] = {
11663       BestLoQuad < 0 ? 0 : BestLoQuad,
11664       BestHiQuad < 0 ? 1 : BestHiQuad
11665     };
11666     NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
11667                   DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
11668                   DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
11669     NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV);
11670
11671     // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
11672     // source words for the shuffle, to aid later transformations.
11673     bool AllWordsInNewV = true;
11674     bool InOrder[2] = { true, true };
11675     for (unsigned i = 0; i != 8; ++i) {
11676       int idx = MaskVals[i];
11677       if (idx != (int)i)
11678         InOrder[i/4] = false;
11679       if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
11680         continue;
11681       AllWordsInNewV = false;
11682       break;
11683     }
11684
11685     bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
11686     if (AllWordsInNewV) {
11687       for (int i = 0; i != 8; ++i) {
11688         int idx = MaskVals[i];
11689         if (idx < 0)
11690           continue;
11691         idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
11692         if ((idx != i) && idx < 4)
11693           pshufhw = false;
11694         if ((idx != i) && idx > 3)
11695           pshuflw = false;
11696       }
11697       V1 = NewV;
11698       V2Used = false;
11699       BestLoQuad = 0;
11700       BestHiQuad = 1;
11701     }
11702
11703     // If we've eliminated the use of V2, and the new mask is a pshuflw or
11704     // pshufhw, that's as cheap as it gets.  Return the new shuffle.
11705     if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
11706       unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;
11707       unsigned TargetMask = 0;
11708       NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
11709                                   DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
11710       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
11711       TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp):
11712                              getShufflePSHUFLWImmediate(SVOp);
11713       V1 = NewV.getOperand(0);
11714       return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
11715     }
11716   }
11717
11718   // Promote splats to a larger type which usually leads to more efficient code.
11719   // FIXME: Is this true if pshufb is available?
11720   if (SVOp->isSplat())
11721     return PromoteSplat(SVOp, DAG);
11722
11723   // If we have SSSE3, and all words of the result are from 1 input vector,
11724   // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
11725   // is present, fall back to case 4.
11726   if (Subtarget->hasSSSE3()) {
11727     SmallVector<SDValue,16> pshufbMask;
11728
11729     // If we have elements from both input vectors, set the high bit of the
11730     // shuffle mask element to zero out elements that come from V2 in the V1
11731     // mask, and elements that come from V1 in the V2 mask, so that the two
11732     // results can be OR'd together.
11733     bool TwoInputs = V1Used && V2Used;
11734     V1 = getPSHUFB(MaskVals, V1, dl, DAG);
11735     if (!TwoInputs)
11736       return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
11737
11738     // Calculate the shuffle mask for the second input, shuffle it, and
11739     // OR it with the first shuffled input.
11740     CommuteVectorShuffleMask(MaskVals, 8);
11741     V2 = getPSHUFB(MaskVals, V2, dl, DAG);
11742     V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
11743     return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
11744   }
11745
11746   // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
11747   // and update MaskVals with new element order.
11748   std::bitset<8> InOrder;
11749   if (BestLoQuad >= 0) {
11750     int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 };
11751     for (int i = 0; i != 4; ++i) {
11752       int idx = MaskVals[i];
11753       if (idx < 0) {
11754         InOrder.set(i);
11755       } else if ((idx / 4) == BestLoQuad) {
11756         MaskV[i] = idx & 3;
11757         InOrder.set(i);
11758       }
11759     }
11760     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
11761                                 &MaskV[0]);
11762
11763     if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
11764       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
11765       NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
11766                                   NewV.getOperand(0),
11767                                   getShufflePSHUFLWImmediate(SVOp), DAG);
11768     }
11769   }
11770
11771   // If BestHi >= 0, generate a pshufhw to put the high elements in order,
11772   // and update MaskVals with the new element order.
11773   if (BestHiQuad >= 0) {
11774     int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 };
11775     for (unsigned i = 4; i != 8; ++i) {
11776       int idx = MaskVals[i];
11777       if (idx < 0) {
11778         InOrder.set(i);
11779       } else if ((idx / 4) == BestHiQuad) {
11780         MaskV[i] = (idx & 3) + 4;
11781         InOrder.set(i);
11782       }
11783     }
11784     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
11785                                 &MaskV[0]);
11786
11787     if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
11788       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
11789       NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
11790                                   NewV.getOperand(0),
11791                                   getShufflePSHUFHWImmediate(SVOp), DAG);
11792     }
11793   }
11794
11795   // In case BestHi & BestLo were both -1, which means each quadword has a word
11796   // from each of the four input quadwords, calculate the InOrder bitvector now
11797   // before falling through to the insert/extract cleanup.
11798   if (BestLoQuad == -1 && BestHiQuad == -1) {
11799     NewV = V1;
11800     for (int i = 0; i != 8; ++i)
11801       if (MaskVals[i] < 0 || MaskVals[i] == i)
11802         InOrder.set(i);
11803   }
11804
11805   // The other elements are put in the right place using pextrw and pinsrw.
11806   for (unsigned i = 0; i != 8; ++i) {
11807     if (InOrder[i])
11808       continue;
11809     int EltIdx = MaskVals[i];
11810     if (EltIdx < 0)
11811       continue;
11812     SDValue ExtOp = (EltIdx < 8) ?
11813       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
11814                   DAG.getIntPtrConstant(EltIdx)) :
11815       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
11816                   DAG.getIntPtrConstant(EltIdx - 8));
11817     NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
11818                        DAG.getIntPtrConstant(i));
11819   }
11820   return NewV;
11821 }
11822
11823 /// \brief v16i16 shuffles
11824 ///
11825 /// FIXME: We only support generation of a single pshufb currently.  We can
11826 /// generalize the other applicable cases from LowerVECTOR_SHUFFLEv8i16 as
11827 /// well (e.g 2 x pshufb + 1 x por).
11828 static SDValue
11829 LowerVECTOR_SHUFFLEv16i16(SDValue Op, SelectionDAG &DAG) {
11830   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11831   SDValue V1 = SVOp->getOperand(0);
11832   SDValue V2 = SVOp->getOperand(1);
11833   SDLoc dl(SVOp);
11834
11835   if (V2.getOpcode() != ISD::UNDEF)
11836     return SDValue();
11837
11838   SmallVector<int, 16> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
11839   return getPSHUFB(MaskVals, V1, dl, DAG);
11840 }
11841
11842 // v16i8 shuffles - Prefer shuffles in the following order:
11843 // 1. [ssse3] 1 x pshufb
11844 // 2. [ssse3] 2 x pshufb + 1 x por
11845 // 3. [all]   v8i16 shuffle + N x pextrw + rotate + pinsrw
11846 static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
11847                                         const X86Subtarget* Subtarget,
11848                                         SelectionDAG &DAG) {
11849   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
11850   SDValue V1 = SVOp->getOperand(0);
11851   SDValue V2 = SVOp->getOperand(1);
11852   SDLoc dl(SVOp);
11853   ArrayRef<int> MaskVals = SVOp->getMask();
11854
11855   // Promote splats to a larger type which usually leads to more efficient code.
11856   // FIXME: Is this true if pshufb is available?
11857   if (SVOp->isSplat())
11858     return PromoteSplat(SVOp, DAG);
11859
11860   // If we have SSSE3, case 1 is generated when all result bytes come from
11861   // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
11862   // present, fall back to case 3.
11863
11864   // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
11865   if (Subtarget->hasSSSE3()) {
11866     SmallVector<SDValue,16> pshufbMask;
11867
11868     // If all result elements are from one input vector, then only translate
11869     // undef mask values to 0x80 (zero out result) in the pshufb mask.
11870     //
11871     // Otherwise, we have elements from both input vectors, and must zero out
11872     // elements that come from V2 in the first mask, and V1 in the second mask
11873     // so that we can OR them together.
11874     for (unsigned i = 0; i != 16; ++i) {
11875       int EltIdx = MaskVals[i];
11876       if (EltIdx < 0 || EltIdx >= 16)
11877         EltIdx = 0x80;
11878       pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
11879     }
11880     V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
11881                      DAG.getNode(ISD::BUILD_VECTOR, dl,
11882                                  MVT::v16i8, pshufbMask));
11883
11884     // As PSHUFB will zero elements with negative indices, it's safe to ignore
11885     // the 2nd operand if it's undefined or zero.
11886     if (V2.getOpcode() == ISD::UNDEF ||
11887         ISD::isBuildVectorAllZeros(V2.getNode()))
11888       return V1;
11889
11890     // Calculate the shuffle mask for the second input, shuffle it, and
11891     // OR it with the first shuffled input.
11892     pshufbMask.clear();
11893     for (unsigned i = 0; i != 16; ++i) {
11894       int EltIdx = MaskVals[i];
11895       EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16;
11896       pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
11897     }
11898     V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
11899                      DAG.getNode(ISD::BUILD_VECTOR, dl,
11900                                  MVT::v16i8, pshufbMask));
11901     return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
11902   }
11903
11904   // No SSSE3 - Calculate in place words and then fix all out of place words
11905   // With 0-16 extracts & inserts.  Worst case is 16 bytes out of order from
11906   // the 16 different words that comprise the two doublequadword input vectors.
11907   V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
11908   V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
11909   SDValue NewV = V1;
11910   for (int i = 0; i != 8; ++i) {
11911     int Elt0 = MaskVals[i*2];
11912     int Elt1 = MaskVals[i*2+1];
11913
11914     // This word of the result is all undef, skip it.
11915     if (Elt0 < 0 && Elt1 < 0)
11916       continue;
11917
11918     // This word of the result is already in the correct place, skip it.
11919     if ((Elt0 == i*2) && (Elt1 == i*2+1))
11920       continue;
11921
11922     SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
11923     SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
11924     SDValue InsElt;
11925
11926     // If Elt0 and Elt1 are defined, are consecutive, and can be load
11927     // using a single extract together, load it and store it.
11928     if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
11929       InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
11930                            DAG.getIntPtrConstant(Elt1 / 2));
11931       NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
11932                         DAG.getIntPtrConstant(i));
11933       continue;
11934     }
11935
11936     // If Elt1 is defined, extract it from the appropriate source.  If the
11937     // source byte is not also odd, shift the extracted word left 8 bits
11938     // otherwise clear the bottom 8 bits if we need to do an or.
11939     if (Elt1 >= 0) {
11940       InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
11941                            DAG.getIntPtrConstant(Elt1 / 2));
11942       if ((Elt1 & 1) == 0)
11943         InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
11944                              DAG.getConstant(8,
11945                                   TLI.getShiftAmountTy(InsElt.getValueType())));
11946       else if (Elt0 >= 0)
11947         InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
11948                              DAG.getConstant(0xFF00, MVT::i16));
11949     }
11950     // If Elt0 is defined, extract it from the appropriate source.  If the
11951     // source byte is not also even, shift the extracted word right 8 bits. If
11952     // Elt1 was also defined, OR the extracted values together before
11953     // inserting them in the result.
11954     if (Elt0 >= 0) {
11955       SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
11956                                     Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
11957       if ((Elt0 & 1) != 0)
11958         InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
11959                               DAG.getConstant(8,
11960                                  TLI.getShiftAmountTy(InsElt0.getValueType())));
11961       else if (Elt1 >= 0)
11962         InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
11963                              DAG.getConstant(0x00FF, MVT::i16));
11964       InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
11965                          : InsElt0;
11966     }
11967     NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
11968                        DAG.getIntPtrConstant(i));
11969   }
11970   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
11971 }
11972
11973 // v32i8 shuffles - Translate to VPSHUFB if possible.
11974 static
11975 SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
11976                                  const X86Subtarget *Subtarget,
11977                                  SelectionDAG &DAG) {
11978   MVT VT = SVOp->getSimpleValueType(0);
11979   SDValue V1 = SVOp->getOperand(0);
11980   SDValue V2 = SVOp->getOperand(1);
11981   SDLoc dl(SVOp);
11982   SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
11983
11984   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
11985   bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode());
11986   bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode());
11987
11988   // VPSHUFB may be generated if
11989   // (1) one of input vector is undefined or zeroinitializer.
11990   // The mask value 0x80 puts 0 in the corresponding slot of the vector.
11991   // And (2) the mask indexes don't cross the 128-bit lane.
11992   if (VT != MVT::v32i8 || !Subtarget->hasInt256() ||
11993       (!V2IsUndef && !V2IsAllZero && !V1IsAllZero))
11994     return SDValue();
11995
11996   if (V1IsAllZero && !V2IsAllZero) {
11997     CommuteVectorShuffleMask(MaskVals, 32);
11998     V1 = V2;
11999   }
12000   return getPSHUFB(MaskVals, V1, dl, DAG);
12001 }
12002
12003 /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
12004 /// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
12005 /// done when every pair / quad of shuffle mask elements point to elements in
12006 /// the right sequence. e.g.
12007 /// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15>
12008 static
12009 SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
12010                                  SelectionDAG &DAG) {
12011   MVT VT = SVOp->getSimpleValueType(0);
12012   SDLoc dl(SVOp);
12013   unsigned NumElems = VT.getVectorNumElements();
12014   MVT NewVT;
12015   unsigned Scale;
12016   switch (VT.SimpleTy) {
12017   default: llvm_unreachable("Unexpected!");
12018   case MVT::v2i64:
12019   case MVT::v2f64:
12020            return SDValue(SVOp, 0);
12021   case MVT::v4f32:  NewVT = MVT::v2f64; Scale = 2; break;
12022   case MVT::v4i32:  NewVT = MVT::v2i64; Scale = 2; break;
12023   case MVT::v8i16:  NewVT = MVT::v4i32; Scale = 2; break;
12024   case MVT::v16i8:  NewVT = MVT::v4i32; Scale = 4; break;
12025   case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break;
12026   case MVT::v32i8:  NewVT = MVT::v8i32; Scale = 4; break;
12027   }
12028
12029   SmallVector<int, 8> MaskVec;
12030   for (unsigned i = 0; i != NumElems; i += Scale) {
12031     int StartIdx = -1;
12032     for (unsigned j = 0; j != Scale; ++j) {
12033       int EltIdx = SVOp->getMaskElt(i+j);
12034       if (EltIdx < 0)
12035         continue;
12036       if (StartIdx < 0)
12037         StartIdx = (EltIdx / Scale);
12038       if (EltIdx != (int)(StartIdx*Scale + j))
12039         return SDValue();
12040     }
12041     MaskVec.push_back(StartIdx);
12042   }
12043
12044   SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0));
12045   SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1));
12046   return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
12047 }
12048
12049 /// getVZextMovL - Return a zero-extending vector move low node.
12050 ///
12051 static SDValue getVZextMovL(MVT VT, MVT OpVT,
12052                             SDValue SrcOp, SelectionDAG &DAG,
12053                             const X86Subtarget *Subtarget, SDLoc dl) {
12054   if (VT == MVT::v2f64 || VT == MVT::v4f32) {
12055     LoadSDNode *LD = nullptr;
12056     if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
12057       LD = dyn_cast<LoadSDNode>(SrcOp);
12058     if (!LD) {
12059       // movssrr and movsdrr do not clear top bits. Try to use movd, movq
12060       // instead.
12061       MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
12062       if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) &&
12063           SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
12064           SrcOp.getOperand(0).getOpcode() == ISD::BITCAST &&
12065           SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
12066         // PR2108
12067         OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
12068         return DAG.getNode(ISD::BITCAST, dl, VT,
12069                            DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
12070                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
12071                                                    OpVT,
12072                                                    SrcOp.getOperand(0)
12073                                                           .getOperand(0))));
12074       }
12075     }
12076   }
12077
12078   return DAG.getNode(ISD::BITCAST, dl, VT,
12079                      DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
12080                                  DAG.getNode(ISD::BITCAST, dl,
12081                                              OpVT, SrcOp)));
12082 }
12083
12084 /// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
12085 /// which could not be matched by any known target speficic shuffle
12086 static SDValue
12087 LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
12088
12089   SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG);
12090   if (NewOp.getNode())
12091     return NewOp;
12092
12093   MVT VT = SVOp->getSimpleValueType(0);
12094
12095   unsigned NumElems = VT.getVectorNumElements();
12096   unsigned NumLaneElems = NumElems / 2;
12097
12098   SDLoc dl(SVOp);
12099   MVT EltVT = VT.getVectorElementType();
12100   MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
12101   SDValue Output[2];
12102
12103   SmallVector<int, 16> Mask;
12104   for (unsigned l = 0; l < 2; ++l) {
12105     // Build a shuffle mask for the output, discovering on the fly which
12106     // input vectors to use as shuffle operands (recorded in InputUsed).
12107     // If building a suitable shuffle vector proves too hard, then bail
12108     // out with UseBuildVector set.
12109     bool UseBuildVector = false;
12110     int InputUsed[2] = { -1, -1 }; // Not yet discovered.
12111     unsigned LaneStart = l * NumLaneElems;
12112     for (unsigned i = 0; i != NumLaneElems; ++i) {
12113       // The mask element.  This indexes into the input.
12114       int Idx = SVOp->getMaskElt(i+LaneStart);
12115       if (Idx < 0) {
12116         // the mask element does not index into any input vector.
12117         Mask.push_back(-1);
12118         continue;
12119       }
12120
12121       // The input vector this mask element indexes into.
12122       int Input = Idx / NumLaneElems;
12123
12124       // Turn the index into an offset from the start of the input vector.
12125       Idx -= Input * NumLaneElems;
12126
12127       // Find or create a shuffle vector operand to hold this input.
12128       unsigned OpNo;
12129       for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
12130         if (InputUsed[OpNo] == Input)
12131           // This input vector is already an operand.
12132           break;
12133         if (InputUsed[OpNo] < 0) {
12134           // Create a new operand for this input vector.
12135           InputUsed[OpNo] = Input;
12136           break;
12137         }
12138       }
12139
12140       if (OpNo >= array_lengthof(InputUsed)) {
12141         // More than two input vectors used!  Give up on trying to create a
12142         // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
12143         UseBuildVector = true;
12144         break;
12145       }
12146
12147       // Add the mask index for the new shuffle vector.
12148       Mask.push_back(Idx + OpNo * NumLaneElems);
12149     }
12150
12151     if (UseBuildVector) {
12152       SmallVector<SDValue, 16> SVOps;
12153       for (unsigned i = 0; i != NumLaneElems; ++i) {
12154         // The mask element.  This indexes into the input.
12155         int Idx = SVOp->getMaskElt(i+LaneStart);
12156         if (Idx < 0) {
12157           SVOps.push_back(DAG.getUNDEF(EltVT));
12158           continue;
12159         }
12160
12161         // The input vector this mask element indexes into.
12162         int Input = Idx / NumElems;
12163
12164         // Turn the index into an offset from the start of the input vector.
12165         Idx -= Input * NumElems;
12166
12167         // Extract the vector element by hand.
12168         SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
12169                                     SVOp->getOperand(Input),
12170                                     DAG.getIntPtrConstant(Idx)));
12171       }
12172
12173       // Construct the output using a BUILD_VECTOR.
12174       Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, SVOps);
12175     } else if (InputUsed[0] < 0) {
12176       // No input vectors were used! The result is undefined.
12177       Output[l] = DAG.getUNDEF(NVT);
12178     } else {
12179       SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),
12180                                         (InputUsed[0] % 2) * NumLaneElems,
12181                                         DAG, dl);
12182       // If only one input was used, use an undefined vector for the other.
12183       SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :
12184         Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),
12185                             (InputUsed[1] % 2) * NumLaneElems, DAG, dl);
12186       // At least one input vector was used. Create a new shuffle vector.
12187       Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
12188     }
12189
12190     Mask.clear();
12191   }
12192
12193   // Concatenate the result back
12194   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]);
12195 }
12196
12197 /// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
12198 /// 4 elements, and match them with several different shuffle types.
12199 static SDValue
12200 LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
12201   SDValue V1 = SVOp->getOperand(0);
12202   SDValue V2 = SVOp->getOperand(1);
12203   SDLoc dl(SVOp);
12204   MVT VT = SVOp->getSimpleValueType(0);
12205
12206   assert(VT.is128BitVector() && "Unsupported vector size");
12207
12208   std::pair<int, int> Locs[4];
12209   int Mask1[] = { -1, -1, -1, -1 };
12210   SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end());
12211
12212   unsigned NumHi = 0;
12213   unsigned NumLo = 0;
12214   for (unsigned i = 0; i != 4; ++i) {
12215     int Idx = PermMask[i];
12216     if (Idx < 0) {
12217       Locs[i] = std::make_pair(-1, -1);
12218     } else {
12219       assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
12220       if (Idx < 4) {
12221         Locs[i] = std::make_pair(0, NumLo);
12222         Mask1[NumLo] = Idx;
12223         NumLo++;
12224       } else {
12225         Locs[i] = std::make_pair(1, NumHi);
12226         if (2+NumHi < 4)
12227           Mask1[2+NumHi] = Idx;
12228         NumHi++;
12229       }
12230     }
12231   }
12232
12233   if (NumLo <= 2 && NumHi <= 2) {
12234     // If no more than two elements come from either vector. This can be
12235     // implemented with two shuffles. First shuffle gather the elements.
12236     // The second shuffle, which takes the first shuffle as both of its
12237     // vector operands, put the elements into the right order.
12238     V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
12239
12240     int Mask2[] = { -1, -1, -1, -1 };
12241
12242     for (unsigned i = 0; i != 4; ++i)
12243       if (Locs[i].first != -1) {
12244         unsigned Idx = (i < 2) ? 0 : 4;
12245         Idx += Locs[i].first * 2 + Locs[i].second;
12246         Mask2[i] = Idx;
12247       }
12248
12249     return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
12250   }
12251
12252   if (NumLo == 3 || NumHi == 3) {
12253     // Otherwise, we must have three elements from one vector, call it X, and
12254     // one element from the other, call it Y.  First, use a shufps to build an
12255     // intermediate vector with the one element from Y and the element from X
12256     // that will be in the same half in the final destination (the indexes don't
12257     // matter). Then, use a shufps to build the final vector, taking the half
12258     // containing the element from Y from the intermediate, and the other half
12259     // from X.
12260     if (NumHi == 3) {
12261       // Normalize it so the 3 elements come from V1.
12262       CommuteVectorShuffleMask(PermMask, 4);
12263       std::swap(V1, V2);
12264     }
12265
12266     // Find the element from V2.
12267     unsigned HiIndex;
12268     for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
12269       int Val = PermMask[HiIndex];
12270       if (Val < 0)
12271         continue;
12272       if (Val >= 4)
12273         break;
12274     }
12275
12276     Mask1[0] = PermMask[HiIndex];
12277     Mask1[1] = -1;
12278     Mask1[2] = PermMask[HiIndex^1];
12279     Mask1[3] = -1;
12280     V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
12281
12282     if (HiIndex >= 2) {
12283       Mask1[0] = PermMask[0];
12284       Mask1[1] = PermMask[1];
12285       Mask1[2] = HiIndex & 1 ? 6 : 4;
12286       Mask1[3] = HiIndex & 1 ? 4 : 6;
12287       return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
12288     }
12289
12290     Mask1[0] = HiIndex & 1 ? 2 : 0;
12291     Mask1[1] = HiIndex & 1 ? 0 : 2;
12292     Mask1[2] = PermMask[2];
12293     Mask1[3] = PermMask[3];
12294     if (Mask1[2] >= 0)
12295       Mask1[2] += 4;
12296     if (Mask1[3] >= 0)
12297       Mask1[3] += 4;
12298     return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
12299   }
12300
12301   // Break it into (shuffle shuffle_hi, shuffle_lo).
12302   int LoMask[] = { -1, -1, -1, -1 };
12303   int HiMask[] = { -1, -1, -1, -1 };
12304
12305   int *MaskPtr = LoMask;
12306   unsigned MaskIdx = 0;
12307   unsigned LoIdx = 0;
12308   unsigned HiIdx = 2;
12309   for (unsigned i = 0; i != 4; ++i) {
12310     if (i == 2) {
12311       MaskPtr = HiMask;
12312       MaskIdx = 1;
12313       LoIdx = 0;
12314       HiIdx = 2;
12315     }
12316     int Idx = PermMask[i];
12317     if (Idx < 0) {
12318       Locs[i] = std::make_pair(-1, -1);
12319     } else if (Idx < 4) {
12320       Locs[i] = std::make_pair(MaskIdx, LoIdx);
12321       MaskPtr[LoIdx] = Idx;
12322       LoIdx++;
12323     } else {
12324       Locs[i] = std::make_pair(MaskIdx, HiIdx);
12325       MaskPtr[HiIdx] = Idx;
12326       HiIdx++;
12327     }
12328   }
12329
12330   SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
12331   SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
12332   int MaskOps[] = { -1, -1, -1, -1 };
12333   for (unsigned i = 0; i != 4; ++i)
12334     if (Locs[i].first != -1)
12335       MaskOps[i] = Locs[i].first * 4 + Locs[i].second;
12336   return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
12337 }
12338
12339 static bool MayFoldVectorLoad(SDValue V) {
12340   while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
12341     V = V.getOperand(0);
12342
12343   if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
12344     V = V.getOperand(0);
12345   if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&
12346       V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)
12347     // BUILD_VECTOR (load), undef
12348     V = V.getOperand(0);
12349
12350   return MayFoldLoad(V);
12351 }
12352
12353 static
12354 SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) {
12355   MVT VT = Op.getSimpleValueType();
12356
12357   // Canonicalize to v2f64.
12358   V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
12359   return DAG.getNode(ISD::BITCAST, dl, VT,
12360                      getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64,
12361                                           V1, DAG));
12362 }
12363
12364 static
12365 SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG,
12366                         bool HasSSE2) {
12367   SDValue V1 = Op.getOperand(0);
12368   SDValue V2 = Op.getOperand(1);
12369   MVT VT = Op.getSimpleValueType();
12370
12371   assert(VT != MVT::v2i64 && "unsupported shuffle type");
12372
12373   if (HasSSE2 && VT == MVT::v2f64)
12374     return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
12375
12376   // v4f32 or v4i32: canonicalize to v4f32 (which is legal for SSE1)
12377   return DAG.getNode(ISD::BITCAST, dl, VT,
12378                      getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32,
12379                            DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1),
12380                            DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG));
12381 }
12382
12383 static
12384 SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG) {
12385   SDValue V1 = Op.getOperand(0);
12386   SDValue V2 = Op.getOperand(1);
12387   MVT VT = Op.getSimpleValueType();
12388
12389   assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
12390          "unsupported shuffle type");
12391
12392   if (V2.getOpcode() == ISD::UNDEF)
12393     V2 = V1;
12394
12395   // v4i32 or v4f32
12396   return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
12397 }
12398
12399 static
12400 SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
12401   SDValue V1 = Op.getOperand(0);
12402   SDValue V2 = Op.getOperand(1);
12403   MVT VT = Op.getSimpleValueType();
12404   unsigned NumElems = VT.getVectorNumElements();
12405
12406   // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
12407   // operand of these instructions is only memory, so check if there's a
12408   // potencial load folding here, otherwise use SHUFPS or MOVSD to match the
12409   // same masks.
12410   bool CanFoldLoad = false;
12411
12412   // Trivial case, when V2 comes from a load.
12413   if (MayFoldVectorLoad(V2))
12414     CanFoldLoad = true;
12415
12416   // When V1 is a load, it can be folded later into a store in isel, example:
12417   //  (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1)
12418   //    turns into:
12419   //  (MOVLPSmr addr:$src1, VR128:$src2)
12420   // So, recognize this potential and also use MOVLPS or MOVLPD
12421   else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
12422     CanFoldLoad = true;
12423
12424   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12425   if (CanFoldLoad) {
12426     if (HasSSE2 && NumElems == 2)
12427       return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
12428
12429     if (NumElems == 4)
12430       // If we don't care about the second element, proceed to use movss.
12431       if (SVOp->getMaskElt(1) != -1)
12432         return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
12433   }
12434
12435   // movl and movlp will both match v2i64, but v2i64 is never matched by
12436   // movl earlier because we make it strict to avoid messing with the movlp load
12437   // folding logic (see the code above getMOVLP call). Match it here then,
12438   // this is horrible, but will stay like this until we move all shuffle
12439   // matching to x86 specific nodes. Note that for the 1st condition all
12440   // types are matched with movsd.
12441   if (HasSSE2) {
12442     // FIXME: isMOVLMask should be checked and matched before getMOVLP,
12443     // as to remove this logic from here, as much as possible
12444     if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT))
12445       return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
12446     return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
12447   }
12448
12449   assert(VT != MVT::v4i32 && "unsupported shuffle type");
12450
12451   // Invert the operand order and use SHUFPS to match it.
12452   return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1,
12453                               getShuffleSHUFImmediate(SVOp), DAG);
12454 }
12455
12456 static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
12457                                          SelectionDAG &DAG) {
12458   SDLoc dl(Load);
12459   MVT VT = Load->getSimpleValueType(0);
12460   MVT EVT = VT.getVectorElementType();
12461   SDValue Addr = Load->getOperand(1);
12462   SDValue NewAddr = DAG.getNode(
12463       ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
12464       DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType()));
12465
12466   SDValue NewLoad =
12467       DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
12468                   DAG.getMachineFunction().getMachineMemOperand(
12469                       Load->getMemOperand(), 0, EVT.getStoreSize()));
12470   return NewLoad;
12471 }
12472
12473 // It is only safe to call this function if isINSERTPSMask is true for
12474 // this shufflevector mask.
12475 static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
12476                            SelectionDAG &DAG) {
12477   // Generate an insertps instruction when inserting an f32 from memory onto a
12478   // v4f32 or when copying a member from one v4f32 to another.
12479   // We also use it for transferring i32 from one register to another,
12480   // since it simply copies the same bits.
12481   // If we're transferring an i32 from memory to a specific element in a
12482   // register, we output a generic DAG that will match the PINSRD
12483   // instruction.
12484   MVT VT = SVOp->getSimpleValueType(0);
12485   MVT EVT = VT.getVectorElementType();
12486   SDValue V1 = SVOp->getOperand(0);
12487   SDValue V2 = SVOp->getOperand(1);
12488   auto Mask = SVOp->getMask();
12489   assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
12490          "unsupported vector type for insertps/pinsrd");
12491
12492   auto FromV1Predicate = [](const int &i) { return i < 4 && i > -1; };
12493   auto FromV2Predicate = [](const int &i) { return i >= 4; };
12494   int FromV1 = std::count_if(Mask.begin(), Mask.end(), FromV1Predicate);
12495
12496   SDValue From;
12497   SDValue To;
12498   unsigned DestIndex;
12499   if (FromV1 == 1) {
12500     From = V1;
12501     To = V2;
12502     DestIndex = std::find_if(Mask.begin(), Mask.end(), FromV1Predicate) -
12503                 Mask.begin();
12504
12505     // If we have 1 element from each vector, we have to check if we're
12506     // changing V1's element's place. If so, we're done. Otherwise, we
12507     // should assume we're changing V2's element's place and behave
12508     // accordingly.
12509     int FromV2 = std::count_if(Mask.begin(), Mask.end(), FromV2Predicate);
12510     assert(DestIndex <= INT32_MAX && "truncated destination index");
12511     if (FromV1 == FromV2 &&
12512         static_cast<int>(DestIndex) == Mask[DestIndex] % 4) {
12513       From = V2;
12514       To = V1;
12515       DestIndex =
12516           std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
12517     }
12518   } else {
12519     assert(std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 &&
12520            "More than one element from V1 and from V2, or no elements from one "
12521            "of the vectors. This case should not have returned true from "
12522            "isINSERTPSMask");
12523     From = V2;
12524     To = V1;
12525     DestIndex =
12526         std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
12527   }
12528
12529   // Get an index into the source vector in the range [0,4) (the mask is
12530   // in the range [0,8) because it can address V1 and V2)
12531   unsigned SrcIndex = Mask[DestIndex] % 4;
12532   if (MayFoldLoad(From)) {
12533     // Trivial case, when From comes from a load and is only used by the
12534     // shuffle. Make it use insertps from the vector that we need from that
12535     // load.
12536     SDValue NewLoad =
12537         NarrowVectorLoadToElement(cast<LoadSDNode>(From), SrcIndex, DAG);
12538     if (!NewLoad.getNode())
12539       return SDValue();
12540
12541     if (EVT == MVT::f32) {
12542       // Create this as a scalar to vector to match the instruction pattern.
12543       SDValue LoadScalarToVector =
12544           DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad);
12545       SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4);
12546       return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector,
12547                          InsertpsMask);
12548     } else { // EVT == MVT::i32
12549       // If we're getting an i32 from memory, use an INSERT_VECTOR_ELT
12550       // instruction, to match the PINSRD instruction, which loads an i32 to a
12551       // certain vector element.
12552       return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, To, NewLoad,
12553                          DAG.getConstant(DestIndex, MVT::i32));
12554     }
12555   }
12556
12557   // Vector-element-to-vector
12558   SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6);
12559   return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask);
12560 }
12561
12562 // Reduce a vector shuffle to zext.
12563 static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
12564                                     SelectionDAG &DAG) {
12565   // PMOVZX is only available from SSE41.
12566   if (!Subtarget->hasSSE41())
12567     return SDValue();
12568
12569   MVT VT = Op.getSimpleValueType();
12570
12571   // Only AVX2 support 256-bit vector integer extending.
12572   if (!Subtarget->hasInt256() && VT.is256BitVector())
12573     return SDValue();
12574
12575   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12576   SDLoc DL(Op);
12577   SDValue V1 = Op.getOperand(0);
12578   SDValue V2 = Op.getOperand(1);
12579   unsigned NumElems = VT.getVectorNumElements();
12580
12581   // Extending is an unary operation and the element type of the source vector
12582   // won't be equal to or larger than i64.
12583   if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() ||
12584       VT.getVectorElementType() == MVT::i64)
12585     return SDValue();
12586
12587   // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4.
12588   unsigned Shift = 1; // Start from 2, i.e. 1 << 1.
12589   while ((1U << Shift) < NumElems) {
12590     if (SVOp->getMaskElt(1U << Shift) == 1)
12591       break;
12592     Shift += 1;
12593     // The maximal ratio is 8, i.e. from i8 to i64.
12594     if (Shift > 3)
12595       return SDValue();
12596   }
12597
12598   // Check the shuffle mask.
12599   unsigned Mask = (1U << Shift) - 1;
12600   for (unsigned i = 0; i != NumElems; ++i) {
12601     int EltIdx = SVOp->getMaskElt(i);
12602     if ((i & Mask) != 0 && EltIdx != -1)
12603       return SDValue();
12604     if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift))
12605       return SDValue();
12606   }
12607
12608   unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift;
12609   MVT NeVT = MVT::getIntegerVT(NBits);
12610   MVT NVT = MVT::getVectorVT(NeVT, NumElems >> Shift);
12611
12612   if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT))
12613     return SDValue();
12614
12615   return DAG.getNode(ISD::BITCAST, DL, VT,
12616                      DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
12617 }
12618
12619 static SDValue NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
12620                                       SelectionDAG &DAG) {
12621   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12622   MVT VT = Op.getSimpleValueType();
12623   SDLoc dl(Op);
12624   SDValue V1 = Op.getOperand(0);
12625   SDValue V2 = Op.getOperand(1);
12626
12627   if (isZeroShuffle(SVOp))
12628     return getZeroVector(VT, Subtarget, DAG, dl);
12629
12630   // Handle splat operations
12631   if (SVOp->isSplat()) {
12632     // Use vbroadcast whenever the splat comes from a foldable load
12633     SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
12634     if (Broadcast.getNode())
12635       return Broadcast;
12636   }
12637
12638   // Check integer expanding shuffles.
12639   SDValue NewOp = LowerVectorIntExtend(Op, Subtarget, DAG);
12640   if (NewOp.getNode())
12641     return NewOp;
12642
12643   // If the shuffle can be profitably rewritten as a narrower shuffle, then
12644   // do it!
12645   if (VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v16i16 ||
12646       VT == MVT::v32i8) {
12647     SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
12648     if (NewOp.getNode())
12649       return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
12650   } else if (VT.is128BitVector() && Subtarget->hasSSE2()) {
12651     // FIXME: Figure out a cleaner way to do this.
12652     if (ISD::isBuildVectorAllZeros(V2.getNode())) {
12653       SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
12654       if (NewOp.getNode()) {
12655         MVT NewVT = NewOp.getSimpleValueType();
12656         if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
12657                                NewVT, true, false))
12658           return getVZextMovL(VT, NewVT, NewOp.getOperand(0), DAG, Subtarget,
12659                               dl);
12660       }
12661     } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
12662       SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
12663       if (NewOp.getNode()) {
12664         MVT NewVT = NewOp.getSimpleValueType();
12665         if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
12666           return getVZextMovL(VT, NewVT, NewOp.getOperand(1), DAG, Subtarget,
12667                               dl);
12668       }
12669     }
12670   }
12671   return SDValue();
12672 }
12673
12674 SDValue
12675 X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
12676   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12677   SDValue V1 = Op.getOperand(0);
12678   SDValue V2 = Op.getOperand(1);
12679   MVT VT = Op.getSimpleValueType();
12680   SDLoc dl(Op);
12681   unsigned NumElems = VT.getVectorNumElements();
12682   bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
12683   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
12684   bool V1IsSplat = false;
12685   bool V2IsSplat = false;
12686   bool HasSSE2 = Subtarget->hasSSE2();
12687   bool HasFp256    = Subtarget->hasFp256();
12688   bool HasInt256   = Subtarget->hasInt256();
12689   MachineFunction &MF = DAG.getMachineFunction();
12690   bool OptForSize =
12691       MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize);
12692
12693   // Check if we should use the experimental vector shuffle lowering. If so,
12694   // delegate completely to that code path.
12695   if (ExperimentalVectorShuffleLowering)
12696     return lowerVectorShuffle(Op, Subtarget, DAG);
12697
12698   assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
12699
12700   if (V1IsUndef && V2IsUndef)
12701     return DAG.getUNDEF(VT);
12702
12703   // When we create a shuffle node we put the UNDEF node to second operand,
12704   // but in some cases the first operand may be transformed to UNDEF.
12705   // In this case we should just commute the node.
12706   if (V1IsUndef)
12707     return DAG.getCommutedVectorShuffle(*SVOp);
12708
12709   // Vector shuffle lowering takes 3 steps:
12710   //
12711   // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable
12712   //    narrowing and commutation of operands should be handled.
12713   // 2) Matching of shuffles with known shuffle masks to x86 target specific
12714   //    shuffle nodes.
12715   // 3) Rewriting of unmatched masks into new generic shuffle operations,
12716   //    so the shuffle can be broken into other shuffles and the legalizer can
12717   //    try the lowering again.
12718   //
12719   // The general idea is that no vector_shuffle operation should be left to
12720   // be matched during isel, all of them must be converted to a target specific
12721   // node here.
12722
12723   // Normalize the input vectors. Here splats, zeroed vectors, profitable
12724   // narrowing and commutation of operands should be handled. The actual code
12725   // doesn't include all of those, work in progress...
12726   SDValue NewOp = NormalizeVectorShuffle(Op, Subtarget, DAG);
12727   if (NewOp.getNode())
12728     return NewOp;
12729
12730   SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end());
12731
12732   // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
12733   // unpckh_undef). Only use pshufd if speed is more important than size.
12734   if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasInt256))
12735     return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
12736   if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasInt256))
12737     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
12738
12739   if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() &&
12740       V2IsUndef && MayFoldVectorLoad(V1))
12741     return getMOVDDup(Op, dl, V1, DAG);
12742
12743   if (isMOVHLPS_v_undef_Mask(M, VT))
12744     return getMOVHighToLow(Op, dl, DAG);
12745
12746   // Use to match splats
12747   if (HasSSE2 && isUNPCKHMask(M, VT, HasInt256) && V2IsUndef &&
12748       (VT == MVT::v2f64 || VT == MVT::v2i64))
12749     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
12750
12751   if (isPSHUFDMask(M, VT)) {
12752     // The actual implementation will match the mask in the if above and then
12753     // during isel it can match several different instructions, not only pshufd
12754     // as its name says, sad but true, emulate the behavior for now...
12755     if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64)))
12756       return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG);
12757
12758     unsigned TargetMask = getShuffleSHUFImmediate(SVOp);
12759
12760     if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
12761       return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
12762
12763     if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64))
12764       return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1, TargetMask,
12765                                   DAG);
12766
12767     return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1,
12768                                 TargetMask, DAG);
12769   }
12770
12771   if (isPALIGNRMask(M, VT, Subtarget))
12772     return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2,
12773                                 getShufflePALIGNRImmediate(SVOp),
12774                                 DAG);
12775
12776   if (isVALIGNMask(M, VT, Subtarget))
12777     return getTargetShuffleNode(X86ISD::VALIGN, dl, VT, V1, V2,
12778                                 getShuffleVALIGNImmediate(SVOp),
12779                                 DAG);
12780
12781   // Check if this can be converted into a logical shift.
12782   bool isLeft = false;
12783   unsigned ShAmt = 0;
12784   SDValue ShVal;
12785   bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
12786   if (isShift && ShVal.hasOneUse()) {
12787     // If the shifted value has multiple uses, it may be cheaper to use
12788     // v_set0 + movlhps or movhlps, etc.
12789     MVT EltVT = VT.getVectorElementType();
12790     ShAmt *= EltVT.getSizeInBits();
12791     return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
12792   }
12793
12794   if (isMOVLMask(M, VT)) {
12795     if (ISD::isBuildVectorAllZeros(V1.getNode()))
12796       return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
12797     if (!isMOVLPMask(M, VT)) {
12798       if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
12799         return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
12800
12801       if (VT == MVT::v4i32 || VT == MVT::v4f32)
12802         return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
12803     }
12804   }
12805
12806   // FIXME: fold these into legal mask.
12807   if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasInt256))
12808     return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
12809
12810   if (isMOVHLPSMask(M, VT))
12811     return getMOVHighToLow(Op, dl, DAG);
12812
12813   if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget))
12814     return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
12815
12816   if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget))
12817     return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
12818
12819   if (isMOVLPMask(M, VT))
12820     return getMOVLP(Op, dl, DAG, HasSSE2);
12821
12822   if (ShouldXformToMOVHLPS(M, VT) ||
12823       ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT))
12824     return DAG.getCommutedVectorShuffle(*SVOp);
12825
12826   if (isShift) {
12827     // No better options. Use a vshldq / vsrldq.
12828     MVT EltVT = VT.getVectorElementType();
12829     ShAmt *= EltVT.getSizeInBits();
12830     return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
12831   }
12832
12833   bool Commuted = false;
12834   // FIXME: This should also accept a bitcast of a splat?  Be careful, not
12835   // 1,1,1,1 -> v8i16 though.
12836   BitVector UndefElements;
12837   if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V1.getNode()))
12838     if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
12839       V1IsSplat = true;
12840   if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V2.getNode()))
12841     if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
12842       V2IsSplat = true;
12843
12844   // Canonicalize the splat or undef, if present, to be on the RHS.
12845   if (!V2IsUndef && V1IsSplat && !V2IsSplat) {
12846     CommuteVectorShuffleMask(M, NumElems);
12847     std::swap(V1, V2);
12848     std::swap(V1IsSplat, V2IsSplat);
12849     Commuted = true;
12850   }
12851
12852   if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) {
12853     // Shuffling low element of v1 into undef, just return v1.
12854     if (V2IsUndef)
12855       return V1;
12856     // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
12857     // the instruction selector will not match, so get a canonical MOVL with
12858     // swapped operands to undo the commute.
12859     return getMOVL(DAG, dl, VT, V2, V1);
12860   }
12861
12862   if (isUNPCKLMask(M, VT, HasInt256))
12863     return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
12864
12865   if (isUNPCKHMask(M, VT, HasInt256))
12866     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
12867
12868   if (V2IsSplat) {
12869     // Normalize mask so all entries that point to V2 points to its first
12870     // element then try to match unpck{h|l} again. If match, return a
12871     // new vector_shuffle with the corrected mask.p
12872     SmallVector<int, 8> NewMask(M.begin(), M.end());
12873     NormalizeMask(NewMask, NumElems);
12874     if (isUNPCKLMask(NewMask, VT, HasInt256, true))
12875       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
12876     if (isUNPCKHMask(NewMask, VT, HasInt256, true))
12877       return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
12878   }
12879
12880   if (Commuted) {
12881     // Commute is back and try unpck* again.
12882     // FIXME: this seems wrong.
12883     CommuteVectorShuffleMask(M, NumElems);
12884     std::swap(V1, V2);
12885     std::swap(V1IsSplat, V2IsSplat);
12886
12887     if (isUNPCKLMask(M, VT, HasInt256))
12888       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
12889
12890     if (isUNPCKHMask(M, VT, HasInt256))
12891       return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
12892   }
12893
12894   // Normalize the node to match x86 shuffle ops if needed
12895   if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true)))
12896     return DAG.getCommutedVectorShuffle(*SVOp);
12897
12898   // The checks below are all present in isShuffleMaskLegal, but they are
12899   // inlined here right now to enable us to directly emit target specific
12900   // nodes, and remove one by one until they don't return Op anymore.
12901
12902   if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
12903       SVOp->getSplatIndex() == 0 && V2IsUndef) {
12904     if (VT == MVT::v2f64 || VT == MVT::v2i64)
12905       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
12906   }
12907
12908   if (isPSHUFHWMask(M, VT, HasInt256))
12909     return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
12910                                 getShufflePSHUFHWImmediate(SVOp),
12911                                 DAG);
12912
12913   if (isPSHUFLWMask(M, VT, HasInt256))
12914     return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
12915                                 getShufflePSHUFLWImmediate(SVOp),
12916                                 DAG);
12917
12918   unsigned MaskValue;
12919   if (isBlendMask(M, VT, Subtarget->hasSSE41(), HasInt256, &MaskValue))
12920     return LowerVECTOR_SHUFFLEtoBlend(SVOp, MaskValue, Subtarget, DAG);
12921
12922   if (isSHUFPMask(M, VT))
12923     return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
12924                                 getShuffleSHUFImmediate(SVOp), DAG);
12925
12926   if (isUNPCKL_v_undef_Mask(M, VT, HasInt256))
12927     return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
12928   if (isUNPCKH_v_undef_Mask(M, VT, HasInt256))
12929     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
12930
12931   //===--------------------------------------------------------------------===//
12932   // Generate target specific nodes for 128 or 256-bit shuffles only
12933   // supported in the AVX instruction set.
12934   //
12935
12936   // Handle VMOVDDUPY permutations
12937   if (V2IsUndef && isMOVDDUPYMask(M, VT, HasFp256))
12938     return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
12939
12940   // Handle VPERMILPS/D* permutations
12941   if (isVPERMILPMask(M, VT)) {
12942     if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32)
12943       return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
12944                                   getShuffleSHUFImmediate(SVOp), DAG);
12945     return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1,
12946                                 getShuffleSHUFImmediate(SVOp), DAG);
12947   }
12948
12949   unsigned Idx;
12950   if (VT.is512BitVector() && isINSERT64x4Mask(M, VT, &Idx))
12951     return Insert256BitVector(V1, Extract256BitVector(V2, 0, DAG, dl),
12952                               Idx*(NumElems/2), DAG, dl);
12953
12954   // Handle VPERM2F128/VPERM2I128 permutations
12955   if (isVPERM2X128Mask(M, VT, HasFp256))
12956     return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
12957                                 V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
12958
12959   if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT))
12960     return getINSERTPS(SVOp, dl, DAG);
12961
12962   unsigned Imm8;
12963   if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8))
12964     return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG);
12965
12966   if ((V2IsUndef && HasInt256 && VT.is256BitVector() && NumElems == 8) ||
12967       VT.is512BitVector()) {
12968     MVT MaskEltVT = MVT::getIntegerVT(VT.getVectorElementType().getSizeInBits());
12969     MVT MaskVectorVT = MVT::getVectorVT(MaskEltVT, NumElems);
12970     SmallVector<SDValue, 16> permclMask;
12971     for (unsigned i = 0; i != NumElems; ++i) {
12972       permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT));
12973     }
12974
12975     SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, permclMask);
12976     if (V2IsUndef)
12977       // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
12978       return DAG.getNode(X86ISD::VPERMV, dl, VT,
12979                           DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
12980     return DAG.getNode(X86ISD::VPERMV3, dl, VT, V1,
12981                        DAG.getNode(ISD::BITCAST, dl, VT, Mask), V2);
12982   }
12983
12984   //===--------------------------------------------------------------------===//
12985   // Since no target specific shuffle was selected for this generic one,
12986   // lower it into other known shuffles. FIXME: this isn't true yet, but
12987   // this is the plan.
12988   //
12989
12990   // Handle v8i16 specifically since SSE can do byte extraction and insertion.
12991   if (VT == MVT::v8i16) {
12992     SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG);
12993     if (NewOp.getNode())
12994       return NewOp;
12995   }
12996
12997   if (VT == MVT::v16i16 && HasInt256) {
12998     SDValue NewOp = LowerVECTOR_SHUFFLEv16i16(Op, DAG);
12999     if (NewOp.getNode())
13000       return NewOp;
13001   }
13002
13003   if (VT == MVT::v16i8) {
13004     SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, Subtarget, DAG);
13005     if (NewOp.getNode())
13006       return NewOp;
13007   }
13008
13009   if (VT == MVT::v32i8) {
13010     SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG);
13011     if (NewOp.getNode())
13012       return NewOp;
13013   }
13014
13015   // Handle all 128-bit wide vectors with 4 elements, and match them with
13016   // several different shuffle types.
13017   if (NumElems == 4 && VT.is128BitVector())
13018     return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
13019
13020   // Handle general 256-bit shuffles
13021   if (VT.is256BitVector())
13022     return LowerVECTOR_SHUFFLE_256(SVOp, DAG);
13023
13024   return SDValue();
13025 }
13026
13027 // This function assumes its argument is a BUILD_VECTOR of constants or
13028 // undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is
13029 // true.
13030 static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
13031                                     unsigned &MaskValue) {
13032   MaskValue = 0;
13033   unsigned NumElems = BuildVector->getNumOperands();
13034   // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
13035   unsigned NumLanes = (NumElems - 1) / 8 + 1;
13036   unsigned NumElemsInLane = NumElems / NumLanes;
13037
13038   // Blend for v16i16 should be symetric for the both lanes.
13039   for (unsigned i = 0; i < NumElemsInLane; ++i) {
13040     SDValue EltCond = BuildVector->getOperand(i);
13041     SDValue SndLaneEltCond =
13042         (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond;
13043
13044     int Lane1Cond = -1, Lane2Cond = -1;
13045     if (isa<ConstantSDNode>(EltCond))
13046       Lane1Cond = !isZero(EltCond);
13047     if (isa<ConstantSDNode>(SndLaneEltCond))
13048       Lane2Cond = !isZero(SndLaneEltCond);
13049
13050     if (Lane1Cond == Lane2Cond || Lane2Cond < 0)
13051       // Lane1Cond != 0, means we want the first argument.
13052       // Lane1Cond == 0, means we want the second argument.
13053       // The encoding of this argument is 0 for the first argument, 1
13054       // for the second. Therefore, invert the condition.
13055       MaskValue |= !Lane1Cond << i;
13056     else if (Lane1Cond < 0)
13057       MaskValue |= !Lane2Cond << i;
13058     else
13059       return false;
13060   }
13061   return true;
13062 }
13063
13064 /// \brief Try to lower a VSELECT instruction to an immediate-controlled blend
13065 /// instruction.
13066 static SDValue lowerVSELECTtoBLENDI(SDValue Op, const X86Subtarget *Subtarget,
13067                                     SelectionDAG &DAG) {
13068   SDValue Cond = Op.getOperand(0);
13069   SDValue LHS = Op.getOperand(1);
13070   SDValue RHS = Op.getOperand(2);
13071   SDLoc dl(Op);
13072   MVT VT = Op.getSimpleValueType();
13073   MVT EltVT = VT.getVectorElementType();
13074   unsigned NumElems = VT.getVectorNumElements();
13075
13076   // There is no blend with immediate in AVX-512.
13077   if (VT.is512BitVector())
13078     return SDValue();
13079
13080   if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
13081     return SDValue();
13082   if (!Subtarget->hasInt256() && VT == MVT::v16i16)
13083     return SDValue();
13084
13085   if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
13086     return SDValue();
13087
13088   // Check the mask for BLEND and build the value.
13089   unsigned MaskValue = 0;
13090   if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
13091     return SDValue();
13092
13093   // Convert i32 vectors to floating point if it is not AVX2.
13094   // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
13095   MVT BlendVT = VT;
13096   if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
13097     BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
13098                                NumElems);
13099     LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS);
13100     RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS);
13101   }
13102
13103   SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS,
13104                             DAG.getConstant(MaskValue, MVT::i32));
13105   return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
13106 }
13107
13108 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
13109   // A vselect where all conditions and data are constants can be optimized into
13110   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
13111   if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
13112       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
13113       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
13114     return SDValue();
13115
13116   SDValue BlendOp = lowerVSELECTtoBLENDI(Op, Subtarget, DAG);
13117   if (BlendOp.getNode())
13118     return BlendOp;
13119
13120   // Some types for vselect were previously set to Expand, not Legal or
13121   // Custom. Return an empty SDValue so we fall-through to Expand, after
13122   // the Custom lowering phase.
13123   MVT VT = Op.getSimpleValueType();
13124   switch (VT.SimpleTy) {
13125   default:
13126     break;
13127   case MVT::v8i16:
13128   case MVT::v16i16:
13129     if (Subtarget->hasBWI() && Subtarget->hasVLX())
13130       break;
13131     return SDValue();
13132   }
13133
13134   // We couldn't create a "Blend with immediate" node.
13135   // This node should still be legal, but we'll have to emit a blendv*
13136   // instruction.
13137   return Op;
13138 }
13139
13140 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
13141   MVT VT = Op.getSimpleValueType();
13142   SDLoc dl(Op);
13143
13144   if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
13145     return SDValue();
13146
13147   if (VT.getSizeInBits() == 8) {
13148     SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
13149                                   Op.getOperand(0), Op.getOperand(1));
13150     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
13151                                   DAG.getValueType(VT));
13152     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
13153   }
13154
13155   if (VT.getSizeInBits() == 16) {
13156     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
13157     // If Idx is 0, it's cheaper to do a move instead of a pextrw.
13158     if (Idx == 0)
13159       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
13160                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
13161                                      DAG.getNode(ISD::BITCAST, dl,
13162                                                  MVT::v4i32,
13163                                                  Op.getOperand(0)),
13164                                      Op.getOperand(1)));
13165     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
13166                                   Op.getOperand(0), Op.getOperand(1));
13167     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
13168                                   DAG.getValueType(VT));
13169     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
13170   }
13171
13172   if (VT == MVT::f32) {
13173     // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
13174     // the result back to FR32 register. It's only worth matching if the
13175     // result has a single use which is a store or a bitcast to i32.  And in
13176     // the case of a store, it's not worth it if the index is a constant 0,
13177     // because a MOVSSmr can be used instead, which is smaller and faster.
13178     if (!Op.hasOneUse())
13179       return SDValue();
13180     SDNode *User = *Op.getNode()->use_begin();
13181     if ((User->getOpcode() != ISD::STORE ||
13182          (isa<ConstantSDNode>(Op.getOperand(1)) &&
13183           cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
13184         (User->getOpcode() != ISD::BITCAST ||
13185          User->getValueType(0) != MVT::i32))
13186       return SDValue();
13187     SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
13188                                   DAG.getNode(ISD::BITCAST, dl, MVT::v4i32,
13189                                               Op.getOperand(0)),
13190                                               Op.getOperand(1));
13191     return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
13192   }
13193
13194   if (VT == MVT::i32 || VT == MVT::i64) {
13195     // ExtractPS/pextrq works with constant index.
13196     if (isa<ConstantSDNode>(Op.getOperand(1)))
13197       return Op;
13198   }
13199   return SDValue();
13200 }
13201
13202 /// Extract one bit from mask vector, like v16i1 or v8i1.
13203 /// AVX-512 feature.
13204 SDValue
13205 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
13206   SDValue Vec = Op.getOperand(0);
13207   SDLoc dl(Vec);
13208   MVT VecVT = Vec.getSimpleValueType();
13209   SDValue Idx = Op.getOperand(1);
13210   MVT EltVT = Op.getSimpleValueType();
13211
13212   assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
13213   assert((VecVT.getVectorNumElements() <= 16 || Subtarget->hasBWI()) &&
13214          "Unexpected vector type in ExtractBitFromMaskVector");
13215
13216   // variable index can't be handled in mask registers,
13217   // extend vector to VR512
13218   if (!isa<ConstantSDNode>(Idx)) {
13219     MVT ExtVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
13220     SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
13221     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
13222                               ExtVT.getVectorElementType(), Ext, Idx);
13223     return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
13224   }
13225
13226   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13227   const TargetRegisterClass* rc = getRegClassFor(VecVT);
13228   if (!Subtarget->hasDQI() && (VecVT.getVectorNumElements() <= 8))
13229     rc = getRegClassFor(MVT::v16i1);
13230   unsigned MaxSift = rc->getSize()*8 - 1;
13231   Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
13232                     DAG.getConstant(MaxSift - IdxVal, MVT::i8));
13233   Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
13234                     DAG.getConstant(MaxSift, MVT::i8));
13235   return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
13236                        DAG.getIntPtrConstant(0));
13237 }
13238
13239 SDValue
13240 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
13241                                            SelectionDAG &DAG) const {
13242   SDLoc dl(Op);
13243   SDValue Vec = Op.getOperand(0);
13244   MVT VecVT = Vec.getSimpleValueType();
13245   SDValue Idx = Op.getOperand(1);
13246
13247   if (Op.getSimpleValueType() == MVT::i1)
13248     return ExtractBitFromMaskVector(Op, DAG);
13249
13250   if (!isa<ConstantSDNode>(Idx)) {
13251     if (VecVT.is512BitVector() ||
13252         (VecVT.is256BitVector() && Subtarget->hasInt256() &&
13253          VecVT.getVectorElementType().getSizeInBits() == 32)) {
13254
13255       MVT MaskEltVT =
13256         MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits());
13257       MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
13258                                     MaskEltVT.getSizeInBits());
13259
13260       Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
13261       SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
13262                                 getZeroVector(MaskVT, Subtarget, DAG, dl),
13263                                 Idx, DAG.getConstant(0, getPointerTy()));
13264       SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
13265       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(),
13266                         Perm, DAG.getConstant(0, getPointerTy()));
13267     }
13268     return SDValue();
13269   }
13270
13271   // If this is a 256-bit vector result, first extract the 128-bit vector and
13272   // then extract the element from the 128-bit vector.
13273   if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
13274
13275     unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13276     // Get the 128-bit vector.
13277     Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
13278     MVT EltVT = VecVT.getVectorElementType();
13279
13280     unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
13281
13282     //if (IdxVal >= NumElems/2)
13283     //  IdxVal -= NumElems/2;
13284     IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk;
13285     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
13286                        DAG.getConstant(IdxVal, MVT::i32));
13287   }
13288
13289   assert(VecVT.is128BitVector() && "Unexpected vector length");
13290
13291   if (Subtarget->hasSSE41()) {
13292     SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
13293     if (Res.getNode())
13294       return Res;
13295   }
13296
13297   MVT VT = Op.getSimpleValueType();
13298   // TODO: handle v16i8.
13299   if (VT.getSizeInBits() == 16) {
13300     SDValue Vec = Op.getOperand(0);
13301     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
13302     if (Idx == 0)
13303       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
13304                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
13305                                      DAG.getNode(ISD::BITCAST, dl,
13306                                                  MVT::v4i32, Vec),
13307                                      Op.getOperand(1)));
13308     // Transform it so it match pextrw which produces a 32-bit result.
13309     MVT EltVT = MVT::i32;
13310     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
13311                                   Op.getOperand(0), Op.getOperand(1));
13312     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
13313                                   DAG.getValueType(VT));
13314     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
13315   }
13316
13317   if (VT.getSizeInBits() == 32) {
13318     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
13319     if (Idx == 0)
13320       return Op;
13321
13322     // SHUFPS the element to the lowest double word, then movss.
13323     int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
13324     MVT VVT = Op.getOperand(0).getSimpleValueType();
13325     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
13326                                        DAG.getUNDEF(VVT), Mask);
13327     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
13328                        DAG.getIntPtrConstant(0));
13329   }
13330
13331   if (VT.getSizeInBits() == 64) {
13332     // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
13333     // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
13334     //        to match extract_elt for f64.
13335     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
13336     if (Idx == 0)
13337       return Op;
13338
13339     // UNPCKHPD the element to the lowest double word, then movsd.
13340     // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
13341     // to a f64mem, the whole operation is folded into a single MOVHPDmr.
13342     int Mask[2] = { 1, -1 };
13343     MVT VVT = Op.getOperand(0).getSimpleValueType();
13344     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
13345                                        DAG.getUNDEF(VVT), Mask);
13346     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
13347                        DAG.getIntPtrConstant(0));
13348   }
13349
13350   return SDValue();
13351 }
13352
13353 /// Insert one bit to mask vector, like v16i1 or v8i1.
13354 /// AVX-512 feature.
13355 SDValue
13356 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
13357   SDLoc dl(Op);
13358   SDValue Vec = Op.getOperand(0);
13359   SDValue Elt = Op.getOperand(1);
13360   SDValue Idx = Op.getOperand(2);
13361   MVT VecVT = Vec.getSimpleValueType();
13362
13363   if (!isa<ConstantSDNode>(Idx)) {
13364     // Non constant index. Extend source and destination,
13365     // insert element and then truncate the result.
13366     MVT ExtVecVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
13367     MVT ExtEltVT = (VecVT == MVT::v8i1 ?  MVT::i64 : MVT::i32);
13368     SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
13369       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
13370       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
13371     return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
13372   }
13373
13374   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13375   SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
13376   if (Vec.getOpcode() == ISD::UNDEF)
13377     return DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
13378                        DAG.getConstant(IdxVal, MVT::i8));
13379   const TargetRegisterClass* rc = getRegClassFor(VecVT);
13380   unsigned MaxSift = rc->getSize()*8 - 1;
13381   EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
13382                     DAG.getConstant(MaxSift, MVT::i8));
13383   EltInVec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, EltInVec,
13384                     DAG.getConstant(MaxSift - IdxVal, MVT::i8));
13385   return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
13386 }
13387
13388 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
13389                                                   SelectionDAG &DAG) const {
13390   MVT VT = Op.getSimpleValueType();
13391   MVT EltVT = VT.getVectorElementType();
13392
13393   if (EltVT == MVT::i1)
13394     return InsertBitToMaskVector(Op, DAG);
13395
13396   SDLoc dl(Op);
13397   SDValue N0 = Op.getOperand(0);
13398   SDValue N1 = Op.getOperand(1);
13399   SDValue N2 = Op.getOperand(2);
13400   if (!isa<ConstantSDNode>(N2))
13401     return SDValue();
13402   auto *N2C = cast<ConstantSDNode>(N2);
13403   unsigned IdxVal = N2C->getZExtValue();
13404
13405   // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
13406   // into that, and then insert the subvector back into the result.
13407   if (VT.is256BitVector() || VT.is512BitVector()) {
13408     // Get the desired 128-bit vector half.
13409     SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
13410
13411     // Insert the element into the desired half.
13412     unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
13413     unsigned IdxIn128 = IdxVal - (IdxVal / NumEltsIn128) * NumEltsIn128;
13414
13415     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
13416                     DAG.getConstant(IdxIn128, MVT::i32));
13417
13418     // Insert the changed part back to the 256-bit vector
13419     return Insert128BitVector(N0, V, IdxVal, DAG, dl);
13420   }
13421   assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
13422
13423   if (Subtarget->hasSSE41()) {
13424     if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) {
13425       unsigned Opc;
13426       if (VT == MVT::v8i16) {
13427         Opc = X86ISD::PINSRW;
13428       } else {
13429         assert(VT == MVT::v16i8);
13430         Opc = X86ISD::PINSRB;
13431       }
13432
13433       // Transform it so it match pinsr{b,w} which expects a GR32 as its second
13434       // argument.
13435       if (N1.getValueType() != MVT::i32)
13436         N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
13437       if (N2.getValueType() != MVT::i32)
13438         N2 = DAG.getIntPtrConstant(IdxVal);
13439       return DAG.getNode(Opc, dl, VT, N0, N1, N2);
13440     }
13441
13442     if (EltVT == MVT::f32) {
13443       // Bits [7:6] of the constant are the source select.  This will always be
13444       //  zero here.  The DAG Combiner may combine an extract_elt index into
13445       //  these
13446       //  bits.  For example (insert (extract, 3), 2) could be matched by
13447       //  putting
13448       //  the '3' into bits [7:6] of X86ISD::INSERTPS.
13449       // Bits [5:4] of the constant are the destination select.  This is the
13450       //  value of the incoming immediate.
13451       // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may
13452       //   combine either bitwise AND or insert of float 0.0 to set these bits.
13453       N2 = DAG.getIntPtrConstant(IdxVal << 4);
13454       // Create this as a scalar to vector..
13455       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
13456       return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
13457     }
13458
13459     if (EltVT == MVT::i32 || EltVT == MVT::i64) {
13460       // PINSR* works with constant index.
13461       return Op;
13462     }
13463   }
13464
13465   if (EltVT == MVT::i8)
13466     return SDValue();
13467
13468   if (EltVT.getSizeInBits() == 16) {
13469     // Transform it so it match pinsrw which expects a 16-bit value in a GR32
13470     // as its second argument.
13471     if (N1.getValueType() != MVT::i32)
13472       N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
13473     if (N2.getValueType() != MVT::i32)
13474       N2 = DAG.getIntPtrConstant(IdxVal);
13475     return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
13476   }
13477   return SDValue();
13478 }
13479
13480 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
13481   SDLoc dl(Op);
13482   MVT OpVT = Op.getSimpleValueType();
13483
13484   // If this is a 256-bit vector result, first insert into a 128-bit
13485   // vector and then insert into the 256-bit vector.
13486   if (!OpVT.is128BitVector()) {
13487     // Insert into a 128-bit vector.
13488     unsigned SizeFactor = OpVT.getSizeInBits()/128;
13489     MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
13490                                  OpVT.getVectorNumElements() / SizeFactor);
13491
13492     Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
13493
13494     // Insert the 128-bit vector.
13495     return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
13496   }
13497
13498   if (OpVT == MVT::v1i64 &&
13499       Op.getOperand(0).getValueType() == MVT::i64)
13500     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
13501
13502   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
13503   assert(OpVT.is128BitVector() && "Expected an SSE type!");
13504   return DAG.getNode(ISD::BITCAST, dl, OpVT,
13505                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
13506 }
13507
13508 // Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
13509 // a simple subregister reference or explicit instructions to grab
13510 // upper bits of a vector.
13511 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
13512                                       SelectionDAG &DAG) {
13513   SDLoc dl(Op);
13514   SDValue In =  Op.getOperand(0);
13515   SDValue Idx = Op.getOperand(1);
13516   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13517   MVT ResVT   = Op.getSimpleValueType();
13518   MVT InVT    = In.getSimpleValueType();
13519
13520   if (Subtarget->hasFp256()) {
13521     if (ResVT.is128BitVector() &&
13522         (InVT.is256BitVector() || InVT.is512BitVector()) &&
13523         isa<ConstantSDNode>(Idx)) {
13524       return Extract128BitVector(In, IdxVal, DAG, dl);
13525     }
13526     if (ResVT.is256BitVector() && InVT.is512BitVector() &&
13527         isa<ConstantSDNode>(Idx)) {
13528       return Extract256BitVector(In, IdxVal, DAG, dl);
13529     }
13530   }
13531   return SDValue();
13532 }
13533
13534 // Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
13535 // simple superregister reference or explicit instructions to insert
13536 // the upper bits of a vector.
13537 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
13538                                      SelectionDAG &DAG) {
13539   if (!Subtarget->hasAVX())
13540     return SDValue();
13541
13542   SDLoc dl(Op);
13543   SDValue Vec = Op.getOperand(0);
13544   SDValue SubVec = Op.getOperand(1);
13545   SDValue Idx = Op.getOperand(2);
13546
13547   if (!isa<ConstantSDNode>(Idx))
13548     return SDValue();
13549
13550   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13551   MVT OpVT = Op.getSimpleValueType();
13552   MVT SubVecVT = SubVec.getSimpleValueType();
13553
13554   // Fold two 16-byte subvector loads into one 32-byte load:
13555   // (insert_subvector (insert_subvector undef, (load addr), 0),
13556   //                   (load addr + 16), Elts/2)
13557   // --> load32 addr
13558   if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
13559       Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
13560       OpVT.is256BitVector() && SubVecVT.is128BitVector() &&
13561       !Subtarget->isUnalignedMem32Slow()) {
13562     SDValue SubVec2 = Vec.getOperand(1);
13563     if (auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2))) {
13564       if (Idx2->getZExtValue() == 0) {
13565         SDValue Ops[] = { SubVec2, SubVec };
13566         SDValue LD = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false);
13567         if (LD.getNode())
13568           return LD;
13569       }
13570     }
13571   }
13572
13573   if ((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
13574       SubVecVT.is128BitVector())
13575     return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
13576
13577   if (OpVT.is512BitVector() && SubVecVT.is256BitVector())
13578     return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
13579
13580   return SDValue();
13581 }
13582
13583 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
13584 // their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
13585 // one of the above mentioned nodes. It has to be wrapped because otherwise
13586 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
13587 // be used to form addressing mode. These wrapped nodes will be selected
13588 // into MOV32ri.
13589 SDValue
13590 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
13591   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
13592
13593   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13594   // global base reg.
13595   unsigned char OpFlag = 0;
13596   unsigned WrapperKind = X86ISD::Wrapper;
13597   CodeModel::Model M = DAG.getTarget().getCodeModel();
13598
13599   if (Subtarget->isPICStyleRIPRel() &&
13600       (M == CodeModel::Small || M == CodeModel::Kernel))
13601     WrapperKind = X86ISD::WrapperRIP;
13602   else if (Subtarget->isPICStyleGOT())
13603     OpFlag = X86II::MO_GOTOFF;
13604   else if (Subtarget->isPICStyleStubPIC())
13605     OpFlag = X86II::MO_PIC_BASE_OFFSET;
13606
13607   SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
13608                                              CP->getAlignment(),
13609                                              CP->getOffset(), OpFlag);
13610   SDLoc DL(CP);
13611   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13612   // With PIC, the address is actually $g + Offset.
13613   if (OpFlag) {
13614     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13615                          DAG.getNode(X86ISD::GlobalBaseReg,
13616                                      SDLoc(), getPointerTy()),
13617                          Result);
13618   }
13619
13620   return Result;
13621 }
13622
13623 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
13624   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
13625
13626   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13627   // global base reg.
13628   unsigned char OpFlag = 0;
13629   unsigned WrapperKind = X86ISD::Wrapper;
13630   CodeModel::Model M = DAG.getTarget().getCodeModel();
13631
13632   if (Subtarget->isPICStyleRIPRel() &&
13633       (M == CodeModel::Small || M == CodeModel::Kernel))
13634     WrapperKind = X86ISD::WrapperRIP;
13635   else if (Subtarget->isPICStyleGOT())
13636     OpFlag = X86II::MO_GOTOFF;
13637   else if (Subtarget->isPICStyleStubPIC())
13638     OpFlag = X86II::MO_PIC_BASE_OFFSET;
13639
13640   SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
13641                                           OpFlag);
13642   SDLoc DL(JT);
13643   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13644
13645   // With PIC, the address is actually $g + Offset.
13646   if (OpFlag)
13647     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13648                          DAG.getNode(X86ISD::GlobalBaseReg,
13649                                      SDLoc(), getPointerTy()),
13650                          Result);
13651
13652   return Result;
13653 }
13654
13655 SDValue
13656 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
13657   const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
13658
13659   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13660   // global base reg.
13661   unsigned char OpFlag = 0;
13662   unsigned WrapperKind = X86ISD::Wrapper;
13663   CodeModel::Model M = DAG.getTarget().getCodeModel();
13664
13665   if (Subtarget->isPICStyleRIPRel() &&
13666       (M == CodeModel::Small || M == CodeModel::Kernel)) {
13667     if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())
13668       OpFlag = X86II::MO_GOTPCREL;
13669     WrapperKind = X86ISD::WrapperRIP;
13670   } else if (Subtarget->isPICStyleGOT()) {
13671     OpFlag = X86II::MO_GOT;
13672   } else if (Subtarget->isPICStyleStubPIC()) {
13673     OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE;
13674   } else if (Subtarget->isPICStyleStubNoDynamic()) {
13675     OpFlag = X86II::MO_DARWIN_NONLAZY;
13676   }
13677
13678   SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
13679
13680   SDLoc DL(Op);
13681   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13682
13683   // With PIC, the address is actually $g + Offset.
13684   if (DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
13685       !Subtarget->is64Bit()) {
13686     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13687                          DAG.getNode(X86ISD::GlobalBaseReg,
13688                                      SDLoc(), getPointerTy()),
13689                          Result);
13690   }
13691
13692   // For symbols that require a load from a stub to get the address, emit the
13693   // load.
13694   if (isGlobalStubReference(OpFlag))
13695     Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
13696                          MachinePointerInfo::getGOT(), false, false, false, 0);
13697
13698   return Result;
13699 }
13700
13701 SDValue
13702 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
13703   // Create the TargetBlockAddressAddress node.
13704   unsigned char OpFlags =
13705     Subtarget->ClassifyBlockAddressReference();
13706   CodeModel::Model M = DAG.getTarget().getCodeModel();
13707   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
13708   int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
13709   SDLoc dl(Op);
13710   SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset,
13711                                              OpFlags);
13712
13713   if (Subtarget->isPICStyleRIPRel() &&
13714       (M == CodeModel::Small || M == CodeModel::Kernel))
13715     Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
13716   else
13717     Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
13718
13719   // With PIC, the address is actually $g + Offset.
13720   if (isGlobalRelativeToPICBase(OpFlags)) {
13721     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
13722                          DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
13723                          Result);
13724   }
13725
13726   return Result;
13727 }
13728
13729 SDValue
13730 X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
13731                                       int64_t Offset, SelectionDAG &DAG) const {
13732   // Create the TargetGlobalAddress node, folding in the constant
13733   // offset if it is legal.
13734   unsigned char OpFlags =
13735       Subtarget->ClassifyGlobalReference(GV, DAG.getTarget());
13736   CodeModel::Model M = DAG.getTarget().getCodeModel();
13737   SDValue Result;
13738   if (OpFlags == X86II::MO_NO_FLAG &&
13739       X86::isOffsetSuitableForCodeModel(Offset, M)) {
13740     // A direct static reference to a global.
13741     Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
13742     Offset = 0;
13743   } else {
13744     Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
13745   }
13746
13747   if (Subtarget->isPICStyleRIPRel() &&
13748       (M == CodeModel::Small || M == CodeModel::Kernel))
13749     Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
13750   else
13751     Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
13752
13753   // With PIC, the address is actually $g + Offset.
13754   if (isGlobalRelativeToPICBase(OpFlags)) {
13755     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
13756                          DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
13757                          Result);
13758   }
13759
13760   // For globals that require a load from a stub to get the address, emit the
13761   // load.
13762   if (isGlobalStubReference(OpFlags))
13763     Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
13764                          MachinePointerInfo::getGOT(), false, false, false, 0);
13765
13766   // If there was a non-zero offset that we didn't fold, create an explicit
13767   // addition for it.
13768   if (Offset != 0)
13769     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
13770                          DAG.getConstant(Offset, getPointerTy()));
13771
13772   return Result;
13773 }
13774
13775 SDValue
13776 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
13777   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
13778   int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
13779   return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
13780 }
13781
13782 static SDValue
13783 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
13784            SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
13785            unsigned char OperandFlags, bool LocalDynamic = false) {
13786   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
13787   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
13788   SDLoc dl(GA);
13789   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13790                                            GA->getValueType(0),
13791                                            GA->getOffset(),
13792                                            OperandFlags);
13793
13794   X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
13795                                            : X86ISD::TLSADDR;
13796
13797   if (InFlag) {
13798     SDValue Ops[] = { Chain,  TGA, *InFlag };
13799     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
13800   } else {
13801     SDValue Ops[]  = { Chain, TGA };
13802     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
13803   }
13804
13805   // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
13806   MFI->setAdjustsStack(true);
13807   MFI->setHasCalls(true);
13808
13809   SDValue Flag = Chain.getValue(1);
13810   return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
13811 }
13812
13813 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
13814 static SDValue
13815 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13816                                 const EVT PtrVT) {
13817   SDValue InFlag;
13818   SDLoc dl(GA);  // ? function entry point might be better
13819   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
13820                                    DAG.getNode(X86ISD::GlobalBaseReg,
13821                                                SDLoc(), PtrVT), InFlag);
13822   InFlag = Chain.getValue(1);
13823
13824   return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
13825 }
13826
13827 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
13828 static SDValue
13829 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13830                                 const EVT PtrVT) {
13831   return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
13832                     X86::RAX, X86II::MO_TLSGD);
13833 }
13834
13835 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
13836                                            SelectionDAG &DAG,
13837                                            const EVT PtrVT,
13838                                            bool is64Bit) {
13839   SDLoc dl(GA);
13840
13841   // Get the start address of the TLS block for this module.
13842   X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
13843       .getInfo<X86MachineFunctionInfo>();
13844   MFI->incNumLocalDynamicTLSAccesses();
13845
13846   SDValue Base;
13847   if (is64Bit) {
13848     Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
13849                       X86II::MO_TLSLD, /*LocalDynamic=*/true);
13850   } else {
13851     SDValue InFlag;
13852     SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
13853         DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
13854     InFlag = Chain.getValue(1);
13855     Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
13856                       X86II::MO_TLSLDM, /*LocalDynamic=*/true);
13857   }
13858
13859   // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
13860   // of Base.
13861
13862   // Build x@dtpoff.
13863   unsigned char OperandFlags = X86II::MO_DTPOFF;
13864   unsigned WrapperKind = X86ISD::Wrapper;
13865   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13866                                            GA->getValueType(0),
13867                                            GA->getOffset(), OperandFlags);
13868   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
13869
13870   // Add x@dtpoff with the base.
13871   return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
13872 }
13873
13874 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
13875 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13876                                    const EVT PtrVT, TLSModel::Model model,
13877                                    bool is64Bit, bool isPIC) {
13878   SDLoc dl(GA);
13879
13880   // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
13881   Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
13882                                                          is64Bit ? 257 : 256));
13883
13884   SDValue ThreadPointer =
13885       DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0),
13886                   MachinePointerInfo(Ptr), false, false, false, 0);
13887
13888   unsigned char OperandFlags = 0;
13889   // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
13890   // initialexec.
13891   unsigned WrapperKind = X86ISD::Wrapper;
13892   if (model == TLSModel::LocalExec) {
13893     OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
13894   } else if (model == TLSModel::InitialExec) {
13895     if (is64Bit) {
13896       OperandFlags = X86II::MO_GOTTPOFF;
13897       WrapperKind = X86ISD::WrapperRIP;
13898     } else {
13899       OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
13900     }
13901   } else {
13902     llvm_unreachable("Unexpected model");
13903   }
13904
13905   // emit "addl x@ntpoff,%eax" (local exec)
13906   // or "addl x@indntpoff,%eax" (initial exec)
13907   // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
13908   SDValue TGA =
13909       DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
13910                                  GA->getOffset(), OperandFlags);
13911   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
13912
13913   if (model == TLSModel::InitialExec) {
13914     if (isPIC && !is64Bit) {
13915       Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
13916                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
13917                            Offset);
13918     }
13919
13920     Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
13921                          MachinePointerInfo::getGOT(), false, false, false, 0);
13922   }
13923
13924   // The address of the thread local variable is the add of the thread
13925   // pointer with the offset of the variable.
13926   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
13927 }
13928
13929 SDValue
13930 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
13931
13932   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
13933   const GlobalValue *GV = GA->getGlobal();
13934
13935   if (Subtarget->isTargetELF()) {
13936     TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
13937
13938     switch (model) {
13939       case TLSModel::GeneralDynamic:
13940         if (Subtarget->is64Bit())
13941           return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
13942         return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
13943       case TLSModel::LocalDynamic:
13944         return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(),
13945                                            Subtarget->is64Bit());
13946       case TLSModel::InitialExec:
13947       case TLSModel::LocalExec:
13948         return LowerToTLSExecModel(
13949             GA, DAG, getPointerTy(), model, Subtarget->is64Bit(),
13950             DAG.getTarget().getRelocationModel() == Reloc::PIC_);
13951     }
13952     llvm_unreachable("Unknown TLS model.");
13953   }
13954
13955   if (Subtarget->isTargetDarwin()) {
13956     // Darwin only has one model of TLS.  Lower to that.
13957     unsigned char OpFlag = 0;
13958     unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
13959                            X86ISD::WrapperRIP : X86ISD::Wrapper;
13960
13961     // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13962     // global base reg.
13963     bool PIC32 = (DAG.getTarget().getRelocationModel() == Reloc::PIC_) &&
13964                  !Subtarget->is64Bit();
13965     if (PIC32)
13966       OpFlag = X86II::MO_TLVP_PIC_BASE;
13967     else
13968       OpFlag = X86II::MO_TLVP;
13969     SDLoc DL(Op);
13970     SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
13971                                                 GA->getValueType(0),
13972                                                 GA->getOffset(), OpFlag);
13973     SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13974
13975     // With PIC32, the address is actually $g + Offset.
13976     if (PIC32)
13977       Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13978                            DAG.getNode(X86ISD::GlobalBaseReg,
13979                                        SDLoc(), getPointerTy()),
13980                            Offset);
13981
13982     // Lowering the machine isd will make sure everything is in the right
13983     // location.
13984     SDValue Chain = DAG.getEntryNode();
13985     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
13986     SDValue Args[] = { Chain, Offset };
13987     Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
13988
13989     // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
13990     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
13991     MFI->setAdjustsStack(true);
13992
13993     // And our return value (tls address) is in the standard call return value
13994     // location.
13995     unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
13996     return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
13997                               Chain.getValue(1));
13998   }
13999
14000   if (Subtarget->isTargetKnownWindowsMSVC() ||
14001       Subtarget->isTargetWindowsGNU()) {
14002     // Just use the implicit TLS architecture
14003     // Need to generate someting similar to:
14004     //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
14005     //                                  ; from TEB
14006     //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
14007     //   mov     rcx, qword [rdx+rcx*8]
14008     //   mov     eax, .tls$:tlsvar
14009     //   [rax+rcx] contains the address
14010     // Windows 64bit: gs:0x58
14011     // Windows 32bit: fs:__tls_array
14012
14013     SDLoc dl(GA);
14014     SDValue Chain = DAG.getEntryNode();
14015
14016     // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
14017     // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
14018     // use its literal value of 0x2C.
14019     Value *Ptr = Constant::getNullValue(Subtarget->is64Bit()
14020                                         ? Type::getInt8PtrTy(*DAG.getContext(),
14021                                                              256)
14022                                         : Type::getInt32PtrTy(*DAG.getContext(),
14023                                                               257));
14024
14025     SDValue TlsArray =
14026         Subtarget->is64Bit()
14027             ? DAG.getIntPtrConstant(0x58)
14028             : (Subtarget->isTargetWindowsGNU()
14029                    ? DAG.getIntPtrConstant(0x2C)
14030                    : DAG.getExternalSymbol("_tls_array", getPointerTy()));
14031
14032     SDValue ThreadPointer =
14033         DAG.getLoad(getPointerTy(), dl, Chain, TlsArray,
14034                     MachinePointerInfo(Ptr), false, false, false, 0);
14035
14036     // Load the _tls_index variable
14037     SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy());
14038     if (Subtarget->is64Bit())
14039       IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain,
14040                            IDX, MachinePointerInfo(), MVT::i32,
14041                            false, false, false, 0);
14042     else
14043       IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(),
14044                         false, false, false, 0);
14045
14046     SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()),
14047                                     getPointerTy());
14048     IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);
14049
14050     SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);
14051     res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(),
14052                       false, false, false, 0);
14053
14054     // Get the offset of start of .tls section
14055     SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14056                                              GA->getValueType(0),
14057                                              GA->getOffset(), X86II::MO_SECREL);
14058     SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA);
14059
14060     // The address of the thread local variable is the add of the thread
14061     // pointer with the offset of the variable.
14062     return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset);
14063   }
14064
14065   llvm_unreachable("TLS not implemented for this target.");
14066 }
14067
14068 /// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
14069 /// and take a 2 x i32 value to shift plus a shift amount.
14070 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
14071   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
14072   MVT VT = Op.getSimpleValueType();
14073   unsigned VTBits = VT.getSizeInBits();
14074   SDLoc dl(Op);
14075   bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
14076   SDValue ShOpLo = Op.getOperand(0);
14077   SDValue ShOpHi = Op.getOperand(1);
14078   SDValue ShAmt  = Op.getOperand(2);
14079   // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
14080   // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
14081   // during isel.
14082   SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
14083                                   DAG.getConstant(VTBits - 1, MVT::i8));
14084   SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
14085                                      DAG.getConstant(VTBits - 1, MVT::i8))
14086                        : DAG.getConstant(0, VT);
14087
14088   SDValue Tmp2, Tmp3;
14089   if (Op.getOpcode() == ISD::SHL_PARTS) {
14090     Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
14091     Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
14092   } else {
14093     Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
14094     Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
14095   }
14096
14097   // If the shift amount is larger or equal than the width of a part we can't
14098   // rely on the results of shld/shrd. Insert a test and select the appropriate
14099   // values for large shift amounts.
14100   SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
14101                                 DAG.getConstant(VTBits, MVT::i8));
14102   SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
14103                              AndNode, DAG.getConstant(0, MVT::i8));
14104
14105   SDValue Hi, Lo;
14106   SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
14107   SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
14108   SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
14109
14110   if (Op.getOpcode() == ISD::SHL_PARTS) {
14111     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
14112     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
14113   } else {
14114     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
14115     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
14116   }
14117
14118   SDValue Ops[2] = { Lo, Hi };
14119   return DAG.getMergeValues(Ops, dl);
14120 }
14121
14122 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
14123                                            SelectionDAG &DAG) const {
14124   MVT SrcVT = Op.getOperand(0).getSimpleValueType();
14125   SDLoc dl(Op);
14126
14127   if (SrcVT.isVector()) {
14128     if (SrcVT.getVectorElementType() == MVT::i1) {
14129       MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
14130       return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14131                          DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT,
14132                                      Op.getOperand(0)));
14133     }
14134     return SDValue();
14135   }
14136
14137   assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
14138          "Unknown SINT_TO_FP to lower!");
14139
14140   // These are really Legal; return the operand so the caller accepts it as
14141   // Legal.
14142   if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
14143     return Op;
14144   if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
14145       Subtarget->is64Bit()) {
14146     return Op;
14147   }
14148
14149   unsigned Size = SrcVT.getSizeInBits()/8;
14150   MachineFunction &MF = DAG.getMachineFunction();
14151   int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
14152   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
14153   SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
14154                                StackSlot,
14155                                MachinePointerInfo::getFixedStack(SSFI),
14156                                false, false, 0);
14157   return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
14158 }
14159
14160 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
14161                                      SDValue StackSlot,
14162                                      SelectionDAG &DAG) const {
14163   // Build the FILD
14164   SDLoc DL(Op);
14165   SDVTList Tys;
14166   bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
14167   if (useSSE)
14168     Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
14169   else
14170     Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
14171
14172   unsigned ByteSize = SrcVT.getSizeInBits()/8;
14173
14174   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
14175   MachineMemOperand *MMO;
14176   if (FI) {
14177     int SSFI = FI->getIndex();
14178     MMO =
14179       DAG.getMachineFunction()
14180       .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14181                             MachineMemOperand::MOLoad, ByteSize, ByteSize);
14182   } else {
14183     MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
14184     StackSlot = StackSlot.getOperand(1);
14185   }
14186   SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
14187   SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
14188                                            X86ISD::FILD, DL,
14189                                            Tys, Ops, SrcVT, MMO);
14190
14191   if (useSSE) {
14192     Chain = Result.getValue(1);
14193     SDValue InFlag = Result.getValue(2);
14194
14195     // FIXME: Currently the FST is flagged to the FILD_FLAG. This
14196     // shouldn't be necessary except that RFP cannot be live across
14197     // multiple blocks. When stackifier is fixed, they can be uncoupled.
14198     MachineFunction &MF = DAG.getMachineFunction();
14199     unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
14200     int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
14201     SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
14202     Tys = DAG.getVTList(MVT::Other);
14203     SDValue Ops[] = {
14204       Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
14205     };
14206     MachineMemOperand *MMO =
14207       DAG.getMachineFunction()
14208       .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14209                             MachineMemOperand::MOStore, SSFISize, SSFISize);
14210
14211     Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
14212                                     Ops, Op.getValueType(), MMO);
14213     Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
14214                          MachinePointerInfo::getFixedStack(SSFI),
14215                          false, false, false, 0);
14216   }
14217
14218   return Result;
14219 }
14220
14221 // LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
14222 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
14223                                                SelectionDAG &DAG) const {
14224   // This algorithm is not obvious. Here it is what we're trying to output:
14225   /*
14226      movq       %rax,  %xmm0
14227      punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
14228      subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
14229      #ifdef __SSE3__
14230        haddpd   %xmm0, %xmm0
14231      #else
14232        pshufd   $0x4e, %xmm0, %xmm1
14233        addpd    %xmm1, %xmm0
14234      #endif
14235   */
14236
14237   SDLoc dl(Op);
14238   LLVMContext *Context = DAG.getContext();
14239
14240   // Build some magic constants.
14241   static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
14242   Constant *C0 = ConstantDataVector::get(*Context, CV0);
14243   SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
14244
14245   SmallVector<Constant*,2> CV1;
14246   CV1.push_back(
14247     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
14248                                       APInt(64, 0x4330000000000000ULL))));
14249   CV1.push_back(
14250     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
14251                                       APInt(64, 0x4530000000000000ULL))));
14252   Constant *C1 = ConstantVector::get(CV1);
14253   SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
14254
14255   // Load the 64-bit value into an XMM register.
14256   SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
14257                             Op.getOperand(0));
14258   SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
14259                               MachinePointerInfo::getConstantPool(),
14260                               false, false, false, 16);
14261   SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32,
14262                               DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1),
14263                               CLod0);
14264
14265   SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
14266                               MachinePointerInfo::getConstantPool(),
14267                               false, false, false, 16);
14268   SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1);
14269   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
14270   SDValue Result;
14271
14272   if (Subtarget->hasSSE3()) {
14273     // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
14274     Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
14275   } else {
14276     SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub);
14277     SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
14278                                            S2F, 0x4E, DAG);
14279     Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
14280                          DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle),
14281                          Sub);
14282   }
14283
14284   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
14285                      DAG.getIntPtrConstant(0));
14286 }
14287
14288 // LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
14289 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
14290                                                SelectionDAG &DAG) const {
14291   SDLoc dl(Op);
14292   // FP constant to bias correct the final result.
14293   SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
14294                                    MVT::f64);
14295
14296   // Load the 32-bit value into an XMM register.
14297   SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
14298                              Op.getOperand(0));
14299
14300   // Zero out the upper parts of the register.
14301   Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
14302
14303   Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
14304                      DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
14305                      DAG.getIntPtrConstant(0));
14306
14307   // Or the load with the bias.
14308   SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
14309                            DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
14310                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
14311                                                    MVT::v2f64, Load)),
14312                            DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
14313                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
14314                                                    MVT::v2f64, Bias)));
14315   Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
14316                    DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or),
14317                    DAG.getIntPtrConstant(0));
14318
14319   // Subtract the bias.
14320   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
14321
14322   // Handle final rounding.
14323   EVT DestVT = Op.getValueType();
14324
14325   if (DestVT.bitsLT(MVT::f64))
14326     return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
14327                        DAG.getIntPtrConstant(0));
14328   if (DestVT.bitsGT(MVT::f64))
14329     return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
14330
14331   // Handle final rounding.
14332   return Sub;
14333 }
14334
14335 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
14336                                      const X86Subtarget &Subtarget) {
14337   // The algorithm is the following:
14338   // #ifdef __SSE4_1__
14339   //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
14340   //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
14341   //                                 (uint4) 0x53000000, 0xaa);
14342   // #else
14343   //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
14344   //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
14345   // #endif
14346   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
14347   //     return (float4) lo + fhi;
14348
14349   SDLoc DL(Op);
14350   SDValue V = Op->getOperand(0);
14351   EVT VecIntVT = V.getValueType();
14352   bool Is128 = VecIntVT == MVT::v4i32;
14353   EVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
14354   // If we convert to something else than the supported type, e.g., to v4f64,
14355   // abort early.
14356   if (VecFloatVT != Op->getValueType(0))
14357     return SDValue();
14358
14359   unsigned NumElts = VecIntVT.getVectorNumElements();
14360   assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
14361          "Unsupported custom type");
14362   assert(NumElts <= 8 && "The size of the constant array must be fixed");
14363
14364   // In the #idef/#else code, we have in common:
14365   // - The vector of constants:
14366   // -- 0x4b000000
14367   // -- 0x53000000
14368   // - A shift:
14369   // -- v >> 16
14370
14371   // Create the splat vector for 0x4b000000.
14372   SDValue CstLow = DAG.getConstant(0x4b000000, MVT::i32);
14373   SDValue CstLowArray[] = {CstLow, CstLow, CstLow, CstLow,
14374                            CstLow, CstLow, CstLow, CstLow};
14375   SDValue VecCstLow = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
14376                                   makeArrayRef(&CstLowArray[0], NumElts));
14377   // Create the splat vector for 0x53000000.
14378   SDValue CstHigh = DAG.getConstant(0x53000000, MVT::i32);
14379   SDValue CstHighArray[] = {CstHigh, CstHigh, CstHigh, CstHigh,
14380                             CstHigh, CstHigh, CstHigh, CstHigh};
14381   SDValue VecCstHigh = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
14382                                    makeArrayRef(&CstHighArray[0], NumElts));
14383
14384   // Create the right shift.
14385   SDValue CstShift = DAG.getConstant(16, MVT::i32);
14386   SDValue CstShiftArray[] = {CstShift, CstShift, CstShift, CstShift,
14387                              CstShift, CstShift, CstShift, CstShift};
14388   SDValue VecCstShift = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
14389                                     makeArrayRef(&CstShiftArray[0], NumElts));
14390   SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
14391
14392   SDValue Low, High;
14393   if (Subtarget.hasSSE41()) {
14394     EVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
14395     //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
14396     SDValue VecCstLowBitcast =
14397         DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstLow);
14398     SDValue VecBitcast = DAG.getNode(ISD::BITCAST, DL, VecI16VT, V);
14399     // Low will be bitcasted right away, so do not bother bitcasting back to its
14400     // original type.
14401     Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
14402                       VecCstLowBitcast, DAG.getConstant(0xaa, MVT::i32));
14403     //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
14404     //                                 (uint4) 0x53000000, 0xaa);
14405     SDValue VecCstHighBitcast =
14406         DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstHigh);
14407     SDValue VecShiftBitcast =
14408         DAG.getNode(ISD::BITCAST, DL, VecI16VT, HighShift);
14409     // High will be bitcasted right away, so do not bother bitcasting back to
14410     // its original type.
14411     High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
14412                        VecCstHighBitcast, DAG.getConstant(0xaa, MVT::i32));
14413   } else {
14414     SDValue CstMask = DAG.getConstant(0xffff, MVT::i32);
14415     SDValue VecCstMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, CstMask,
14416                                      CstMask, CstMask, CstMask);
14417     //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
14418     SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
14419     Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
14420
14421     //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
14422     High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
14423   }
14424
14425   // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
14426   SDValue CstFAdd = DAG.getConstantFP(
14427       APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), MVT::f32);
14428   SDValue CstFAddArray[] = {CstFAdd, CstFAdd, CstFAdd, CstFAdd,
14429                             CstFAdd, CstFAdd, CstFAdd, CstFAdd};
14430   SDValue VecCstFAdd = DAG.getNode(ISD::BUILD_VECTOR, DL, VecFloatVT,
14431                                    makeArrayRef(&CstFAddArray[0], NumElts));
14432
14433   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
14434   SDValue HighBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, High);
14435   SDValue FHigh =
14436       DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
14437   //     return (float4) lo + fhi;
14438   SDValue LowBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, Low);
14439   return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
14440 }
14441
14442 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
14443                                                SelectionDAG &DAG) const {
14444   SDValue N0 = Op.getOperand(0);
14445   MVT SVT = N0.getSimpleValueType();
14446   SDLoc dl(Op);
14447
14448   switch (SVT.SimpleTy) {
14449   default:
14450     llvm_unreachable("Custom UINT_TO_FP is not supported!");
14451   case MVT::v4i8:
14452   case MVT::v4i16:
14453   case MVT::v8i8:
14454   case MVT::v8i16: {
14455     MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements());
14456     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14457                        DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
14458   }
14459   case MVT::v4i32:
14460   case MVT::v8i32:
14461     return lowerUINT_TO_FP_vXi32(Op, DAG, *Subtarget);
14462   }
14463   llvm_unreachable(nullptr);
14464 }
14465
14466 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
14467                                            SelectionDAG &DAG) const {
14468   SDValue N0 = Op.getOperand(0);
14469   SDLoc dl(Op);
14470
14471   if (Op.getValueType().isVector())
14472     return lowerUINT_TO_FP_vec(Op, DAG);
14473
14474   // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
14475   // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
14476   // the optimization here.
14477   if (DAG.SignBitIsZero(N0))
14478     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
14479
14480   MVT SrcVT = N0.getSimpleValueType();
14481   MVT DstVT = Op.getSimpleValueType();
14482   if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
14483     return LowerUINT_TO_FP_i64(Op, DAG);
14484   if (SrcVT == MVT::i32 && X86ScalarSSEf64)
14485     return LowerUINT_TO_FP_i32(Op, DAG);
14486   if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
14487     return SDValue();
14488
14489   // Make a 64-bit buffer, and use it to build an FILD.
14490   SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
14491   if (SrcVT == MVT::i32) {
14492     SDValue WordOff = DAG.getConstant(4, getPointerTy());
14493     SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
14494                                      getPointerTy(), StackSlot, WordOff);
14495     SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
14496                                   StackSlot, MachinePointerInfo(),
14497                                   false, false, 0);
14498     SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
14499                                   OffsetSlot, MachinePointerInfo(),
14500                                   false, false, 0);
14501     SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
14502     return Fild;
14503   }
14504
14505   assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
14506   SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
14507                                StackSlot, MachinePointerInfo(),
14508                                false, false, 0);
14509   // For i64 source, we need to add the appropriate power of 2 if the input
14510   // was negative.  This is the same as the optimization in
14511   // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
14512   // we must be careful to do the computation in x87 extended precision, not
14513   // in SSE. (The generic code can't know it's OK to do this, or how to.)
14514   int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
14515   MachineMemOperand *MMO =
14516     DAG.getMachineFunction()
14517     .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14518                           MachineMemOperand::MOLoad, 8, 8);
14519
14520   SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
14521   SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
14522   SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
14523                                          MVT::i64, MMO);
14524
14525   APInt FF(32, 0x5F800000ULL);
14526
14527   // Check whether the sign bit is set.
14528   SDValue SignSet = DAG.getSetCC(dl,
14529                                  getSetCCResultType(*DAG.getContext(), MVT::i64),
14530                                  Op.getOperand(0), DAG.getConstant(0, MVT::i64),
14531                                  ISD::SETLT);
14532
14533   // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
14534   SDValue FudgePtr = DAG.getConstantPool(
14535                              ConstantInt::get(*DAG.getContext(), FF.zext(64)),
14536                                          getPointerTy());
14537
14538   // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
14539   SDValue Zero = DAG.getIntPtrConstant(0);
14540   SDValue Four = DAG.getIntPtrConstant(4);
14541   SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
14542                                Zero, Four);
14543   FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset);
14544
14545   // Load the value out, extending it from f32 to f80.
14546   // FIXME: Avoid the extend by constructing the right constant pool?
14547   SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(),
14548                                  FudgePtr, MachinePointerInfo::getConstantPool(),
14549                                  MVT::f32, false, false, false, 4);
14550   // Extend everything to 80 bits to force it to be done on x87.
14551   SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
14552   return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0));
14553 }
14554
14555 std::pair<SDValue,SDValue>
14556 X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
14557                                     bool IsSigned, bool IsReplace) const {
14558   SDLoc DL(Op);
14559
14560   EVT DstTy = Op.getValueType();
14561
14562   if (!IsSigned && !isIntegerTypeFTOL(DstTy)) {
14563     assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
14564     DstTy = MVT::i64;
14565   }
14566
14567   assert(DstTy.getSimpleVT() <= MVT::i64 &&
14568          DstTy.getSimpleVT() >= MVT::i16 &&
14569          "Unknown FP_TO_INT to lower!");
14570
14571   // These are really Legal.
14572   if (DstTy == MVT::i32 &&
14573       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
14574     return std::make_pair(SDValue(), SDValue());
14575   if (Subtarget->is64Bit() &&
14576       DstTy == MVT::i64 &&
14577       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
14578     return std::make_pair(SDValue(), SDValue());
14579
14580   // We lower FP->int64 either into FISTP64 followed by a load from a temporary
14581   // stack slot, or into the FTOL runtime function.
14582   MachineFunction &MF = DAG.getMachineFunction();
14583   unsigned MemSize = DstTy.getSizeInBits()/8;
14584   int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
14585   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
14586
14587   unsigned Opc;
14588   if (!IsSigned && isIntegerTypeFTOL(DstTy))
14589     Opc = X86ISD::WIN_FTOL;
14590   else
14591     switch (DstTy.getSimpleVT().SimpleTy) {
14592     default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
14593     case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
14594     case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
14595     case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
14596     }
14597
14598   SDValue Chain = DAG.getEntryNode();
14599   SDValue Value = Op.getOperand(0);
14600   EVT TheVT = Op.getOperand(0).getValueType();
14601   // FIXME This causes a redundant load/store if the SSE-class value is already
14602   // in memory, such as if it is on the callstack.
14603   if (isScalarFPTypeInSSEReg(TheVT)) {
14604     assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
14605     Chain = DAG.getStore(Chain, DL, Value, StackSlot,
14606                          MachinePointerInfo::getFixedStack(SSFI),
14607                          false, false, 0);
14608     SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
14609     SDValue Ops[] = {
14610       Chain, StackSlot, DAG.getValueType(TheVT)
14611     };
14612
14613     MachineMemOperand *MMO =
14614       MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14615                               MachineMemOperand::MOLoad, MemSize, MemSize);
14616     Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
14617     Chain = Value.getValue(1);
14618     SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
14619     StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
14620   }
14621
14622   MachineMemOperand *MMO =
14623     MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14624                             MachineMemOperand::MOStore, MemSize, MemSize);
14625
14626   if (Opc != X86ISD::WIN_FTOL) {
14627     // Build the FP_TO_INT*_IN_MEM
14628     SDValue Ops[] = { Chain, Value, StackSlot };
14629     SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
14630                                            Ops, DstTy, MMO);
14631     return std::make_pair(FIST, StackSlot);
14632   } else {
14633     SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
14634       DAG.getVTList(MVT::Other, MVT::Glue),
14635       Chain, Value);
14636     SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX,
14637       MVT::i32, ftol.getValue(1));
14638     SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX,
14639       MVT::i32, eax.getValue(2));
14640     SDValue Ops[] = { eax, edx };
14641     SDValue pair = IsReplace
14642       ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops)
14643       : DAG.getMergeValues(Ops, DL);
14644     return std::make_pair(pair, SDValue());
14645   }
14646 }
14647
14648 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
14649                               const X86Subtarget *Subtarget) {
14650   MVT VT = Op->getSimpleValueType(0);
14651   SDValue In = Op->getOperand(0);
14652   MVT InVT = In.getSimpleValueType();
14653   SDLoc dl(Op);
14654
14655   // Optimize vectors in AVX mode:
14656   //
14657   //   v8i16 -> v8i32
14658   //   Use vpunpcklwd for 4 lower elements  v8i16 -> v4i32.
14659   //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
14660   //   Concat upper and lower parts.
14661   //
14662   //   v4i32 -> v4i64
14663   //   Use vpunpckldq for 4 lower elements  v4i32 -> v2i64.
14664   //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
14665   //   Concat upper and lower parts.
14666   //
14667
14668   if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
14669       ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
14670       ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
14671     return SDValue();
14672
14673   if (Subtarget->hasInt256())
14674     return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
14675
14676   SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
14677   SDValue Undef = DAG.getUNDEF(InVT);
14678   bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
14679   SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
14680   SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
14681
14682   MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
14683                              VT.getVectorNumElements()/2);
14684
14685   OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
14686   OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
14687
14688   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
14689 }
14690
14691 static  SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
14692                                         SelectionDAG &DAG) {
14693   MVT VT = Op->getSimpleValueType(0);
14694   SDValue In = Op->getOperand(0);
14695   MVT InVT = In.getSimpleValueType();
14696   SDLoc DL(Op);
14697   unsigned int NumElts = VT.getVectorNumElements();
14698   if (NumElts != 8 && NumElts != 16)
14699     return SDValue();
14700
14701   if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
14702     return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
14703
14704   EVT ExtVT = (NumElts == 8)? MVT::v8i64 : MVT::v16i32;
14705   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14706   // Now we have only mask extension
14707   assert(InVT.getVectorElementType() == MVT::i1);
14708   SDValue Cst = DAG.getTargetConstant(1, ExtVT.getScalarType());
14709   const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
14710   SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
14711   unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
14712   SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
14713                            MachinePointerInfo::getConstantPool(),
14714                            false, false, false, Alignment);
14715
14716   SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, DL, ExtVT, In, Ld);
14717   if (VT.is512BitVector())
14718     return Brcst;
14719   return DAG.getNode(X86ISD::VTRUNC, DL, VT, Brcst);
14720 }
14721
14722 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
14723                                SelectionDAG &DAG) {
14724   if (Subtarget->hasFp256()) {
14725     SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
14726     if (Res.getNode())
14727       return Res;
14728   }
14729
14730   return SDValue();
14731 }
14732
14733 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
14734                                 SelectionDAG &DAG) {
14735   SDLoc DL(Op);
14736   MVT VT = Op.getSimpleValueType();
14737   SDValue In = Op.getOperand(0);
14738   MVT SVT = In.getSimpleValueType();
14739
14740   if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
14741     return LowerZERO_EXTEND_AVX512(Op, DAG);
14742
14743   if (Subtarget->hasFp256()) {
14744     SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
14745     if (Res.getNode())
14746       return Res;
14747   }
14748
14749   assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
14750          VT.getVectorNumElements() != SVT.getVectorNumElements());
14751   return SDValue();
14752 }
14753
14754 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
14755   SDLoc DL(Op);
14756   MVT VT = Op.getSimpleValueType();
14757   SDValue In = Op.getOperand(0);
14758   MVT InVT = In.getSimpleValueType();
14759
14760   if (VT == MVT::i1) {
14761     assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
14762            "Invalid scalar TRUNCATE operation");
14763     if (InVT.getSizeInBits() >= 32)
14764       return SDValue();
14765     In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
14766     return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
14767   }
14768   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
14769          "Invalid TRUNCATE operation");
14770
14771   if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) {
14772     if (VT.getVectorElementType().getSizeInBits() >=8)
14773       return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
14774
14775     assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
14776     unsigned NumElts = InVT.getVectorNumElements();
14777     assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type");
14778     if (InVT.getSizeInBits() < 512) {
14779       MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64;
14780       In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
14781       InVT = ExtVT;
14782     }
14783
14784     SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType());
14785     const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
14786     SDValue CP = DAG.getConstantPool(C, getPointerTy());
14787     unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
14788     SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
14789                            MachinePointerInfo::getConstantPool(),
14790                            false, false, false, Alignment);
14791     SDValue OneV = DAG.getNode(X86ISD::VBROADCAST, DL, InVT, Ld);
14792     SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In);
14793     return DAG.getNode(X86ISD::TESTM, DL, VT, And, And);
14794   }
14795
14796   if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
14797     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
14798     if (Subtarget->hasInt256()) {
14799       static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
14800       In = DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, In);
14801       In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
14802                                 ShufMask);
14803       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
14804                          DAG.getIntPtrConstant(0));
14805     }
14806
14807     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14808                                DAG.getIntPtrConstant(0));
14809     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14810                                DAG.getIntPtrConstant(2));
14811     OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
14812     OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
14813     static const int ShufMask[] = {0, 2, 4, 6};
14814     return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
14815   }
14816
14817   if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
14818     // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
14819     if (Subtarget->hasInt256()) {
14820       In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In);
14821
14822       SmallVector<SDValue,32> pshufbMask;
14823       for (unsigned i = 0; i < 2; ++i) {
14824         pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8));
14825         pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8));
14826         pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8));
14827         pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8));
14828         pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8));
14829         pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8));
14830         pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8));
14831         pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8));
14832         for (unsigned j = 0; j < 8; ++j)
14833           pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
14834       }
14835       SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask);
14836       In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
14837       In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In);
14838
14839       static const int ShufMask[] = {0,  2,  -1,  -1};
14840       In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, DAG.getUNDEF(MVT::v4i64),
14841                                 &ShufMask[0]);
14842       In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14843                        DAG.getIntPtrConstant(0));
14844       return DAG.getNode(ISD::BITCAST, DL, VT, In);
14845     }
14846
14847     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
14848                                DAG.getIntPtrConstant(0));
14849
14850     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
14851                                DAG.getIntPtrConstant(4));
14852
14853     OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpLo);
14854     OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpHi);
14855
14856     // The PSHUFB mask:
14857     static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
14858                                    -1, -1, -1, -1, -1, -1, -1, -1};
14859
14860     SDValue Undef = DAG.getUNDEF(MVT::v16i8);
14861     OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
14862     OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
14863
14864     OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
14865     OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
14866
14867     // The MOVLHPS Mask:
14868     static const int ShufMask2[] = {0, 1, 4, 5};
14869     SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
14870     return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, res);
14871   }
14872
14873   // Handle truncation of V256 to V128 using shuffles.
14874   if (!VT.is128BitVector() || !InVT.is256BitVector())
14875     return SDValue();
14876
14877   assert(Subtarget->hasFp256() && "256-bit vector without AVX!");
14878
14879   unsigned NumElems = VT.getVectorNumElements();
14880   MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
14881
14882   SmallVector<int, 16> MaskVec(NumElems * 2, -1);
14883   // Prepare truncation shuffle mask
14884   for (unsigned i = 0; i != NumElems; ++i)
14885     MaskVec[i] = i * 2;
14886   SDValue V = DAG.getVectorShuffle(NVT, DL,
14887                                    DAG.getNode(ISD::BITCAST, DL, NVT, In),
14888                                    DAG.getUNDEF(NVT), &MaskVec[0]);
14889   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
14890                      DAG.getIntPtrConstant(0));
14891 }
14892
14893 SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
14894                                            SelectionDAG &DAG) const {
14895   assert(!Op.getSimpleValueType().isVector());
14896
14897   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
14898     /*IsSigned=*/ true, /*IsReplace=*/ false);
14899   SDValue FIST = Vals.first, StackSlot = Vals.second;
14900   // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
14901   if (!FIST.getNode()) return Op;
14902
14903   if (StackSlot.getNode())
14904     // Load the result.
14905     return DAG.getLoad(Op.getValueType(), SDLoc(Op),
14906                        FIST, StackSlot, MachinePointerInfo(),
14907                        false, false, false, 0);
14908
14909   // The node is the result.
14910   return FIST;
14911 }
14912
14913 SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
14914                                            SelectionDAG &DAG) const {
14915   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
14916     /*IsSigned=*/ false, /*IsReplace=*/ false);
14917   SDValue FIST = Vals.first, StackSlot = Vals.second;
14918   assert(FIST.getNode() && "Unexpected failure");
14919
14920   if (StackSlot.getNode())
14921     // Load the result.
14922     return DAG.getLoad(Op.getValueType(), SDLoc(Op),
14923                        FIST, StackSlot, MachinePointerInfo(),
14924                        false, false, false, 0);
14925
14926   // The node is the result.
14927   return FIST;
14928 }
14929
14930 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
14931   SDLoc DL(Op);
14932   MVT VT = Op.getSimpleValueType();
14933   SDValue In = Op.getOperand(0);
14934   MVT SVT = In.getSimpleValueType();
14935
14936   assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
14937
14938   return DAG.getNode(X86ISD::VFPEXT, DL, VT,
14939                      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
14940                                  In, DAG.getUNDEF(SVT)));
14941 }
14942
14943 /// The only differences between FABS and FNEG are the mask and the logic op.
14944 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
14945 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
14946   assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
14947          "Wrong opcode for lowering FABS or FNEG.");
14948
14949   bool IsFABS = (Op.getOpcode() == ISD::FABS);
14950
14951   // If this is a FABS and it has an FNEG user, bail out to fold the combination
14952   // into an FNABS. We'll lower the FABS after that if it is still in use.
14953   if (IsFABS)
14954     for (SDNode *User : Op->uses())
14955       if (User->getOpcode() == ISD::FNEG)
14956         return Op;
14957
14958   SDValue Op0 = Op.getOperand(0);
14959   bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
14960
14961   SDLoc dl(Op);
14962   MVT VT = Op.getSimpleValueType();
14963   // Assume scalar op for initialization; update for vector if needed.
14964   // Note that there are no scalar bitwise logical SSE/AVX instructions, so we
14965   // generate a 16-byte vector constant and logic op even for the scalar case.
14966   // Using a 16-byte mask allows folding the load of the mask with
14967   // the logic op, so it can save (~4 bytes) on code size.
14968   MVT EltVT = VT;
14969   unsigned NumElts = VT == MVT::f64 ? 2 : 4;
14970   // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
14971   // decide if we should generate a 16-byte constant mask when we only need 4 or
14972   // 8 bytes for the scalar case.
14973   if (VT.isVector()) {
14974     EltVT = VT.getVectorElementType();
14975     NumElts = VT.getVectorNumElements();
14976   }
14977
14978   unsigned EltBits = EltVT.getSizeInBits();
14979   LLVMContext *Context = DAG.getContext();
14980   // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
14981   APInt MaskElt =
14982     IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits);
14983   Constant *C = ConstantInt::get(*Context, MaskElt);
14984   C = ConstantVector::getSplat(NumElts, C);
14985   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14986   SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy());
14987   unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
14988   SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
14989                              MachinePointerInfo::getConstantPool(),
14990                              false, false, false, Alignment);
14991
14992   if (VT.isVector()) {
14993     // For a vector, cast operands to a vector type, perform the logic op,
14994     // and cast the result back to the original value type.
14995     MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
14996     SDValue MaskCasted = DAG.getNode(ISD::BITCAST, dl, VecVT, Mask);
14997     SDValue Operand = IsFNABS ?
14998       DAG.getNode(ISD::BITCAST, dl, VecVT, Op0.getOperand(0)) :
14999       DAG.getNode(ISD::BITCAST, dl, VecVT, Op0);
15000     unsigned BitOp = IsFABS ? ISD::AND : IsFNABS ? ISD::OR : ISD::XOR;
15001     return DAG.getNode(ISD::BITCAST, dl, VT,
15002                        DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted));
15003   }
15004
15005   // If not vector, then scalar.
15006   unsigned BitOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
15007   SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
15008   return DAG.getNode(BitOp, dl, VT, Operand, Mask);
15009 }
15010
15011 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
15012   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15013   LLVMContext *Context = DAG.getContext();
15014   SDValue Op0 = Op.getOperand(0);
15015   SDValue Op1 = Op.getOperand(1);
15016   SDLoc dl(Op);
15017   MVT VT = Op.getSimpleValueType();
15018   MVT SrcVT = Op1.getSimpleValueType();
15019
15020   // If second operand is smaller, extend it first.
15021   if (SrcVT.bitsLT(VT)) {
15022     Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
15023     SrcVT = VT;
15024   }
15025   // And if it is bigger, shrink it first.
15026   if (SrcVT.bitsGT(VT)) {
15027     Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
15028     SrcVT = VT;
15029   }
15030
15031   // At this point the operands and the result should have the same
15032   // type, and that won't be f80 since that is not custom lowered.
15033
15034   const fltSemantics &Sem =
15035       VT == MVT::f64 ? APFloat::IEEEdouble : APFloat::IEEEsingle;
15036   const unsigned SizeInBits = VT.getSizeInBits();
15037
15038   SmallVector<Constant *, 4> CV(
15039       VT == MVT::f64 ? 2 : 4,
15040       ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0))));
15041
15042   // First, clear all bits but the sign bit from the second operand (sign).
15043   CV[0] = ConstantFP::get(*Context,
15044                           APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1)));
15045   Constant *C = ConstantVector::get(CV);
15046   SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
15047   SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
15048                               MachinePointerInfo::getConstantPool(),
15049                               false, false, false, 16);
15050   SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
15051
15052   // Next, clear the sign bit from the first operand (magnitude).
15053   // If it's a constant, we can clear it here.
15054   if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Op0)) {
15055     APFloat APF = Op0CN->getValueAPF();
15056     // If the magnitude is a positive zero, the sign bit alone is enough.
15057     if (APF.isPosZero())
15058       return SignBit;
15059     APF.clearSign();
15060     CV[0] = ConstantFP::get(*Context, APF);
15061   } else {
15062     CV[0] = ConstantFP::get(
15063         *Context,
15064         APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1)));
15065   }
15066   C = ConstantVector::get(CV);
15067   CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
15068   SDValue Val = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
15069                             MachinePointerInfo::getConstantPool(),
15070                             false, false, false, 16);
15071   // If the magnitude operand wasn't a constant, we need to AND out the sign.
15072   if (!isa<ConstantFPSDNode>(Op0))
15073     Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Val);
15074
15075   // OR the magnitude value with the sign bit.
15076   return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
15077 }
15078
15079 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
15080   SDValue N0 = Op.getOperand(0);
15081   SDLoc dl(Op);
15082   MVT VT = Op.getSimpleValueType();
15083
15084   // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
15085   SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
15086                                   DAG.getConstant(1, VT));
15087   return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
15088 }
15089
15090 // Check whether an OR'd tree is PTEST-able.
15091 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
15092                                       SelectionDAG &DAG) {
15093   assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
15094
15095   if (!Subtarget->hasSSE41())
15096     return SDValue();
15097
15098   if (!Op->hasOneUse())
15099     return SDValue();
15100
15101   SDNode *N = Op.getNode();
15102   SDLoc DL(N);
15103
15104   SmallVector<SDValue, 8> Opnds;
15105   DenseMap<SDValue, unsigned> VecInMap;
15106   SmallVector<SDValue, 8> VecIns;
15107   EVT VT = MVT::Other;
15108
15109   // Recognize a special case where a vector is casted into wide integer to
15110   // test all 0s.
15111   Opnds.push_back(N->getOperand(0));
15112   Opnds.push_back(N->getOperand(1));
15113
15114   for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
15115     SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
15116     // BFS traverse all OR'd operands.
15117     if (I->getOpcode() == ISD::OR) {
15118       Opnds.push_back(I->getOperand(0));
15119       Opnds.push_back(I->getOperand(1));
15120       // Re-evaluate the number of nodes to be traversed.
15121       e += 2; // 2 more nodes (LHS and RHS) are pushed.
15122       continue;
15123     }
15124
15125     // Quit if a non-EXTRACT_VECTOR_ELT
15126     if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
15127       return SDValue();
15128
15129     // Quit if without a constant index.
15130     SDValue Idx = I->getOperand(1);
15131     if (!isa<ConstantSDNode>(Idx))
15132       return SDValue();
15133
15134     SDValue ExtractedFromVec = I->getOperand(0);
15135     DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
15136     if (M == VecInMap.end()) {
15137       VT = ExtractedFromVec.getValueType();
15138       // Quit if not 128/256-bit vector.
15139       if (!VT.is128BitVector() && !VT.is256BitVector())
15140         return SDValue();
15141       // Quit if not the same type.
15142       if (VecInMap.begin() != VecInMap.end() &&
15143           VT != VecInMap.begin()->first.getValueType())
15144         return SDValue();
15145       M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
15146       VecIns.push_back(ExtractedFromVec);
15147     }
15148     M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
15149   }
15150
15151   assert((VT.is128BitVector() || VT.is256BitVector()) &&
15152          "Not extracted from 128-/256-bit vector.");
15153
15154   unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
15155
15156   for (DenseMap<SDValue, unsigned>::const_iterator
15157         I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
15158     // Quit if not all elements are used.
15159     if (I->second != FullMask)
15160       return SDValue();
15161   }
15162
15163   EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
15164
15165   // Cast all vectors into TestVT for PTEST.
15166   for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
15167     VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]);
15168
15169   // If more than one full vectors are evaluated, OR them first before PTEST.
15170   for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
15171     // Each iteration will OR 2 nodes and append the result until there is only
15172     // 1 node left, i.e. the final OR'd value of all vectors.
15173     SDValue LHS = VecIns[Slot];
15174     SDValue RHS = VecIns[Slot + 1];
15175     VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
15176   }
15177
15178   return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
15179                      VecIns.back(), VecIns.back());
15180 }
15181
15182 /// \brief return true if \c Op has a use that doesn't just read flags.
15183 static bool hasNonFlagsUse(SDValue Op) {
15184   for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
15185        ++UI) {
15186     SDNode *User = *UI;
15187     unsigned UOpNo = UI.getOperandNo();
15188     if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
15189       // Look pass truncate.
15190       UOpNo = User->use_begin().getOperandNo();
15191       User = *User->use_begin();
15192     }
15193
15194     if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
15195         !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
15196       return true;
15197   }
15198   return false;
15199 }
15200
15201 /// Emit nodes that will be selected as "test Op0,Op0", or something
15202 /// equivalent.
15203 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
15204                                     SelectionDAG &DAG) const {
15205   if (Op.getValueType() == MVT::i1) {
15206     SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
15207     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
15208                        DAG.getConstant(0, MVT::i8));
15209   }
15210   // CF and OF aren't always set the way we want. Determine which
15211   // of these we need.
15212   bool NeedCF = false;
15213   bool NeedOF = false;
15214   switch (X86CC) {
15215   default: break;
15216   case X86::COND_A: case X86::COND_AE:
15217   case X86::COND_B: case X86::COND_BE:
15218     NeedCF = true;
15219     break;
15220   case X86::COND_G: case X86::COND_GE:
15221   case X86::COND_L: case X86::COND_LE:
15222   case X86::COND_O: case X86::COND_NO: {
15223     // Check if we really need to set the
15224     // Overflow flag. If NoSignedWrap is present
15225     // that is not actually needed.
15226     switch (Op->getOpcode()) {
15227     case ISD::ADD:
15228     case ISD::SUB:
15229     case ISD::MUL:
15230     case ISD::SHL: {
15231       const BinaryWithFlagsSDNode *BinNode =
15232           cast<BinaryWithFlagsSDNode>(Op.getNode());
15233       if (BinNode->hasNoSignedWrap())
15234         break;
15235     }
15236     default:
15237       NeedOF = true;
15238       break;
15239     }
15240     break;
15241   }
15242   }
15243   // See if we can use the EFLAGS value from the operand instead of
15244   // doing a separate TEST. TEST always sets OF and CF to 0, so unless
15245   // we prove that the arithmetic won't overflow, we can't use OF or CF.
15246   if (Op.getResNo() != 0 || NeedOF || NeedCF) {
15247     // Emit a CMP with 0, which is the TEST pattern.
15248     //if (Op.getValueType() == MVT::i1)
15249     //  return DAG.getNode(X86ISD::CMP, dl, MVT::i1, Op,
15250     //                     DAG.getConstant(0, MVT::i1));
15251     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
15252                        DAG.getConstant(0, Op.getValueType()));
15253   }
15254   unsigned Opcode = 0;
15255   unsigned NumOperands = 0;
15256
15257   // Truncate operations may prevent the merge of the SETCC instruction
15258   // and the arithmetic instruction before it. Attempt to truncate the operands
15259   // of the arithmetic instruction and use a reduced bit-width instruction.
15260   bool NeedTruncation = false;
15261   SDValue ArithOp = Op;
15262   if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
15263     SDValue Arith = Op->getOperand(0);
15264     // Both the trunc and the arithmetic op need to have one user each.
15265     if (Arith->hasOneUse())
15266       switch (Arith.getOpcode()) {
15267         default: break;
15268         case ISD::ADD:
15269         case ISD::SUB:
15270         case ISD::AND:
15271         case ISD::OR:
15272         case ISD::XOR: {
15273           NeedTruncation = true;
15274           ArithOp = Arith;
15275         }
15276       }
15277   }
15278
15279   // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
15280   // which may be the result of a CAST.  We use the variable 'Op', which is the
15281   // non-casted variable when we check for possible users.
15282   switch (ArithOp.getOpcode()) {
15283   case ISD::ADD:
15284     // Due to an isel shortcoming, be conservative if this add is likely to be
15285     // selected as part of a load-modify-store instruction. When the root node
15286     // in a match is a store, isel doesn't know how to remap non-chain non-flag
15287     // uses of other nodes in the match, such as the ADD in this case. This
15288     // leads to the ADD being left around and reselected, with the result being
15289     // two adds in the output.  Alas, even if none our users are stores, that
15290     // doesn't prove we're O.K.  Ergo, if we have any parents that aren't
15291     // CopyToReg or SETCC, eschew INC/DEC.  A better fix seems to require
15292     // climbing the DAG back to the root, and it doesn't seem to be worth the
15293     // effort.
15294     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
15295          UE = Op.getNode()->use_end(); UI != UE; ++UI)
15296       if (UI->getOpcode() != ISD::CopyToReg &&
15297           UI->getOpcode() != ISD::SETCC &&
15298           UI->getOpcode() != ISD::STORE)
15299         goto default_case;
15300
15301     if (ConstantSDNode *C =
15302         dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
15303       // An add of one will be selected as an INC.
15304       if (C->getAPIntValue() == 1 && !Subtarget->slowIncDec()) {
15305         Opcode = X86ISD::INC;
15306         NumOperands = 1;
15307         break;
15308       }
15309
15310       // An add of negative one (subtract of one) will be selected as a DEC.
15311       if (C->getAPIntValue().isAllOnesValue() && !Subtarget->slowIncDec()) {
15312         Opcode = X86ISD::DEC;
15313         NumOperands = 1;
15314         break;
15315       }
15316     }
15317
15318     // Otherwise use a regular EFLAGS-setting add.
15319     Opcode = X86ISD::ADD;
15320     NumOperands = 2;
15321     break;
15322   case ISD::SHL:
15323   case ISD::SRL:
15324     // If we have a constant logical shift that's only used in a comparison
15325     // against zero turn it into an equivalent AND. This allows turning it into
15326     // a TEST instruction later.
15327     if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
15328         isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
15329       EVT VT = Op.getValueType();
15330       unsigned BitWidth = VT.getSizeInBits();
15331       unsigned ShAmt = Op->getConstantOperandVal(1);
15332       if (ShAmt >= BitWidth) // Avoid undefined shifts.
15333         break;
15334       APInt Mask = ArithOp.getOpcode() == ISD::SRL
15335                        ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
15336                        : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
15337       if (!Mask.isSignedIntN(32)) // Avoid large immediates.
15338         break;
15339       SDValue New = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
15340                                 DAG.getConstant(Mask, VT));
15341       DAG.ReplaceAllUsesWith(Op, New);
15342       Op = New;
15343     }
15344     break;
15345
15346   case ISD::AND:
15347     // If the primary and result isn't used, don't bother using X86ISD::AND,
15348     // because a TEST instruction will be better.
15349     if (!hasNonFlagsUse(Op))
15350       break;
15351     // FALL THROUGH
15352   case ISD::SUB:
15353   case ISD::OR:
15354   case ISD::XOR:
15355     // Due to the ISEL shortcoming noted above, be conservative if this op is
15356     // likely to be selected as part of a load-modify-store instruction.
15357     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
15358            UE = Op.getNode()->use_end(); UI != UE; ++UI)
15359       if (UI->getOpcode() == ISD::STORE)
15360         goto default_case;
15361
15362     // Otherwise use a regular EFLAGS-setting instruction.
15363     switch (ArithOp.getOpcode()) {
15364     default: llvm_unreachable("unexpected operator!");
15365     case ISD::SUB: Opcode = X86ISD::SUB; break;
15366     case ISD::XOR: Opcode = X86ISD::XOR; break;
15367     case ISD::AND: Opcode = X86ISD::AND; break;
15368     case ISD::OR: {
15369       if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
15370         SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG);
15371         if (EFLAGS.getNode())
15372           return EFLAGS;
15373       }
15374       Opcode = X86ISD::OR;
15375       break;
15376     }
15377     }
15378
15379     NumOperands = 2;
15380     break;
15381   case X86ISD::ADD:
15382   case X86ISD::SUB:
15383   case X86ISD::INC:
15384   case X86ISD::DEC:
15385   case X86ISD::OR:
15386   case X86ISD::XOR:
15387   case X86ISD::AND:
15388     return SDValue(Op.getNode(), 1);
15389   default:
15390   default_case:
15391     break;
15392   }
15393
15394   // If we found that truncation is beneficial, perform the truncation and
15395   // update 'Op'.
15396   if (NeedTruncation) {
15397     EVT VT = Op.getValueType();
15398     SDValue WideVal = Op->getOperand(0);
15399     EVT WideVT = WideVal.getValueType();
15400     unsigned ConvertedOp = 0;
15401     // Use a target machine opcode to prevent further DAGCombine
15402     // optimizations that may separate the arithmetic operations
15403     // from the setcc node.
15404     switch (WideVal.getOpcode()) {
15405       default: break;
15406       case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
15407       case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
15408       case ISD::AND: ConvertedOp = X86ISD::AND; break;
15409       case ISD::OR:  ConvertedOp = X86ISD::OR;  break;
15410       case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
15411     }
15412
15413     if (ConvertedOp) {
15414       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15415       if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
15416         SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
15417         SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
15418         Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
15419       }
15420     }
15421   }
15422
15423   if (Opcode == 0)
15424     // Emit a CMP with 0, which is the TEST pattern.
15425     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
15426                        DAG.getConstant(0, Op.getValueType()));
15427
15428   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
15429   SmallVector<SDValue, 4> Ops;
15430   for (unsigned i = 0; i != NumOperands; ++i)
15431     Ops.push_back(Op.getOperand(i));
15432
15433   SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
15434   DAG.ReplaceAllUsesWith(Op, New);
15435   return SDValue(New.getNode(), 1);
15436 }
15437
15438 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
15439 /// equivalent.
15440 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
15441                                    SDLoc dl, SelectionDAG &DAG) const {
15442   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) {
15443     if (C->getAPIntValue() == 0)
15444       return EmitTest(Op0, X86CC, dl, DAG);
15445
15446      if (Op0.getValueType() == MVT::i1)
15447        llvm_unreachable("Unexpected comparison operation for MVT::i1 operands");
15448   }
15449
15450   if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
15451        Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
15452     // Do the comparison at i32 if it's smaller, besides the Atom case.
15453     // This avoids subregister aliasing issues. Keep the smaller reference
15454     // if we're optimizing for size, however, as that'll allow better folding
15455     // of memory operations.
15456     if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 &&
15457         !DAG.getMachineFunction().getFunction()->hasFnAttribute(
15458             Attribute::MinSize) &&
15459         !Subtarget->isAtom()) {
15460       unsigned ExtendOp =
15461           isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
15462       Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
15463       Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
15464     }
15465     // Use SUB instead of CMP to enable CSE between SUB and CMP.
15466     SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
15467     SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
15468                               Op0, Op1);
15469     return SDValue(Sub.getNode(), 1);
15470   }
15471   return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
15472 }
15473
15474 /// Convert a comparison if required by the subtarget.
15475 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
15476                                                  SelectionDAG &DAG) const {
15477   // If the subtarget does not support the FUCOMI instruction, floating-point
15478   // comparisons have to be converted.
15479   if (Subtarget->hasCMov() ||
15480       Cmp.getOpcode() != X86ISD::CMP ||
15481       !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
15482       !Cmp.getOperand(1).getValueType().isFloatingPoint())
15483     return Cmp;
15484
15485   // The instruction selector will select an FUCOM instruction instead of
15486   // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
15487   // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
15488   // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
15489   SDLoc dl(Cmp);
15490   SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
15491   SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
15492   SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
15493                             DAG.getConstant(8, MVT::i8));
15494   SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
15495   return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
15496 }
15497
15498 /// The minimum architected relative accuracy is 2^-12. We need one
15499 /// Newton-Raphson step to have a good float result (24 bits of precision).
15500 SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
15501                                             DAGCombinerInfo &DCI,
15502                                             unsigned &RefinementSteps,
15503                                             bool &UseOneConstNR) const {
15504   // FIXME: We should use instruction latency models to calculate the cost of
15505   // each potential sequence, but this is very hard to do reliably because
15506   // at least Intel's Core* chips have variable timing based on the number of
15507   // significant digits in the divisor and/or sqrt operand.
15508   if (!Subtarget->useSqrtEst())
15509     return SDValue();
15510
15511   EVT VT = Op.getValueType();
15512
15513   // SSE1 has rsqrtss and rsqrtps.
15514   // TODO: Add support for AVX512 (v16f32).
15515   // It is likely not profitable to do this for f64 because a double-precision
15516   // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
15517   // instructions: convert to single, rsqrtss, convert back to double, refine
15518   // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
15519   // along with FMA, this could be a throughput win.
15520   if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
15521       (Subtarget->hasAVX() && VT == MVT::v8f32)) {
15522     RefinementSteps = 1;
15523     UseOneConstNR = false;
15524     return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
15525   }
15526   return SDValue();
15527 }
15528
15529 /// The minimum architected relative accuracy is 2^-12. We need one
15530 /// Newton-Raphson step to have a good float result (24 bits of precision).
15531 SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
15532                                             DAGCombinerInfo &DCI,
15533                                             unsigned &RefinementSteps) const {
15534   // FIXME: We should use instruction latency models to calculate the cost of
15535   // each potential sequence, but this is very hard to do reliably because
15536   // at least Intel's Core* chips have variable timing based on the number of
15537   // significant digits in the divisor.
15538   if (!Subtarget->useReciprocalEst())
15539     return SDValue();
15540
15541   EVT VT = Op.getValueType();
15542
15543   // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
15544   // TODO: Add support for AVX512 (v16f32).
15545   // It is likely not profitable to do this for f64 because a double-precision
15546   // reciprocal estimate with refinement on x86 prior to FMA requires
15547   // 15 instructions: convert to single, rcpss, convert back to double, refine
15548   // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
15549   // along with FMA, this could be a throughput win.
15550   if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
15551       (Subtarget->hasAVX() && VT == MVT::v8f32)) {
15552     RefinementSteps = ReciprocalEstimateRefinementSteps;
15553     return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
15554   }
15555   return SDValue();
15556 }
15557
15558 static bool isAllOnes(SDValue V) {
15559   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
15560   return C && C->isAllOnesValue();
15561 }
15562
15563 /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
15564 /// if it's possible.
15565 SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
15566                                      SDLoc dl, SelectionDAG &DAG) const {
15567   SDValue Op0 = And.getOperand(0);
15568   SDValue Op1 = And.getOperand(1);
15569   if (Op0.getOpcode() == ISD::TRUNCATE)
15570     Op0 = Op0.getOperand(0);
15571   if (Op1.getOpcode() == ISD::TRUNCATE)
15572     Op1 = Op1.getOperand(0);
15573
15574   SDValue LHS, RHS;
15575   if (Op1.getOpcode() == ISD::SHL)
15576     std::swap(Op0, Op1);
15577   if (Op0.getOpcode() == ISD::SHL) {
15578     if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0)))
15579       if (And00C->getZExtValue() == 1) {
15580         // If we looked past a truncate, check that it's only truncating away
15581         // known zeros.
15582         unsigned BitWidth = Op0.getValueSizeInBits();
15583         unsigned AndBitWidth = And.getValueSizeInBits();
15584         if (BitWidth > AndBitWidth) {
15585           APInt Zeros, Ones;
15586           DAG.computeKnownBits(Op0, Zeros, Ones);
15587           if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
15588             return SDValue();
15589         }
15590         LHS = Op1;
15591         RHS = Op0.getOperand(1);
15592       }
15593   } else if (Op1.getOpcode() == ISD::Constant) {
15594     ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
15595     uint64_t AndRHSVal = AndRHS->getZExtValue();
15596     SDValue AndLHS = Op0;
15597
15598     if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
15599       LHS = AndLHS.getOperand(0);
15600       RHS = AndLHS.getOperand(1);
15601     }
15602
15603     // Use BT if the immediate can't be encoded in a TEST instruction.
15604     if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
15605       LHS = AndLHS;
15606       RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType());
15607     }
15608   }
15609
15610   if (LHS.getNode()) {
15611     // If LHS is i8, promote it to i32 with any_extend.  There is no i8 BT
15612     // instruction.  Since the shift amount is in-range-or-undefined, we know
15613     // that doing a bittest on the i32 value is ok.  We extend to i32 because
15614     // the encoding for the i16 version is larger than the i32 version.
15615     // Also promote i16 to i32 for performance / code size reason.
15616     if (LHS.getValueType() == MVT::i8 ||
15617         LHS.getValueType() == MVT::i16)
15618       LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
15619
15620     // If the operand types disagree, extend the shift amount to match.  Since
15621     // BT ignores high bits (like shifts) we can use anyextend.
15622     if (LHS.getValueType() != RHS.getValueType())
15623       RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
15624
15625     SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
15626     X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
15627     return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15628                        DAG.getConstant(Cond, MVT::i8), BT);
15629   }
15630
15631   return SDValue();
15632 }
15633
15634 /// \brief - Turns an ISD::CondCode into a value suitable for SSE floating point
15635 /// mask CMPs.
15636 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
15637                               SDValue &Op1) {
15638   unsigned SSECC;
15639   bool Swap = false;
15640
15641   // SSE Condition code mapping:
15642   //  0 - EQ
15643   //  1 - LT
15644   //  2 - LE
15645   //  3 - UNORD
15646   //  4 - NEQ
15647   //  5 - NLT
15648   //  6 - NLE
15649   //  7 - ORD
15650   switch (SetCCOpcode) {
15651   default: llvm_unreachable("Unexpected SETCC condition");
15652   case ISD::SETOEQ:
15653   case ISD::SETEQ:  SSECC = 0; break;
15654   case ISD::SETOGT:
15655   case ISD::SETGT:  Swap = true; // Fallthrough
15656   case ISD::SETLT:
15657   case ISD::SETOLT: SSECC = 1; break;
15658   case ISD::SETOGE:
15659   case ISD::SETGE:  Swap = true; // Fallthrough
15660   case ISD::SETLE:
15661   case ISD::SETOLE: SSECC = 2; break;
15662   case ISD::SETUO:  SSECC = 3; break;
15663   case ISD::SETUNE:
15664   case ISD::SETNE:  SSECC = 4; break;
15665   case ISD::SETULE: Swap = true; // Fallthrough
15666   case ISD::SETUGE: SSECC = 5; break;
15667   case ISD::SETULT: Swap = true; // Fallthrough
15668   case ISD::SETUGT: SSECC = 6; break;
15669   case ISD::SETO:   SSECC = 7; break;
15670   case ISD::SETUEQ:
15671   case ISD::SETONE: SSECC = 8; break;
15672   }
15673   if (Swap)
15674     std::swap(Op0, Op1);
15675
15676   return SSECC;
15677 }
15678
15679 // Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
15680 // ones, and then concatenate the result back.
15681 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
15682   MVT VT = Op.getSimpleValueType();
15683
15684   assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
15685          "Unsupported value type for operation");
15686
15687   unsigned NumElems = VT.getVectorNumElements();
15688   SDLoc dl(Op);
15689   SDValue CC = Op.getOperand(2);
15690
15691   // Extract the LHS vectors
15692   SDValue LHS = Op.getOperand(0);
15693   SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
15694   SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
15695
15696   // Extract the RHS vectors
15697   SDValue RHS = Op.getOperand(1);
15698   SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
15699   SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
15700
15701   // Issue the operation on the smaller types and concatenate the result back
15702   MVT EltVT = VT.getVectorElementType();
15703   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
15704   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
15705                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
15706                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
15707 }
15708
15709 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG,
15710                                      const X86Subtarget *Subtarget) {
15711   SDValue Op0 = Op.getOperand(0);
15712   SDValue Op1 = Op.getOperand(1);
15713   SDValue CC = Op.getOperand(2);
15714   MVT VT = Op.getSimpleValueType();
15715   SDLoc dl(Op);
15716
15717   assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 8 &&
15718          Op.getValueType().getScalarType() == MVT::i1 &&
15719          "Cannot set masked compare for this operation");
15720
15721   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
15722   unsigned  Opc = 0;
15723   bool Unsigned = false;
15724   bool Swap = false;
15725   unsigned SSECC;
15726   switch (SetCCOpcode) {
15727   default: llvm_unreachable("Unexpected SETCC condition");
15728   case ISD::SETNE:  SSECC = 4; break;
15729   case ISD::SETEQ:  Opc = X86ISD::PCMPEQM; break;
15730   case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
15731   case ISD::SETLT:  Swap = true; //fall-through
15732   case ISD::SETGT:  Opc = X86ISD::PCMPGTM; break;
15733   case ISD::SETULT: SSECC = 1; Unsigned = true; break;
15734   case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
15735   case ISD::SETGE:  Swap = true; SSECC = 2; break; // LE + swap
15736   case ISD::SETULE: Unsigned = true; //fall-through
15737   case ISD::SETLE:  SSECC = 2; break;
15738   }
15739
15740   if (Swap)
15741     std::swap(Op0, Op1);
15742   if (Opc)
15743     return DAG.getNode(Opc, dl, VT, Op0, Op1);
15744   Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
15745   return DAG.getNode(Opc, dl, VT, Op0, Op1,
15746                      DAG.getConstant(SSECC, MVT::i8));
15747 }
15748
15749 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
15750 /// operand \p Op1.  If non-trivial (for example because it's not constant)
15751 /// return an empty value.
15752 static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG)
15753 {
15754   BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
15755   if (!BV)
15756     return SDValue();
15757
15758   MVT VT = Op1.getSimpleValueType();
15759   MVT EVT = VT.getVectorElementType();
15760   unsigned n = VT.getVectorNumElements();
15761   SmallVector<SDValue, 8> ULTOp1;
15762
15763   for (unsigned i = 0; i < n; ++i) {
15764     ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
15765     if (!Elt || Elt->isOpaque() || Elt->getValueType(0) != EVT)
15766       return SDValue();
15767
15768     // Avoid underflow.
15769     APInt Val = Elt->getAPIntValue();
15770     if (Val == 0)
15771       return SDValue();
15772
15773     ULTOp1.push_back(DAG.getConstant(Val - 1, EVT));
15774   }
15775
15776   return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ULTOp1);
15777 }
15778
15779 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
15780                            SelectionDAG &DAG) {
15781   SDValue Op0 = Op.getOperand(0);
15782   SDValue Op1 = Op.getOperand(1);
15783   SDValue CC = Op.getOperand(2);
15784   MVT VT = Op.getSimpleValueType();
15785   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
15786   bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
15787   SDLoc dl(Op);
15788
15789   if (isFP) {
15790 #ifndef NDEBUG
15791     MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
15792     assert(EltVT == MVT::f32 || EltVT == MVT::f64);
15793 #endif
15794
15795     unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
15796     unsigned Opc = X86ISD::CMPP;
15797     if (Subtarget->hasAVX512() && VT.getVectorElementType() == MVT::i1) {
15798       assert(VT.getVectorNumElements() <= 16);
15799       Opc = X86ISD::CMPM;
15800     }
15801     // In the two special cases we can't handle, emit two comparisons.
15802     if (SSECC == 8) {
15803       unsigned CC0, CC1;
15804       unsigned CombineOpc;
15805       if (SetCCOpcode == ISD::SETUEQ) {
15806         CC0 = 3; CC1 = 0; CombineOpc = ISD::OR;
15807       } else {
15808         assert(SetCCOpcode == ISD::SETONE);
15809         CC0 = 7; CC1 = 4; CombineOpc = ISD::AND;
15810       }
15811
15812       SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
15813                                  DAG.getConstant(CC0, MVT::i8));
15814       SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
15815                                  DAG.getConstant(CC1, MVT::i8));
15816       return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
15817     }
15818     // Handle all other FP comparisons here.
15819     return DAG.getNode(Opc, dl, VT, Op0, Op1,
15820                        DAG.getConstant(SSECC, MVT::i8));
15821   }
15822
15823   // Break 256-bit integer vector compare into smaller ones.
15824   if (VT.is256BitVector() && !Subtarget->hasInt256())
15825     return Lower256IntVSETCC(Op, DAG);
15826
15827   bool MaskResult = (VT.getVectorElementType() == MVT::i1);
15828   EVT OpVT = Op1.getValueType();
15829   if (Subtarget->hasAVX512()) {
15830     if (Op1.getValueType().is512BitVector() ||
15831         (Subtarget->hasBWI() && Subtarget->hasVLX()) ||
15832         (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32))
15833       return LowerIntVSETCC_AVX512(Op, DAG, Subtarget);
15834
15835     // In AVX-512 architecture setcc returns mask with i1 elements,
15836     // But there is no compare instruction for i8 and i16 elements in KNL.
15837     // We are not talking about 512-bit operands in this case, these
15838     // types are illegal.
15839     if (MaskResult &&
15840         (OpVT.getVectorElementType().getSizeInBits() < 32 &&
15841          OpVT.getVectorElementType().getSizeInBits() >= 8))
15842       return DAG.getNode(ISD::TRUNCATE, dl, VT,
15843                          DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
15844   }
15845
15846   // We are handling one of the integer comparisons here.  Since SSE only has
15847   // GT and EQ comparisons for integer, swapping operands and multiple
15848   // operations may be required for some comparisons.
15849   unsigned Opc;
15850   bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
15851   bool Subus = false;
15852
15853   switch (SetCCOpcode) {
15854   default: llvm_unreachable("Unexpected SETCC condition");
15855   case ISD::SETNE:  Invert = true;
15856   case ISD::SETEQ:  Opc = X86ISD::PCMPEQ; break;
15857   case ISD::SETLT:  Swap = true;
15858   case ISD::SETGT:  Opc = X86ISD::PCMPGT; break;
15859   case ISD::SETGE:  Swap = true;
15860   case ISD::SETLE:  Opc = X86ISD::PCMPGT;
15861                     Invert = true; break;
15862   case ISD::SETULT: Swap = true;
15863   case ISD::SETUGT: Opc = X86ISD::PCMPGT;
15864                     FlipSigns = true; break;
15865   case ISD::SETUGE: Swap = true;
15866   case ISD::SETULE: Opc = X86ISD::PCMPGT;
15867                     FlipSigns = true; Invert = true; break;
15868   }
15869
15870   // Special case: Use min/max operations for SETULE/SETUGE
15871   MVT VET = VT.getVectorElementType();
15872   bool hasMinMax =
15873        (Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
15874     || (Subtarget->hasSSE2()  && (VET == MVT::i8));
15875
15876   if (hasMinMax) {
15877     switch (SetCCOpcode) {
15878     default: break;
15879     case ISD::SETULE: Opc = X86ISD::UMIN; MinMax = true; break;
15880     case ISD::SETUGE: Opc = X86ISD::UMAX; MinMax = true; break;
15881     }
15882
15883     if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
15884   }
15885
15886   bool hasSubus = Subtarget->hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
15887   if (!MinMax && hasSubus) {
15888     // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
15889     // Op0 u<= Op1:
15890     //   t = psubus Op0, Op1
15891     //   pcmpeq t, <0..0>
15892     switch (SetCCOpcode) {
15893     default: break;
15894     case ISD::SETULT: {
15895       // If the comparison is against a constant we can turn this into a
15896       // setule.  With psubus, setule does not require a swap.  This is
15897       // beneficial because the constant in the register is no longer
15898       // destructed as the destination so it can be hoisted out of a loop.
15899       // Only do this pre-AVX since vpcmp* is no longer destructive.
15900       if (Subtarget->hasAVX())
15901         break;
15902       SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG);
15903       if (ULEOp1.getNode()) {
15904         Op1 = ULEOp1;
15905         Subus = true; Invert = false; Swap = false;
15906       }
15907       break;
15908     }
15909     // Psubus is better than flip-sign because it requires no inversion.
15910     case ISD::SETUGE: Subus = true; Invert = false; Swap = true;  break;
15911     case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
15912     }
15913
15914     if (Subus) {
15915       Opc = X86ISD::SUBUS;
15916       FlipSigns = false;
15917     }
15918   }
15919
15920   if (Swap)
15921     std::swap(Op0, Op1);
15922
15923   // Check that the operation in question is available (most are plain SSE2,
15924   // but PCMPGTQ and PCMPEQQ have different requirements).
15925   if (VT == MVT::v2i64) {
15926     if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) {
15927       assert(Subtarget->hasSSE2() && "Don't know how to lower!");
15928
15929       // First cast everything to the right type.
15930       Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
15931       Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
15932
15933       // Since SSE has no unsigned integer comparisons, we need to flip the sign
15934       // bits of the inputs before performing those operations. The lower
15935       // compare is always unsigned.
15936       SDValue SB;
15937       if (FlipSigns) {
15938         SB = DAG.getConstant(0x80000000U, MVT::v4i32);
15939       } else {
15940         SDValue Sign = DAG.getConstant(0x80000000U, MVT::i32);
15941         SDValue Zero = DAG.getConstant(0x00000000U, MVT::i32);
15942         SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
15943                          Sign, Zero, Sign, Zero);
15944       }
15945       Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
15946       Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
15947
15948       // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
15949       SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
15950       SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
15951
15952       // Create masks for only the low parts/high parts of the 64 bit integers.
15953       static const int MaskHi[] = { 1, 1, 3, 3 };
15954       static const int MaskLo[] = { 0, 0, 2, 2 };
15955       SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
15956       SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
15957       SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
15958
15959       SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
15960       Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
15961
15962       if (Invert)
15963         Result = DAG.getNOT(dl, Result, MVT::v4i32);
15964
15965       return DAG.getNode(ISD::BITCAST, dl, VT, Result);
15966     }
15967
15968     if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) {
15969       // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
15970       // pcmpeqd + pshufd + pand.
15971       assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!");
15972
15973       // First cast everything to the right type.
15974       Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
15975       Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
15976
15977       // Do the compare.
15978       SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
15979
15980       // Make sure the lower and upper halves are both all-ones.
15981       static const int Mask[] = { 1, 0, 3, 2 };
15982       SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
15983       Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
15984
15985       if (Invert)
15986         Result = DAG.getNOT(dl, Result, MVT::v4i32);
15987
15988       return DAG.getNode(ISD::BITCAST, dl, VT, Result);
15989     }
15990   }
15991
15992   // Since SSE has no unsigned integer comparisons, we need to flip the sign
15993   // bits of the inputs before performing those operations.
15994   if (FlipSigns) {
15995     EVT EltVT = VT.getVectorElementType();
15996     SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), VT);
15997     Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
15998     Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
15999   }
16000
16001   SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
16002
16003   // If the logical-not of the result is required, perform that now.
16004   if (Invert)
16005     Result = DAG.getNOT(dl, Result, VT);
16006
16007   if (MinMax)
16008     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
16009
16010   if (Subus)
16011     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
16012                          getZeroVector(VT, Subtarget, DAG, dl));
16013
16014   return Result;
16015 }
16016
16017 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
16018
16019   MVT VT = Op.getSimpleValueType();
16020
16021   if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
16022
16023   assert(((!Subtarget->hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
16024          && "SetCC type must be 8-bit or 1-bit integer");
16025   SDValue Op0 = Op.getOperand(0);
16026   SDValue Op1 = Op.getOperand(1);
16027   SDLoc dl(Op);
16028   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
16029
16030   // Optimize to BT if possible.
16031   // Lower (X & (1 << N)) == 0 to BT(X, N).
16032   // Lower ((X >>u N) & 1) != 0 to BT(X, N).
16033   // Lower ((X >>s N) & 1) != 0 to BT(X, N).
16034   if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
16035       Op1.getOpcode() == ISD::Constant &&
16036       cast<ConstantSDNode>(Op1)->isNullValue() &&
16037       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
16038     SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
16039     if (NewSetCC.getNode()) {
16040       if (VT == MVT::i1)
16041         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
16042       return NewSetCC;
16043     }
16044   }
16045
16046   // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
16047   // these.
16048   if (Op1.getOpcode() == ISD::Constant &&
16049       (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
16050        cast<ConstantSDNode>(Op1)->isNullValue()) &&
16051       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
16052
16053     // If the input is a setcc, then reuse the input setcc or use a new one with
16054     // the inverted condition.
16055     if (Op0.getOpcode() == X86ISD::SETCC) {
16056       X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
16057       bool Invert = (CC == ISD::SETNE) ^
16058         cast<ConstantSDNode>(Op1)->isNullValue();
16059       if (!Invert)
16060         return Op0;
16061
16062       CCode = X86::GetOppositeBranchCondition(CCode);
16063       SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
16064                                   DAG.getConstant(CCode, MVT::i8),
16065                                   Op0.getOperand(1));
16066       if (VT == MVT::i1)
16067         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
16068       return SetCC;
16069     }
16070   }
16071   if ((Op0.getValueType() == MVT::i1) && (Op1.getOpcode() == ISD::Constant) &&
16072       (cast<ConstantSDNode>(Op1)->getZExtValue() == 1) &&
16073       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
16074
16075     ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
16076     return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, MVT::i1), NewCC);
16077   }
16078
16079   bool isFP = Op1.getSimpleValueType().isFloatingPoint();
16080   unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
16081   if (X86CC == X86::COND_INVALID)
16082     return SDValue();
16083
16084   SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
16085   EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
16086   SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
16087                               DAG.getConstant(X86CC, MVT::i8), EFLAGS);
16088   if (VT == MVT::i1)
16089     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
16090   return SetCC;
16091 }
16092
16093 // isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
16094 static bool isX86LogicalCmp(SDValue Op) {
16095   unsigned Opc = Op.getNode()->getOpcode();
16096   if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
16097       Opc == X86ISD::SAHF)
16098     return true;
16099   if (Op.getResNo() == 1 &&
16100       (Opc == X86ISD::ADD ||
16101        Opc == X86ISD::SUB ||
16102        Opc == X86ISD::ADC ||
16103        Opc == X86ISD::SBB ||
16104        Opc == X86ISD::SMUL ||
16105        Opc == X86ISD::UMUL ||
16106        Opc == X86ISD::INC ||
16107        Opc == X86ISD::DEC ||
16108        Opc == X86ISD::OR ||
16109        Opc == X86ISD::XOR ||
16110        Opc == X86ISD::AND))
16111     return true;
16112
16113   if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
16114     return true;
16115
16116   return false;
16117 }
16118
16119 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
16120   if (V.getOpcode() != ISD::TRUNCATE)
16121     return false;
16122
16123   SDValue VOp0 = V.getOperand(0);
16124   unsigned InBits = VOp0.getValueSizeInBits();
16125   unsigned Bits = V.getValueSizeInBits();
16126   return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
16127 }
16128
16129 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
16130   bool addTest = true;
16131   SDValue Cond  = Op.getOperand(0);
16132   SDValue Op1 = Op.getOperand(1);
16133   SDValue Op2 = Op.getOperand(2);
16134   SDLoc DL(Op);
16135   EVT VT = Op1.getValueType();
16136   SDValue CC;
16137
16138   // Lower fp selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
16139   // are available. Otherwise fp cmovs get lowered into a less efficient branch
16140   // sequence later on.
16141   if (Cond.getOpcode() == ISD::SETCC &&
16142       ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
16143        (Subtarget->hasSSE1() && VT == MVT::f32)) &&
16144       VT == Cond.getOperand(0).getValueType() && Cond->hasOneUse()) {
16145     SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
16146     int SSECC = translateX86FSETCC(
16147         cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
16148
16149     if (SSECC != 8) {
16150       if (Subtarget->hasAVX512()) {
16151         SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1,
16152                                   DAG.getConstant(SSECC, MVT::i8));
16153         return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2);
16154       }
16155       SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
16156                                 DAG.getConstant(SSECC, MVT::i8));
16157       SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
16158       SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
16159       return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
16160     }
16161   }
16162
16163   if (Cond.getOpcode() == ISD::SETCC) {
16164     SDValue NewCond = LowerSETCC(Cond, DAG);
16165     if (NewCond.getNode())
16166       Cond = NewCond;
16167   }
16168
16169   // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
16170   // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
16171   // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
16172   // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
16173   if (Cond.getOpcode() == X86ISD::SETCC &&
16174       Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
16175       isZero(Cond.getOperand(1).getOperand(1))) {
16176     SDValue Cmp = Cond.getOperand(1);
16177
16178     unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
16179
16180     if ((isAllOnes(Op1) || isAllOnes(Op2)) &&
16181         (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
16182       SDValue Y = isAllOnes(Op2) ? Op1 : Op2;
16183
16184       SDValue CmpOp0 = Cmp.getOperand(0);
16185       // Apply further optimizations for special cases
16186       // (select (x != 0), -1, 0) -> neg & sbb
16187       // (select (x == 0), 0, -1) -> neg & sbb
16188       if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y))
16189         if (YC->isNullValue() &&
16190             (isAllOnes(Op1) == (CondCode == X86::COND_NE))) {
16191           SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
16192           SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
16193                                     DAG.getConstant(0, CmpOp0.getValueType()),
16194                                     CmpOp0);
16195           SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
16196                                     DAG.getConstant(X86::COND_B, MVT::i8),
16197                                     SDValue(Neg.getNode(), 1));
16198           return Res;
16199         }
16200
16201       Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
16202                         CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));
16203       Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16204
16205       SDValue Res =   // Res = 0 or -1.
16206         DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
16207                     DAG.getConstant(X86::COND_B, MVT::i8), Cmp);
16208
16209       if (isAllOnes(Op1) != (CondCode == X86::COND_E))
16210         Res = DAG.getNOT(DL, Res, Res.getValueType());
16211
16212       ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
16213       if (!N2C || !N2C->isNullValue())
16214         Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
16215       return Res;
16216     }
16217   }
16218
16219   // Look past (and (setcc_carry (cmp ...)), 1).
16220   if (Cond.getOpcode() == ISD::AND &&
16221       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
16222     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
16223     if (C && C->getAPIntValue() == 1)
16224       Cond = Cond.getOperand(0);
16225   }
16226
16227   // If condition flag is set by a X86ISD::CMP, then use it as the condition
16228   // setting operand in place of the X86ISD::SETCC.
16229   unsigned CondOpcode = Cond.getOpcode();
16230   if (CondOpcode == X86ISD::SETCC ||
16231       CondOpcode == X86ISD::SETCC_CARRY) {
16232     CC = Cond.getOperand(0);
16233
16234     SDValue Cmp = Cond.getOperand(1);
16235     unsigned Opc = Cmp.getOpcode();
16236     MVT VT = Op.getSimpleValueType();
16237
16238     bool IllegalFPCMov = false;
16239     if (VT.isFloatingPoint() && !VT.isVector() &&
16240         !isScalarFPTypeInSSEReg(VT))  // FPStack?
16241       IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
16242
16243     if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
16244         Opc == X86ISD::BT) { // FIXME
16245       Cond = Cmp;
16246       addTest = false;
16247     }
16248   } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
16249              CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
16250              ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
16251               Cond.getOperand(0).getValueType() != MVT::i8)) {
16252     SDValue LHS = Cond.getOperand(0);
16253     SDValue RHS = Cond.getOperand(1);
16254     unsigned X86Opcode;
16255     unsigned X86Cond;
16256     SDVTList VTs;
16257     switch (CondOpcode) {
16258     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
16259     case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
16260     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
16261     case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
16262     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
16263     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
16264     default: llvm_unreachable("unexpected overflowing operator");
16265     }
16266     if (CondOpcode == ISD::UMULO)
16267       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
16268                           MVT::i32);
16269     else
16270       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
16271
16272     SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
16273
16274     if (CondOpcode == ISD::UMULO)
16275       Cond = X86Op.getValue(2);
16276     else
16277       Cond = X86Op.getValue(1);
16278
16279     CC = DAG.getConstant(X86Cond, MVT::i8);
16280     addTest = false;
16281   }
16282
16283   if (addTest) {
16284     // Look pass the truncate if the high bits are known zero.
16285     if (isTruncWithZeroHighBitsInput(Cond, DAG))
16286         Cond = Cond.getOperand(0);
16287
16288     // We know the result of AND is compared against zero. Try to match
16289     // it to BT.
16290     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
16291       SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG);
16292       if (NewSetCC.getNode()) {
16293         CC = NewSetCC.getOperand(0);
16294         Cond = NewSetCC.getOperand(1);
16295         addTest = false;
16296       }
16297     }
16298   }
16299
16300   if (addTest) {
16301     CC = DAG.getConstant(X86::COND_NE, MVT::i8);
16302     Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
16303   }
16304
16305   // a <  b ? -1 :  0 -> RES = ~setcc_carry
16306   // a <  b ?  0 : -1 -> RES = setcc_carry
16307   // a >= b ? -1 :  0 -> RES = setcc_carry
16308   // a >= b ?  0 : -1 -> RES = ~setcc_carry
16309   if (Cond.getOpcode() == X86ISD::SUB) {
16310     Cond = ConvertCmpIfNecessary(Cond, DAG);
16311     unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
16312
16313     if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
16314         (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) {
16315       SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
16316                                 DAG.getConstant(X86::COND_B, MVT::i8), Cond);
16317       if (isAllOnes(Op1) != (CondCode == X86::COND_B))
16318         return DAG.getNOT(DL, Res, Res.getValueType());
16319       return Res;
16320     }
16321   }
16322
16323   // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
16324   // widen the cmov and push the truncate through. This avoids introducing a new
16325   // branch during isel and doesn't add any extensions.
16326   if (Op.getValueType() == MVT::i8 &&
16327       Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
16328     SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
16329     if (T1.getValueType() == T2.getValueType() &&
16330         // Blacklist CopyFromReg to avoid partial register stalls.
16331         T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
16332       SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
16333       SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
16334       return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
16335     }
16336   }
16337
16338   // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
16339   // condition is true.
16340   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
16341   SDValue Ops[] = { Op2, Op1, CC, Cond };
16342   return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
16343 }
16344
16345 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget,
16346                                        SelectionDAG &DAG) {
16347   MVT VT = Op->getSimpleValueType(0);
16348   SDValue In = Op->getOperand(0);
16349   MVT InVT = In.getSimpleValueType();
16350   MVT VTElt = VT.getVectorElementType();
16351   MVT InVTElt = InVT.getVectorElementType();
16352   SDLoc dl(Op);
16353
16354   // SKX processor
16355   if ((InVTElt == MVT::i1) &&
16356       (((Subtarget->hasBWI() && Subtarget->hasVLX() &&
16357         VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) ||
16358
16359        ((Subtarget->hasBWI() && VT.is512BitVector() &&
16360         VTElt.getSizeInBits() <= 16)) ||
16361
16362        ((Subtarget->hasDQI() && Subtarget->hasVLX() &&
16363         VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
16364
16365        ((Subtarget->hasDQI() && VT.is512BitVector() &&
16366         VTElt.getSizeInBits() >= 32))))
16367     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16368
16369   unsigned int NumElts = VT.getVectorNumElements();
16370
16371   if (NumElts != 8 && NumElts != 16)
16372     return SDValue();
16373
16374   if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) {
16375     if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
16376       return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
16377     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16378   }
16379
16380   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16381   assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
16382
16383   MVT ExtVT = (NumElts == 8) ? MVT::v8i64 : MVT::v16i32;
16384   Constant *C = ConstantInt::get(*DAG.getContext(),
16385     APInt::getAllOnesValue(ExtVT.getScalarType().getSizeInBits()));
16386
16387   SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
16388   unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
16389   SDValue Ld = DAG.getLoad(ExtVT.getScalarType(), dl, DAG.getEntryNode(), CP,
16390                           MachinePointerInfo::getConstantPool(),
16391                           false, false, false, Alignment);
16392   SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, dl, ExtVT, In, Ld);
16393   if (VT.is512BitVector())
16394     return Brcst;
16395   return DAG.getNode(X86ISD::VTRUNC, dl, VT, Brcst);
16396 }
16397
16398 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
16399                                 SelectionDAG &DAG) {
16400   MVT VT = Op->getSimpleValueType(0);
16401   SDValue In = Op->getOperand(0);
16402   MVT InVT = In.getSimpleValueType();
16403   SDLoc dl(Op);
16404
16405   if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
16406     return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
16407
16408   if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
16409       (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
16410       (VT != MVT::v16i16 || InVT != MVT::v16i8))
16411     return SDValue();
16412
16413   if (Subtarget->hasInt256())
16414     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16415
16416   // Optimize vectors in AVX mode
16417   // Sign extend  v8i16 to v8i32 and
16418   //              v4i32 to v4i64
16419   //
16420   // Divide input vector into two parts
16421   // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
16422   // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
16423   // concat the vectors to original VT
16424
16425   unsigned NumElems = InVT.getVectorNumElements();
16426   SDValue Undef = DAG.getUNDEF(InVT);
16427
16428   SmallVector<int,8> ShufMask1(NumElems, -1);
16429   for (unsigned i = 0; i != NumElems/2; ++i)
16430     ShufMask1[i] = i;
16431
16432   SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask1[0]);
16433
16434   SmallVector<int,8> ShufMask2(NumElems, -1);
16435   for (unsigned i = 0; i != NumElems/2; ++i)
16436     ShufMask2[i] = i + NumElems/2;
16437
16438   SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]);
16439
16440   MVT HalfVT = MVT::getVectorVT(VT.getScalarType(),
16441                                 VT.getVectorNumElements()/2);
16442
16443   OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
16444   OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi);
16445
16446   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
16447 }
16448
16449 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
16450 // may emit an illegal shuffle but the expansion is still better than scalar
16451 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
16452 // we'll emit a shuffle and a arithmetic shift.
16453 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
16454 // TODO: It is possible to support ZExt by zeroing the undef values during
16455 // the shuffle phase or after the shuffle.
16456 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
16457                                  SelectionDAG &DAG) {
16458   MVT RegVT = Op.getSimpleValueType();
16459   assert(RegVT.isVector() && "We only custom lower vector sext loads.");
16460   assert(RegVT.isInteger() &&
16461          "We only custom lower integer vector sext loads.");
16462
16463   // Nothing useful we can do without SSE2 shuffles.
16464   assert(Subtarget->hasSSE2() && "We only custom lower sext loads with SSE2.");
16465
16466   LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
16467   SDLoc dl(Ld);
16468   EVT MemVT = Ld->getMemoryVT();
16469   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16470   unsigned RegSz = RegVT.getSizeInBits();
16471
16472   ISD::LoadExtType Ext = Ld->getExtensionType();
16473
16474   assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
16475          && "Only anyext and sext are currently implemented.");
16476   assert(MemVT != RegVT && "Cannot extend to the same type");
16477   assert(MemVT.isVector() && "Must load a vector from memory");
16478
16479   unsigned NumElems = RegVT.getVectorNumElements();
16480   unsigned MemSz = MemVT.getSizeInBits();
16481   assert(RegSz > MemSz && "Register size must be greater than the mem size");
16482
16483   if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) {
16484     // The only way in which we have a legal 256-bit vector result but not the
16485     // integer 256-bit operations needed to directly lower a sextload is if we
16486     // have AVX1 but not AVX2. In that case, we can always emit a sextload to
16487     // a 128-bit vector and a normal sign_extend to 256-bits that should get
16488     // correctly legalized. We do this late to allow the canonical form of
16489     // sextload to persist throughout the rest of the DAG combiner -- it wants
16490     // to fold together any extensions it can, and so will fuse a sign_extend
16491     // of an sextload into a sextload targeting a wider value.
16492     SDValue Load;
16493     if (MemSz == 128) {
16494       // Just switch this to a normal load.
16495       assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
16496                                        "it must be a legal 128-bit vector "
16497                                        "type!");
16498       Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
16499                   Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(),
16500                   Ld->isInvariant(), Ld->getAlignment());
16501     } else {
16502       assert(MemSz < 128 &&
16503              "Can't extend a type wider than 128 bits to a 256 bit vector!");
16504       // Do an sext load to a 128-bit vector type. We want to use the same
16505       // number of elements, but elements half as wide. This will end up being
16506       // recursively lowered by this routine, but will succeed as we definitely
16507       // have all the necessary features if we're using AVX1.
16508       EVT HalfEltVT =
16509           EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
16510       EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
16511       Load =
16512           DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
16513                          Ld->getPointerInfo(), MemVT, Ld->isVolatile(),
16514                          Ld->isNonTemporal(), Ld->isInvariant(),
16515                          Ld->getAlignment());
16516     }
16517
16518     // Replace chain users with the new chain.
16519     assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
16520     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
16521
16522     // Finally, do a normal sign-extend to the desired register.
16523     return DAG.getSExtOrTrunc(Load, dl, RegVT);
16524   }
16525
16526   // All sizes must be a power of two.
16527   assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
16528          "Non-power-of-two elements are not custom lowered!");
16529
16530   // Attempt to load the original value using scalar loads.
16531   // Find the largest scalar type that divides the total loaded size.
16532   MVT SclrLoadTy = MVT::i8;
16533   for (MVT Tp : MVT::integer_valuetypes()) {
16534     if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
16535       SclrLoadTy = Tp;
16536     }
16537   }
16538
16539   // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
16540   if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
16541       (64 <= MemSz))
16542     SclrLoadTy = MVT::f64;
16543
16544   // Calculate the number of scalar loads that we need to perform
16545   // in order to load our vector from memory.
16546   unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
16547
16548   assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
16549          "Can only lower sext loads with a single scalar load!");
16550
16551   unsigned loadRegZize = RegSz;
16552   if (Ext == ISD::SEXTLOAD && RegSz == 256)
16553     loadRegZize /= 2;
16554
16555   // Represent our vector as a sequence of elements which are the
16556   // largest scalar that we can load.
16557   EVT LoadUnitVecVT = EVT::getVectorVT(
16558       *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
16559
16560   // Represent the data using the same element type that is stored in
16561   // memory. In practice, we ''widen'' MemVT.
16562   EVT WideVecVT =
16563       EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
16564                        loadRegZize / MemVT.getScalarType().getSizeInBits());
16565
16566   assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
16567          "Invalid vector type");
16568
16569   // We can't shuffle using an illegal type.
16570   assert(TLI.isTypeLegal(WideVecVT) &&
16571          "We only lower types that form legal widened vector types");
16572
16573   SmallVector<SDValue, 8> Chains;
16574   SDValue Ptr = Ld->getBasePtr();
16575   SDValue Increment =
16576       DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, TLI.getPointerTy());
16577   SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
16578
16579   for (unsigned i = 0; i < NumLoads; ++i) {
16580     // Perform a single load.
16581     SDValue ScalarLoad =
16582         DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
16583                     Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(),
16584                     Ld->getAlignment());
16585     Chains.push_back(ScalarLoad.getValue(1));
16586     // Create the first element type using SCALAR_TO_VECTOR in order to avoid
16587     // another round of DAGCombining.
16588     if (i == 0)
16589       Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
16590     else
16591       Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
16592                         ScalarLoad, DAG.getIntPtrConstant(i));
16593
16594     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
16595   }
16596
16597   SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
16598
16599   // Bitcast the loaded value to a vector of the original element type, in
16600   // the size of the target vector type.
16601   SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
16602   unsigned SizeRatio = RegSz / MemSz;
16603
16604   if (Ext == ISD::SEXTLOAD) {
16605     // If we have SSE4.1, we can directly emit a VSEXT node.
16606     if (Subtarget->hasSSE41()) {
16607       SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
16608       DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16609       return Sext;
16610     }
16611
16612     // Otherwise we'll shuffle the small elements in the high bits of the
16613     // larger type and perform an arithmetic shift. If the shift is not legal
16614     // it's better to scalarize.
16615     assert(TLI.isOperationLegalOrCustom(ISD::SRA, RegVT) &&
16616            "We can't implement a sext load without an arithmetic right shift!");
16617
16618     // Redistribute the loaded elements into the different locations.
16619     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
16620     for (unsigned i = 0; i != NumElems; ++i)
16621       ShuffleVec[i * SizeRatio + SizeRatio - 1] = i;
16622
16623     SDValue Shuff = DAG.getVectorShuffle(
16624         WideVecVT, dl, SlicedVec, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
16625
16626     Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
16627
16628     // Build the arithmetic shift.
16629     unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
16630                    MemVT.getVectorElementType().getSizeInBits();
16631     Shuff =
16632         DAG.getNode(ISD::SRA, dl, RegVT, Shuff, DAG.getConstant(Amt, RegVT));
16633
16634     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16635     return Shuff;
16636   }
16637
16638   // Redistribute the loaded elements into the different locations.
16639   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
16640   for (unsigned i = 0; i != NumElems; ++i)
16641     ShuffleVec[i * SizeRatio] = i;
16642
16643   SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
16644                                        DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
16645
16646   // Bitcast to the requested type.
16647   Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
16648   DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16649   return Shuff;
16650 }
16651
16652 // isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
16653 // ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
16654 // from the AND / OR.
16655 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
16656   Opc = Op.getOpcode();
16657   if (Opc != ISD::OR && Opc != ISD::AND)
16658     return false;
16659   return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
16660           Op.getOperand(0).hasOneUse() &&
16661           Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
16662           Op.getOperand(1).hasOneUse());
16663 }
16664
16665 // isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
16666 // 1 and that the SETCC node has a single use.
16667 static bool isXor1OfSetCC(SDValue Op) {
16668   if (Op.getOpcode() != ISD::XOR)
16669     return false;
16670   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
16671   if (N1C && N1C->getAPIntValue() == 1) {
16672     return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
16673       Op.getOperand(0).hasOneUse();
16674   }
16675   return false;
16676 }
16677
16678 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
16679   bool addTest = true;
16680   SDValue Chain = Op.getOperand(0);
16681   SDValue Cond  = Op.getOperand(1);
16682   SDValue Dest  = Op.getOperand(2);
16683   SDLoc dl(Op);
16684   SDValue CC;
16685   bool Inverted = false;
16686
16687   if (Cond.getOpcode() == ISD::SETCC) {
16688     // Check for setcc([su]{add,sub,mul}o == 0).
16689     if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
16690         isa<ConstantSDNode>(Cond.getOperand(1)) &&
16691         cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() &&
16692         Cond.getOperand(0).getResNo() == 1 &&
16693         (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
16694          Cond.getOperand(0).getOpcode() == ISD::UADDO ||
16695          Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
16696          Cond.getOperand(0).getOpcode() == ISD::USUBO ||
16697          Cond.getOperand(0).getOpcode() == ISD::SMULO ||
16698          Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
16699       Inverted = true;
16700       Cond = Cond.getOperand(0);
16701     } else {
16702       SDValue NewCond = LowerSETCC(Cond, DAG);
16703       if (NewCond.getNode())
16704         Cond = NewCond;
16705     }
16706   }
16707 #if 0
16708   // FIXME: LowerXALUO doesn't handle these!!
16709   else if (Cond.getOpcode() == X86ISD::ADD  ||
16710            Cond.getOpcode() == X86ISD::SUB  ||
16711            Cond.getOpcode() == X86ISD::SMUL ||
16712            Cond.getOpcode() == X86ISD::UMUL)
16713     Cond = LowerXALUO(Cond, DAG);
16714 #endif
16715
16716   // Look pass (and (setcc_carry (cmp ...)), 1).
16717   if (Cond.getOpcode() == ISD::AND &&
16718       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
16719     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
16720     if (C && C->getAPIntValue() == 1)
16721       Cond = Cond.getOperand(0);
16722   }
16723
16724   // If condition flag is set by a X86ISD::CMP, then use it as the condition
16725   // setting operand in place of the X86ISD::SETCC.
16726   unsigned CondOpcode = Cond.getOpcode();
16727   if (CondOpcode == X86ISD::SETCC ||
16728       CondOpcode == X86ISD::SETCC_CARRY) {
16729     CC = Cond.getOperand(0);
16730
16731     SDValue Cmp = Cond.getOperand(1);
16732     unsigned Opc = Cmp.getOpcode();
16733     // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
16734     if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
16735       Cond = Cmp;
16736       addTest = false;
16737     } else {
16738       switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
16739       default: break;
16740       case X86::COND_O:
16741       case X86::COND_B:
16742         // These can only come from an arithmetic instruction with overflow,
16743         // e.g. SADDO, UADDO.
16744         Cond = Cond.getNode()->getOperand(1);
16745         addTest = false;
16746         break;
16747       }
16748     }
16749   }
16750   CondOpcode = Cond.getOpcode();
16751   if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
16752       CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
16753       ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
16754        Cond.getOperand(0).getValueType() != MVT::i8)) {
16755     SDValue LHS = Cond.getOperand(0);
16756     SDValue RHS = Cond.getOperand(1);
16757     unsigned X86Opcode;
16758     unsigned X86Cond;
16759     SDVTList VTs;
16760     // Keep this in sync with LowerXALUO, otherwise we might create redundant
16761     // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
16762     // X86ISD::INC).
16763     switch (CondOpcode) {
16764     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
16765     case ISD::SADDO:
16766       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
16767         if (C->isOne()) {
16768           X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
16769           break;
16770         }
16771       X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
16772     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
16773     case ISD::SSUBO:
16774       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
16775         if (C->isOne()) {
16776           X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
16777           break;
16778         }
16779       X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
16780     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
16781     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
16782     default: llvm_unreachable("unexpected overflowing operator");
16783     }
16784     if (Inverted)
16785       X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
16786     if (CondOpcode == ISD::UMULO)
16787       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
16788                           MVT::i32);
16789     else
16790       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
16791
16792     SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
16793
16794     if (CondOpcode == ISD::UMULO)
16795       Cond = X86Op.getValue(2);
16796     else
16797       Cond = X86Op.getValue(1);
16798
16799     CC = DAG.getConstant(X86Cond, MVT::i8);
16800     addTest = false;
16801   } else {
16802     unsigned CondOpc;
16803     if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
16804       SDValue Cmp = Cond.getOperand(0).getOperand(1);
16805       if (CondOpc == ISD::OR) {
16806         // Also, recognize the pattern generated by an FCMP_UNE. We can emit
16807         // two branches instead of an explicit OR instruction with a
16808         // separate test.
16809         if (Cmp == Cond.getOperand(1).getOperand(1) &&
16810             isX86LogicalCmp(Cmp)) {
16811           CC = Cond.getOperand(0).getOperand(0);
16812           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16813                               Chain, Dest, CC, Cmp);
16814           CC = Cond.getOperand(1).getOperand(0);
16815           Cond = Cmp;
16816           addTest = false;
16817         }
16818       } else { // ISD::AND
16819         // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
16820         // two branches instead of an explicit AND instruction with a
16821         // separate test. However, we only do this if this block doesn't
16822         // have a fall-through edge, because this requires an explicit
16823         // jmp when the condition is false.
16824         if (Cmp == Cond.getOperand(1).getOperand(1) &&
16825             isX86LogicalCmp(Cmp) &&
16826             Op.getNode()->hasOneUse()) {
16827           X86::CondCode CCode =
16828             (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
16829           CCode = X86::GetOppositeBranchCondition(CCode);
16830           CC = DAG.getConstant(CCode, MVT::i8);
16831           SDNode *User = *Op.getNode()->use_begin();
16832           // Look for an unconditional branch following this conditional branch.
16833           // We need this because we need to reverse the successors in order
16834           // to implement FCMP_OEQ.
16835           if (User->getOpcode() == ISD::BR) {
16836             SDValue FalseBB = User->getOperand(1);
16837             SDNode *NewBR =
16838               DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16839             assert(NewBR == User);
16840             (void)NewBR;
16841             Dest = FalseBB;
16842
16843             Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16844                                 Chain, Dest, CC, Cmp);
16845             X86::CondCode CCode =
16846               (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
16847             CCode = X86::GetOppositeBranchCondition(CCode);
16848             CC = DAG.getConstant(CCode, MVT::i8);
16849             Cond = Cmp;
16850             addTest = false;
16851           }
16852         }
16853       }
16854     } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
16855       // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
16856       // It should be transformed during dag combiner except when the condition
16857       // is set by a arithmetics with overflow node.
16858       X86::CondCode CCode =
16859         (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
16860       CCode = X86::GetOppositeBranchCondition(CCode);
16861       CC = DAG.getConstant(CCode, MVT::i8);
16862       Cond = Cond.getOperand(0).getOperand(1);
16863       addTest = false;
16864     } else if (Cond.getOpcode() == ISD::SETCC &&
16865                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
16866       // For FCMP_OEQ, we can emit
16867       // two branches instead of an explicit AND instruction with a
16868       // separate test. However, we only do this if this block doesn't
16869       // have a fall-through edge, because this requires an explicit
16870       // jmp when the condition is false.
16871       if (Op.getNode()->hasOneUse()) {
16872         SDNode *User = *Op.getNode()->use_begin();
16873         // Look for an unconditional branch following this conditional branch.
16874         // We need this because we need to reverse the successors in order
16875         // to implement FCMP_OEQ.
16876         if (User->getOpcode() == ISD::BR) {
16877           SDValue FalseBB = User->getOperand(1);
16878           SDNode *NewBR =
16879             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16880           assert(NewBR == User);
16881           (void)NewBR;
16882           Dest = FalseBB;
16883
16884           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
16885                                     Cond.getOperand(0), Cond.getOperand(1));
16886           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16887           CC = DAG.getConstant(X86::COND_NE, MVT::i8);
16888           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16889                               Chain, Dest, CC, Cmp);
16890           CC = DAG.getConstant(X86::COND_P, MVT::i8);
16891           Cond = Cmp;
16892           addTest = false;
16893         }
16894       }
16895     } else if (Cond.getOpcode() == ISD::SETCC &&
16896                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
16897       // For FCMP_UNE, we can emit
16898       // two branches instead of an explicit AND instruction with a
16899       // separate test. However, we only do this if this block doesn't
16900       // have a fall-through edge, because this requires an explicit
16901       // jmp when the condition is false.
16902       if (Op.getNode()->hasOneUse()) {
16903         SDNode *User = *Op.getNode()->use_begin();
16904         // Look for an unconditional branch following this conditional branch.
16905         // We need this because we need to reverse the successors in order
16906         // to implement FCMP_UNE.
16907         if (User->getOpcode() == ISD::BR) {
16908           SDValue FalseBB = User->getOperand(1);
16909           SDNode *NewBR =
16910             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16911           assert(NewBR == User);
16912           (void)NewBR;
16913
16914           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
16915                                     Cond.getOperand(0), Cond.getOperand(1));
16916           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16917           CC = DAG.getConstant(X86::COND_NE, MVT::i8);
16918           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16919                               Chain, Dest, CC, Cmp);
16920           CC = DAG.getConstant(X86::COND_NP, MVT::i8);
16921           Cond = Cmp;
16922           addTest = false;
16923           Dest = FalseBB;
16924         }
16925       }
16926     }
16927   }
16928
16929   if (addTest) {
16930     // Look pass the truncate if the high bits are known zero.
16931     if (isTruncWithZeroHighBitsInput(Cond, DAG))
16932         Cond = Cond.getOperand(0);
16933
16934     // We know the result of AND is compared against zero. Try to match
16935     // it to BT.
16936     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
16937       SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
16938       if (NewSetCC.getNode()) {
16939         CC = NewSetCC.getOperand(0);
16940         Cond = NewSetCC.getOperand(1);
16941         addTest = false;
16942       }
16943     }
16944   }
16945
16946   if (addTest) {
16947     X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
16948     CC = DAG.getConstant(X86Cond, MVT::i8);
16949     Cond = EmitTest(Cond, X86Cond, dl, DAG);
16950   }
16951   Cond = ConvertCmpIfNecessary(Cond, DAG);
16952   return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16953                      Chain, Dest, CC, Cond);
16954 }
16955
16956 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
16957 // Calls to _alloca are needed to probe the stack when allocating more than 4k
16958 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
16959 // that the guard pages used by the OS virtual memory manager are allocated in
16960 // correct sequence.
16961 SDValue
16962 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
16963                                            SelectionDAG &DAG) const {
16964   MachineFunction &MF = DAG.getMachineFunction();
16965   bool SplitStack = MF.shouldSplitStack();
16966   bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMachO()) ||
16967                SplitStack;
16968   SDLoc dl(Op);
16969
16970   if (!Lower) {
16971     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16972     SDNode* Node = Op.getNode();
16973
16974     unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
16975     assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
16976         " not tell us which reg is the stack pointer!");
16977     EVT VT = Node->getValueType(0);
16978     SDValue Tmp1 = SDValue(Node, 0);
16979     SDValue Tmp2 = SDValue(Node, 1);
16980     SDValue Tmp3 = Node->getOperand(2);
16981     SDValue Chain = Tmp1.getOperand(0);
16982
16983     // Chain the dynamic stack allocation so that it doesn't modify the stack
16984     // pointer when other instructions are using the stack.
16985     Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true),
16986         SDLoc(Node));
16987
16988     SDValue Size = Tmp2.getOperand(1);
16989     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
16990     Chain = SP.getValue(1);
16991     unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
16992     const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
16993     unsigned StackAlign = TFI.getStackAlignment();
16994     Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
16995     if (Align > StackAlign)
16996       Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
16997           DAG.getConstant(-(uint64_t)Align, VT));
16998     Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
16999
17000     Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, true),
17001         DAG.getIntPtrConstant(0, true), SDValue(),
17002         SDLoc(Node));
17003
17004     SDValue Ops[2] = { Tmp1, Tmp2 };
17005     return DAG.getMergeValues(Ops, dl);
17006   }
17007
17008   // Get the inputs.
17009   SDValue Chain = Op.getOperand(0);
17010   SDValue Size  = Op.getOperand(1);
17011   unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
17012   EVT VT = Op.getNode()->getValueType(0);
17013
17014   bool Is64Bit = Subtarget->is64Bit();
17015   EVT SPTy = getPointerTy();
17016
17017   if (SplitStack) {
17018     MachineRegisterInfo &MRI = MF.getRegInfo();
17019
17020     if (Is64Bit) {
17021       // The 64 bit implementation of segmented stacks needs to clobber both r10
17022       // r11. This makes it impossible to use it along with nested parameters.
17023       const Function *F = MF.getFunction();
17024
17025       for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
17026            I != E; ++I)
17027         if (I->hasNestAttr())
17028           report_fatal_error("Cannot use segmented stacks with functions that "
17029                              "have nested arguments.");
17030     }
17031
17032     const TargetRegisterClass *AddrRegClass =
17033       getRegClassFor(getPointerTy());
17034     unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
17035     Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
17036     SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
17037                                 DAG.getRegister(Vreg, SPTy));
17038     SDValue Ops1[2] = { Value, Chain };
17039     return DAG.getMergeValues(Ops1, dl);
17040   } else {
17041     SDValue Flag;
17042     const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX);
17043
17044     Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
17045     Flag = Chain.getValue(1);
17046     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
17047
17048     Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
17049
17050     const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
17051     unsigned SPReg = RegInfo->getStackRegister();
17052     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
17053     Chain = SP.getValue(1);
17054
17055     if (Align) {
17056       SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
17057                        DAG.getConstant(-(uint64_t)Align, VT));
17058       Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
17059     }
17060
17061     SDValue Ops1[2] = { SP, Chain };
17062     return DAG.getMergeValues(Ops1, dl);
17063   }
17064 }
17065
17066 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
17067   MachineFunction &MF = DAG.getMachineFunction();
17068   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
17069
17070   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
17071   SDLoc DL(Op);
17072
17073   if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) {
17074     // vastart just stores the address of the VarArgsFrameIndex slot into the
17075     // memory location argument.
17076     SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
17077                                    getPointerTy());
17078     return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
17079                         MachinePointerInfo(SV), false, false, 0);
17080   }
17081
17082   // __va_list_tag:
17083   //   gp_offset         (0 - 6 * 8)
17084   //   fp_offset         (48 - 48 + 8 * 16)
17085   //   overflow_arg_area (point to parameters coming in memory).
17086   //   reg_save_area
17087   SmallVector<SDValue, 8> MemOps;
17088   SDValue FIN = Op.getOperand(1);
17089   // Store gp_offset
17090   SDValue Store = DAG.getStore(Op.getOperand(0), DL,
17091                                DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
17092                                                MVT::i32),
17093                                FIN, MachinePointerInfo(SV), false, false, 0);
17094   MemOps.push_back(Store);
17095
17096   // Store fp_offset
17097   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
17098                     FIN, DAG.getIntPtrConstant(4));
17099   Store = DAG.getStore(Op.getOperand(0), DL,
17100                        DAG.getConstant(FuncInfo->getVarArgsFPOffset(),
17101                                        MVT::i32),
17102                        FIN, MachinePointerInfo(SV, 4), false, false, 0);
17103   MemOps.push_back(Store);
17104
17105   // Store ptr to overflow_arg_area
17106   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
17107                     FIN, DAG.getIntPtrConstant(4));
17108   SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
17109                                     getPointerTy());
17110   Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
17111                        MachinePointerInfo(SV, 8),
17112                        false, false, 0);
17113   MemOps.push_back(Store);
17114
17115   // Store ptr to reg_save_area.
17116   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
17117                     FIN, DAG.getIntPtrConstant(8));
17118   SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
17119                                     getPointerTy());
17120   Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
17121                        MachinePointerInfo(SV, 16), false, false, 0);
17122   MemOps.push_back(Store);
17123   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
17124 }
17125
17126 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
17127   assert(Subtarget->is64Bit() &&
17128          "LowerVAARG only handles 64-bit va_arg!");
17129   assert((Subtarget->isTargetLinux() ||
17130           Subtarget->isTargetDarwin()) &&
17131           "Unhandled target in LowerVAARG");
17132   assert(Op.getNode()->getNumOperands() == 4);
17133   SDValue Chain = Op.getOperand(0);
17134   SDValue SrcPtr = Op.getOperand(1);
17135   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
17136   unsigned Align = Op.getConstantOperandVal(3);
17137   SDLoc dl(Op);
17138
17139   EVT ArgVT = Op.getNode()->getValueType(0);
17140   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
17141   uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
17142   uint8_t ArgMode;
17143
17144   // Decide which area this value should be read from.
17145   // TODO: Implement the AMD64 ABI in its entirety. This simple
17146   // selection mechanism works only for the basic types.
17147   if (ArgVT == MVT::f80) {
17148     llvm_unreachable("va_arg for f80 not yet implemented");
17149   } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
17150     ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
17151   } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
17152     ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
17153   } else {
17154     llvm_unreachable("Unhandled argument type in LowerVAARG");
17155   }
17156
17157   if (ArgMode == 2) {
17158     // Sanity Check: Make sure using fp_offset makes sense.
17159     assert(!DAG.getTarget().Options.UseSoftFloat &&
17160            !(DAG.getMachineFunction().getFunction()->hasFnAttribute(
17161                Attribute::NoImplicitFloat)) &&
17162            Subtarget->hasSSE1());
17163   }
17164
17165   // Insert VAARG_64 node into the DAG
17166   // VAARG_64 returns two values: Variable Argument Address, Chain
17167   SmallVector<SDValue, 11> InstOps;
17168   InstOps.push_back(Chain);
17169   InstOps.push_back(SrcPtr);
17170   InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32));
17171   InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8));
17172   InstOps.push_back(DAG.getConstant(Align, MVT::i32));
17173   SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other);
17174   SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
17175                                           VTs, InstOps, MVT::i64,
17176                                           MachinePointerInfo(SV),
17177                                           /*Align=*/0,
17178                                           /*Volatile=*/false,
17179                                           /*ReadMem=*/true,
17180                                           /*WriteMem=*/true);
17181   Chain = VAARG.getValue(1);
17182
17183   // Load the next argument and return it
17184   return DAG.getLoad(ArgVT, dl,
17185                      Chain,
17186                      VAARG,
17187                      MachinePointerInfo(),
17188                      false, false, false, 0);
17189 }
17190
17191 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
17192                            SelectionDAG &DAG) {
17193   // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
17194   assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
17195   SDValue Chain = Op.getOperand(0);
17196   SDValue DstPtr = Op.getOperand(1);
17197   SDValue SrcPtr = Op.getOperand(2);
17198   const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
17199   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
17200   SDLoc DL(Op);
17201
17202   return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
17203                        DAG.getIntPtrConstant(24), 8, /*isVolatile*/false,
17204                        false,
17205                        MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
17206 }
17207
17208 // getTargetVShiftByConstNode - Handle vector element shifts where the shift
17209 // amount is a constant. Takes immediate version of shift as input.
17210 static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
17211                                           SDValue SrcOp, uint64_t ShiftAmt,
17212                                           SelectionDAG &DAG) {
17213   MVT ElementType = VT.getVectorElementType();
17214
17215   // Fold this packed shift into its first operand if ShiftAmt is 0.
17216   if (ShiftAmt == 0)
17217     return SrcOp;
17218
17219   // Check for ShiftAmt >= element width
17220   if (ShiftAmt >= ElementType.getSizeInBits()) {
17221     if (Opc == X86ISD::VSRAI)
17222       ShiftAmt = ElementType.getSizeInBits() - 1;
17223     else
17224       return DAG.getConstant(0, VT);
17225   }
17226
17227   assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
17228          && "Unknown target vector shift-by-constant node");
17229
17230   // Fold this packed vector shift into a build vector if SrcOp is a
17231   // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT.
17232   if (VT == SrcOp.getSimpleValueType() &&
17233       ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
17234     SmallVector<SDValue, 8> Elts;
17235     unsigned NumElts = SrcOp->getNumOperands();
17236     ConstantSDNode *ND;
17237
17238     switch(Opc) {
17239     default: llvm_unreachable(nullptr);
17240     case X86ISD::VSHLI:
17241       for (unsigned i=0; i!=NumElts; ++i) {
17242         SDValue CurrentOp = SrcOp->getOperand(i);
17243         if (CurrentOp->getOpcode() == ISD::UNDEF) {
17244           Elts.push_back(CurrentOp);
17245           continue;
17246         }
17247         ND = cast<ConstantSDNode>(CurrentOp);
17248         const APInt &C = ND->getAPIntValue();
17249         Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), ElementType));
17250       }
17251       break;
17252     case X86ISD::VSRLI:
17253       for (unsigned i=0; i!=NumElts; ++i) {
17254         SDValue CurrentOp = SrcOp->getOperand(i);
17255         if (CurrentOp->getOpcode() == ISD::UNDEF) {
17256           Elts.push_back(CurrentOp);
17257           continue;
17258         }
17259         ND = cast<ConstantSDNode>(CurrentOp);
17260         const APInt &C = ND->getAPIntValue();
17261         Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), ElementType));
17262       }
17263       break;
17264     case X86ISD::VSRAI:
17265       for (unsigned i=0; i!=NumElts; ++i) {
17266         SDValue CurrentOp = SrcOp->getOperand(i);
17267         if (CurrentOp->getOpcode() == ISD::UNDEF) {
17268           Elts.push_back(CurrentOp);
17269           continue;
17270         }
17271         ND = cast<ConstantSDNode>(CurrentOp);
17272         const APInt &C = ND->getAPIntValue();
17273         Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), ElementType));
17274       }
17275       break;
17276     }
17277
17278     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
17279   }
17280
17281   return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8));
17282 }
17283
17284 // getTargetVShiftNode - Handle vector element shifts where the shift amount
17285 // may or may not be a constant. Takes immediate version of shift as input.
17286 static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
17287                                    SDValue SrcOp, SDValue ShAmt,
17288                                    SelectionDAG &DAG) {
17289   MVT SVT = ShAmt.getSimpleValueType();
17290   assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
17291
17292   // Catch shift-by-constant.
17293   if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
17294     return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
17295                                       CShAmt->getZExtValue(), DAG);
17296
17297   // Change opcode to non-immediate version
17298   switch (Opc) {
17299     default: llvm_unreachable("Unknown target vector shift node");
17300     case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
17301     case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
17302     case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
17303   }
17304
17305   const X86Subtarget &Subtarget =
17306       static_cast<const X86Subtarget &>(DAG.getSubtarget());
17307   if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
17308       ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
17309     // Let the shuffle legalizer expand this shift amount node.
17310     SDValue Op0 = ShAmt.getOperand(0);
17311     Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
17312     ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, &Subtarget, DAG);
17313   } else {
17314     // Need to build a vector containing shift amount.
17315     // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
17316     SmallVector<SDValue, 4> ShOps;
17317     ShOps.push_back(ShAmt);
17318     if (SVT == MVT::i32) {
17319       ShOps.push_back(DAG.getConstant(0, SVT));
17320       ShOps.push_back(DAG.getUNDEF(SVT));
17321     }
17322     ShOps.push_back(DAG.getUNDEF(SVT));
17323
17324     MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
17325     ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, BVT, ShOps);
17326   }
17327
17328   // The return type has to be a 128-bit type with the same element
17329   // type as the input type.
17330   MVT EltVT = VT.getVectorElementType();
17331   EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
17332
17333   ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt);
17334   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
17335 }
17336
17337 /// \brief Return (and \p Op, \p Mask) for compare instructions or
17338 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
17339 /// necessary casting for \p Mask when lowering masking intrinsics.
17340 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
17341                                     SDValue PreservedSrc,
17342                                     const X86Subtarget *Subtarget,
17343                                     SelectionDAG &DAG) {
17344     EVT VT = Op.getValueType();
17345     EVT MaskVT = EVT::getVectorVT(*DAG.getContext(),
17346                                   MVT::i1, VT.getVectorNumElements());
17347     EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17348                                      Mask.getValueType().getSizeInBits());
17349     SDLoc dl(Op);
17350
17351     assert(MaskVT.isSimple() && "invalid mask type");
17352
17353     if (isAllOnes(Mask))
17354       return Op;
17355
17356     // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
17357     // are extracted by EXTRACT_SUBVECTOR.
17358     SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17359                               DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17360                               DAG.getIntPtrConstant(0));
17361
17362     switch (Op.getOpcode()) {
17363       default: break;
17364       case X86ISD::PCMPEQM:
17365       case X86ISD::PCMPGTM:
17366       case X86ISD::CMPM:
17367       case X86ISD::CMPMU:
17368         return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
17369     }
17370     if (PreservedSrc.getOpcode() == ISD::UNDEF)
17371       PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
17372     return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc);
17373 }
17374
17375 /// \brief Creates an SDNode for a predicated scalar operation.
17376 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
17377 /// The mask is comming as MVT::i8 and it should be truncated
17378 /// to MVT::i1 while lowering masking intrinsics.
17379 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
17380 /// "X86select" instead of "vselect". We just can't create the "vselect" node for
17381 /// a scalar instruction.
17382 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
17383                                     SDValue PreservedSrc,
17384                                     const X86Subtarget *Subtarget,
17385                                     SelectionDAG &DAG) {
17386     if (isAllOnes(Mask))
17387       return Op;
17388
17389     EVT VT = Op.getValueType();
17390     SDLoc dl(Op);
17391     // The mask should be of type MVT::i1
17392     SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
17393
17394     if (PreservedSrc.getOpcode() == ISD::UNDEF)
17395       PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
17396     return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
17397 }
17398
17399 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
17400                                        SelectionDAG &DAG) {
17401   SDLoc dl(Op);
17402   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
17403   EVT VT = Op.getValueType();
17404   const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
17405   if (IntrData) {
17406     switch(IntrData->Type) {
17407     case INTR_TYPE_1OP:
17408       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
17409     case INTR_TYPE_2OP:
17410       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17411         Op.getOperand(2));
17412     case INTR_TYPE_3OP:
17413       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17414         Op.getOperand(2), Op.getOperand(3));
17415     case INTR_TYPE_1OP_MASK_RM: {
17416       SDValue Src = Op.getOperand(1);
17417       SDValue Src0 = Op.getOperand(2);
17418       SDValue Mask = Op.getOperand(3);
17419       SDValue RoundingMode = Op.getOperand(4);
17420       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
17421                                               RoundingMode),
17422                                   Mask, Src0, Subtarget, DAG);
17423     }
17424     case INTR_TYPE_SCALAR_MASK_RM: {
17425       SDValue Src1 = Op.getOperand(1);
17426       SDValue Src2 = Op.getOperand(2);
17427       SDValue Src0 = Op.getOperand(3);
17428       SDValue Mask = Op.getOperand(4);
17429       SDValue RoundingMode = Op.getOperand(5);
17430       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
17431                                               RoundingMode),
17432                                   Mask, Src0, Subtarget, DAG);
17433     }
17434     case INTR_TYPE_2OP_MASK: {
17435       SDValue Mask = Op.getOperand(4);
17436       SDValue PassThru = Op.getOperand(3);
17437       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17438       if (IntrWithRoundingModeOpcode != 0) {
17439         unsigned Round = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
17440         if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
17441           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17442                                       dl, Op.getValueType(),
17443                                       Op.getOperand(1), Op.getOperand(2),
17444                                       Op.getOperand(3), Op.getOperand(5)),
17445                                       Mask, PassThru, Subtarget, DAG);
17446         }
17447       }
17448       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17449                                               Op.getOperand(1),
17450                                               Op.getOperand(2)),
17451                                   Mask, PassThru, Subtarget, DAG);
17452     }
17453     case FMA_OP_MASK: {
17454       SDValue Src1 = Op.getOperand(1);
17455       SDValue Src2 = Op.getOperand(2);
17456       SDValue Src3 = Op.getOperand(3);
17457       SDValue Mask = Op.getOperand(4);
17458       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17459       if (IntrWithRoundingModeOpcode != 0) {
17460         SDValue Rnd = Op.getOperand(5);
17461         if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
17462             X86::STATIC_ROUNDING::CUR_DIRECTION)
17463           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17464                                                   dl, Op.getValueType(),
17465                                                   Src1, Src2, Src3, Rnd),
17466                                       Mask, Src1, Subtarget, DAG);
17467       }
17468       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
17469                                               dl, Op.getValueType(),
17470                                               Src1, Src2, Src3),
17471                                   Mask, Src1, Subtarget, DAG);
17472     }
17473     case CMP_MASK:
17474     case CMP_MASK_CC: {
17475       // Comparison intrinsics with masks.
17476       // Example of transformation:
17477       // (i8 (int_x86_avx512_mask_pcmpeq_q_128
17478       //             (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
17479       // (i8 (bitcast
17480       //   (v8i1 (insert_subvector undef,
17481       //           (v2i1 (and (PCMPEQM %a, %b),
17482       //                      (extract_subvector
17483       //                         (v8i1 (bitcast %mask)), 0))), 0))))
17484       EVT VT = Op.getOperand(1).getValueType();
17485       EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17486                                     VT.getVectorNumElements());
17487       SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
17488       EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17489                                        Mask.getValueType().getSizeInBits());
17490       SDValue Cmp;
17491       if (IntrData->Type == CMP_MASK_CC) {
17492         Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
17493                     Op.getOperand(2), Op.getOperand(3));
17494       } else {
17495         assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
17496         Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
17497                     Op.getOperand(2));
17498       }
17499       SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
17500                                              DAG.getTargetConstant(0, MaskVT),
17501                                              Subtarget, DAG);
17502       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
17503                                 DAG.getUNDEF(BitcastVT), CmpMask,
17504                                 DAG.getIntPtrConstant(0));
17505       return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
17506     }
17507     case COMI: { // Comparison intrinsics
17508       ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
17509       SDValue LHS = Op.getOperand(1);
17510       SDValue RHS = Op.getOperand(2);
17511       unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
17512       assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
17513       SDValue Cond = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
17514       SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17515                                   DAG.getConstant(X86CC, MVT::i8), Cond);
17516       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17517     }
17518     case VSHIFT:
17519       return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
17520                                  Op.getOperand(1), Op.getOperand(2), DAG);
17521     case VSHIFT_MASK:
17522       return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl,
17523                                                       Op.getSimpleValueType(),
17524                                                       Op.getOperand(1),
17525                                                       Op.getOperand(2), DAG),
17526                                   Op.getOperand(4), Op.getOperand(3), Subtarget,
17527                                   DAG);
17528     case COMPRESS_EXPAND_IN_REG: {
17529       SDValue Mask = Op.getOperand(3);
17530       SDValue DataToCompress = Op.getOperand(1);
17531       SDValue PassThru = Op.getOperand(2);
17532       if (isAllOnes(Mask)) // return data as is
17533         return Op.getOperand(1);
17534       EVT VT = Op.getValueType();
17535       EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17536                                     VT.getVectorNumElements());
17537       EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17538                                        Mask.getValueType().getSizeInBits());
17539       SDLoc dl(Op);
17540       SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17541                                   DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17542                                   DAG.getIntPtrConstant(0));
17543
17544       return DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToCompress,
17545                          PassThru);
17546     }
17547     case BLEND: {
17548       SDValue Mask = Op.getOperand(3);
17549       EVT VT = Op.getValueType();
17550       EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17551                                     VT.getVectorNumElements());
17552       EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17553                                        Mask.getValueType().getSizeInBits());
17554       SDLoc dl(Op);
17555       SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17556                                   DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17557                                   DAG.getIntPtrConstant(0));
17558       return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1),
17559                          Op.getOperand(2));
17560     }
17561     default:
17562       break;
17563     }
17564   }
17565
17566   switch (IntNo) {
17567   default: return SDValue();    // Don't custom lower most intrinsics.
17568
17569   case Intrinsic::x86_avx512_mask_valign_q_512:
17570   case Intrinsic::x86_avx512_mask_valign_d_512:
17571     // Vector source operands are swapped.
17572     return getVectorMaskingNode(DAG.getNode(X86ISD::VALIGN, dl,
17573                                             Op.getValueType(), Op.getOperand(2),
17574                                             Op.getOperand(1),
17575                                             Op.getOperand(3)),
17576                                 Op.getOperand(5), Op.getOperand(4),
17577                                 Subtarget, DAG);
17578
17579   // ptest and testp intrinsics. The intrinsic these come from are designed to
17580   // return an integer value, not just an instruction so lower it to the ptest
17581   // or testp pattern and a setcc for the result.
17582   case Intrinsic::x86_sse41_ptestz:
17583   case Intrinsic::x86_sse41_ptestc:
17584   case Intrinsic::x86_sse41_ptestnzc:
17585   case Intrinsic::x86_avx_ptestz_256:
17586   case Intrinsic::x86_avx_ptestc_256:
17587   case Intrinsic::x86_avx_ptestnzc_256:
17588   case Intrinsic::x86_avx_vtestz_ps:
17589   case Intrinsic::x86_avx_vtestc_ps:
17590   case Intrinsic::x86_avx_vtestnzc_ps:
17591   case Intrinsic::x86_avx_vtestz_pd:
17592   case Intrinsic::x86_avx_vtestc_pd:
17593   case Intrinsic::x86_avx_vtestnzc_pd:
17594   case Intrinsic::x86_avx_vtestz_ps_256:
17595   case Intrinsic::x86_avx_vtestc_ps_256:
17596   case Intrinsic::x86_avx_vtestnzc_ps_256:
17597   case Intrinsic::x86_avx_vtestz_pd_256:
17598   case Intrinsic::x86_avx_vtestc_pd_256:
17599   case Intrinsic::x86_avx_vtestnzc_pd_256: {
17600     bool IsTestPacked = false;
17601     unsigned X86CC;
17602     switch (IntNo) {
17603     default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
17604     case Intrinsic::x86_avx_vtestz_ps:
17605     case Intrinsic::x86_avx_vtestz_pd:
17606     case Intrinsic::x86_avx_vtestz_ps_256:
17607     case Intrinsic::x86_avx_vtestz_pd_256:
17608       IsTestPacked = true; // Fallthrough
17609     case Intrinsic::x86_sse41_ptestz:
17610     case Intrinsic::x86_avx_ptestz_256:
17611       // ZF = 1
17612       X86CC = X86::COND_E;
17613       break;
17614     case Intrinsic::x86_avx_vtestc_ps:
17615     case Intrinsic::x86_avx_vtestc_pd:
17616     case Intrinsic::x86_avx_vtestc_ps_256:
17617     case Intrinsic::x86_avx_vtestc_pd_256:
17618       IsTestPacked = true; // Fallthrough
17619     case Intrinsic::x86_sse41_ptestc:
17620     case Intrinsic::x86_avx_ptestc_256:
17621       // CF = 1
17622       X86CC = X86::COND_B;
17623       break;
17624     case Intrinsic::x86_avx_vtestnzc_ps:
17625     case Intrinsic::x86_avx_vtestnzc_pd:
17626     case Intrinsic::x86_avx_vtestnzc_ps_256:
17627     case Intrinsic::x86_avx_vtestnzc_pd_256:
17628       IsTestPacked = true; // Fallthrough
17629     case Intrinsic::x86_sse41_ptestnzc:
17630     case Intrinsic::x86_avx_ptestnzc_256:
17631       // ZF and CF = 0
17632       X86CC = X86::COND_A;
17633       break;
17634     }
17635
17636     SDValue LHS = Op.getOperand(1);
17637     SDValue RHS = Op.getOperand(2);
17638     unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
17639     SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
17640     SDValue CC = DAG.getConstant(X86CC, MVT::i8);
17641     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
17642     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17643   }
17644   case Intrinsic::x86_avx512_kortestz_w:
17645   case Intrinsic::x86_avx512_kortestc_w: {
17646     unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B;
17647     SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1));
17648     SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2));
17649     SDValue CC = DAG.getConstant(X86CC, MVT::i8);
17650     SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
17651     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i1, CC, Test);
17652     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17653   }
17654
17655   case Intrinsic::x86_sse42_pcmpistria128:
17656   case Intrinsic::x86_sse42_pcmpestria128:
17657   case Intrinsic::x86_sse42_pcmpistric128:
17658   case Intrinsic::x86_sse42_pcmpestric128:
17659   case Intrinsic::x86_sse42_pcmpistrio128:
17660   case Intrinsic::x86_sse42_pcmpestrio128:
17661   case Intrinsic::x86_sse42_pcmpistris128:
17662   case Intrinsic::x86_sse42_pcmpestris128:
17663   case Intrinsic::x86_sse42_pcmpistriz128:
17664   case Intrinsic::x86_sse42_pcmpestriz128: {
17665     unsigned Opcode;
17666     unsigned X86CC;
17667     switch (IntNo) {
17668     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
17669     case Intrinsic::x86_sse42_pcmpistria128:
17670       Opcode = X86ISD::PCMPISTRI;
17671       X86CC = X86::COND_A;
17672       break;
17673     case Intrinsic::x86_sse42_pcmpestria128:
17674       Opcode = X86ISD::PCMPESTRI;
17675       X86CC = X86::COND_A;
17676       break;
17677     case Intrinsic::x86_sse42_pcmpistric128:
17678       Opcode = X86ISD::PCMPISTRI;
17679       X86CC = X86::COND_B;
17680       break;
17681     case Intrinsic::x86_sse42_pcmpestric128:
17682       Opcode = X86ISD::PCMPESTRI;
17683       X86CC = X86::COND_B;
17684       break;
17685     case Intrinsic::x86_sse42_pcmpistrio128:
17686       Opcode = X86ISD::PCMPISTRI;
17687       X86CC = X86::COND_O;
17688       break;
17689     case Intrinsic::x86_sse42_pcmpestrio128:
17690       Opcode = X86ISD::PCMPESTRI;
17691       X86CC = X86::COND_O;
17692       break;
17693     case Intrinsic::x86_sse42_pcmpistris128:
17694       Opcode = X86ISD::PCMPISTRI;
17695       X86CC = X86::COND_S;
17696       break;
17697     case Intrinsic::x86_sse42_pcmpestris128:
17698       Opcode = X86ISD::PCMPESTRI;
17699       X86CC = X86::COND_S;
17700       break;
17701     case Intrinsic::x86_sse42_pcmpistriz128:
17702       Opcode = X86ISD::PCMPISTRI;
17703       X86CC = X86::COND_E;
17704       break;
17705     case Intrinsic::x86_sse42_pcmpestriz128:
17706       Opcode = X86ISD::PCMPESTRI;
17707       X86CC = X86::COND_E;
17708       break;
17709     }
17710     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
17711     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17712     SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
17713     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17714                                 DAG.getConstant(X86CC, MVT::i8),
17715                                 SDValue(PCMP.getNode(), 1));
17716     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17717   }
17718
17719   case Intrinsic::x86_sse42_pcmpistri128:
17720   case Intrinsic::x86_sse42_pcmpestri128: {
17721     unsigned Opcode;
17722     if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
17723       Opcode = X86ISD::PCMPISTRI;
17724     else
17725       Opcode = X86ISD::PCMPESTRI;
17726
17727     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
17728     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17729     return DAG.getNode(Opcode, dl, VTs, NewOps);
17730   }
17731   }
17732 }
17733
17734 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
17735                               SDValue Src, SDValue Mask, SDValue Base,
17736                               SDValue Index, SDValue ScaleOp, SDValue Chain,
17737                               const X86Subtarget * Subtarget) {
17738   SDLoc dl(Op);
17739   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
17740   assert(C && "Invalid scale type");
17741   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
17742   EVT MaskVT = MVT::getVectorVT(MVT::i1,
17743                              Index.getSimpleValueType().getVectorNumElements());
17744   SDValue MaskInReg;
17745   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
17746   if (MaskC)
17747     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
17748   else
17749     MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
17750   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
17751   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
17752   SDValue Segment = DAG.getRegister(0, MVT::i32);
17753   if (Src.getOpcode() == ISD::UNDEF)
17754     Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl);
17755   SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
17756   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
17757   SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
17758   return DAG.getMergeValues(RetOps, dl);
17759 }
17760
17761 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
17762                                SDValue Src, SDValue Mask, SDValue Base,
17763                                SDValue Index, SDValue ScaleOp, SDValue Chain) {
17764   SDLoc dl(Op);
17765   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
17766   assert(C && "Invalid scale type");
17767   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
17768   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
17769   SDValue Segment = DAG.getRegister(0, MVT::i32);
17770   EVT MaskVT = MVT::getVectorVT(MVT::i1,
17771                              Index.getSimpleValueType().getVectorNumElements());
17772   SDValue MaskInReg;
17773   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
17774   if (MaskC)
17775     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
17776   else
17777     MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
17778   SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
17779   SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
17780   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
17781   return SDValue(Res, 1);
17782 }
17783
17784 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
17785                                SDValue Mask, SDValue Base, SDValue Index,
17786                                SDValue ScaleOp, SDValue Chain) {
17787   SDLoc dl(Op);
17788   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
17789   assert(C && "Invalid scale type");
17790   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
17791   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
17792   SDValue Segment = DAG.getRegister(0, MVT::i32);
17793   EVT MaskVT =
17794     MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
17795   SDValue MaskInReg;
17796   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
17797   if (MaskC)
17798     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
17799   else
17800     MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
17801   //SDVTList VTs = DAG.getVTList(MVT::Other);
17802   SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
17803   SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
17804   return SDValue(Res, 0);
17805 }
17806
17807 // getReadPerformanceCounter - Handles the lowering of builtin intrinsics that
17808 // read performance monitor counters (x86_rdpmc).
17809 static void getReadPerformanceCounter(SDNode *N, SDLoc DL,
17810                               SelectionDAG &DAG, const X86Subtarget *Subtarget,
17811                               SmallVectorImpl<SDValue> &Results) {
17812   assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
17813   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
17814   SDValue LO, HI;
17815
17816   // The ECX register is used to select the index of the performance counter
17817   // to read.
17818   SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
17819                                    N->getOperand(2));
17820   SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
17821
17822   // Reads the content of a 64-bit performance counter and returns it in the
17823   // registers EDX:EAX.
17824   if (Subtarget->is64Bit()) {
17825     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
17826     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
17827                             LO.getValue(2));
17828   } else {
17829     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
17830     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
17831                             LO.getValue(2));
17832   }
17833   Chain = HI.getValue(1);
17834
17835   if (Subtarget->is64Bit()) {
17836     // The EAX register is loaded with the low-order 32 bits. The EDX register
17837     // is loaded with the supported high-order bits of the counter.
17838     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
17839                               DAG.getConstant(32, MVT::i8));
17840     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
17841     Results.push_back(Chain);
17842     return;
17843   }
17844
17845   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
17846   SDValue Ops[] = { LO, HI };
17847   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
17848   Results.push_back(Pair);
17849   Results.push_back(Chain);
17850 }
17851
17852 // getReadTimeStampCounter - Handles the lowering of builtin intrinsics that
17853 // read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is
17854 // also used to custom lower READCYCLECOUNTER nodes.
17855 static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode,
17856                               SelectionDAG &DAG, const X86Subtarget *Subtarget,
17857                               SmallVectorImpl<SDValue> &Results) {
17858   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
17859   SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
17860   SDValue LO, HI;
17861
17862   // The processor's time-stamp counter (a 64-bit MSR) is stored into the
17863   // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
17864   // and the EAX register is loaded with the low-order 32 bits.
17865   if (Subtarget->is64Bit()) {
17866     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
17867     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
17868                             LO.getValue(2));
17869   } else {
17870     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
17871     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
17872                             LO.getValue(2));
17873   }
17874   SDValue Chain = HI.getValue(1);
17875
17876   if (Opcode == X86ISD::RDTSCP_DAG) {
17877     assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
17878
17879     // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
17880     // the ECX register. Add 'ecx' explicitly to the chain.
17881     SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
17882                                      HI.getValue(2));
17883     // Explicitly store the content of ECX at the location passed in input
17884     // to the 'rdtscp' intrinsic.
17885     Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
17886                          MachinePointerInfo(), false, false, 0);
17887   }
17888
17889   if (Subtarget->is64Bit()) {
17890     // The EDX register is loaded with the high-order 32 bits of the MSR, and
17891     // the EAX register is loaded with the low-order 32 bits.
17892     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
17893                               DAG.getConstant(32, MVT::i8));
17894     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
17895     Results.push_back(Chain);
17896     return;
17897   }
17898
17899   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
17900   SDValue Ops[] = { LO, HI };
17901   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
17902   Results.push_back(Pair);
17903   Results.push_back(Chain);
17904 }
17905
17906 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
17907                                      SelectionDAG &DAG) {
17908   SmallVector<SDValue, 2> Results;
17909   SDLoc DL(Op);
17910   getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
17911                           Results);
17912   return DAG.getMergeValues(Results, DL);
17913 }
17914
17915
17916 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
17917                                       SelectionDAG &DAG) {
17918   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
17919
17920   const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo);
17921   if (!IntrData)
17922     return SDValue();
17923
17924   SDLoc dl(Op);
17925   switch(IntrData->Type) {
17926   default:
17927     llvm_unreachable("Unknown Intrinsic Type");
17928     break;
17929   case RDSEED:
17930   case RDRAND: {
17931     // Emit the node with the right value type.
17932     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
17933     SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
17934
17935     // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
17936     // Otherwise return the value from Rand, which is always 0, casted to i32.
17937     SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
17938                       DAG.getConstant(1, Op->getValueType(1)),
17939                       DAG.getConstant(X86::COND_B, MVT::i32),
17940                       SDValue(Result.getNode(), 1) };
17941     SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
17942                                   DAG.getVTList(Op->getValueType(1), MVT::Glue),
17943                                   Ops);
17944
17945     // Return { result, isValid, chain }.
17946     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
17947                        SDValue(Result.getNode(), 2));
17948   }
17949   case GATHER: {
17950   //gather(v1, mask, index, base, scale);
17951     SDValue Chain = Op.getOperand(0);
17952     SDValue Src   = Op.getOperand(2);
17953     SDValue Base  = Op.getOperand(3);
17954     SDValue Index = Op.getOperand(4);
17955     SDValue Mask  = Op.getOperand(5);
17956     SDValue Scale = Op.getOperand(6);
17957     return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain,
17958                           Subtarget);
17959   }
17960   case SCATTER: {
17961   //scatter(base, mask, index, v1, scale);
17962     SDValue Chain = Op.getOperand(0);
17963     SDValue Base  = Op.getOperand(2);
17964     SDValue Mask  = Op.getOperand(3);
17965     SDValue Index = Op.getOperand(4);
17966     SDValue Src   = Op.getOperand(5);
17967     SDValue Scale = Op.getOperand(6);
17968     return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain);
17969   }
17970   case PREFETCH: {
17971     SDValue Hint = Op.getOperand(6);
17972     unsigned HintVal;
17973     if (dyn_cast<ConstantSDNode> (Hint) == nullptr ||
17974         (HintVal = dyn_cast<ConstantSDNode> (Hint)->getZExtValue()) > 1)
17975       llvm_unreachable("Wrong prefetch hint in intrinsic: should be 0 or 1");
17976     unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0);
17977     SDValue Chain = Op.getOperand(0);
17978     SDValue Mask  = Op.getOperand(2);
17979     SDValue Index = Op.getOperand(3);
17980     SDValue Base  = Op.getOperand(4);
17981     SDValue Scale = Op.getOperand(5);
17982     return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain);
17983   }
17984   // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
17985   case RDTSC: {
17986     SmallVector<SDValue, 2> Results;
17987     getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget, Results);
17988     return DAG.getMergeValues(Results, dl);
17989   }
17990   // Read Performance Monitoring Counters.
17991   case RDPMC: {
17992     SmallVector<SDValue, 2> Results;
17993     getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
17994     return DAG.getMergeValues(Results, dl);
17995   }
17996   // XTEST intrinsics.
17997   case XTEST: {
17998     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
17999     SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
18000     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
18001                                 DAG.getConstant(X86::COND_NE, MVT::i8),
18002                                 InTrans);
18003     SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
18004     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
18005                        Ret, SDValue(InTrans.getNode(), 1));
18006   }
18007   // ADC/ADCX/SBB
18008   case ADX: {
18009     SmallVector<SDValue, 2> Results;
18010     SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
18011     SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
18012     SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
18013                                 DAG.getConstant(-1, MVT::i8));
18014     SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
18015                               Op.getOperand(4), GenCF.getValue(1));
18016     SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
18017                                  Op.getOperand(5), MachinePointerInfo(),
18018                                  false, false, 0);
18019     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
18020                                 DAG.getConstant(X86::COND_B, MVT::i8),
18021                                 Res.getValue(1));
18022     Results.push_back(SetCC);
18023     Results.push_back(Store);
18024     return DAG.getMergeValues(Results, dl);
18025   }
18026   case COMPRESS_TO_MEM: {
18027     SDLoc dl(Op);
18028     SDValue Mask = Op.getOperand(4);
18029     SDValue DataToCompress = Op.getOperand(3);
18030     SDValue Addr = Op.getOperand(2);
18031     SDValue Chain = Op.getOperand(0);
18032
18033     if (isAllOnes(Mask)) // return just a store
18034       return DAG.getStore(Chain, dl, DataToCompress, Addr,
18035                           MachinePointerInfo(), false, false, 0);
18036
18037     EVT VT = DataToCompress.getValueType();
18038     EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
18039                                   VT.getVectorNumElements());
18040     EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
18041                                      Mask.getValueType().getSizeInBits());
18042     SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
18043                                 DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
18044                                 DAG.getIntPtrConstant(0));
18045
18046     SDValue Compressed =  DAG.getNode(IntrData->Opc0, dl, VT, VMask,
18047                                       DataToCompress, DAG.getUNDEF(VT));
18048     return DAG.getStore(Chain, dl, Compressed, Addr,
18049                         MachinePointerInfo(), false, false, 0);
18050   }
18051   case EXPAND_FROM_MEM: {
18052     SDLoc dl(Op);
18053     SDValue Mask = Op.getOperand(4);
18054     SDValue PathThru = Op.getOperand(3);
18055     SDValue Addr = Op.getOperand(2);
18056     SDValue Chain = Op.getOperand(0);
18057     EVT VT = Op.getValueType();
18058
18059     if (isAllOnes(Mask)) // return just a load
18060       return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false,
18061                          false, 0);
18062     EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
18063                                   VT.getVectorNumElements());
18064     EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
18065                                      Mask.getValueType().getSizeInBits());
18066     SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
18067                                 DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
18068                                 DAG.getIntPtrConstant(0));
18069
18070     SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(),
18071                                    false, false, false, 0);
18072
18073     SmallVector<SDValue, 2> Results;
18074     Results.push_back(DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToExpand,
18075                                   PathThru));
18076     Results.push_back(Chain);
18077     return DAG.getMergeValues(Results, dl);
18078   }
18079   }
18080 }
18081
18082 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
18083                                            SelectionDAG &DAG) const {
18084   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
18085   MFI->setReturnAddressIsTaken(true);
18086
18087   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
18088     return SDValue();
18089
18090   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
18091   SDLoc dl(Op);
18092   EVT PtrVT = getPointerTy();
18093
18094   if (Depth > 0) {
18095     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
18096     const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
18097     SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
18098     return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
18099                        DAG.getNode(ISD::ADD, dl, PtrVT,
18100                                    FrameAddr, Offset),
18101                        MachinePointerInfo(), false, false, false, 0);
18102   }
18103
18104   // Just load the return address.
18105   SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
18106   return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
18107                      RetAddrFI, MachinePointerInfo(), false, false, false, 0);
18108 }
18109
18110 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
18111   MachineFunction &MF = DAG.getMachineFunction();
18112   MachineFrameInfo *MFI = MF.getFrameInfo();
18113   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
18114   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
18115   EVT VT = Op.getValueType();
18116
18117   MFI->setFrameAddressIsTaken(true);
18118
18119   if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
18120     // Depth > 0 makes no sense on targets which use Windows unwind codes.  It
18121     // is not possible to crawl up the stack without looking at the unwind codes
18122     // simultaneously.
18123     int FrameAddrIndex = FuncInfo->getFAIndex();
18124     if (!FrameAddrIndex) {
18125       // Set up a frame object for the return address.
18126       unsigned SlotSize = RegInfo->getSlotSize();
18127       FrameAddrIndex = MF.getFrameInfo()->CreateFixedObject(
18128           SlotSize, /*Offset=*/INT64_MIN, /*IsImmutable=*/false);
18129       FuncInfo->setFAIndex(FrameAddrIndex);
18130     }
18131     return DAG.getFrameIndex(FrameAddrIndex, VT);
18132   }
18133
18134   unsigned FrameReg =
18135       RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
18136   SDLoc dl(Op);  // FIXME probably not meaningful
18137   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
18138   assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
18139           (FrameReg == X86::EBP && VT == MVT::i32)) &&
18140          "Invalid Frame Register!");
18141   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
18142   while (Depth--)
18143     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
18144                             MachinePointerInfo(),
18145                             false, false, false, 0);
18146   return FrameAddr;
18147 }
18148
18149 // FIXME? Maybe this could be a TableGen attribute on some registers and
18150 // this table could be generated automatically from RegInfo.
18151 unsigned X86TargetLowering::getRegisterByName(const char* RegName,
18152                                               EVT VT) const {
18153   unsigned Reg = StringSwitch<unsigned>(RegName)
18154                        .Case("esp", X86::ESP)
18155                        .Case("rsp", X86::RSP)
18156                        .Default(0);
18157   if (Reg)
18158     return Reg;
18159   report_fatal_error("Invalid register name global variable");
18160 }
18161
18162 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
18163                                                      SelectionDAG &DAG) const {
18164   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
18165   return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());
18166 }
18167
18168 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
18169   SDValue Chain     = Op.getOperand(0);
18170   SDValue Offset    = Op.getOperand(1);
18171   SDValue Handler   = Op.getOperand(2);
18172   SDLoc dl      (Op);
18173
18174   EVT PtrVT = getPointerTy();
18175   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
18176   unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
18177   assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
18178           (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
18179          "Invalid Frame Register!");
18180   SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
18181   unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
18182
18183   SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
18184                                  DAG.getIntPtrConstant(RegInfo->getSlotSize()));
18185   StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
18186   Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
18187                        false, false, 0);
18188   Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
18189
18190   return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
18191                      DAG.getRegister(StoreAddrReg, PtrVT));
18192 }
18193
18194 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
18195                                                SelectionDAG &DAG) const {
18196   SDLoc DL(Op);
18197   return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
18198                      DAG.getVTList(MVT::i32, MVT::Other),
18199                      Op.getOperand(0), Op.getOperand(1));
18200 }
18201
18202 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
18203                                                 SelectionDAG &DAG) const {
18204   SDLoc DL(Op);
18205   return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
18206                      Op.getOperand(0), Op.getOperand(1));
18207 }
18208
18209 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
18210   return Op.getOperand(0);
18211 }
18212
18213 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
18214                                                 SelectionDAG &DAG) const {
18215   SDValue Root = Op.getOperand(0);
18216   SDValue Trmp = Op.getOperand(1); // trampoline
18217   SDValue FPtr = Op.getOperand(2); // nested function
18218   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
18219   SDLoc dl (Op);
18220
18221   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
18222   const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
18223
18224   if (Subtarget->is64Bit()) {
18225     SDValue OutChains[6];
18226
18227     // Large code-model.
18228     const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
18229     const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
18230
18231     const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
18232     const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
18233
18234     const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
18235
18236     // Load the pointer to the nested function into R11.
18237     unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
18238     SDValue Addr = Trmp;
18239     OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
18240                                 Addr, MachinePointerInfo(TrmpAddr),
18241                                 false, false, 0);
18242
18243     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18244                        DAG.getConstant(2, MVT::i64));
18245     OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
18246                                 MachinePointerInfo(TrmpAddr, 2),
18247                                 false, false, 2);
18248
18249     // Load the 'nest' parameter value into R10.
18250     // R10 is specified in X86CallingConv.td
18251     OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
18252     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18253                        DAG.getConstant(10, MVT::i64));
18254     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
18255                                 Addr, MachinePointerInfo(TrmpAddr, 10),
18256                                 false, false, 0);
18257
18258     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18259                        DAG.getConstant(12, MVT::i64));
18260     OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
18261                                 MachinePointerInfo(TrmpAddr, 12),
18262                                 false, false, 2);
18263
18264     // Jump to the nested function.
18265     OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
18266     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18267                        DAG.getConstant(20, MVT::i64));
18268     OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
18269                                 Addr, MachinePointerInfo(TrmpAddr, 20),
18270                                 false, false, 0);
18271
18272     unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
18273     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18274                        DAG.getConstant(22, MVT::i64));
18275     OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,
18276                                 MachinePointerInfo(TrmpAddr, 22),
18277                                 false, false, 0);
18278
18279     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
18280   } else {
18281     const Function *Func =
18282       cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
18283     CallingConv::ID CC = Func->getCallingConv();
18284     unsigned NestReg;
18285
18286     switch (CC) {
18287     default:
18288       llvm_unreachable("Unsupported calling convention");
18289     case CallingConv::C:
18290     case CallingConv::X86_StdCall: {
18291       // Pass 'nest' parameter in ECX.
18292       // Must be kept in sync with X86CallingConv.td
18293       NestReg = X86::ECX;
18294
18295       // Check that ECX wasn't needed by an 'inreg' parameter.
18296       FunctionType *FTy = Func->getFunctionType();
18297       const AttributeSet &Attrs = Func->getAttributes();
18298
18299       if (!Attrs.isEmpty() && !Func->isVarArg()) {
18300         unsigned InRegCount = 0;
18301         unsigned Idx = 1;
18302
18303         for (FunctionType::param_iterator I = FTy->param_begin(),
18304              E = FTy->param_end(); I != E; ++I, ++Idx)
18305           if (Attrs.hasAttribute(Idx, Attribute::InReg))
18306             // FIXME: should only count parameters that are lowered to integers.
18307             InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
18308
18309         if (InRegCount > 2) {
18310           report_fatal_error("Nest register in use - reduce number of inreg"
18311                              " parameters!");
18312         }
18313       }
18314       break;
18315     }
18316     case CallingConv::X86_FastCall:
18317     case CallingConv::X86_ThisCall:
18318     case CallingConv::Fast:
18319       // Pass 'nest' parameter in EAX.
18320       // Must be kept in sync with X86CallingConv.td
18321       NestReg = X86::EAX;
18322       break;
18323     }
18324
18325     SDValue OutChains[4];
18326     SDValue Addr, Disp;
18327
18328     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18329                        DAG.getConstant(10, MVT::i32));
18330     Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
18331
18332     // This is storing the opcode for MOV32ri.
18333     const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
18334     const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
18335     OutChains[0] = DAG.getStore(Root, dl,
18336                                 DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
18337                                 Trmp, MachinePointerInfo(TrmpAddr),
18338                                 false, false, 0);
18339
18340     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18341                        DAG.getConstant(1, MVT::i32));
18342     OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
18343                                 MachinePointerInfo(TrmpAddr, 1),
18344                                 false, false, 1);
18345
18346     const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
18347     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18348                        DAG.getConstant(5, MVT::i32));
18349     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,
18350                                 MachinePointerInfo(TrmpAddr, 5),
18351                                 false, false, 1);
18352
18353     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18354                        DAG.getConstant(6, MVT::i32));
18355     OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
18356                                 MachinePointerInfo(TrmpAddr, 6),
18357                                 false, false, 1);
18358
18359     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
18360   }
18361 }
18362
18363 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
18364                                             SelectionDAG &DAG) const {
18365   /*
18366    The rounding mode is in bits 11:10 of FPSR, and has the following
18367    settings:
18368      00 Round to nearest
18369      01 Round to -inf
18370      10 Round to +inf
18371      11 Round to 0
18372
18373   FLT_ROUNDS, on the other hand, expects the following:
18374     -1 Undefined
18375      0 Round to 0
18376      1 Round to nearest
18377      2 Round to +inf
18378      3 Round to -inf
18379
18380   To perform the conversion, we do:
18381     (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
18382   */
18383
18384   MachineFunction &MF = DAG.getMachineFunction();
18385   const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
18386   unsigned StackAlignment = TFI.getStackAlignment();
18387   MVT VT = Op.getSimpleValueType();
18388   SDLoc DL(Op);
18389
18390   // Save FP Control Word to stack slot
18391   int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
18392   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
18393
18394   MachineMemOperand *MMO =
18395    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
18396                            MachineMemOperand::MOStore, 2, 2);
18397
18398   SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
18399   SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
18400                                           DAG.getVTList(MVT::Other),
18401                                           Ops, MVT::i16, MMO);
18402
18403   // Load FP Control Word from stack slot
18404   SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
18405                             MachinePointerInfo(), false, false, false, 0);
18406
18407   // Transform as necessary
18408   SDValue CWD1 =
18409     DAG.getNode(ISD::SRL, DL, MVT::i16,
18410                 DAG.getNode(ISD::AND, DL, MVT::i16,
18411                             CWD, DAG.getConstant(0x800, MVT::i16)),
18412                 DAG.getConstant(11, MVT::i8));
18413   SDValue CWD2 =
18414     DAG.getNode(ISD::SRL, DL, MVT::i16,
18415                 DAG.getNode(ISD::AND, DL, MVT::i16,
18416                             CWD, DAG.getConstant(0x400, MVT::i16)),
18417                 DAG.getConstant(9, MVT::i8));
18418
18419   SDValue RetVal =
18420     DAG.getNode(ISD::AND, DL, MVT::i16,
18421                 DAG.getNode(ISD::ADD, DL, MVT::i16,
18422                             DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
18423                             DAG.getConstant(1, MVT::i16)),
18424                 DAG.getConstant(3, MVT::i16));
18425
18426   return DAG.getNode((VT.getSizeInBits() < 16 ?
18427                       ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
18428 }
18429
18430 static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
18431   MVT VT = Op.getSimpleValueType();
18432   EVT OpVT = VT;
18433   unsigned NumBits = VT.getSizeInBits();
18434   SDLoc dl(Op);
18435
18436   Op = Op.getOperand(0);
18437   if (VT == MVT::i8) {
18438     // Zero extend to i32 since there is not an i8 bsr.
18439     OpVT = MVT::i32;
18440     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
18441   }
18442
18443   // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
18444   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
18445   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
18446
18447   // If src is zero (i.e. bsr sets ZF), returns NumBits.
18448   SDValue Ops[] = {
18449     Op,
18450     DAG.getConstant(NumBits+NumBits-1, OpVT),
18451     DAG.getConstant(X86::COND_E, MVT::i8),
18452     Op.getValue(1)
18453   };
18454   Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
18455
18456   // Finally xor with NumBits-1.
18457   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
18458
18459   if (VT == MVT::i8)
18460     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
18461   return Op;
18462 }
18463
18464 static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
18465   MVT VT = Op.getSimpleValueType();
18466   EVT OpVT = VT;
18467   unsigned NumBits = VT.getSizeInBits();
18468   SDLoc dl(Op);
18469
18470   Op = Op.getOperand(0);
18471   if (VT == MVT::i8) {
18472     // Zero extend to i32 since there is not an i8 bsr.
18473     OpVT = MVT::i32;
18474     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
18475   }
18476
18477   // Issue a bsr (scan bits in reverse).
18478   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
18479   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
18480
18481   // And xor with NumBits-1.
18482   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
18483
18484   if (VT == MVT::i8)
18485     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
18486   return Op;
18487 }
18488
18489 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
18490   MVT VT = Op.getSimpleValueType();
18491   unsigned NumBits = VT.getSizeInBits();
18492   SDLoc dl(Op);
18493   Op = Op.getOperand(0);
18494
18495   // Issue a bsf (scan bits forward) which also sets EFLAGS.
18496   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18497   Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
18498
18499   // If src is zero (i.e. bsf sets ZF), returns NumBits.
18500   SDValue Ops[] = {
18501     Op,
18502     DAG.getConstant(NumBits, VT),
18503     DAG.getConstant(X86::COND_E, MVT::i8),
18504     Op.getValue(1)
18505   };
18506   return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
18507 }
18508
18509 // Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
18510 // ones, and then concatenate the result back.
18511 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
18512   MVT VT = Op.getSimpleValueType();
18513
18514   assert(VT.is256BitVector() && VT.isInteger() &&
18515          "Unsupported value type for operation");
18516
18517   unsigned NumElems = VT.getVectorNumElements();
18518   SDLoc dl(Op);
18519
18520   // Extract the LHS vectors
18521   SDValue LHS = Op.getOperand(0);
18522   SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
18523   SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
18524
18525   // Extract the RHS vectors
18526   SDValue RHS = Op.getOperand(1);
18527   SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
18528   SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
18529
18530   MVT EltVT = VT.getVectorElementType();
18531   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
18532
18533   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
18534                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
18535                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
18536 }
18537
18538 static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
18539   assert(Op.getSimpleValueType().is256BitVector() &&
18540          Op.getSimpleValueType().isInteger() &&
18541          "Only handle AVX 256-bit vector integer operation");
18542   return Lower256IntArith(Op, DAG);
18543 }
18544
18545 static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
18546   assert(Op.getSimpleValueType().is256BitVector() &&
18547          Op.getSimpleValueType().isInteger() &&
18548          "Only handle AVX 256-bit vector integer operation");
18549   return Lower256IntArith(Op, DAG);
18550 }
18551
18552 static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
18553                         SelectionDAG &DAG) {
18554   SDLoc dl(Op);
18555   MVT VT = Op.getSimpleValueType();
18556
18557   // Decompose 256-bit ops into smaller 128-bit ops.
18558   if (VT.is256BitVector() && !Subtarget->hasInt256())
18559     return Lower256IntArith(Op, DAG);
18560
18561   SDValue A = Op.getOperand(0);
18562   SDValue B = Op.getOperand(1);
18563
18564   // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
18565   if (VT == MVT::v4i32) {
18566     assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() &&
18567            "Should not custom lower when pmuldq is available!");
18568
18569     // Extract the odd parts.
18570     static const int UnpackMask[] = { 1, -1, 3, -1 };
18571     SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
18572     SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
18573
18574     // Multiply the even parts.
18575     SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
18576     // Now multiply odd parts.
18577     SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
18578
18579     Evens = DAG.getNode(ISD::BITCAST, dl, VT, Evens);
18580     Odds = DAG.getNode(ISD::BITCAST, dl, VT, Odds);
18581
18582     // Merge the two vectors back together with a shuffle. This expands into 2
18583     // shuffles.
18584     static const int ShufMask[] = { 0, 4, 2, 6 };
18585     return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
18586   }
18587
18588   assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
18589          "Only know how to lower V2I64/V4I64/V8I64 multiply");
18590
18591   //  Ahi = psrlqi(a, 32);
18592   //  Bhi = psrlqi(b, 32);
18593   //
18594   //  AloBlo = pmuludq(a, b);
18595   //  AloBhi = pmuludq(a, Bhi);
18596   //  AhiBlo = pmuludq(Ahi, b);
18597
18598   //  AloBhi = psllqi(AloBhi, 32);
18599   //  AhiBlo = psllqi(AhiBlo, 32);
18600   //  return AloBlo + AloBhi + AhiBlo;
18601
18602   SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
18603   SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
18604
18605   // Bit cast to 32-bit vectors for MULUDQ
18606   EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
18607                                   (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;
18608   A = DAG.getNode(ISD::BITCAST, dl, MulVT, A);
18609   B = DAG.getNode(ISD::BITCAST, dl, MulVT, B);
18610   Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi);
18611   Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi);
18612
18613   SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
18614   SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
18615   SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
18616
18617   AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG);
18618   AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG);
18619
18620   SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
18621   return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
18622 }
18623
18624 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
18625   assert(Subtarget->isTargetWin64() && "Unexpected target");
18626   EVT VT = Op.getValueType();
18627   assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
18628          "Unexpected return type for lowering");
18629
18630   RTLIB::Libcall LC;
18631   bool isSigned;
18632   switch (Op->getOpcode()) {
18633   default: llvm_unreachable("Unexpected request for libcall!");
18634   case ISD::SDIV:      isSigned = true;  LC = RTLIB::SDIV_I128;    break;
18635   case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;
18636   case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;
18637   case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;
18638   case ISD::SDIVREM:   isSigned = true;  LC = RTLIB::SDIVREM_I128; break;
18639   case ISD::UDIVREM:   isSigned = false; LC = RTLIB::UDIVREM_I128; break;
18640   }
18641
18642   SDLoc dl(Op);
18643   SDValue InChain = DAG.getEntryNode();
18644
18645   TargetLowering::ArgListTy Args;
18646   TargetLowering::ArgListEntry Entry;
18647   for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
18648     EVT ArgVT = Op->getOperand(i).getValueType();
18649     assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
18650            "Unexpected argument type for lowering");
18651     SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
18652     Entry.Node = StackPtr;
18653     InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(),
18654                            false, false, 16);
18655     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
18656     Entry.Ty = PointerType::get(ArgTy,0);
18657     Entry.isSExt = false;
18658     Entry.isZExt = false;
18659     Args.push_back(Entry);
18660   }
18661
18662   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
18663                                          getPointerTy());
18664
18665   TargetLowering::CallLoweringInfo CLI(DAG);
18666   CLI.setDebugLoc(dl).setChain(InChain)
18667     .setCallee(getLibcallCallingConv(LC),
18668                static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
18669                Callee, std::move(Args), 0)
18670     .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
18671
18672   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
18673   return DAG.getNode(ISD::BITCAST, dl, VT, CallInfo.first);
18674 }
18675
18676 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
18677                              SelectionDAG &DAG) {
18678   SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
18679   EVT VT = Op0.getValueType();
18680   SDLoc dl(Op);
18681
18682   assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) ||
18683          (VT == MVT::v8i32 && Subtarget->hasInt256()));
18684
18685   // PMULxD operations multiply each even value (starting at 0) of LHS with
18686   // the related value of RHS and produce a widen result.
18687   // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
18688   // => <2 x i64> <ae|cg>
18689   //
18690   // In other word, to have all the results, we need to perform two PMULxD:
18691   // 1. one with the even values.
18692   // 2. one with the odd values.
18693   // To achieve #2, with need to place the odd values at an even position.
18694   //
18695   // Place the odd value at an even position (basically, shift all values 1
18696   // step to the left):
18697   const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
18698   // <a|b|c|d> => <b|undef|d|undef>
18699   SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask);
18700   // <e|f|g|h> => <f|undef|h|undef>
18701   SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask);
18702
18703   // Emit two multiplies, one for the lower 2 ints and one for the higher 2
18704   // ints.
18705   MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
18706   bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
18707   unsigned Opcode =
18708       (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
18709   // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
18710   // => <2 x i64> <ae|cg>
18711   SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT,
18712                              DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
18713   // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
18714   // => <2 x i64> <bf|dh>
18715   SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT,
18716                              DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
18717
18718   // Shuffle it back into the right order.
18719   SDValue Highs, Lows;
18720   if (VT == MVT::v8i32) {
18721     const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
18722     Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
18723     const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
18724     Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
18725   } else {
18726     const int HighMask[] = {1, 5, 3, 7};
18727     Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
18728     const int LowMask[] = {0, 4, 2, 6};
18729     Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
18730   }
18731
18732   // If we have a signed multiply but no PMULDQ fix up the high parts of a
18733   // unsigned multiply.
18734   if (IsSigned && !Subtarget->hasSSE41()) {
18735     SDValue ShAmt =
18736         DAG.getConstant(31, DAG.getTargetLoweringInfo().getShiftAmountTy(VT));
18737     SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
18738                              DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
18739     SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
18740                              DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
18741
18742     SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
18743     Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
18744   }
18745
18746   // The first result of MUL_LOHI is actually the low value, followed by the
18747   // high value.
18748   SDValue Ops[] = {Lows, Highs};
18749   return DAG.getMergeValues(Ops, dl);
18750 }
18751
18752 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
18753                                          const X86Subtarget *Subtarget) {
18754   MVT VT = Op.getSimpleValueType();
18755   SDLoc dl(Op);
18756   SDValue R = Op.getOperand(0);
18757   SDValue Amt = Op.getOperand(1);
18758
18759   // Optimize shl/srl/sra with constant shift amount.
18760   if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
18761     if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
18762       uint64_t ShiftAmt = ShiftConst->getZExtValue();
18763
18764       if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
18765           (Subtarget->hasInt256() &&
18766            (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16)) ||
18767           (Subtarget->hasAVX512() &&
18768            (VT == MVT::v8i64 || VT == MVT::v16i32))) {
18769         if (Op.getOpcode() == ISD::SHL)
18770           return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
18771                                             DAG);
18772         if (Op.getOpcode() == ISD::SRL)
18773           return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
18774                                             DAG);
18775         if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64)
18776           return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
18777                                             DAG);
18778       }
18779
18780       if (VT == MVT::v16i8) {
18781         if (Op.getOpcode() == ISD::SHL) {
18782           // Make a large shift.
18783           SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
18784                                                    MVT::v8i16, R, ShiftAmt,
18785                                                    DAG);
18786           SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
18787           // Zero out the rightmost bits.
18788           SmallVector<SDValue, 16> V(16,
18789                                      DAG.getConstant(uint8_t(-1U << ShiftAmt),
18790                                                      MVT::i8));
18791           return DAG.getNode(ISD::AND, dl, VT, SHL,
18792                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18793         }
18794         if (Op.getOpcode() == ISD::SRL) {
18795           // Make a large shift.
18796           SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
18797                                                    MVT::v8i16, R, ShiftAmt,
18798                                                    DAG);
18799           SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
18800           // Zero out the leftmost bits.
18801           SmallVector<SDValue, 16> V(16,
18802                                      DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
18803                                                      MVT::i8));
18804           return DAG.getNode(ISD::AND, dl, VT, SRL,
18805                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18806         }
18807         if (Op.getOpcode() == ISD::SRA) {
18808           if (ShiftAmt == 7) {
18809             // R s>> 7  ===  R s< 0
18810             SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
18811             return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
18812           }
18813
18814           // R s>> a === ((R u>> a) ^ m) - m
18815           SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
18816           SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
18817                                                          MVT::i8));
18818           SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
18819           Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
18820           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
18821           return Res;
18822         }
18823         llvm_unreachable("Unknown shift opcode.");
18824       }
18825
18826       if (Subtarget->hasInt256() && VT == MVT::v32i8) {
18827         if (Op.getOpcode() == ISD::SHL) {
18828           // Make a large shift.
18829           SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
18830                                                    MVT::v16i16, R, ShiftAmt,
18831                                                    DAG);
18832           SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
18833           // Zero out the rightmost bits.
18834           SmallVector<SDValue, 32> V(32,
18835                                      DAG.getConstant(uint8_t(-1U << ShiftAmt),
18836                                                      MVT::i8));
18837           return DAG.getNode(ISD::AND, dl, VT, SHL,
18838                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18839         }
18840         if (Op.getOpcode() == ISD::SRL) {
18841           // Make a large shift.
18842           SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
18843                                                    MVT::v16i16, R, ShiftAmt,
18844                                                    DAG);
18845           SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
18846           // Zero out the leftmost bits.
18847           SmallVector<SDValue, 32> V(32,
18848                                      DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
18849                                                      MVT::i8));
18850           return DAG.getNode(ISD::AND, dl, VT, SRL,
18851                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18852         }
18853         if (Op.getOpcode() == ISD::SRA) {
18854           if (ShiftAmt == 7) {
18855             // R s>> 7  ===  R s< 0
18856             SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
18857             return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
18858           }
18859
18860           // R s>> a === ((R u>> a) ^ m) - m
18861           SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
18862           SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt,
18863                                                          MVT::i8));
18864           SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
18865           Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
18866           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
18867           return Res;
18868         }
18869         llvm_unreachable("Unknown shift opcode.");
18870       }
18871     }
18872   }
18873
18874   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
18875   if (!Subtarget->is64Bit() &&
18876       (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
18877       Amt.getOpcode() == ISD::BITCAST &&
18878       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
18879     Amt = Amt.getOperand(0);
18880     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
18881                      VT.getVectorNumElements();
18882     unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
18883     uint64_t ShiftAmt = 0;
18884     for (unsigned i = 0; i != Ratio; ++i) {
18885       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i));
18886       if (!C)
18887         return SDValue();
18888       // 6 == Log2(64)
18889       ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
18890     }
18891     // Check remaining shift amounts.
18892     for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
18893       uint64_t ShAmt = 0;
18894       for (unsigned j = 0; j != Ratio; ++j) {
18895         ConstantSDNode *C =
18896           dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
18897         if (!C)
18898           return SDValue();
18899         // 6 == Log2(64)
18900         ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
18901       }
18902       if (ShAmt != ShiftAmt)
18903         return SDValue();
18904     }
18905     switch (Op.getOpcode()) {
18906     default:
18907       llvm_unreachable("Unknown shift opcode!");
18908     case ISD::SHL:
18909       return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
18910                                         DAG);
18911     case ISD::SRL:
18912       return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
18913                                         DAG);
18914     case ISD::SRA:
18915       return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
18916                                         DAG);
18917     }
18918   }
18919
18920   return SDValue();
18921 }
18922
18923 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
18924                                         const X86Subtarget* Subtarget) {
18925   MVT VT = Op.getSimpleValueType();
18926   SDLoc dl(Op);
18927   SDValue R = Op.getOperand(0);
18928   SDValue Amt = Op.getOperand(1);
18929
18930   if ((VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) ||
18931       VT == MVT::v4i32 || VT == MVT::v8i16 ||
18932       (Subtarget->hasInt256() &&
18933        ((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) ||
18934         VT == MVT::v8i32 || VT == MVT::v16i16)) ||
18935        (Subtarget->hasAVX512() && (VT == MVT::v8i64 || VT == MVT::v16i32))) {
18936     SDValue BaseShAmt;
18937     EVT EltVT = VT.getVectorElementType();
18938
18939     if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
18940       // Check if this build_vector node is doing a splat.
18941       // If so, then set BaseShAmt equal to the splat value.
18942       BaseShAmt = BV->getSplatValue();
18943       if (BaseShAmt && BaseShAmt.getOpcode() == ISD::UNDEF)
18944         BaseShAmt = SDValue();
18945     } else {
18946       if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
18947         Amt = Amt.getOperand(0);
18948
18949       ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
18950       if (SVN && SVN->isSplat()) {
18951         unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
18952         SDValue InVec = Amt.getOperand(0);
18953         if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
18954           assert((SplatIdx < InVec.getValueType().getVectorNumElements()) &&
18955                  "Unexpected shuffle index found!");
18956           BaseShAmt = InVec.getOperand(SplatIdx);
18957         } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
18958            if (ConstantSDNode *C =
18959                dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
18960              if (C->getZExtValue() == SplatIdx)
18961                BaseShAmt = InVec.getOperand(1);
18962            }
18963         }
18964
18965         if (!BaseShAmt)
18966           // Avoid introducing an extract element from a shuffle.
18967           BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
18968                                     DAG.getIntPtrConstant(SplatIdx));
18969       }
18970     }
18971
18972     if (BaseShAmt.getNode()) {
18973       assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
18974       if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
18975         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
18976       else if (EltVT.bitsLT(MVT::i32))
18977         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
18978
18979       switch (Op.getOpcode()) {
18980       default:
18981         llvm_unreachable("Unknown shift opcode!");
18982       case ISD::SHL:
18983         switch (VT.SimpleTy) {
18984         default: return SDValue();
18985         case MVT::v2i64:
18986         case MVT::v4i32:
18987         case MVT::v8i16:
18988         case MVT::v4i64:
18989         case MVT::v8i32:
18990         case MVT::v16i16:
18991         case MVT::v16i32:
18992         case MVT::v8i64:
18993           return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG);
18994         }
18995       case ISD::SRA:
18996         switch (VT.SimpleTy) {
18997         default: return SDValue();
18998         case MVT::v4i32:
18999         case MVT::v8i16:
19000         case MVT::v8i32:
19001         case MVT::v16i16:
19002         case MVT::v16i32:
19003         case MVT::v8i64:
19004           return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG);
19005         }
19006       case ISD::SRL:
19007         switch (VT.SimpleTy) {
19008         default: return SDValue();
19009         case MVT::v2i64:
19010         case MVT::v4i32:
19011         case MVT::v8i16:
19012         case MVT::v4i64:
19013         case MVT::v8i32:
19014         case MVT::v16i16:
19015         case MVT::v16i32:
19016         case MVT::v8i64:
19017           return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG);
19018         }
19019       }
19020     }
19021   }
19022
19023   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
19024   if (!Subtarget->is64Bit() &&
19025       (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64) ||
19026       (Subtarget->hasAVX512() && VT == MVT::v8i64)) &&
19027       Amt.getOpcode() == ISD::BITCAST &&
19028       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
19029     Amt = Amt.getOperand(0);
19030     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
19031                      VT.getVectorNumElements();
19032     std::vector<SDValue> Vals(Ratio);
19033     for (unsigned i = 0; i != Ratio; ++i)
19034       Vals[i] = Amt.getOperand(i);
19035     for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
19036       for (unsigned j = 0; j != Ratio; ++j)
19037         if (Vals[j] != Amt.getOperand(i + j))
19038           return SDValue();
19039     }
19040     switch (Op.getOpcode()) {
19041     default:
19042       llvm_unreachable("Unknown shift opcode!");
19043     case ISD::SHL:
19044       return DAG.getNode(X86ISD::VSHL, dl, VT, R, Op.getOperand(1));
19045     case ISD::SRL:
19046       return DAG.getNode(X86ISD::VSRL, dl, VT, R, Op.getOperand(1));
19047     case ISD::SRA:
19048       return DAG.getNode(X86ISD::VSRA, dl, VT, R, Op.getOperand(1));
19049     }
19050   }
19051
19052   return SDValue();
19053 }
19054
19055 static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
19056                           SelectionDAG &DAG) {
19057   MVT VT = Op.getSimpleValueType();
19058   SDLoc dl(Op);
19059   SDValue R = Op.getOperand(0);
19060   SDValue Amt = Op.getOperand(1);
19061   SDValue V;
19062
19063   assert(VT.isVector() && "Custom lowering only for vector shifts!");
19064   assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!");
19065
19066   V = LowerScalarImmediateShift(Op, DAG, Subtarget);
19067   if (V.getNode())
19068     return V;
19069
19070   V = LowerScalarVariableShift(Op, DAG, Subtarget);
19071   if (V.getNode())
19072       return V;
19073
19074   if (Subtarget->hasAVX512() && (VT == MVT::v16i32 || VT == MVT::v8i64))
19075     return Op;
19076   // AVX2 has VPSLLV/VPSRAV/VPSRLV.
19077   if (Subtarget->hasInt256()) {
19078     if (Op.getOpcode() == ISD::SRL &&
19079         (VT == MVT::v2i64 || VT == MVT::v4i32 ||
19080          VT == MVT::v4i64 || VT == MVT::v8i32))
19081       return Op;
19082     if (Op.getOpcode() == ISD::SHL &&
19083         (VT == MVT::v2i64 || VT == MVT::v4i32 ||
19084          VT == MVT::v4i64 || VT == MVT::v8i32))
19085       return Op;
19086     if (Op.getOpcode() == ISD::SRA && (VT == MVT::v4i32 || VT == MVT::v8i32))
19087       return Op;
19088   }
19089
19090   // If possible, lower this packed shift into a vector multiply instead of
19091   // expanding it into a sequence of scalar shifts.
19092   // Do this only if the vector shift count is a constant build_vector.
19093   if (Op.getOpcode() == ISD::SHL &&
19094       (VT == MVT::v8i16 || VT == MVT::v4i32 ||
19095        (Subtarget->hasInt256() && VT == MVT::v16i16)) &&
19096       ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
19097     SmallVector<SDValue, 8> Elts;
19098     EVT SVT = VT.getScalarType();
19099     unsigned SVTBits = SVT.getSizeInBits();
19100     const APInt &One = APInt(SVTBits, 1);
19101     unsigned NumElems = VT.getVectorNumElements();
19102
19103     for (unsigned i=0; i !=NumElems; ++i) {
19104       SDValue Op = Amt->getOperand(i);
19105       if (Op->getOpcode() == ISD::UNDEF) {
19106         Elts.push_back(Op);
19107         continue;
19108       }
19109
19110       ConstantSDNode *ND = cast<ConstantSDNode>(Op);
19111       const APInt &C = APInt(SVTBits, ND->getAPIntValue().getZExtValue());
19112       uint64_t ShAmt = C.getZExtValue();
19113       if (ShAmt >= SVTBits) {
19114         Elts.push_back(DAG.getUNDEF(SVT));
19115         continue;
19116       }
19117       Elts.push_back(DAG.getConstant(One.shl(ShAmt), SVT));
19118     }
19119     SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
19120     return DAG.getNode(ISD::MUL, dl, VT, R, BV);
19121   }
19122
19123   // Lower SHL with variable shift amount.
19124   if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
19125     Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT));
19126
19127     Op = DAG.getNode(ISD::ADD, dl, VT, Op, DAG.getConstant(0x3f800000U, VT));
19128     Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op);
19129     Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
19130     return DAG.getNode(ISD::MUL, dl, VT, Op, R);
19131   }
19132
19133   // If possible, lower this shift as a sequence of two shifts by
19134   // constant plus a MOVSS/MOVSD instead of scalarizing it.
19135   // Example:
19136   //   (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
19137   //
19138   // Could be rewritten as:
19139   //   (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
19140   //
19141   // The advantage is that the two shifts from the example would be
19142   // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
19143   // the vector shift into four scalar shifts plus four pairs of vector
19144   // insert/extract.
19145   if ((VT == MVT::v8i16 || VT == MVT::v4i32) &&
19146       ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
19147     unsigned TargetOpcode = X86ISD::MOVSS;
19148     bool CanBeSimplified;
19149     // The splat value for the first packed shift (the 'X' from the example).
19150     SDValue Amt1 = Amt->getOperand(0);
19151     // The splat value for the second packed shift (the 'Y' from the example).
19152     SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) :
19153                                         Amt->getOperand(2);
19154
19155     // See if it is possible to replace this node with a sequence of
19156     // two shifts followed by a MOVSS/MOVSD
19157     if (VT == MVT::v4i32) {
19158       // Check if it is legal to use a MOVSS.
19159       CanBeSimplified = Amt2 == Amt->getOperand(2) &&
19160                         Amt2 == Amt->getOperand(3);
19161       if (!CanBeSimplified) {
19162         // Otherwise, check if we can still simplify this node using a MOVSD.
19163         CanBeSimplified = Amt1 == Amt->getOperand(1) &&
19164                           Amt->getOperand(2) == Amt->getOperand(3);
19165         TargetOpcode = X86ISD::MOVSD;
19166         Amt2 = Amt->getOperand(2);
19167       }
19168     } else {
19169       // Do similar checks for the case where the machine value type
19170       // is MVT::v8i16.
19171       CanBeSimplified = Amt1 == Amt->getOperand(1);
19172       for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
19173         CanBeSimplified = Amt2 == Amt->getOperand(i);
19174
19175       if (!CanBeSimplified) {
19176         TargetOpcode = X86ISD::MOVSD;
19177         CanBeSimplified = true;
19178         Amt2 = Amt->getOperand(4);
19179         for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
19180           CanBeSimplified = Amt1 == Amt->getOperand(i);
19181         for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
19182           CanBeSimplified = Amt2 == Amt->getOperand(j);
19183       }
19184     }
19185
19186     if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
19187         isa<ConstantSDNode>(Amt2)) {
19188       // Replace this node with two shifts followed by a MOVSS/MOVSD.
19189       EVT CastVT = MVT::v4i32;
19190       SDValue Splat1 =
19191         DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), VT);
19192       SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
19193       SDValue Splat2 =
19194         DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), VT);
19195       SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
19196       if (TargetOpcode == X86ISD::MOVSD)
19197         CastVT = MVT::v2i64;
19198       SDValue BitCast1 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift1);
19199       SDValue BitCast2 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift2);
19200       SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2,
19201                                             BitCast1, DAG);
19202       return DAG.getNode(ISD::BITCAST, dl, VT, Result);
19203     }
19204   }
19205
19206   if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
19207     assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq.");
19208
19209     // a = a << 5;
19210     Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(5, VT));
19211     Op = DAG.getNode(ISD::BITCAST, dl, VT, Op);
19212
19213     // Turn 'a' into a mask suitable for VSELECT
19214     SDValue VSelM = DAG.getConstant(0x80, VT);
19215     SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
19216     OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
19217
19218     SDValue CM1 = DAG.getConstant(0x0f, VT);
19219     SDValue CM2 = DAG.getConstant(0x3f, VT);
19220
19221     // r = VSELECT(r, psllw(r & (char16)15, 4), a);
19222     SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1);
19223     M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 4, DAG);
19224     M = DAG.getNode(ISD::BITCAST, dl, VT, M);
19225     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
19226
19227     // a += a
19228     Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
19229     OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
19230     OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
19231
19232     // r = VSELECT(r, psllw(r & (char16)63, 2), a);
19233     M = DAG.getNode(ISD::AND, dl, VT, R, CM2);
19234     M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 2, DAG);
19235     M = DAG.getNode(ISD::BITCAST, dl, VT, M);
19236     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
19237
19238     // a += a
19239     Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
19240     OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
19241     OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
19242
19243     // return VSELECT(r, r+r, a);
19244     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel,
19245                     DAG.getNode(ISD::ADD, dl, VT, R, R), R);
19246     return R;
19247   }
19248
19249   // It's worth extending once and using the v8i32 shifts for 16-bit types, but
19250   // the extra overheads to get from v16i8 to v8i32 make the existing SSE
19251   // solution better.
19252   if (Subtarget->hasInt256() && VT == MVT::v8i16) {
19253     MVT NewVT = VT == MVT::v8i16 ? MVT::v8i32 : MVT::v16i16;
19254     unsigned ExtOpc =
19255         Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
19256     R = DAG.getNode(ExtOpc, dl, NewVT, R);
19257     Amt = DAG.getNode(ISD::ANY_EXTEND, dl, NewVT, Amt);
19258     return DAG.getNode(ISD::TRUNCATE, dl, VT,
19259                        DAG.getNode(Op.getOpcode(), dl, NewVT, R, Amt));
19260     }
19261
19262   // Decompose 256-bit shifts into smaller 128-bit shifts.
19263   if (VT.is256BitVector()) {
19264     unsigned NumElems = VT.getVectorNumElements();
19265     MVT EltVT = VT.getVectorElementType();
19266     EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
19267
19268     // Extract the two vectors
19269     SDValue V1 = Extract128BitVector(R, 0, DAG, dl);
19270     SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl);
19271
19272     // Recreate the shift amount vectors
19273     SDValue Amt1, Amt2;
19274     if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
19275       // Constant shift amount
19276       SmallVector<SDValue, 4> Amt1Csts;
19277       SmallVector<SDValue, 4> Amt2Csts;
19278       for (unsigned i = 0; i != NumElems/2; ++i)
19279         Amt1Csts.push_back(Amt->getOperand(i));
19280       for (unsigned i = NumElems/2; i != NumElems; ++i)
19281         Amt2Csts.push_back(Amt->getOperand(i));
19282
19283       Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts);
19284       Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts);
19285     } else {
19286       // Variable shift amount
19287       Amt1 = Extract128BitVector(Amt, 0, DAG, dl);
19288       Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl);
19289     }
19290
19291     // Issue new vector shifts for the smaller types
19292     V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
19293     V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
19294
19295     // Concatenate the result back
19296     return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
19297   }
19298
19299   return SDValue();
19300 }
19301
19302 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
19303   // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
19304   // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
19305   // looks for this combo and may remove the "setcc" instruction if the "setcc"
19306   // has only one use.
19307   SDNode *N = Op.getNode();
19308   SDValue LHS = N->getOperand(0);
19309   SDValue RHS = N->getOperand(1);
19310   unsigned BaseOp = 0;
19311   unsigned Cond = 0;
19312   SDLoc DL(Op);
19313   switch (Op.getOpcode()) {
19314   default: llvm_unreachable("Unknown ovf instruction!");
19315   case ISD::SADDO:
19316     // A subtract of one will be selected as a INC. Note that INC doesn't
19317     // set CF, so we can't do this for UADDO.
19318     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
19319       if (C->isOne()) {
19320         BaseOp = X86ISD::INC;
19321         Cond = X86::COND_O;
19322         break;
19323       }
19324     BaseOp = X86ISD::ADD;
19325     Cond = X86::COND_O;
19326     break;
19327   case ISD::UADDO:
19328     BaseOp = X86ISD::ADD;
19329     Cond = X86::COND_B;
19330     break;
19331   case ISD::SSUBO:
19332     // A subtract of one will be selected as a DEC. Note that DEC doesn't
19333     // set CF, so we can't do this for USUBO.
19334     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
19335       if (C->isOne()) {
19336         BaseOp = X86ISD::DEC;
19337         Cond = X86::COND_O;
19338         break;
19339       }
19340     BaseOp = X86ISD::SUB;
19341     Cond = X86::COND_O;
19342     break;
19343   case ISD::USUBO:
19344     BaseOp = X86ISD::SUB;
19345     Cond = X86::COND_B;
19346     break;
19347   case ISD::SMULO:
19348     BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
19349     Cond = X86::COND_O;
19350     break;
19351   case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
19352     if (N->getValueType(0) == MVT::i8) {
19353       BaseOp = X86ISD::UMUL8;
19354       Cond = X86::COND_O;
19355       break;
19356     }
19357     SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
19358                                  MVT::i32);
19359     SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
19360
19361     SDValue SetCC =
19362       DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
19363                   DAG.getConstant(X86::COND_O, MVT::i32),
19364                   SDValue(Sum.getNode(), 2));
19365
19366     return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
19367   }
19368   }
19369
19370   // Also sets EFLAGS.
19371   SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
19372   SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
19373
19374   SDValue SetCC =
19375     DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1),
19376                 DAG.getConstant(Cond, MVT::i32),
19377                 SDValue(Sum.getNode(), 1));
19378
19379   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
19380 }
19381
19382 // Sign extension of the low part of vector elements. This may be used either
19383 // when sign extend instructions are not available or if the vector element
19384 // sizes already match the sign-extended size. If the vector elements are in
19385 // their pre-extended size and sign extend instructions are available, that will
19386 // be handled by LowerSIGN_EXTEND.
19387 SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
19388                                                   SelectionDAG &DAG) const {
19389   SDLoc dl(Op);
19390   EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
19391   MVT VT = Op.getSimpleValueType();
19392
19393   if (!Subtarget->hasSSE2() || !VT.isVector())
19394     return SDValue();
19395
19396   unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
19397                       ExtraVT.getScalarType().getSizeInBits();
19398
19399   switch (VT.SimpleTy) {
19400     default: return SDValue();
19401     case MVT::v8i32:
19402     case MVT::v16i16:
19403       if (!Subtarget->hasFp256())
19404         return SDValue();
19405       if (!Subtarget->hasInt256()) {
19406         // needs to be split
19407         unsigned NumElems = VT.getVectorNumElements();
19408
19409         // Extract the LHS vectors
19410         SDValue LHS = Op.getOperand(0);
19411         SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
19412         SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
19413
19414         MVT EltVT = VT.getVectorElementType();
19415         EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
19416
19417         EVT ExtraEltVT = ExtraVT.getVectorElementType();
19418         unsigned ExtraNumElems = ExtraVT.getVectorNumElements();
19419         ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
19420                                    ExtraNumElems/2);
19421         SDValue Extra = DAG.getValueType(ExtraVT);
19422
19423         LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
19424         LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
19425
19426         return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);
19427       }
19428       // fall through
19429     case MVT::v4i32:
19430     case MVT::v8i16: {
19431       SDValue Op0 = Op.getOperand(0);
19432
19433       // This is a sign extension of some low part of vector elements without
19434       // changing the size of the vector elements themselves:
19435       // Shift-Left + Shift-Right-Algebraic.
19436       SDValue Shl = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0,
19437                                                BitsDiff, DAG);
19438       return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Shl, BitsDiff,
19439                                         DAG);
19440     }
19441   }
19442 }
19443
19444 /// Returns true if the operand type is exactly twice the native width, and
19445 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
19446 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
19447 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
19448 bool X86TargetLowering::needsCmpXchgNb(const Type *MemType) const {
19449   unsigned OpWidth = MemType->getPrimitiveSizeInBits();
19450
19451   if (OpWidth == 64)
19452     return !Subtarget->is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
19453   else if (OpWidth == 128)
19454     return Subtarget->hasCmpxchg16b();
19455   else
19456     return false;
19457 }
19458
19459 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
19460   return needsCmpXchgNb(SI->getValueOperand()->getType());
19461 }
19462
19463 // Note: this turns large loads into lock cmpxchg8b/16b.
19464 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
19465 bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
19466   auto PTy = cast<PointerType>(LI->getPointerOperand()->getType());
19467   return needsCmpXchgNb(PTy->getElementType());
19468 }
19469
19470 bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
19471   unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
19472   const Type *MemType = AI->getType();
19473
19474   // If the operand is too big, we must see if cmpxchg8/16b is available
19475   // and default to library calls otherwise.
19476   if (MemType->getPrimitiveSizeInBits() > NativeWidth)
19477     return needsCmpXchgNb(MemType);
19478
19479   AtomicRMWInst::BinOp Op = AI->getOperation();
19480   switch (Op) {
19481   default:
19482     llvm_unreachable("Unknown atomic operation");
19483   case AtomicRMWInst::Xchg:
19484   case AtomicRMWInst::Add:
19485   case AtomicRMWInst::Sub:
19486     // It's better to use xadd, xsub or xchg for these in all cases.
19487     return false;
19488   case AtomicRMWInst::Or:
19489   case AtomicRMWInst::And:
19490   case AtomicRMWInst::Xor:
19491     // If the atomicrmw's result isn't actually used, we can just add a "lock"
19492     // prefix to a normal instruction for these operations.
19493     return !AI->use_empty();
19494   case AtomicRMWInst::Nand:
19495   case AtomicRMWInst::Max:
19496   case AtomicRMWInst::Min:
19497   case AtomicRMWInst::UMax:
19498   case AtomicRMWInst::UMin:
19499     // These always require a non-trivial set of data operations on x86. We must
19500     // use a cmpxchg loop.
19501     return true;
19502   }
19503 }
19504
19505 static bool hasMFENCE(const X86Subtarget& Subtarget) {
19506   // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
19507   // no-sse2). There isn't any reason to disable it if the target processor
19508   // supports it.
19509   return Subtarget.hasSSE2() || Subtarget.is64Bit();
19510 }
19511
19512 LoadInst *
19513 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
19514   unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
19515   const Type *MemType = AI->getType();
19516   // Accesses larger than the native width are turned into cmpxchg/libcalls, so
19517   // there is no benefit in turning such RMWs into loads, and it is actually
19518   // harmful as it introduces a mfence.
19519   if (MemType->getPrimitiveSizeInBits() > NativeWidth)
19520     return nullptr;
19521
19522   auto Builder = IRBuilder<>(AI);
19523   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
19524   auto SynchScope = AI->getSynchScope();
19525   // We must restrict the ordering to avoid generating loads with Release or
19526   // ReleaseAcquire orderings.
19527   auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
19528   auto Ptr = AI->getPointerOperand();
19529
19530   // Before the load we need a fence. Here is an example lifted from
19531   // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
19532   // is required:
19533   // Thread 0:
19534   //   x.store(1, relaxed);
19535   //   r1 = y.fetch_add(0, release);
19536   // Thread 1:
19537   //   y.fetch_add(42, acquire);
19538   //   r2 = x.load(relaxed);
19539   // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
19540   // lowered to just a load without a fence. A mfence flushes the store buffer,
19541   // making the optimization clearly correct.
19542   // FIXME: it is required if isAtLeastRelease(Order) but it is not clear
19543   // otherwise, we might be able to be more agressive on relaxed idempotent
19544   // rmw. In practice, they do not look useful, so we don't try to be
19545   // especially clever.
19546   if (SynchScope == SingleThread) {
19547     // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
19548     // the IR level, so we must wrap it in an intrinsic.
19549     return nullptr;
19550   } else if (hasMFENCE(*Subtarget)) {
19551     Function *MFence = llvm::Intrinsic::getDeclaration(M,
19552             Intrinsic::x86_sse2_mfence);
19553     Builder.CreateCall(MFence);
19554   } else {
19555     // FIXME: it might make sense to use a locked operation here but on a
19556     // different cache-line to prevent cache-line bouncing. In practice it
19557     // is probably a small win, and x86 processors without mfence are rare
19558     // enough that we do not bother.
19559     return nullptr;
19560   }
19561
19562   // Finally we can emit the atomic load.
19563   LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
19564           AI->getType()->getPrimitiveSizeInBits());
19565   Loaded->setAtomic(Order, SynchScope);
19566   AI->replaceAllUsesWith(Loaded);
19567   AI->eraseFromParent();
19568   return Loaded;
19569 }
19570
19571 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
19572                                  SelectionDAG &DAG) {
19573   SDLoc dl(Op);
19574   AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
19575     cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
19576   SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
19577     cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
19578
19579   // The only fence that needs an instruction is a sequentially-consistent
19580   // cross-thread fence.
19581   if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) {
19582     if (hasMFENCE(*Subtarget))
19583       return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
19584
19585     SDValue Chain = Op.getOperand(0);
19586     SDValue Zero = DAG.getConstant(0, MVT::i32);
19587     SDValue Ops[] = {
19588       DAG.getRegister(X86::ESP, MVT::i32), // Base
19589       DAG.getTargetConstant(1, MVT::i8),   // Scale
19590       DAG.getRegister(0, MVT::i32),        // Index
19591       DAG.getTargetConstant(0, MVT::i32),  // Disp
19592       DAG.getRegister(0, MVT::i32),        // Segment.
19593       Zero,
19594       Chain
19595     };
19596     SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
19597     return SDValue(Res, 0);
19598   }
19599
19600   // MEMBARRIER is a compiler barrier; it codegens to a no-op.
19601   return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
19602 }
19603
19604 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
19605                              SelectionDAG &DAG) {
19606   MVT T = Op.getSimpleValueType();
19607   SDLoc DL(Op);
19608   unsigned Reg = 0;
19609   unsigned size = 0;
19610   switch(T.SimpleTy) {
19611   default: llvm_unreachable("Invalid value type!");
19612   case MVT::i8:  Reg = X86::AL;  size = 1; break;
19613   case MVT::i16: Reg = X86::AX;  size = 2; break;
19614   case MVT::i32: Reg = X86::EAX; size = 4; break;
19615   case MVT::i64:
19616     assert(Subtarget->is64Bit() && "Node not type legal!");
19617     Reg = X86::RAX; size = 8;
19618     break;
19619   }
19620   SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
19621                                   Op.getOperand(2), SDValue());
19622   SDValue Ops[] = { cpIn.getValue(0),
19623                     Op.getOperand(1),
19624                     Op.getOperand(3),
19625                     DAG.getTargetConstant(size, MVT::i8),
19626                     cpIn.getValue(1) };
19627   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
19628   MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
19629   SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
19630                                            Ops, T, MMO);
19631
19632   SDValue cpOut =
19633     DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
19634   SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
19635                                       MVT::i32, cpOut.getValue(2));
19636   SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1),
19637                                 DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);
19638
19639   DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
19640   DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
19641   DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
19642   return SDValue();
19643 }
19644
19645 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
19646                             SelectionDAG &DAG) {
19647   MVT SrcVT = Op.getOperand(0).getSimpleValueType();
19648   MVT DstVT = Op.getSimpleValueType();
19649
19650   if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) {
19651     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
19652     if (DstVT != MVT::f64)
19653       // This conversion needs to be expanded.
19654       return SDValue();
19655
19656     SDValue InVec = Op->getOperand(0);
19657     SDLoc dl(Op);
19658     unsigned NumElts = SrcVT.getVectorNumElements();
19659     EVT SVT = SrcVT.getVectorElementType();
19660
19661     // Widen the vector in input in the case of MVT::v2i32.
19662     // Example: from MVT::v2i32 to MVT::v4i32.
19663     SmallVector<SDValue, 16> Elts;
19664     for (unsigned i = 0, e = NumElts; i != e; ++i)
19665       Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec,
19666                                  DAG.getIntPtrConstant(i)));
19667
19668     // Explicitly mark the extra elements as Undef.
19669     SDValue Undef = DAG.getUNDEF(SVT);
19670     for (unsigned i = NumElts, e = NumElts * 2; i != e; ++i)
19671       Elts.push_back(Undef);
19672
19673     EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
19674     SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts);
19675     SDValue ToV2F64 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, BV);
19676     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
19677                        DAG.getIntPtrConstant(0));
19678   }
19679
19680   assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
19681          Subtarget->hasMMX() && "Unexpected custom BITCAST");
19682   assert((DstVT == MVT::i64 ||
19683           (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
19684          "Unexpected custom BITCAST");
19685   // i64 <=> MMX conversions are Legal.
19686   if (SrcVT==MVT::i64 && DstVT.isVector())
19687     return Op;
19688   if (DstVT==MVT::i64 && SrcVT.isVector())
19689     return Op;
19690   // MMX <=> MMX conversions are Legal.
19691   if (SrcVT.isVector() && DstVT.isVector())
19692     return Op;
19693   // All other conversions need to be expanded.
19694   return SDValue();
19695 }
19696
19697 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget,
19698                           SelectionDAG &DAG) {
19699   SDNode *Node = Op.getNode();
19700   SDLoc dl(Node);
19701
19702   Op = Op.getOperand(0);
19703   EVT VT = Op.getValueType();
19704   assert((VT.is128BitVector() || VT.is256BitVector()) &&
19705          "CTPOP lowering only implemented for 128/256-bit wide vector types");
19706
19707   unsigned NumElts = VT.getVectorNumElements();
19708   EVT EltVT = VT.getVectorElementType();
19709   unsigned Len = EltVT.getSizeInBits();
19710
19711   // This is the vectorized version of the "best" algorithm from
19712   // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
19713   // with a minor tweak to use a series of adds + shifts instead of vector
19714   // multiplications. Implemented for the v2i64, v4i64, v4i32, v8i32 types:
19715   //
19716   //  v2i64, v4i64, v4i32 => Only profitable w/ popcnt disabled
19717   //  v8i32 => Always profitable
19718   //
19719   // FIXME: There a couple of possible improvements:
19720   //
19721   // 1) Support for i8 and i16 vectors (needs measurements if popcnt enabled).
19722   // 2) Use strategies from http://wm.ite.pl/articles/sse-popcount.html
19723   //
19724   assert(EltVT.isInteger() && (Len == 32 || Len == 64) && Len % 8 == 0 &&
19725          "CTPOP not implemented for this vector element type.");
19726
19727   // X86 canonicalize ANDs to vXi64, generate the appropriate bitcasts to avoid
19728   // extra legalization.
19729   bool NeedsBitcast = EltVT == MVT::i32;
19730   MVT BitcastVT = VT.is256BitVector() ? MVT::v4i64 : MVT::v2i64;
19731
19732   SDValue Cst55 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), EltVT);
19733   SDValue Cst33 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), EltVT);
19734   SDValue Cst0F = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), EltVT);
19735
19736   // v = v - ((v >> 1) & 0x55555555...)
19737   SmallVector<SDValue, 8> Ones(NumElts, DAG.getConstant(1, EltVT));
19738   SDValue OnesV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ones);
19739   SDValue Srl = DAG.getNode(ISD::SRL, dl, VT, Op, OnesV);
19740   if (NeedsBitcast)
19741     Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);
19742
19743   SmallVector<SDValue, 8> Mask55(NumElts, Cst55);
19744   SDValue M55 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask55);
19745   if (NeedsBitcast)
19746     M55 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M55);
19747
19748   SDValue And = DAG.getNode(ISD::AND, dl, Srl.getValueType(), Srl, M55);
19749   if (VT != And.getValueType())
19750     And = DAG.getNode(ISD::BITCAST, dl, VT, And);
19751   SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, Op, And);
19752
19753   // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
19754   SmallVector<SDValue, 8> Mask33(NumElts, Cst33);
19755   SDValue M33 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask33);
19756   SmallVector<SDValue, 8> Twos(NumElts, DAG.getConstant(2, EltVT));
19757   SDValue TwosV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Twos);
19758
19759   Srl = DAG.getNode(ISD::SRL, dl, VT, Sub, TwosV);
19760   if (NeedsBitcast) {
19761     Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);
19762     M33 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M33);
19763     Sub = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Sub);
19764   }
19765
19766   SDValue AndRHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Srl, M33);
19767   SDValue AndLHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Sub, M33);
19768   if (VT != AndRHS.getValueType()) {
19769     AndRHS = DAG.getNode(ISD::BITCAST, dl, VT, AndRHS);
19770     AndLHS = DAG.getNode(ISD::BITCAST, dl, VT, AndLHS);
19771   }
19772   SDValue Add = DAG.getNode(ISD::ADD, dl, VT, AndLHS, AndRHS);
19773
19774   // v = (v + (v >> 4)) & 0x0F0F0F0F...
19775   SmallVector<SDValue, 8> Fours(NumElts, DAG.getConstant(4, EltVT));
19776   SDValue FoursV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Fours);
19777   Srl = DAG.getNode(ISD::SRL, dl, VT, Add, FoursV);
19778   Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);
19779
19780   SmallVector<SDValue, 8> Mask0F(NumElts, Cst0F);
19781   SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask0F);
19782   if (NeedsBitcast) {
19783     Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);
19784     M0F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M0F);
19785   }
19786   And = DAG.getNode(ISD::AND, dl, M0F.getValueType(), Add, M0F);
19787   if (VT != And.getValueType())
19788     And = DAG.getNode(ISD::BITCAST, dl, VT, And);
19789
19790   // The algorithm mentioned above uses:
19791   //    v = (v * 0x01010101...) >> (Len - 8)
19792   //
19793   // Change it to use vector adds + vector shifts which yield faster results on
19794   // Haswell than using vector integer multiplication.
19795   //
19796   // For i32 elements:
19797   //    v = v + (v >> 8)
19798   //    v = v + (v >> 16)
19799   //
19800   // For i64 elements:
19801   //    v = v + (v >> 8)
19802   //    v = v + (v >> 16)
19803   //    v = v + (v >> 32)
19804   //
19805   Add = And;
19806   SmallVector<SDValue, 8> Csts;
19807   for (unsigned i = 8; i <= Len/2; i *= 2) {
19808     Csts.assign(NumElts, DAG.getConstant(i, EltVT));
19809     SDValue CstsV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Csts);
19810     Srl = DAG.getNode(ISD::SRL, dl, VT, Add, CstsV);
19811     Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);
19812     Csts.clear();
19813   }
19814
19815   // The result is on the least significant 6-bits on i32 and 7-bits on i64.
19816   SDValue Cst3F = DAG.getConstant(APInt(Len, Len == 32 ? 0x3F : 0x7F), EltVT);
19817   SmallVector<SDValue, 8> Cst3FV(NumElts, Cst3F);
19818   SDValue M3F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Cst3FV);
19819   if (NeedsBitcast) {
19820     Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);
19821     M3F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M3F);
19822   }
19823   And = DAG.getNode(ISD::AND, dl, M3F.getValueType(), Add, M3F);
19824   if (VT != And.getValueType())
19825     And = DAG.getNode(ISD::BITCAST, dl, VT, And);
19826
19827   return And;
19828 }
19829
19830 static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
19831   SDNode *Node = Op.getNode();
19832   SDLoc dl(Node);
19833   EVT T = Node->getValueType(0);
19834   SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
19835                               DAG.getConstant(0, T), Node->getOperand(2));
19836   return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
19837                        cast<AtomicSDNode>(Node)->getMemoryVT(),
19838                        Node->getOperand(0),
19839                        Node->getOperand(1), negOp,
19840                        cast<AtomicSDNode>(Node)->getMemOperand(),
19841                        cast<AtomicSDNode>(Node)->getOrdering(),
19842                        cast<AtomicSDNode>(Node)->getSynchScope());
19843 }
19844
19845 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
19846   SDNode *Node = Op.getNode();
19847   SDLoc dl(Node);
19848   EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
19849
19850   // Convert seq_cst store -> xchg
19851   // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
19852   // FIXME: On 32-bit, store -> fist or movq would be more efficient
19853   //        (The only way to get a 16-byte store is cmpxchg16b)
19854   // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
19855   if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent ||
19856       !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
19857     SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
19858                                  cast<AtomicSDNode>(Node)->getMemoryVT(),
19859                                  Node->getOperand(0),
19860                                  Node->getOperand(1), Node->getOperand(2),
19861                                  cast<AtomicSDNode>(Node)->getMemOperand(),
19862                                  cast<AtomicSDNode>(Node)->getOrdering(),
19863                                  cast<AtomicSDNode>(Node)->getSynchScope());
19864     return Swap.getValue(1);
19865   }
19866   // Other atomic stores have a simple pattern.
19867   return Op;
19868 }
19869
19870 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
19871   EVT VT = Op.getNode()->getSimpleValueType(0);
19872
19873   // Let legalize expand this if it isn't a legal type yet.
19874   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
19875     return SDValue();
19876
19877   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
19878
19879   unsigned Opc;
19880   bool ExtraOp = false;
19881   switch (Op.getOpcode()) {
19882   default: llvm_unreachable("Invalid code");
19883   case ISD::ADDC: Opc = X86ISD::ADD; break;
19884   case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
19885   case ISD::SUBC: Opc = X86ISD::SUB; break;
19886   case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
19887   }
19888
19889   if (!ExtraOp)
19890     return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
19891                        Op.getOperand(1));
19892   return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
19893                      Op.getOperand(1), Op.getOperand(2));
19894 }
19895
19896 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
19897                             SelectionDAG &DAG) {
19898   assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit());
19899
19900   // For MacOSX, we want to call an alternative entry point: __sincos_stret,
19901   // which returns the values as { float, float } (in XMM0) or
19902   // { double, double } (which is returned in XMM0, XMM1).
19903   SDLoc dl(Op);
19904   SDValue Arg = Op.getOperand(0);
19905   EVT ArgVT = Arg.getValueType();
19906   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
19907
19908   TargetLowering::ArgListTy Args;
19909   TargetLowering::ArgListEntry Entry;
19910
19911   Entry.Node = Arg;
19912   Entry.Ty = ArgTy;
19913   Entry.isSExt = false;
19914   Entry.isZExt = false;
19915   Args.push_back(Entry);
19916
19917   bool isF64 = ArgVT == MVT::f64;
19918   // Only optimize x86_64 for now. i386 is a bit messy. For f32,
19919   // the small struct {f32, f32} is returned in (eax, edx). For f64,
19920   // the results are returned via SRet in memory.
19921   const char *LibcallName =  isF64 ? "__sincos_stret" : "__sincosf_stret";
19922   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19923   SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy());
19924
19925   Type *RetTy = isF64
19926     ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
19927     : (Type*)VectorType::get(ArgTy, 4);
19928
19929   TargetLowering::CallLoweringInfo CLI(DAG);
19930   CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
19931     .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0);
19932
19933   std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
19934
19935   if (isF64)
19936     // Returned in xmm0 and xmm1.
19937     return CallResult.first;
19938
19939   // Returned in bits 0:31 and 32:64 xmm0.
19940   SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
19941                                CallResult.first, DAG.getIntPtrConstant(0));
19942   SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
19943                                CallResult.first, DAG.getIntPtrConstant(1));
19944   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
19945   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
19946 }
19947
19948 /// LowerOperation - Provide custom lowering hooks for some operations.
19949 ///
19950 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
19951   switch (Op.getOpcode()) {
19952   default: llvm_unreachable("Should not custom lower this!");
19953   case ISD::SIGN_EXTEND_INREG:  return LowerSIGN_EXTEND_INREG(Op,DAG);
19954   case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
19955   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
19956     return LowerCMP_SWAP(Op, Subtarget, DAG);
19957   case ISD::CTPOP:              return LowerCTPOP(Op, Subtarget, DAG);
19958   case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
19959   case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op,DAG);
19960   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
19961   case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, DAG);
19962   case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
19963   case ISD::VSELECT:            return LowerVSELECT(Op, DAG);
19964   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
19965   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
19966   case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
19967   case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
19968   case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
19969   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
19970   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
19971   case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
19972   case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
19973   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
19974   case ISD::SHL_PARTS:
19975   case ISD::SRA_PARTS:
19976   case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
19977   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
19978   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
19979   case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
19980   case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, Subtarget, DAG);
19981   case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, Subtarget, DAG);
19982   case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, Subtarget, DAG);
19983   case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
19984   case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
19985   case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
19986   case ISD::LOAD:               return LowerExtendedLoad(Op, Subtarget, DAG);
19987   case ISD::FABS:
19988   case ISD::FNEG:               return LowerFABSorFNEG(Op, DAG);
19989   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
19990   case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
19991   case ISD::SETCC:              return LowerSETCC(Op, DAG);
19992   case ISD::SELECT:             return LowerSELECT(Op, DAG);
19993   case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
19994   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
19995   case ISD::VASTART:            return LowerVASTART(Op, DAG);
19996   case ISD::VAARG:              return LowerVAARG(Op, DAG);
19997   case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
19998   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
19999   case ISD::INTRINSIC_VOID:
20000   case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
20001   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
20002   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
20003   case ISD::FRAME_TO_ARGS_OFFSET:
20004                                 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
20005   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
20006   case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
20007   case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
20008   case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
20009   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
20010   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
20011   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
20012   case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
20013   case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ_ZERO_UNDEF(Op, DAG);
20014   case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
20015   case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
20016   case ISD::UMUL_LOHI:
20017   case ISD::SMUL_LOHI:          return LowerMUL_LOHI(Op, Subtarget, DAG);
20018   case ISD::SRA:
20019   case ISD::SRL:
20020   case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
20021   case ISD::SADDO:
20022   case ISD::UADDO:
20023   case ISD::SSUBO:
20024   case ISD::USUBO:
20025   case ISD::SMULO:
20026   case ISD::UMULO:              return LowerXALUO(Op, DAG);
20027   case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
20028   case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);
20029   case ISD::ADDC:
20030   case ISD::ADDE:
20031   case ISD::SUBC:
20032   case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
20033   case ISD::ADD:                return LowerADD(Op, DAG);
20034   case ISD::SUB:                return LowerSUB(Op, DAG);
20035   case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
20036   }
20037 }
20038
20039 /// ReplaceNodeResults - Replace a node with an illegal result type
20040 /// with a new node built out of custom code.
20041 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
20042                                            SmallVectorImpl<SDValue>&Results,
20043                                            SelectionDAG &DAG) const {
20044   SDLoc dl(N);
20045   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20046   switch (N->getOpcode()) {
20047   default:
20048     llvm_unreachable("Do not know how to custom type legalize this operation!");
20049   // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
20050   case X86ISD::FMINC:
20051   case X86ISD::FMIN:
20052   case X86ISD::FMAXC:
20053   case X86ISD::FMAX: {
20054     EVT VT = N->getValueType(0);
20055     if (VT != MVT::v2f32)
20056       llvm_unreachable("Unexpected type (!= v2f32) on FMIN/FMAX.");
20057     SDValue UNDEF = DAG.getUNDEF(VT);
20058     SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
20059                               N->getOperand(0), UNDEF);
20060     SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
20061                               N->getOperand(1), UNDEF);
20062     Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
20063     return;
20064   }
20065   case ISD::SIGN_EXTEND_INREG:
20066   case ISD::ADDC:
20067   case ISD::ADDE:
20068   case ISD::SUBC:
20069   case ISD::SUBE:
20070     // We don't want to expand or promote these.
20071     return;
20072   case ISD::SDIV:
20073   case ISD::UDIV:
20074   case ISD::SREM:
20075   case ISD::UREM:
20076   case ISD::SDIVREM:
20077   case ISD::UDIVREM: {
20078     SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
20079     Results.push_back(V);
20080     return;
20081   }
20082   case ISD::FP_TO_SINT:
20083   case ISD::FP_TO_UINT: {
20084     bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
20085
20086     if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType()))
20087       return;
20088
20089     std::pair<SDValue,SDValue> Vals =
20090         FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
20091     SDValue FIST = Vals.first, StackSlot = Vals.second;
20092     if (FIST.getNode()) {
20093       EVT VT = N->getValueType(0);
20094       // Return a load from the stack slot.
20095       if (StackSlot.getNode())
20096         Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
20097                                       MachinePointerInfo(),
20098                                       false, false, false, 0));
20099       else
20100         Results.push_back(FIST);
20101     }
20102     return;
20103   }
20104   case ISD::UINT_TO_FP: {
20105     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
20106     if (N->getOperand(0).getValueType() != MVT::v2i32 ||
20107         N->getValueType(0) != MVT::v2f32)
20108       return;
20109     SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,
20110                                  N->getOperand(0));
20111     SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
20112                                      MVT::f64);
20113     SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias);
20114     SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
20115                              DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, VBias));
20116     Or = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or);
20117     SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
20118     Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
20119     return;
20120   }
20121   case ISD::FP_ROUND: {
20122     if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
20123         return;
20124     SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
20125     Results.push_back(V);
20126     return;
20127   }
20128   case ISD::INTRINSIC_W_CHAIN: {
20129     unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
20130     switch (IntNo) {
20131     default : llvm_unreachable("Do not know how to custom type "
20132                                "legalize this intrinsic operation!");
20133     case Intrinsic::x86_rdtsc:
20134       return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
20135                                      Results);
20136     case Intrinsic::x86_rdtscp:
20137       return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
20138                                      Results);
20139     case Intrinsic::x86_rdpmc:
20140       return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
20141     }
20142   }
20143   case ISD::READCYCLECOUNTER: {
20144     return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
20145                                    Results);
20146   }
20147   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
20148     EVT T = N->getValueType(0);
20149     assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
20150     bool Regs64bit = T == MVT::i128;
20151     EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
20152     SDValue cpInL, cpInH;
20153     cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
20154                         DAG.getConstant(0, HalfT));
20155     cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
20156                         DAG.getConstant(1, HalfT));
20157     cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
20158                              Regs64bit ? X86::RAX : X86::EAX,
20159                              cpInL, SDValue());
20160     cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
20161                              Regs64bit ? X86::RDX : X86::EDX,
20162                              cpInH, cpInL.getValue(1));
20163     SDValue swapInL, swapInH;
20164     swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
20165                           DAG.getConstant(0, HalfT));
20166     swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
20167                           DAG.getConstant(1, HalfT));
20168     swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl,
20169                                Regs64bit ? X86::RBX : X86::EBX,
20170                                swapInL, cpInH.getValue(1));
20171     swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl,
20172                                Regs64bit ? X86::RCX : X86::ECX,
20173                                swapInH, swapInL.getValue(1));
20174     SDValue Ops[] = { swapInH.getValue(0),
20175                       N->getOperand(1),
20176                       swapInH.getValue(1) };
20177     SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20178     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
20179     unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
20180                                   X86ISD::LCMPXCHG8_DAG;
20181     SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
20182     SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
20183                                         Regs64bit ? X86::RAX : X86::EAX,
20184                                         HalfT, Result.getValue(1));
20185     SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
20186                                         Regs64bit ? X86::RDX : X86::EDX,
20187                                         HalfT, cpOutL.getValue(2));
20188     SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
20189
20190     SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
20191                                         MVT::i32, cpOutH.getValue(2));
20192     SDValue Success =
20193         DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
20194                     DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);
20195     Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
20196
20197     Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
20198     Results.push_back(Success);
20199     Results.push_back(EFLAGS.getValue(1));
20200     return;
20201   }
20202   case ISD::ATOMIC_SWAP:
20203   case ISD::ATOMIC_LOAD_ADD:
20204   case ISD::ATOMIC_LOAD_SUB:
20205   case ISD::ATOMIC_LOAD_AND:
20206   case ISD::ATOMIC_LOAD_OR:
20207   case ISD::ATOMIC_LOAD_XOR:
20208   case ISD::ATOMIC_LOAD_NAND:
20209   case ISD::ATOMIC_LOAD_MIN:
20210   case ISD::ATOMIC_LOAD_MAX:
20211   case ISD::ATOMIC_LOAD_UMIN:
20212   case ISD::ATOMIC_LOAD_UMAX:
20213   case ISD::ATOMIC_LOAD: {
20214     // Delegate to generic TypeLegalization. Situations we can really handle
20215     // should have already been dealt with by AtomicExpandPass.cpp.
20216     break;
20217   }
20218   case ISD::BITCAST: {
20219     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
20220     EVT DstVT = N->getValueType(0);
20221     EVT SrcVT = N->getOperand(0)->getValueType(0);
20222
20223     if (SrcVT != MVT::f64 ||
20224         (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
20225       return;
20226
20227     unsigned NumElts = DstVT.getVectorNumElements();
20228     EVT SVT = DstVT.getVectorElementType();
20229     EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
20230     SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
20231                                    MVT::v2f64, N->getOperand(0));
20232     SDValue ToVecInt = DAG.getNode(ISD::BITCAST, dl, WiderVT, Expanded);
20233
20234     if (ExperimentalVectorWideningLegalization) {
20235       // If we are legalizing vectors by widening, we already have the desired
20236       // legal vector type, just return it.
20237       Results.push_back(ToVecInt);
20238       return;
20239     }
20240
20241     SmallVector<SDValue, 8> Elts;
20242     for (unsigned i = 0, e = NumElts; i != e; ++i)
20243       Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
20244                                    ToVecInt, DAG.getIntPtrConstant(i)));
20245
20246     Results.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, DstVT, Elts));
20247   }
20248   }
20249 }
20250
20251 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
20252   switch (Opcode) {
20253   default: return nullptr;
20254   case X86ISD::BSF:                return "X86ISD::BSF";
20255   case X86ISD::BSR:                return "X86ISD::BSR";
20256   case X86ISD::SHLD:               return "X86ISD::SHLD";
20257   case X86ISD::SHRD:               return "X86ISD::SHRD";
20258   case X86ISD::FAND:               return "X86ISD::FAND";
20259   case X86ISD::FANDN:              return "X86ISD::FANDN";
20260   case X86ISD::FOR:                return "X86ISD::FOR";
20261   case X86ISD::FXOR:               return "X86ISD::FXOR";
20262   case X86ISD::FSRL:               return "X86ISD::FSRL";
20263   case X86ISD::FILD:               return "X86ISD::FILD";
20264   case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
20265   case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
20266   case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
20267   case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
20268   case X86ISD::FLD:                return "X86ISD::FLD";
20269   case X86ISD::FST:                return "X86ISD::FST";
20270   case X86ISD::CALL:               return "X86ISD::CALL";
20271   case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
20272   case X86ISD::RDTSCP_DAG:         return "X86ISD::RDTSCP_DAG";
20273   case X86ISD::RDPMC_DAG:          return "X86ISD::RDPMC_DAG";
20274   case X86ISD::BT:                 return "X86ISD::BT";
20275   case X86ISD::CMP:                return "X86ISD::CMP";
20276   case X86ISD::COMI:               return "X86ISD::COMI";
20277   case X86ISD::UCOMI:              return "X86ISD::UCOMI";
20278   case X86ISD::CMPM:               return "X86ISD::CMPM";
20279   case X86ISD::CMPMU:              return "X86ISD::CMPMU";
20280   case X86ISD::SETCC:              return "X86ISD::SETCC";
20281   case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
20282   case X86ISD::FSETCC:             return "X86ISD::FSETCC";
20283   case X86ISD::CMOV:               return "X86ISD::CMOV";
20284   case X86ISD::BRCOND:             return "X86ISD::BRCOND";
20285   case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
20286   case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
20287   case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
20288   case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
20289   case X86ISD::Wrapper:            return "X86ISD::Wrapper";
20290   case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
20291   case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
20292   case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
20293   case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
20294   case X86ISD::PINSRB:             return "X86ISD::PINSRB";
20295   case X86ISD::PINSRW:             return "X86ISD::PINSRW";
20296   case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
20297   case X86ISD::ANDNP:              return "X86ISD::ANDNP";
20298   case X86ISD::PSIGN:              return "X86ISD::PSIGN";
20299   case X86ISD::BLENDI:             return "X86ISD::BLENDI";
20300   case X86ISD::SHRUNKBLEND:        return "X86ISD::SHRUNKBLEND";
20301   case X86ISD::SUBUS:              return "X86ISD::SUBUS";
20302   case X86ISD::HADD:               return "X86ISD::HADD";
20303   case X86ISD::HSUB:               return "X86ISD::HSUB";
20304   case X86ISD::FHADD:              return "X86ISD::FHADD";
20305   case X86ISD::FHSUB:              return "X86ISD::FHSUB";
20306   case X86ISD::UMAX:               return "X86ISD::UMAX";
20307   case X86ISD::UMIN:               return "X86ISD::UMIN";
20308   case X86ISD::SMAX:               return "X86ISD::SMAX";
20309   case X86ISD::SMIN:               return "X86ISD::SMIN";
20310   case X86ISD::FMAX:               return "X86ISD::FMAX";
20311   case X86ISD::FMIN:               return "X86ISD::FMIN";
20312   case X86ISD::FMAXC:              return "X86ISD::FMAXC";
20313   case X86ISD::FMINC:              return "X86ISD::FMINC";
20314   case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
20315   case X86ISD::FRCP:               return "X86ISD::FRCP";
20316   case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
20317   case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
20318   case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
20319   case X86ISD::EH_SJLJ_SETJMP:     return "X86ISD::EH_SJLJ_SETJMP";
20320   case X86ISD::EH_SJLJ_LONGJMP:    return "X86ISD::EH_SJLJ_LONGJMP";
20321   case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
20322   case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
20323   case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
20324   case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
20325   case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
20326   case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
20327   case X86ISD::LCMPXCHG16_DAG:     return "X86ISD::LCMPXCHG16_DAG";
20328   case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
20329   case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
20330   case X86ISD::VZEXT:              return "X86ISD::VZEXT";
20331   case X86ISD::VSEXT:              return "X86ISD::VSEXT";
20332   case X86ISD::VTRUNC:             return "X86ISD::VTRUNC";
20333   case X86ISD::VTRUNCM:            return "X86ISD::VTRUNCM";
20334   case X86ISD::VINSERT:            return "X86ISD::VINSERT";
20335   case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
20336   case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
20337   case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
20338   case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
20339   case X86ISD::VSHL:               return "X86ISD::VSHL";
20340   case X86ISD::VSRL:               return "X86ISD::VSRL";
20341   case X86ISD::VSRA:               return "X86ISD::VSRA";
20342   case X86ISD::VSHLI:              return "X86ISD::VSHLI";
20343   case X86ISD::VSRLI:              return "X86ISD::VSRLI";
20344   case X86ISD::VSRAI:              return "X86ISD::VSRAI";
20345   case X86ISD::CMPP:               return "X86ISD::CMPP";
20346   case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
20347   case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
20348   case X86ISD::PCMPEQM:            return "X86ISD::PCMPEQM";
20349   case X86ISD::PCMPGTM:            return "X86ISD::PCMPGTM";
20350   case X86ISD::ADD:                return "X86ISD::ADD";
20351   case X86ISD::SUB:                return "X86ISD::SUB";
20352   case X86ISD::ADC:                return "X86ISD::ADC";
20353   case X86ISD::SBB:                return "X86ISD::SBB";
20354   case X86ISD::SMUL:               return "X86ISD::SMUL";
20355   case X86ISD::UMUL:               return "X86ISD::UMUL";
20356   case X86ISD::SMUL8:              return "X86ISD::SMUL8";
20357   case X86ISD::UMUL8:              return "X86ISD::UMUL8";
20358   case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
20359   case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
20360   case X86ISD::INC:                return "X86ISD::INC";
20361   case X86ISD::DEC:                return "X86ISD::DEC";
20362   case X86ISD::OR:                 return "X86ISD::OR";
20363   case X86ISD::XOR:                return "X86ISD::XOR";
20364   case X86ISD::AND:                return "X86ISD::AND";
20365   case X86ISD::BEXTR:              return "X86ISD::BEXTR";
20366   case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
20367   case X86ISD::PTEST:              return "X86ISD::PTEST";
20368   case X86ISD::TESTP:              return "X86ISD::TESTP";
20369   case X86ISD::TESTM:              return "X86ISD::TESTM";
20370   case X86ISD::TESTNM:             return "X86ISD::TESTNM";
20371   case X86ISD::KORTEST:            return "X86ISD::KORTEST";
20372   case X86ISD::PACKSS:             return "X86ISD::PACKSS";
20373   case X86ISD::PACKUS:             return "X86ISD::PACKUS";
20374   case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
20375   case X86ISD::VALIGN:             return "X86ISD::VALIGN";
20376   case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
20377   case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
20378   case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
20379   case X86ISD::SHUFP:              return "X86ISD::SHUFP";
20380   case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
20381   case X86ISD::MOVLHPD:            return "X86ISD::MOVLHPD";
20382   case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
20383   case X86ISD::MOVLPS:             return "X86ISD::MOVLPS";
20384   case X86ISD::MOVLPD:             return "X86ISD::MOVLPD";
20385   case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
20386   case X86ISD::MOVSHDUP:           return "X86ISD::MOVSHDUP";
20387   case X86ISD::MOVSLDUP:           return "X86ISD::MOVSLDUP";
20388   case X86ISD::MOVSD:              return "X86ISD::MOVSD";
20389   case X86ISD::MOVSS:              return "X86ISD::MOVSS";
20390   case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
20391   case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
20392   case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
20393   case X86ISD::VBROADCASTM:        return "X86ISD::VBROADCASTM";
20394   case X86ISD::VEXTRACT:           return "X86ISD::VEXTRACT";
20395   case X86ISD::VPERMILPI:          return "X86ISD::VPERMILPI";
20396   case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
20397   case X86ISD::VPERMV:             return "X86ISD::VPERMV";
20398   case X86ISD::VPERMV3:            return "X86ISD::VPERMV3";
20399   case X86ISD::VPERMIV3:           return "X86ISD::VPERMIV3";
20400   case X86ISD::VPERMI:             return "X86ISD::VPERMI";
20401   case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
20402   case X86ISD::PMULDQ:             return "X86ISD::PMULDQ";
20403   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
20404   case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
20405   case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
20406   case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
20407   case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
20408   case X86ISD::WIN_FTOL:           return "X86ISD::WIN_FTOL";
20409   case X86ISD::SAHF:               return "X86ISD::SAHF";
20410   case X86ISD::RDRAND:             return "X86ISD::RDRAND";
20411   case X86ISD::RDSEED:             return "X86ISD::RDSEED";
20412   case X86ISD::FMADD:              return "X86ISD::FMADD";
20413   case X86ISD::FMSUB:              return "X86ISD::FMSUB";
20414   case X86ISD::FNMADD:             return "X86ISD::FNMADD";
20415   case X86ISD::FNMSUB:             return "X86ISD::FNMSUB";
20416   case X86ISD::FMADDSUB:           return "X86ISD::FMADDSUB";
20417   case X86ISD::FMSUBADD:           return "X86ISD::FMSUBADD";
20418   case X86ISD::PCMPESTRI:          return "X86ISD::PCMPESTRI";
20419   case X86ISD::PCMPISTRI:          return "X86ISD::PCMPISTRI";
20420   case X86ISD::XTEST:              return "X86ISD::XTEST";
20421   case X86ISD::COMPRESS:           return "X86ISD::COMPRESS";
20422   case X86ISD::EXPAND:             return "X86ISD::EXPAND";
20423   case X86ISD::SELECT:             return "X86ISD::SELECT";
20424   case X86ISD::ADDSUB:             return "X86ISD::ADDSUB";
20425   case X86ISD::RCP28:              return "X86ISD::RCP28";
20426   case X86ISD::RSQRT28:            return "X86ISD::RSQRT28";
20427   }
20428 }
20429
20430 // isLegalAddressingMode - Return true if the addressing mode represented
20431 // by AM is legal for this target, for a load/store of the specified type.
20432 bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
20433                                               Type *Ty) const {
20434   // X86 supports extremely general addressing modes.
20435   CodeModel::Model M = getTargetMachine().getCodeModel();
20436   Reloc::Model R = getTargetMachine().getRelocationModel();
20437
20438   // X86 allows a sign-extended 32-bit immediate field as a displacement.
20439   if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
20440     return false;
20441
20442   if (AM.BaseGV) {
20443     unsigned GVFlags =
20444       Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
20445
20446     // If a reference to this global requires an extra load, we can't fold it.
20447     if (isGlobalStubReference(GVFlags))
20448       return false;
20449
20450     // If BaseGV requires a register for the PIC base, we cannot also have a
20451     // BaseReg specified.
20452     if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
20453       return false;
20454
20455     // If lower 4G is not available, then we must use rip-relative addressing.
20456     if ((M != CodeModel::Small || R != Reloc::Static) &&
20457         Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
20458       return false;
20459   }
20460
20461   switch (AM.Scale) {
20462   case 0:
20463   case 1:
20464   case 2:
20465   case 4:
20466   case 8:
20467     // These scales always work.
20468     break;
20469   case 3:
20470   case 5:
20471   case 9:
20472     // These scales are formed with basereg+scalereg.  Only accept if there is
20473     // no basereg yet.
20474     if (AM.HasBaseReg)
20475       return false;
20476     break;
20477   default:  // Other stuff never works.
20478     return false;
20479   }
20480
20481   return true;
20482 }
20483
20484 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
20485   unsigned Bits = Ty->getScalarSizeInBits();
20486
20487   // 8-bit shifts are always expensive, but versions with a scalar amount aren't
20488   // particularly cheaper than those without.
20489   if (Bits == 8)
20490     return false;
20491
20492   // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
20493   // variable shifts just as cheap as scalar ones.
20494   if (Subtarget->hasInt256() && (Bits == 32 || Bits == 64))
20495     return false;
20496
20497   // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
20498   // fully general vector.
20499   return true;
20500 }
20501
20502 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
20503   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
20504     return false;
20505   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
20506   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
20507   return NumBits1 > NumBits2;
20508 }
20509
20510 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
20511   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
20512     return false;
20513
20514   if (!isTypeLegal(EVT::getEVT(Ty1)))
20515     return false;
20516
20517   assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
20518
20519   // Assuming the caller doesn't have a zeroext or signext return parameter,
20520   // truncation all the way down to i1 is valid.
20521   return true;
20522 }
20523
20524 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
20525   return isInt<32>(Imm);
20526 }
20527
20528 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
20529   // Can also use sub to handle negated immediates.
20530   return isInt<32>(Imm);
20531 }
20532
20533 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
20534   if (!VT1.isInteger() || !VT2.isInteger())
20535     return false;
20536   unsigned NumBits1 = VT1.getSizeInBits();
20537   unsigned NumBits2 = VT2.getSizeInBits();
20538   return NumBits1 > NumBits2;
20539 }
20540
20541 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
20542   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
20543   return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
20544 }
20545
20546 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
20547   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
20548   return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
20549 }
20550
20551 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
20552   EVT VT1 = Val.getValueType();
20553   if (isZExtFree(VT1, VT2))
20554     return true;
20555
20556   if (Val.getOpcode() != ISD::LOAD)
20557     return false;
20558
20559   if (!VT1.isSimple() || !VT1.isInteger() ||
20560       !VT2.isSimple() || !VT2.isInteger())
20561     return false;
20562
20563   switch (VT1.getSimpleVT().SimpleTy) {
20564   default: break;
20565   case MVT::i8:
20566   case MVT::i16:
20567   case MVT::i32:
20568     // X86 has 8, 16, and 32-bit zero-extending loads.
20569     return true;
20570   }
20571
20572   return false;
20573 }
20574
20575 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
20576
20577 bool
20578 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
20579   if (!(Subtarget->hasFMA() || Subtarget->hasFMA4()))
20580     return false;
20581
20582   VT = VT.getScalarType();
20583
20584   if (!VT.isSimple())
20585     return false;
20586
20587   switch (VT.getSimpleVT().SimpleTy) {
20588   case MVT::f32:
20589   case MVT::f64:
20590     return true;
20591   default:
20592     break;
20593   }
20594
20595   return false;
20596 }
20597
20598 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
20599   // i16 instructions are longer (0x66 prefix) and potentially slower.
20600   return !(VT1 == MVT::i32 && VT2 == MVT::i16);
20601 }
20602
20603 /// isShuffleMaskLegal - Targets can use this to indicate that they only
20604 /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
20605 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
20606 /// are assumed to be legal.
20607 bool
20608 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
20609                                       EVT VT) const {
20610   if (!VT.isSimple())
20611     return false;
20612
20613   MVT SVT = VT.getSimpleVT();
20614
20615   // Very little shuffling can be done for 64-bit vectors right now.
20616   if (VT.getSizeInBits() == 64)
20617     return false;
20618
20619   // This is an experimental legality test that is tailored to match the
20620   // legality test of the experimental lowering more closely. They are gated
20621   // separately to ease testing of performance differences.
20622   if (ExperimentalVectorShuffleLegality)
20623     // We only care that the types being shuffled are legal. The lowering can
20624     // handle any possible shuffle mask that results.
20625     return isTypeLegal(SVT);
20626
20627   // If this is a single-input shuffle with no 128 bit lane crossings we can
20628   // lower it into pshufb.
20629   if ((SVT.is128BitVector() && Subtarget->hasSSSE3()) ||
20630       (SVT.is256BitVector() && Subtarget->hasInt256())) {
20631     bool isLegal = true;
20632     for (unsigned I = 0, E = M.size(); I != E; ++I) {
20633       if (M[I] >= (int)SVT.getVectorNumElements() ||
20634           ShuffleCrosses128bitLane(SVT, I, M[I])) {
20635         isLegal = false;
20636         break;
20637       }
20638     }
20639     if (isLegal)
20640       return true;
20641   }
20642
20643   // FIXME: blends, shifts.
20644   return (SVT.getVectorNumElements() == 2 ||
20645           ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
20646           isMOVLMask(M, SVT) ||
20647           isCommutedMOVLMask(M, SVT) ||
20648           isMOVHLPSMask(M, SVT) ||
20649           isSHUFPMask(M, SVT) ||
20650           isSHUFPMask(M, SVT, /* Commuted */ true) ||
20651           isPSHUFDMask(M, SVT) ||
20652           isPSHUFDMask(M, SVT, /* SecondOperand */ true) ||
20653           isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) ||
20654           isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) ||
20655           isPALIGNRMask(M, SVT, Subtarget) ||
20656           isUNPCKLMask(M, SVT, Subtarget->hasInt256()) ||
20657           isUNPCKHMask(M, SVT, Subtarget->hasInt256()) ||
20658           isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
20659           isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
20660           isBlendMask(M, SVT, Subtarget->hasSSE41(), Subtarget->hasInt256()) ||
20661           (Subtarget->hasSSE41() && isINSERTPSMask(M, SVT)));
20662 }
20663
20664 bool
20665 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
20666                                           EVT VT) const {
20667   if (!VT.isSimple())
20668     return false;
20669
20670   MVT SVT = VT.getSimpleVT();
20671
20672   // This is an experimental legality test that is tailored to match the
20673   // legality test of the experimental lowering more closely. They are gated
20674   // separately to ease testing of performance differences.
20675   if (ExperimentalVectorShuffleLegality)
20676     // The new vector shuffle lowering is very good at managing zero-inputs.
20677     return isShuffleMaskLegal(Mask, VT);
20678
20679   unsigned NumElts = SVT.getVectorNumElements();
20680   // FIXME: This collection of masks seems suspect.
20681   if (NumElts == 2)
20682     return true;
20683   if (NumElts == 4 && SVT.is128BitVector()) {
20684     return (isMOVLMask(Mask, SVT)  ||
20685             isCommutedMOVLMask(Mask, SVT, true) ||
20686             isSHUFPMask(Mask, SVT) ||
20687             isSHUFPMask(Mask, SVT, /* Commuted */ true) ||
20688             isBlendMask(Mask, SVT, Subtarget->hasSSE41(),
20689                         Subtarget->hasInt256()));
20690   }
20691   return false;
20692 }
20693
20694 //===----------------------------------------------------------------------===//
20695 //                           X86 Scheduler Hooks
20696 //===----------------------------------------------------------------------===//
20697
20698 /// Utility function to emit xbegin specifying the start of an RTM region.
20699 static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,
20700                                      const TargetInstrInfo *TII) {
20701   DebugLoc DL = MI->getDebugLoc();
20702
20703   const BasicBlock *BB = MBB->getBasicBlock();
20704   MachineFunction::iterator I = MBB;
20705   ++I;
20706
20707   // For the v = xbegin(), we generate
20708   //
20709   // thisMBB:
20710   //  xbegin sinkMBB
20711   //
20712   // mainMBB:
20713   //  eax = -1
20714   //
20715   // sinkMBB:
20716   //  v = eax
20717
20718   MachineBasicBlock *thisMBB = MBB;
20719   MachineFunction *MF = MBB->getParent();
20720   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
20721   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
20722   MF->insert(I, mainMBB);
20723   MF->insert(I, sinkMBB);
20724
20725   // Transfer the remainder of BB and its successor edges to sinkMBB.
20726   sinkMBB->splice(sinkMBB->begin(), MBB,
20727                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
20728   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
20729
20730   // thisMBB:
20731   //  xbegin sinkMBB
20732   //  # fallthrough to mainMBB
20733   //  # abortion to sinkMBB
20734   BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
20735   thisMBB->addSuccessor(mainMBB);
20736   thisMBB->addSuccessor(sinkMBB);
20737
20738   // mainMBB:
20739   //  EAX = -1
20740   BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
20741   mainMBB->addSuccessor(sinkMBB);
20742
20743   // sinkMBB:
20744   // EAX is live into the sinkMBB
20745   sinkMBB->addLiveIn(X86::EAX);
20746   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
20747           TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
20748     .addReg(X86::EAX);
20749
20750   MI->eraseFromParent();
20751   return sinkMBB;
20752 }
20753
20754 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
20755 // or XMM0_V32I8 in AVX all of this code can be replaced with that
20756 // in the .td file.
20757 static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB,
20758                                        const TargetInstrInfo *TII) {
20759   unsigned Opc;
20760   switch (MI->getOpcode()) {
20761   default: llvm_unreachable("illegal opcode!");
20762   case X86::PCMPISTRM128REG:  Opc = X86::PCMPISTRM128rr;  break;
20763   case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
20764   case X86::PCMPISTRM128MEM:  Opc = X86::PCMPISTRM128rm;  break;
20765   case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
20766   case X86::PCMPESTRM128REG:  Opc = X86::PCMPESTRM128rr;  break;
20767   case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
20768   case X86::PCMPESTRM128MEM:  Opc = X86::PCMPESTRM128rm;  break;
20769   case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
20770   }
20771
20772   DebugLoc dl = MI->getDebugLoc();
20773   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
20774
20775   unsigned NumArgs = MI->getNumOperands();
20776   for (unsigned i = 1; i < NumArgs; ++i) {
20777     MachineOperand &Op = MI->getOperand(i);
20778     if (!(Op.isReg() && Op.isImplicit()))
20779       MIB.addOperand(Op);
20780   }
20781   if (MI->hasOneMemOperand())
20782     MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
20783
20784   BuildMI(*BB, MI, dl,
20785     TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
20786     .addReg(X86::XMM0);
20787
20788   MI->eraseFromParent();
20789   return BB;
20790 }
20791
20792 // FIXME: Custom handling because TableGen doesn't support multiple implicit
20793 // defs in an instruction pattern
20794 static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB,
20795                                        const TargetInstrInfo *TII) {
20796   unsigned Opc;
20797   switch (MI->getOpcode()) {
20798   default: llvm_unreachable("illegal opcode!");
20799   case X86::PCMPISTRIREG:  Opc = X86::PCMPISTRIrr;  break;
20800   case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
20801   case X86::PCMPISTRIMEM:  Opc = X86::PCMPISTRIrm;  break;
20802   case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
20803   case X86::PCMPESTRIREG:  Opc = X86::PCMPESTRIrr;  break;
20804   case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
20805   case X86::PCMPESTRIMEM:  Opc = X86::PCMPESTRIrm;  break;
20806   case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
20807   }
20808
20809   DebugLoc dl = MI->getDebugLoc();
20810   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
20811
20812   unsigned NumArgs = MI->getNumOperands(); // remove the results
20813   for (unsigned i = 1; i < NumArgs; ++i) {
20814     MachineOperand &Op = MI->getOperand(i);
20815     if (!(Op.isReg() && Op.isImplicit()))
20816       MIB.addOperand(Op);
20817   }
20818   if (MI->hasOneMemOperand())
20819     MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
20820
20821   BuildMI(*BB, MI, dl,
20822     TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
20823     .addReg(X86::ECX);
20824
20825   MI->eraseFromParent();
20826   return BB;
20827 }
20828
20829 static MachineBasicBlock *EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
20830                                       const X86Subtarget *Subtarget) {
20831   DebugLoc dl = MI->getDebugLoc();
20832   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
20833   // Address into RAX/EAX, other two args into ECX, EDX.
20834   unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
20835   unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
20836   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
20837   for (int i = 0; i < X86::AddrNumOperands; ++i)
20838     MIB.addOperand(MI->getOperand(i));
20839
20840   unsigned ValOps = X86::AddrNumOperands;
20841   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
20842     .addReg(MI->getOperand(ValOps).getReg());
20843   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
20844     .addReg(MI->getOperand(ValOps+1).getReg());
20845
20846   // The instruction doesn't actually take any operands though.
20847   BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr));
20848
20849   MI->eraseFromParent(); // The pseudo is gone now.
20850   return BB;
20851 }
20852
20853 MachineBasicBlock *
20854 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI,
20855                                                  MachineBasicBlock *MBB) const {
20856   // Emit va_arg instruction on X86-64.
20857
20858   // Operands to this pseudo-instruction:
20859   // 0  ) Output        : destination address (reg)
20860   // 1-5) Input         : va_list address (addr, i64mem)
20861   // 6  ) ArgSize       : Size (in bytes) of vararg type
20862   // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
20863   // 8  ) Align         : Alignment of type
20864   // 9  ) EFLAGS (implicit-def)
20865
20866   assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
20867   assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands");
20868
20869   unsigned DestReg = MI->getOperand(0).getReg();
20870   MachineOperand &Base = MI->getOperand(1);
20871   MachineOperand &Scale = MI->getOperand(2);
20872   MachineOperand &Index = MI->getOperand(3);
20873   MachineOperand &Disp = MI->getOperand(4);
20874   MachineOperand &Segment = MI->getOperand(5);
20875   unsigned ArgSize = MI->getOperand(6).getImm();
20876   unsigned ArgMode = MI->getOperand(7).getImm();
20877   unsigned Align = MI->getOperand(8).getImm();
20878
20879   // Memory Reference
20880   assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
20881   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
20882   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
20883
20884   // Machine Information
20885   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
20886   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
20887   const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
20888   const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
20889   DebugLoc DL = MI->getDebugLoc();
20890
20891   // struct va_list {
20892   //   i32   gp_offset
20893   //   i32   fp_offset
20894   //   i64   overflow_area (address)
20895   //   i64   reg_save_area (address)
20896   // }
20897   // sizeof(va_list) = 24
20898   // alignment(va_list) = 8
20899
20900   unsigned TotalNumIntRegs = 6;
20901   unsigned TotalNumXMMRegs = 8;
20902   bool UseGPOffset = (ArgMode == 1);
20903   bool UseFPOffset = (ArgMode == 2);
20904   unsigned MaxOffset = TotalNumIntRegs * 8 +
20905                        (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
20906
20907   /* Align ArgSize to a multiple of 8 */
20908   unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
20909   bool NeedsAlign = (Align > 8);
20910
20911   MachineBasicBlock *thisMBB = MBB;
20912   MachineBasicBlock *overflowMBB;
20913   MachineBasicBlock *offsetMBB;
20914   MachineBasicBlock *endMBB;
20915
20916   unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
20917   unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
20918   unsigned OffsetReg = 0;
20919
20920   if (!UseGPOffset && !UseFPOffset) {
20921     // If we only pull from the overflow region, we don't create a branch.
20922     // We don't need to alter control flow.
20923     OffsetDestReg = 0; // unused
20924     OverflowDestReg = DestReg;
20925
20926     offsetMBB = nullptr;
20927     overflowMBB = thisMBB;
20928     endMBB = thisMBB;
20929   } else {
20930     // First emit code to check if gp_offset (or fp_offset) is below the bound.
20931     // If so, pull the argument from reg_save_area. (branch to offsetMBB)
20932     // If not, pull from overflow_area. (branch to overflowMBB)
20933     //
20934     //       thisMBB
20935     //         |     .
20936     //         |        .
20937     //     offsetMBB   overflowMBB
20938     //         |        .
20939     //         |     .
20940     //        endMBB
20941
20942     // Registers for the PHI in endMBB
20943     OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
20944     OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
20945
20946     const BasicBlock *LLVM_BB = MBB->getBasicBlock();
20947     MachineFunction *MF = MBB->getParent();
20948     overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20949     offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20950     endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20951
20952     MachineFunction::iterator MBBIter = MBB;
20953     ++MBBIter;
20954
20955     // Insert the new basic blocks
20956     MF->insert(MBBIter, offsetMBB);
20957     MF->insert(MBBIter, overflowMBB);
20958     MF->insert(MBBIter, endMBB);
20959
20960     // Transfer the remainder of MBB and its successor edges to endMBB.
20961     endMBB->splice(endMBB->begin(), thisMBB,
20962                    std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
20963     endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
20964
20965     // Make offsetMBB and overflowMBB successors of thisMBB
20966     thisMBB->addSuccessor(offsetMBB);
20967     thisMBB->addSuccessor(overflowMBB);
20968
20969     // endMBB is a successor of both offsetMBB and overflowMBB
20970     offsetMBB->addSuccessor(endMBB);
20971     overflowMBB->addSuccessor(endMBB);
20972
20973     // Load the offset value into a register
20974     OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
20975     BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
20976       .addOperand(Base)
20977       .addOperand(Scale)
20978       .addOperand(Index)
20979       .addDisp(Disp, UseFPOffset ? 4 : 0)
20980       .addOperand(Segment)
20981       .setMemRefs(MMOBegin, MMOEnd);
20982
20983     // Check if there is enough room left to pull this argument.
20984     BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
20985       .addReg(OffsetReg)
20986       .addImm(MaxOffset + 8 - ArgSizeA8);
20987
20988     // Branch to "overflowMBB" if offset >= max
20989     // Fall through to "offsetMBB" otherwise
20990     BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
20991       .addMBB(overflowMBB);
20992   }
20993
20994   // In offsetMBB, emit code to use the reg_save_area.
20995   if (offsetMBB) {
20996     assert(OffsetReg != 0);
20997
20998     // Read the reg_save_area address.
20999     unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
21000     BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
21001       .addOperand(Base)
21002       .addOperand(Scale)
21003       .addOperand(Index)
21004       .addDisp(Disp, 16)
21005       .addOperand(Segment)
21006       .setMemRefs(MMOBegin, MMOEnd);
21007
21008     // Zero-extend the offset
21009     unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
21010       BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
21011         .addImm(0)
21012         .addReg(OffsetReg)
21013         .addImm(X86::sub_32bit);
21014
21015     // Add the offset to the reg_save_area to get the final address.
21016     BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
21017       .addReg(OffsetReg64)
21018       .addReg(RegSaveReg);
21019
21020     // Compute the offset for the next argument
21021     unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
21022     BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
21023       .addReg(OffsetReg)
21024       .addImm(UseFPOffset ? 16 : 8);
21025
21026     // Store it back into the va_list.
21027     BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
21028       .addOperand(Base)
21029       .addOperand(Scale)
21030       .addOperand(Index)
21031       .addDisp(Disp, UseFPOffset ? 4 : 0)
21032       .addOperand(Segment)
21033       .addReg(NextOffsetReg)
21034       .setMemRefs(MMOBegin, MMOEnd);
21035
21036     // Jump to endMBB
21037     BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
21038       .addMBB(endMBB);
21039   }
21040
21041   //
21042   // Emit code to use overflow area
21043   //
21044
21045   // Load the overflow_area address into a register.
21046   unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
21047   BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
21048     .addOperand(Base)
21049     .addOperand(Scale)
21050     .addOperand(Index)
21051     .addDisp(Disp, 8)
21052     .addOperand(Segment)
21053     .setMemRefs(MMOBegin, MMOEnd);
21054
21055   // If we need to align it, do so. Otherwise, just copy the address
21056   // to OverflowDestReg.
21057   if (NeedsAlign) {
21058     // Align the overflow address
21059     assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2");
21060     unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
21061
21062     // aligned_addr = (addr + (align-1)) & ~(align-1)
21063     BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
21064       .addReg(OverflowAddrReg)
21065       .addImm(Align-1);
21066
21067     BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
21068       .addReg(TmpReg)
21069       .addImm(~(uint64_t)(Align-1));
21070   } else {
21071     BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
21072       .addReg(OverflowAddrReg);
21073   }
21074
21075   // Compute the next overflow address after this argument.
21076   // (the overflow address should be kept 8-byte aligned)
21077   unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
21078   BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
21079     .addReg(OverflowDestReg)
21080     .addImm(ArgSizeA8);
21081
21082   // Store the new overflow address.
21083   BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
21084     .addOperand(Base)
21085     .addOperand(Scale)
21086     .addOperand(Index)
21087     .addDisp(Disp, 8)
21088     .addOperand(Segment)
21089     .addReg(NextAddrReg)
21090     .setMemRefs(MMOBegin, MMOEnd);
21091
21092   // If we branched, emit the PHI to the front of endMBB.
21093   if (offsetMBB) {
21094     BuildMI(*endMBB, endMBB->begin(), DL,
21095             TII->get(X86::PHI), DestReg)
21096       .addReg(OffsetDestReg).addMBB(offsetMBB)
21097       .addReg(OverflowDestReg).addMBB(overflowMBB);
21098   }
21099
21100   // Erase the pseudo instruction
21101   MI->eraseFromParent();
21102
21103   return endMBB;
21104 }
21105
21106 MachineBasicBlock *
21107 X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
21108                                                  MachineInstr *MI,
21109                                                  MachineBasicBlock *MBB) const {
21110   // Emit code to save XMM registers to the stack. The ABI says that the
21111   // number of registers to save is given in %al, so it's theoretically
21112   // possible to do an indirect jump trick to avoid saving all of them,
21113   // however this code takes a simpler approach and just executes all
21114   // of the stores if %al is non-zero. It's less code, and it's probably
21115   // easier on the hardware branch predictor, and stores aren't all that
21116   // expensive anyway.
21117
21118   // Create the new basic blocks. One block contains all the XMM stores,
21119   // and one block is the final destination regardless of whether any
21120   // stores were performed.
21121   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
21122   MachineFunction *F = MBB->getParent();
21123   MachineFunction::iterator MBBIter = MBB;
21124   ++MBBIter;
21125   MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
21126   MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
21127   F->insert(MBBIter, XMMSaveMBB);
21128   F->insert(MBBIter, EndMBB);
21129
21130   // Transfer the remainder of MBB and its successor edges to EndMBB.
21131   EndMBB->splice(EndMBB->begin(), MBB,
21132                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
21133   EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
21134
21135   // The original block will now fall through to the XMM save block.
21136   MBB->addSuccessor(XMMSaveMBB);
21137   // The XMMSaveMBB will fall through to the end block.
21138   XMMSaveMBB->addSuccessor(EndMBB);
21139
21140   // Now add the instructions.
21141   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21142   DebugLoc DL = MI->getDebugLoc();
21143
21144   unsigned CountReg = MI->getOperand(0).getReg();
21145   int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
21146   int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
21147
21148   if (!Subtarget->isTargetWin64()) {
21149     // If %al is 0, branch around the XMM save block.
21150     BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
21151     BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
21152     MBB->addSuccessor(EndMBB);
21153   }
21154
21155   // Make sure the last operand is EFLAGS, which gets clobbered by the branch
21156   // that was just emitted, but clearly shouldn't be "saved".
21157   assert((MI->getNumOperands() <= 3 ||
21158           !MI->getOperand(MI->getNumOperands() - 1).isReg() ||
21159           MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS)
21160          && "Expected last argument to be EFLAGS");
21161   unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
21162   // In the XMM save block, save all the XMM argument registers.
21163   for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) {
21164     int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
21165     MachineMemOperand *MMO =
21166       F->getMachineMemOperand(
21167           MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset),
21168         MachineMemOperand::MOStore,
21169         /*Size=*/16, /*Align=*/16);
21170     BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
21171       .addFrameIndex(RegSaveFrameIndex)
21172       .addImm(/*Scale=*/1)
21173       .addReg(/*IndexReg=*/0)
21174       .addImm(/*Disp=*/Offset)
21175       .addReg(/*Segment=*/0)
21176       .addReg(MI->getOperand(i).getReg())
21177       .addMemOperand(MMO);
21178   }
21179
21180   MI->eraseFromParent();   // The pseudo instruction is gone now.
21181
21182   return EndMBB;
21183 }
21184
21185 // The EFLAGS operand of SelectItr might be missing a kill marker
21186 // because there were multiple uses of EFLAGS, and ISel didn't know
21187 // which to mark. Figure out whether SelectItr should have had a
21188 // kill marker, and set it if it should. Returns the correct kill
21189 // marker value.
21190 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
21191                                      MachineBasicBlock* BB,
21192                                      const TargetRegisterInfo* TRI) {
21193   // Scan forward through BB for a use/def of EFLAGS.
21194   MachineBasicBlock::iterator miI(std::next(SelectItr));
21195   for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
21196     const MachineInstr& mi = *miI;
21197     if (mi.readsRegister(X86::EFLAGS))
21198       return false;
21199     if (mi.definesRegister(X86::EFLAGS))
21200       break; // Should have kill-flag - update below.
21201   }
21202
21203   // If we hit the end of the block, check whether EFLAGS is live into a
21204   // successor.
21205   if (miI == BB->end()) {
21206     for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
21207                                           sEnd = BB->succ_end();
21208          sItr != sEnd; ++sItr) {
21209       MachineBasicBlock* succ = *sItr;
21210       if (succ->isLiveIn(X86::EFLAGS))
21211         return false;
21212     }
21213   }
21214
21215   // We found a def, or hit the end of the basic block and EFLAGS wasn't live
21216   // out. SelectMI should have a kill flag on EFLAGS.
21217   SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
21218   return true;
21219 }
21220
21221 MachineBasicBlock *
21222 X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
21223                                      MachineBasicBlock *BB) const {
21224   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21225   DebugLoc DL = MI->getDebugLoc();
21226
21227   // To "insert" a SELECT_CC instruction, we actually have to insert the
21228   // diamond control-flow pattern.  The incoming instruction knows the
21229   // destination vreg to set, the condition code register to branch on, the
21230   // true/false values to select between, and a branch opcode to use.
21231   const BasicBlock *LLVM_BB = BB->getBasicBlock();
21232   MachineFunction::iterator It = BB;
21233   ++It;
21234
21235   //  thisMBB:
21236   //  ...
21237   //   TrueVal = ...
21238   //   cmpTY ccX, r1, r2
21239   //   bCC copy1MBB
21240   //   fallthrough --> copy0MBB
21241   MachineBasicBlock *thisMBB = BB;
21242   MachineFunction *F = BB->getParent();
21243   MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
21244   MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
21245   F->insert(It, copy0MBB);
21246   F->insert(It, sinkMBB);
21247
21248   // If the EFLAGS register isn't dead in the terminator, then claim that it's
21249   // live into the sink and copy blocks.
21250   const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
21251   if (!MI->killsRegister(X86::EFLAGS) &&
21252       !checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
21253     copy0MBB->addLiveIn(X86::EFLAGS);
21254     sinkMBB->addLiveIn(X86::EFLAGS);
21255   }
21256
21257   // Transfer the remainder of BB and its successor edges to sinkMBB.
21258   sinkMBB->splice(sinkMBB->begin(), BB,
21259                   std::next(MachineBasicBlock::iterator(MI)), BB->end());
21260   sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
21261
21262   // Add the true and fallthrough blocks as its successors.
21263   BB->addSuccessor(copy0MBB);
21264   BB->addSuccessor(sinkMBB);
21265
21266   // Create the conditional branch instruction.
21267   unsigned Opc =
21268     X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
21269   BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
21270
21271   //  copy0MBB:
21272   //   %FalseValue = ...
21273   //   # fallthrough to sinkMBB
21274   copy0MBB->addSuccessor(sinkMBB);
21275
21276   //  sinkMBB:
21277   //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
21278   //  ...
21279   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
21280           TII->get(X86::PHI), MI->getOperand(0).getReg())
21281     .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
21282     .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
21283
21284   MI->eraseFromParent();   // The pseudo instruction is gone now.
21285   return sinkMBB;
21286 }
21287
21288 MachineBasicBlock *
21289 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
21290                                         MachineBasicBlock *BB) const {
21291   MachineFunction *MF = BB->getParent();
21292   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21293   DebugLoc DL = MI->getDebugLoc();
21294   const BasicBlock *LLVM_BB = BB->getBasicBlock();
21295
21296   assert(MF->shouldSplitStack());
21297
21298   const bool Is64Bit = Subtarget->is64Bit();
21299   const bool IsLP64 = Subtarget->isTarget64BitLP64();
21300
21301   const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
21302   const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
21303
21304   // BB:
21305   //  ... [Till the alloca]
21306   // If stacklet is not large enough, jump to mallocMBB
21307   //
21308   // bumpMBB:
21309   //  Allocate by subtracting from RSP
21310   //  Jump to continueMBB
21311   //
21312   // mallocMBB:
21313   //  Allocate by call to runtime
21314   //
21315   // continueMBB:
21316   //  ...
21317   //  [rest of original BB]
21318   //
21319
21320   MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
21321   MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
21322   MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
21323
21324   MachineRegisterInfo &MRI = MF->getRegInfo();
21325   const TargetRegisterClass *AddrRegClass =
21326     getRegClassFor(getPointerTy());
21327
21328   unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
21329     bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
21330     tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
21331     SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
21332     sizeVReg = MI->getOperand(1).getReg(),
21333     physSPReg = IsLP64 || Subtarget->isTargetNaCl64() ? X86::RSP : X86::ESP;
21334
21335   MachineFunction::iterator MBBIter = BB;
21336   ++MBBIter;
21337
21338   MF->insert(MBBIter, bumpMBB);
21339   MF->insert(MBBIter, mallocMBB);
21340   MF->insert(MBBIter, continueMBB);
21341
21342   continueMBB->splice(continueMBB->begin(), BB,
21343                       std::next(MachineBasicBlock::iterator(MI)), BB->end());
21344   continueMBB->transferSuccessorsAndUpdatePHIs(BB);
21345
21346   // Add code to the main basic block to check if the stack limit has been hit,
21347   // and if so, jump to mallocMBB otherwise to bumpMBB.
21348   BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
21349   BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
21350     .addReg(tmpSPVReg).addReg(sizeVReg);
21351   BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
21352     .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
21353     .addReg(SPLimitVReg);
21354   BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
21355
21356   // bumpMBB simply decreases the stack pointer, since we know the current
21357   // stacklet has enough space.
21358   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
21359     .addReg(SPLimitVReg);
21360   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
21361     .addReg(SPLimitVReg);
21362   BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
21363
21364   // Calls into a routine in libgcc to allocate more space from the heap.
21365   const uint32_t *RegMask =
21366       Subtarget->getRegisterInfo()->getCallPreservedMask(CallingConv::C);
21367   if (IsLP64) {
21368     BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
21369       .addReg(sizeVReg);
21370     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
21371       .addExternalSymbol("__morestack_allocate_stack_space")
21372       .addRegMask(RegMask)
21373       .addReg(X86::RDI, RegState::Implicit)
21374       .addReg(X86::RAX, RegState::ImplicitDefine);
21375   } else if (Is64Bit) {
21376     BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
21377       .addReg(sizeVReg);
21378     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
21379       .addExternalSymbol("__morestack_allocate_stack_space")
21380       .addRegMask(RegMask)
21381       .addReg(X86::EDI, RegState::Implicit)
21382       .addReg(X86::EAX, RegState::ImplicitDefine);
21383   } else {
21384     BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
21385       .addImm(12);
21386     BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
21387     BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
21388       .addExternalSymbol("__morestack_allocate_stack_space")
21389       .addRegMask(RegMask)
21390       .addReg(X86::EAX, RegState::ImplicitDefine);
21391   }
21392
21393   if (!Is64Bit)
21394     BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
21395       .addImm(16);
21396
21397   BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
21398     .addReg(IsLP64 ? X86::RAX : X86::EAX);
21399   BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
21400
21401   // Set up the CFG correctly.
21402   BB->addSuccessor(bumpMBB);
21403   BB->addSuccessor(mallocMBB);
21404   mallocMBB->addSuccessor(continueMBB);
21405   bumpMBB->addSuccessor(continueMBB);
21406
21407   // Take care of the PHI nodes.
21408   BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
21409           MI->getOperand(0).getReg())
21410     .addReg(mallocPtrVReg).addMBB(mallocMBB)
21411     .addReg(bumpSPPtrVReg).addMBB(bumpMBB);
21412
21413   // Delete the original pseudo instruction.
21414   MI->eraseFromParent();
21415
21416   // And we're done.
21417   return continueMBB;
21418 }
21419
21420 MachineBasicBlock *
21421 X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
21422                                         MachineBasicBlock *BB) const {
21423   DebugLoc DL = MI->getDebugLoc();
21424
21425   assert(!Subtarget->isTargetMachO());
21426
21427   X86FrameLowering::emitStackProbeCall(*BB->getParent(), *BB, MI, DL);
21428
21429   MI->eraseFromParent();   // The pseudo instruction is gone now.
21430   return BB;
21431 }
21432
21433 MachineBasicBlock *
21434 X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
21435                                       MachineBasicBlock *BB) const {
21436   // This is pretty easy.  We're taking the value that we received from
21437   // our load from the relocation, sticking it in either RDI (x86-64)
21438   // or EAX and doing an indirect call.  The return value will then
21439   // be in the normal return register.
21440   MachineFunction *F = BB->getParent();
21441   const X86InstrInfo *TII = Subtarget->getInstrInfo();
21442   DebugLoc DL = MI->getDebugLoc();
21443
21444   assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
21445   assert(MI->getOperand(3).isGlobal() && "This should be a global");
21446
21447   // Get a register mask for the lowered call.
21448   // FIXME: The 32-bit calls have non-standard calling conventions. Use a
21449   // proper register mask.
21450   const uint32_t *RegMask =
21451       Subtarget->getRegisterInfo()->getCallPreservedMask(CallingConv::C);
21452   if (Subtarget->is64Bit()) {
21453     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
21454                                       TII->get(X86::MOV64rm), X86::RDI)
21455     .addReg(X86::RIP)
21456     .addImm(0).addReg(0)
21457     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
21458                       MI->getOperand(3).getTargetFlags())
21459     .addReg(0);
21460     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
21461     addDirectMem(MIB, X86::RDI);
21462     MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
21463   } else if (F->getTarget().getRelocationModel() != Reloc::PIC_) {
21464     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
21465                                       TII->get(X86::MOV32rm), X86::EAX)
21466     .addReg(0)
21467     .addImm(0).addReg(0)
21468     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
21469                       MI->getOperand(3).getTargetFlags())
21470     .addReg(0);
21471     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
21472     addDirectMem(MIB, X86::EAX);
21473     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
21474   } else {
21475     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
21476                                       TII->get(X86::MOV32rm), X86::EAX)
21477     .addReg(TII->getGlobalBaseReg(F))
21478     .addImm(0).addReg(0)
21479     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
21480                       MI->getOperand(3).getTargetFlags())
21481     .addReg(0);
21482     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
21483     addDirectMem(MIB, X86::EAX);
21484     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
21485   }
21486
21487   MI->eraseFromParent(); // The pseudo instruction is gone now.
21488   return BB;
21489 }
21490
21491 MachineBasicBlock *
21492 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
21493                                     MachineBasicBlock *MBB) const {
21494   DebugLoc DL = MI->getDebugLoc();
21495   MachineFunction *MF = MBB->getParent();
21496   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21497   MachineRegisterInfo &MRI = MF->getRegInfo();
21498
21499   const BasicBlock *BB = MBB->getBasicBlock();
21500   MachineFunction::iterator I = MBB;
21501   ++I;
21502
21503   // Memory Reference
21504   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
21505   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
21506
21507   unsigned DstReg;
21508   unsigned MemOpndSlot = 0;
21509
21510   unsigned CurOp = 0;
21511
21512   DstReg = MI->getOperand(CurOp++).getReg();
21513   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
21514   assert(RC->hasType(MVT::i32) && "Invalid destination!");
21515   unsigned mainDstReg = MRI.createVirtualRegister(RC);
21516   unsigned restoreDstReg = MRI.createVirtualRegister(RC);
21517
21518   MemOpndSlot = CurOp;
21519
21520   MVT PVT = getPointerTy();
21521   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
21522          "Invalid Pointer Size!");
21523
21524   // For v = setjmp(buf), we generate
21525   //
21526   // thisMBB:
21527   //  buf[LabelOffset] = restoreMBB
21528   //  SjLjSetup restoreMBB
21529   //
21530   // mainMBB:
21531   //  v_main = 0
21532   //
21533   // sinkMBB:
21534   //  v = phi(main, restore)
21535   //
21536   // restoreMBB:
21537   //  if base pointer being used, load it from frame
21538   //  v_restore = 1
21539
21540   MachineBasicBlock *thisMBB = MBB;
21541   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
21542   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
21543   MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
21544   MF->insert(I, mainMBB);
21545   MF->insert(I, sinkMBB);
21546   MF->push_back(restoreMBB);
21547
21548   MachineInstrBuilder MIB;
21549
21550   // Transfer the remainder of BB and its successor edges to sinkMBB.
21551   sinkMBB->splice(sinkMBB->begin(), MBB,
21552                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
21553   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
21554
21555   // thisMBB:
21556   unsigned PtrStoreOpc = 0;
21557   unsigned LabelReg = 0;
21558   const int64_t LabelOffset = 1 * PVT.getStoreSize();
21559   Reloc::Model RM = MF->getTarget().getRelocationModel();
21560   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
21561                      (RM == Reloc::Static || RM == Reloc::DynamicNoPIC);
21562
21563   // Prepare IP either in reg or imm.
21564   if (!UseImmLabel) {
21565     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
21566     const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
21567     LabelReg = MRI.createVirtualRegister(PtrRC);
21568     if (Subtarget->is64Bit()) {
21569       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
21570               .addReg(X86::RIP)
21571               .addImm(0)
21572               .addReg(0)
21573               .addMBB(restoreMBB)
21574               .addReg(0);
21575     } else {
21576       const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
21577       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
21578               .addReg(XII->getGlobalBaseReg(MF))
21579               .addImm(0)
21580               .addReg(0)
21581               .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference())
21582               .addReg(0);
21583     }
21584   } else
21585     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
21586   // Store IP
21587   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
21588   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
21589     if (i == X86::AddrDisp)
21590       MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset);
21591     else
21592       MIB.addOperand(MI->getOperand(MemOpndSlot + i));
21593   }
21594   if (!UseImmLabel)
21595     MIB.addReg(LabelReg);
21596   else
21597     MIB.addMBB(restoreMBB);
21598   MIB.setMemRefs(MMOBegin, MMOEnd);
21599   // Setup
21600   MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
21601           .addMBB(restoreMBB);
21602
21603   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
21604   MIB.addRegMask(RegInfo->getNoPreservedMask());
21605   thisMBB->addSuccessor(mainMBB);
21606   thisMBB->addSuccessor(restoreMBB);
21607
21608   // mainMBB:
21609   //  EAX = 0
21610   BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
21611   mainMBB->addSuccessor(sinkMBB);
21612
21613   // sinkMBB:
21614   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
21615           TII->get(X86::PHI), DstReg)
21616     .addReg(mainDstReg).addMBB(mainMBB)
21617     .addReg(restoreDstReg).addMBB(restoreMBB);
21618
21619   // restoreMBB:
21620   if (RegInfo->hasBasePointer(*MF)) {
21621     const bool Uses64BitFramePtr =
21622         Subtarget->isTarget64BitLP64() || Subtarget->isTargetNaCl64();
21623     X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
21624     X86FI->setRestoreBasePointer(MF);
21625     unsigned FramePtr = RegInfo->getFrameRegister(*MF);
21626     unsigned BasePtr = RegInfo->getBaseRegister();
21627     unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
21628     addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
21629                  FramePtr, true, X86FI->getRestoreBasePointerOffset())
21630       .setMIFlag(MachineInstr::FrameSetup);
21631   }
21632   BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
21633   BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
21634   restoreMBB->addSuccessor(sinkMBB);
21635
21636   MI->eraseFromParent();
21637   return sinkMBB;
21638 }
21639
21640 MachineBasicBlock *
21641 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
21642                                      MachineBasicBlock *MBB) const {
21643   DebugLoc DL = MI->getDebugLoc();
21644   MachineFunction *MF = MBB->getParent();
21645   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21646   MachineRegisterInfo &MRI = MF->getRegInfo();
21647
21648   // Memory Reference
21649   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
21650   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
21651
21652   MVT PVT = getPointerTy();
21653   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
21654          "Invalid Pointer Size!");
21655
21656   const TargetRegisterClass *RC =
21657     (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
21658   unsigned Tmp = MRI.createVirtualRegister(RC);
21659   // Since FP is only updated here but NOT referenced, it's treated as GPR.
21660   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
21661   unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
21662   unsigned SP = RegInfo->getStackRegister();
21663
21664   MachineInstrBuilder MIB;
21665
21666   const int64_t LabelOffset = 1 * PVT.getStoreSize();
21667   const int64_t SPOffset = 2 * PVT.getStoreSize();
21668
21669   unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
21670   unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
21671
21672   // Reload FP
21673   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
21674   for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
21675     MIB.addOperand(MI->getOperand(i));
21676   MIB.setMemRefs(MMOBegin, MMOEnd);
21677   // Reload IP
21678   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
21679   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
21680     if (i == X86::AddrDisp)
21681       MIB.addDisp(MI->getOperand(i), LabelOffset);
21682     else
21683       MIB.addOperand(MI->getOperand(i));
21684   }
21685   MIB.setMemRefs(MMOBegin, MMOEnd);
21686   // Reload SP
21687   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
21688   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
21689     if (i == X86::AddrDisp)
21690       MIB.addDisp(MI->getOperand(i), SPOffset);
21691     else
21692       MIB.addOperand(MI->getOperand(i));
21693   }
21694   MIB.setMemRefs(MMOBegin, MMOEnd);
21695   // Jump
21696   BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
21697
21698   MI->eraseFromParent();
21699   return MBB;
21700 }
21701
21702 // Replace 213-type (isel default) FMA3 instructions with 231-type for
21703 // accumulator loops. Writing back to the accumulator allows the coalescer
21704 // to remove extra copies in the loop.
21705 MachineBasicBlock *
21706 X86TargetLowering::emitFMA3Instr(MachineInstr *MI,
21707                                  MachineBasicBlock *MBB) const {
21708   MachineOperand &AddendOp = MI->getOperand(3);
21709
21710   // Bail out early if the addend isn't a register - we can't switch these.
21711   if (!AddendOp.isReg())
21712     return MBB;
21713
21714   MachineFunction &MF = *MBB->getParent();
21715   MachineRegisterInfo &MRI = MF.getRegInfo();
21716
21717   // Check whether the addend is defined by a PHI:
21718   assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?");
21719   MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg());
21720   if (!AddendDef.isPHI())
21721     return MBB;
21722
21723   // Look for the following pattern:
21724   // loop:
21725   //   %addend = phi [%entry, 0], [%loop, %result]
21726   //   ...
21727   //   %result<tied1> = FMA213 %m2<tied0>, %m1, %addend
21728
21729   // Replace with:
21730   //   loop:
21731   //   %addend = phi [%entry, 0], [%loop, %result]
21732   //   ...
21733   //   %result<tied1> = FMA231 %addend<tied0>, %m1, %m2
21734
21735   for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) {
21736     assert(AddendDef.getOperand(i).isReg());
21737     MachineOperand PHISrcOp = AddendDef.getOperand(i);
21738     MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg());
21739     if (&PHISrcInst == MI) {
21740       // Found a matching instruction.
21741       unsigned NewFMAOpc = 0;
21742       switch (MI->getOpcode()) {
21743         case X86::VFMADDPDr213r: NewFMAOpc = X86::VFMADDPDr231r; break;
21744         case X86::VFMADDPSr213r: NewFMAOpc = X86::VFMADDPSr231r; break;
21745         case X86::VFMADDSDr213r: NewFMAOpc = X86::VFMADDSDr231r; break;
21746         case X86::VFMADDSSr213r: NewFMAOpc = X86::VFMADDSSr231r; break;
21747         case X86::VFMSUBPDr213r: NewFMAOpc = X86::VFMSUBPDr231r; break;
21748         case X86::VFMSUBPSr213r: NewFMAOpc = X86::VFMSUBPSr231r; break;
21749         case X86::VFMSUBSDr213r: NewFMAOpc = X86::VFMSUBSDr231r; break;
21750         case X86::VFMSUBSSr213r: NewFMAOpc = X86::VFMSUBSSr231r; break;
21751         case X86::VFNMADDPDr213r: NewFMAOpc = X86::VFNMADDPDr231r; break;
21752         case X86::VFNMADDPSr213r: NewFMAOpc = X86::VFNMADDPSr231r; break;
21753         case X86::VFNMADDSDr213r: NewFMAOpc = X86::VFNMADDSDr231r; break;
21754         case X86::VFNMADDSSr213r: NewFMAOpc = X86::VFNMADDSSr231r; break;
21755         case X86::VFNMSUBPDr213r: NewFMAOpc = X86::VFNMSUBPDr231r; break;
21756         case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break;
21757         case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break;
21758         case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break;
21759         case X86::VFMADDSUBPDr213r: NewFMAOpc = X86::VFMADDSUBPDr231r; break;
21760         case X86::VFMADDSUBPSr213r: NewFMAOpc = X86::VFMADDSUBPSr231r; break;
21761         case X86::VFMSUBADDPDr213r: NewFMAOpc = X86::VFMSUBADDPDr231r; break;
21762         case X86::VFMSUBADDPSr213r: NewFMAOpc = X86::VFMSUBADDPSr231r; break;
21763
21764         case X86::VFMADDPDr213rY: NewFMAOpc = X86::VFMADDPDr231rY; break;
21765         case X86::VFMADDPSr213rY: NewFMAOpc = X86::VFMADDPSr231rY; break;
21766         case X86::VFMSUBPDr213rY: NewFMAOpc = X86::VFMSUBPDr231rY; break;
21767         case X86::VFMSUBPSr213rY: NewFMAOpc = X86::VFMSUBPSr231rY; break;
21768         case X86::VFNMADDPDr213rY: NewFMAOpc = X86::VFNMADDPDr231rY; break;
21769         case X86::VFNMADDPSr213rY: NewFMAOpc = X86::VFNMADDPSr231rY; break;
21770         case X86::VFNMSUBPDr213rY: NewFMAOpc = X86::VFNMSUBPDr231rY; break;
21771         case X86::VFNMSUBPSr213rY: NewFMAOpc = X86::VFNMSUBPSr231rY; break;
21772         case X86::VFMADDSUBPDr213rY: NewFMAOpc = X86::VFMADDSUBPDr231rY; break;
21773         case X86::VFMADDSUBPSr213rY: NewFMAOpc = X86::VFMADDSUBPSr231rY; break;
21774         case X86::VFMSUBADDPDr213rY: NewFMAOpc = X86::VFMSUBADDPDr231rY; break;
21775         case X86::VFMSUBADDPSr213rY: NewFMAOpc = X86::VFMSUBADDPSr231rY; break;
21776         default: llvm_unreachable("Unrecognized FMA variant.");
21777       }
21778
21779       const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
21780       MachineInstrBuilder MIB =
21781         BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc))
21782         .addOperand(MI->getOperand(0))
21783         .addOperand(MI->getOperand(3))
21784         .addOperand(MI->getOperand(2))
21785         .addOperand(MI->getOperand(1));
21786       MBB->insert(MachineBasicBlock::iterator(MI), MIB);
21787       MI->eraseFromParent();
21788     }
21789   }
21790
21791   return MBB;
21792 }
21793
21794 MachineBasicBlock *
21795 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
21796                                                MachineBasicBlock *BB) const {
21797   switch (MI->getOpcode()) {
21798   default: llvm_unreachable("Unexpected instr type to insert");
21799   case X86::TAILJMPd64:
21800   case X86::TAILJMPr64:
21801   case X86::TAILJMPm64:
21802   case X86::TAILJMPd64_REX:
21803   case X86::TAILJMPr64_REX:
21804   case X86::TAILJMPm64_REX:
21805     llvm_unreachable("TAILJMP64 would not be touched here.");
21806   case X86::TCRETURNdi64:
21807   case X86::TCRETURNri64:
21808   case X86::TCRETURNmi64:
21809     return BB;
21810   case X86::WIN_ALLOCA:
21811     return EmitLoweredWinAlloca(MI, BB);
21812   case X86::SEG_ALLOCA_32:
21813   case X86::SEG_ALLOCA_64:
21814     return EmitLoweredSegAlloca(MI, BB);
21815   case X86::TLSCall_32:
21816   case X86::TLSCall_64:
21817     return EmitLoweredTLSCall(MI, BB);
21818   case X86::CMOV_GR8:
21819   case X86::CMOV_FR32:
21820   case X86::CMOV_FR64:
21821   case X86::CMOV_V4F32:
21822   case X86::CMOV_V2F64:
21823   case X86::CMOV_V2I64:
21824   case X86::CMOV_V8F32:
21825   case X86::CMOV_V4F64:
21826   case X86::CMOV_V4I64:
21827   case X86::CMOV_V16F32:
21828   case X86::CMOV_V8F64:
21829   case X86::CMOV_V8I64:
21830   case X86::CMOV_GR16:
21831   case X86::CMOV_GR32:
21832   case X86::CMOV_RFP32:
21833   case X86::CMOV_RFP64:
21834   case X86::CMOV_RFP80:
21835     return EmitLoweredSelect(MI, BB);
21836
21837   case X86::FP32_TO_INT16_IN_MEM:
21838   case X86::FP32_TO_INT32_IN_MEM:
21839   case X86::FP32_TO_INT64_IN_MEM:
21840   case X86::FP64_TO_INT16_IN_MEM:
21841   case X86::FP64_TO_INT32_IN_MEM:
21842   case X86::FP64_TO_INT64_IN_MEM:
21843   case X86::FP80_TO_INT16_IN_MEM:
21844   case X86::FP80_TO_INT32_IN_MEM:
21845   case X86::FP80_TO_INT64_IN_MEM: {
21846     MachineFunction *F = BB->getParent();
21847     const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21848     DebugLoc DL = MI->getDebugLoc();
21849
21850     // Change the floating point control register to use "round towards zero"
21851     // mode when truncating to an integer value.
21852     int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
21853     addFrameReference(BuildMI(*BB, MI, DL,
21854                               TII->get(X86::FNSTCW16m)), CWFrameIdx);
21855
21856     // Load the old value of the high byte of the control word...
21857     unsigned OldCW =
21858       F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
21859     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
21860                       CWFrameIdx);
21861
21862     // Set the high part to be round to zero...
21863     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
21864       .addImm(0xC7F);
21865
21866     // Reload the modified control word now...
21867     addFrameReference(BuildMI(*BB, MI, DL,
21868                               TII->get(X86::FLDCW16m)), CWFrameIdx);
21869
21870     // Restore the memory image of control word to original value
21871     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
21872       .addReg(OldCW);
21873
21874     // Get the X86 opcode to use.
21875     unsigned Opc;
21876     switch (MI->getOpcode()) {
21877     default: llvm_unreachable("illegal opcode!");
21878     case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
21879     case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
21880     case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
21881     case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
21882     case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
21883     case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
21884     case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
21885     case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
21886     case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
21887     }
21888
21889     X86AddressMode AM;
21890     MachineOperand &Op = MI->getOperand(0);
21891     if (Op.isReg()) {
21892       AM.BaseType = X86AddressMode::RegBase;
21893       AM.Base.Reg = Op.getReg();
21894     } else {
21895       AM.BaseType = X86AddressMode::FrameIndexBase;
21896       AM.Base.FrameIndex = Op.getIndex();
21897     }
21898     Op = MI->getOperand(1);
21899     if (Op.isImm())
21900       AM.Scale = Op.getImm();
21901     Op = MI->getOperand(2);
21902     if (Op.isImm())
21903       AM.IndexReg = Op.getImm();
21904     Op = MI->getOperand(3);
21905     if (Op.isGlobal()) {
21906       AM.GV = Op.getGlobal();
21907     } else {
21908       AM.Disp = Op.getImm();
21909     }
21910     addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
21911                       .addReg(MI->getOperand(X86::AddrNumOperands).getReg());
21912
21913     // Reload the original control word now.
21914     addFrameReference(BuildMI(*BB, MI, DL,
21915                               TII->get(X86::FLDCW16m)), CWFrameIdx);
21916
21917     MI->eraseFromParent();   // The pseudo instruction is gone now.
21918     return BB;
21919   }
21920     // String/text processing lowering.
21921   case X86::PCMPISTRM128REG:
21922   case X86::VPCMPISTRM128REG:
21923   case X86::PCMPISTRM128MEM:
21924   case X86::VPCMPISTRM128MEM:
21925   case X86::PCMPESTRM128REG:
21926   case X86::VPCMPESTRM128REG:
21927   case X86::PCMPESTRM128MEM:
21928   case X86::VPCMPESTRM128MEM:
21929     assert(Subtarget->hasSSE42() &&
21930            "Target must have SSE4.2 or AVX features enabled");
21931     return EmitPCMPSTRM(MI, BB, Subtarget->getInstrInfo());
21932
21933   // String/text processing lowering.
21934   case X86::PCMPISTRIREG:
21935   case X86::VPCMPISTRIREG:
21936   case X86::PCMPISTRIMEM:
21937   case X86::VPCMPISTRIMEM:
21938   case X86::PCMPESTRIREG:
21939   case X86::VPCMPESTRIREG:
21940   case X86::PCMPESTRIMEM:
21941   case X86::VPCMPESTRIMEM:
21942     assert(Subtarget->hasSSE42() &&
21943            "Target must have SSE4.2 or AVX features enabled");
21944     return EmitPCMPSTRI(MI, BB, Subtarget->getInstrInfo());
21945
21946   // Thread synchronization.
21947   case X86::MONITOR:
21948     return EmitMonitor(MI, BB, Subtarget);
21949
21950   // xbegin
21951   case X86::XBEGIN:
21952     return EmitXBegin(MI, BB, Subtarget->getInstrInfo());
21953
21954   case X86::VASTART_SAVE_XMM_REGS:
21955     return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
21956
21957   case X86::VAARG_64:
21958     return EmitVAARG64WithCustomInserter(MI, BB);
21959
21960   case X86::EH_SjLj_SetJmp32:
21961   case X86::EH_SjLj_SetJmp64:
21962     return emitEHSjLjSetJmp(MI, BB);
21963
21964   case X86::EH_SjLj_LongJmp32:
21965   case X86::EH_SjLj_LongJmp64:
21966     return emitEHSjLjLongJmp(MI, BB);
21967
21968   case TargetOpcode::STATEPOINT:
21969     // As an implementation detail, STATEPOINT shares the STACKMAP format at
21970     // this point in the process.  We diverge later.
21971     return emitPatchPoint(MI, BB);
21972
21973   case TargetOpcode::STACKMAP:
21974   case TargetOpcode::PATCHPOINT:
21975     return emitPatchPoint(MI, BB);
21976
21977   case X86::VFMADDPDr213r:
21978   case X86::VFMADDPSr213r:
21979   case X86::VFMADDSDr213r:
21980   case X86::VFMADDSSr213r:
21981   case X86::VFMSUBPDr213r:
21982   case X86::VFMSUBPSr213r:
21983   case X86::VFMSUBSDr213r:
21984   case X86::VFMSUBSSr213r:
21985   case X86::VFNMADDPDr213r:
21986   case X86::VFNMADDPSr213r:
21987   case X86::VFNMADDSDr213r:
21988   case X86::VFNMADDSSr213r:
21989   case X86::VFNMSUBPDr213r:
21990   case X86::VFNMSUBPSr213r:
21991   case X86::VFNMSUBSDr213r:
21992   case X86::VFNMSUBSSr213r:
21993   case X86::VFMADDSUBPDr213r:
21994   case X86::VFMADDSUBPSr213r:
21995   case X86::VFMSUBADDPDr213r:
21996   case X86::VFMSUBADDPSr213r:
21997   case X86::VFMADDPDr213rY:
21998   case X86::VFMADDPSr213rY:
21999   case X86::VFMSUBPDr213rY:
22000   case X86::VFMSUBPSr213rY:
22001   case X86::VFNMADDPDr213rY:
22002   case X86::VFNMADDPSr213rY:
22003   case X86::VFNMSUBPDr213rY:
22004   case X86::VFNMSUBPSr213rY:
22005   case X86::VFMADDSUBPDr213rY:
22006   case X86::VFMADDSUBPSr213rY:
22007   case X86::VFMSUBADDPDr213rY:
22008   case X86::VFMSUBADDPSr213rY:
22009     return emitFMA3Instr(MI, BB);
22010   }
22011 }
22012
22013 //===----------------------------------------------------------------------===//
22014 //                           X86 Optimization Hooks
22015 //===----------------------------------------------------------------------===//
22016
22017 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
22018                                                       APInt &KnownZero,
22019                                                       APInt &KnownOne,
22020                                                       const SelectionDAG &DAG,
22021                                                       unsigned Depth) const {
22022   unsigned BitWidth = KnownZero.getBitWidth();
22023   unsigned Opc = Op.getOpcode();
22024   assert((Opc >= ISD::BUILTIN_OP_END ||
22025           Opc == ISD::INTRINSIC_WO_CHAIN ||
22026           Opc == ISD::INTRINSIC_W_CHAIN ||
22027           Opc == ISD::INTRINSIC_VOID) &&
22028          "Should use MaskedValueIsZero if you don't know whether Op"
22029          " is a target node!");
22030
22031   KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
22032   switch (Opc) {
22033   default: break;
22034   case X86ISD::ADD:
22035   case X86ISD::SUB:
22036   case X86ISD::ADC:
22037   case X86ISD::SBB:
22038   case X86ISD::SMUL:
22039   case X86ISD::UMUL:
22040   case X86ISD::INC:
22041   case X86ISD::DEC:
22042   case X86ISD::OR:
22043   case X86ISD::XOR:
22044   case X86ISD::AND:
22045     // These nodes' second result is a boolean.
22046     if (Op.getResNo() == 0)
22047       break;
22048     // Fallthrough
22049   case X86ISD::SETCC:
22050     KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
22051     break;
22052   case ISD::INTRINSIC_WO_CHAIN: {
22053     unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
22054     unsigned NumLoBits = 0;
22055     switch (IntId) {
22056     default: break;
22057     case Intrinsic::x86_sse_movmsk_ps:
22058     case Intrinsic::x86_avx_movmsk_ps_256:
22059     case Intrinsic::x86_sse2_movmsk_pd:
22060     case Intrinsic::x86_avx_movmsk_pd_256:
22061     case Intrinsic::x86_mmx_pmovmskb:
22062     case Intrinsic::x86_sse2_pmovmskb_128:
22063     case Intrinsic::x86_avx2_pmovmskb: {
22064       // High bits of movmskp{s|d}, pmovmskb are known zero.
22065       switch (IntId) {
22066         default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
22067         case Intrinsic::x86_sse_movmsk_ps:      NumLoBits = 4; break;
22068         case Intrinsic::x86_avx_movmsk_ps_256:  NumLoBits = 8; break;
22069         case Intrinsic::x86_sse2_movmsk_pd:     NumLoBits = 2; break;
22070         case Intrinsic::x86_avx_movmsk_pd_256:  NumLoBits = 4; break;
22071         case Intrinsic::x86_mmx_pmovmskb:       NumLoBits = 8; break;
22072         case Intrinsic::x86_sse2_pmovmskb_128:  NumLoBits = 16; break;
22073         case Intrinsic::x86_avx2_pmovmskb:      NumLoBits = 32; break;
22074       }
22075       KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
22076       break;
22077     }
22078     }
22079     break;
22080   }
22081   }
22082 }
22083
22084 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
22085   SDValue Op,
22086   const SelectionDAG &,
22087   unsigned Depth) const {
22088   // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
22089   if (Op.getOpcode() == X86ISD::SETCC_CARRY)
22090     return Op.getValueType().getScalarType().getSizeInBits();
22091
22092   // Fallback case.
22093   return 1;
22094 }
22095
22096 /// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
22097 /// node is a GlobalAddress + offset.
22098 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
22099                                        const GlobalValue* &GA,
22100                                        int64_t &Offset) const {
22101   if (N->getOpcode() == X86ISD::Wrapper) {
22102     if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
22103       GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
22104       Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
22105       return true;
22106     }
22107   }
22108   return TargetLowering::isGAPlusOffset(N, GA, Offset);
22109 }
22110
22111 /// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the
22112 /// same as extracting the high 128-bit part of 256-bit vector and then
22113 /// inserting the result into the low part of a new 256-bit vector
22114 static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
22115   EVT VT = SVOp->getValueType(0);
22116   unsigned NumElems = VT.getVectorNumElements();
22117
22118   // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
22119   for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j)
22120     if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
22121         SVOp->getMaskElt(j) >= 0)
22122       return false;
22123
22124   return true;
22125 }
22126
22127 /// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the
22128 /// same as extracting the low 128-bit part of 256-bit vector and then
22129 /// inserting the result into the high part of a new 256-bit vector
22130 static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
22131   EVT VT = SVOp->getValueType(0);
22132   unsigned NumElems = VT.getVectorNumElements();
22133
22134   // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
22135   for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j)
22136     if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
22137         SVOp->getMaskElt(j) >= 0)
22138       return false;
22139
22140   return true;
22141 }
22142
22143 /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
22144 static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
22145                                         TargetLowering::DAGCombinerInfo &DCI,
22146                                         const X86Subtarget* Subtarget) {
22147   SDLoc dl(N);
22148   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
22149   SDValue V1 = SVOp->getOperand(0);
22150   SDValue V2 = SVOp->getOperand(1);
22151   EVT VT = SVOp->getValueType(0);
22152   unsigned NumElems = VT.getVectorNumElements();
22153
22154   if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
22155       V2.getOpcode() == ISD::CONCAT_VECTORS) {
22156     //
22157     //                   0,0,0,...
22158     //                      |
22159     //    V      UNDEF    BUILD_VECTOR    UNDEF
22160     //     \      /           \           /
22161     //  CONCAT_VECTOR         CONCAT_VECTOR
22162     //         \                  /
22163     //          \                /
22164     //          RESULT: V + zero extended
22165     //
22166     if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
22167         V2.getOperand(1).getOpcode() != ISD::UNDEF ||
22168         V1.getOperand(1).getOpcode() != ISD::UNDEF)
22169       return SDValue();
22170
22171     if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
22172       return SDValue();
22173
22174     // To match the shuffle mask, the first half of the mask should
22175     // be exactly the first vector, and all the rest a splat with the
22176     // first element of the second one.
22177     for (unsigned i = 0; i != NumElems/2; ++i)
22178       if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
22179           !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
22180         return SDValue();
22181
22182     // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
22183     if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
22184       if (Ld->hasNUsesOfValue(1, 0)) {
22185         SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
22186         SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
22187         SDValue ResNode =
22188           DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
22189                                   Ld->getMemoryVT(),
22190                                   Ld->getPointerInfo(),
22191                                   Ld->getAlignment(),
22192                                   false/*isVolatile*/, true/*ReadMem*/,
22193                                   false/*WriteMem*/);
22194
22195         // Make sure the newly-created LOAD is in the same position as Ld in
22196         // terms of dependency. We create a TokenFactor for Ld and ResNode,
22197         // and update uses of Ld's output chain to use the TokenFactor.
22198         if (Ld->hasAnyUseOfValue(1)) {
22199           SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
22200                              SDValue(Ld, 1), SDValue(ResNode.getNode(), 1));
22201           DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
22202           DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
22203                                  SDValue(ResNode.getNode(), 1));
22204         }
22205
22206         return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
22207       }
22208     }
22209
22210     // Emit a zeroed vector and insert the desired subvector on its
22211     // first half.
22212     SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
22213     SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
22214     return DCI.CombineTo(N, InsV);
22215   }
22216
22217   //===--------------------------------------------------------------------===//
22218   // Combine some shuffles into subvector extracts and inserts:
22219   //
22220
22221   // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
22222   if (isShuffleHigh128VectorInsertLow(SVOp)) {
22223     SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl);
22224     SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl);
22225     return DCI.CombineTo(N, InsV);
22226   }
22227
22228   // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
22229   if (isShuffleLow128VectorInsertHigh(SVOp)) {
22230     SDValue V = Extract128BitVector(V1, 0, DAG, dl);
22231     SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl);
22232     return DCI.CombineTo(N, InsV);
22233   }
22234
22235   return SDValue();
22236 }
22237
22238 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
22239 /// possible.
22240 ///
22241 /// This is the leaf of the recursive combinine below. When we have found some
22242 /// chain of single-use x86 shuffle instructions and accumulated the combined
22243 /// shuffle mask represented by them, this will try to pattern match that mask
22244 /// into either a single instruction if there is a special purpose instruction
22245 /// for this operation, or into a PSHUFB instruction which is a fully general
22246 /// instruction but should only be used to replace chains over a certain depth.
22247 static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
22248                                    int Depth, bool HasPSHUFB, SelectionDAG &DAG,
22249                                    TargetLowering::DAGCombinerInfo &DCI,
22250                                    const X86Subtarget *Subtarget) {
22251   assert(!Mask.empty() && "Cannot combine an empty shuffle mask!");
22252
22253   // Find the operand that enters the chain. Note that multiple uses are OK
22254   // here, we're not going to remove the operand we find.
22255   SDValue Input = Op.getOperand(0);
22256   while (Input.getOpcode() == ISD::BITCAST)
22257     Input = Input.getOperand(0);
22258
22259   MVT VT = Input.getSimpleValueType();
22260   MVT RootVT = Root.getSimpleValueType();
22261   SDLoc DL(Root);
22262
22263   // Just remove no-op shuffle masks.
22264   if (Mask.size() == 1) {
22265     DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Input),
22266                   /*AddTo*/ true);
22267     return true;
22268   }
22269
22270   // Use the float domain if the operand type is a floating point type.
22271   bool FloatDomain = VT.isFloatingPoint();
22272
22273   // For floating point shuffles, we don't have free copies in the shuffle
22274   // instructions or the ability to load as part of the instruction, so
22275   // canonicalize their shuffles to UNPCK or MOV variants.
22276   //
22277   // Note that even with AVX we prefer the PSHUFD form of shuffle for integer
22278   // vectors because it can have a load folded into it that UNPCK cannot. This
22279   // doesn't preclude something switching to the shorter encoding post-RA.
22280   if (FloatDomain) {
22281     if (Mask.equals(0, 0) || Mask.equals(1, 1)) {
22282       bool Lo = Mask.equals(0, 0);
22283       unsigned Shuffle;
22284       MVT ShuffleVT;
22285       // Check if we have SSE3 which will let us use MOVDDUP. That instruction
22286       // is no slower than UNPCKLPD but has the option to fold the input operand
22287       // into even an unaligned memory load.
22288       if (Lo && Subtarget->hasSSE3()) {
22289         Shuffle = X86ISD::MOVDDUP;
22290         ShuffleVT = MVT::v2f64;
22291       } else {
22292         // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller
22293         // than the UNPCK variants.
22294         Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS;
22295         ShuffleVT = MVT::v4f32;
22296       }
22297       if (Depth == 1 && Root->getOpcode() == Shuffle)
22298         return false; // Nothing to do!
22299       Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
22300       DCI.AddToWorklist(Op.getNode());
22301       if (Shuffle == X86ISD::MOVDDUP)
22302         Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
22303       else
22304         Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
22305       DCI.AddToWorklist(Op.getNode());
22306       DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22307                     /*AddTo*/ true);
22308       return true;
22309     }
22310     if (Subtarget->hasSSE3() &&
22311         (Mask.equals(0, 0, 2, 2) || Mask.equals(1, 1, 3, 3))) {
22312       bool Lo = Mask.equals(0, 0, 2, 2);
22313       unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP;
22314       MVT ShuffleVT = MVT::v4f32;
22315       if (Depth == 1 && Root->getOpcode() == Shuffle)
22316         return false; // Nothing to do!
22317       Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
22318       DCI.AddToWorklist(Op.getNode());
22319       Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
22320       DCI.AddToWorklist(Op.getNode());
22321       DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22322                     /*AddTo*/ true);
22323       return true;
22324     }
22325     if (Mask.equals(0, 0, 1, 1) || Mask.equals(2, 2, 3, 3)) {
22326       bool Lo = Mask.equals(0, 0, 1, 1);
22327       unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
22328       MVT ShuffleVT = MVT::v4f32;
22329       if (Depth == 1 && Root->getOpcode() == Shuffle)
22330         return false; // Nothing to do!
22331       Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
22332       DCI.AddToWorklist(Op.getNode());
22333       Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
22334       DCI.AddToWorklist(Op.getNode());
22335       DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22336                     /*AddTo*/ true);
22337       return true;
22338     }
22339   }
22340
22341   // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK
22342   // variants as none of these have single-instruction variants that are
22343   // superior to the UNPCK formulation.
22344   if (!FloatDomain &&
22345       (Mask.equals(0, 0, 1, 1, 2, 2, 3, 3) ||
22346        Mask.equals(4, 4, 5, 5, 6, 6, 7, 7) ||
22347        Mask.equals(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7) ||
22348        Mask.equals(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15,
22349                    15))) {
22350     bool Lo = Mask[0] == 0;
22351     unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
22352     if (Depth == 1 && Root->getOpcode() == Shuffle)
22353       return false; // Nothing to do!
22354     MVT ShuffleVT;
22355     switch (Mask.size()) {
22356     case 8:
22357       ShuffleVT = MVT::v8i16;
22358       break;
22359     case 16:
22360       ShuffleVT = MVT::v16i8;
22361       break;
22362     default:
22363       llvm_unreachable("Impossible mask size!");
22364     };
22365     Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
22366     DCI.AddToWorklist(Op.getNode());
22367     Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
22368     DCI.AddToWorklist(Op.getNode());
22369     DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22370                   /*AddTo*/ true);
22371     return true;
22372   }
22373
22374   // Don't try to re-form single instruction chains under any circumstances now
22375   // that we've done encoding canonicalization for them.
22376   if (Depth < 2)
22377     return false;
22378
22379   // If we have 3 or more shuffle instructions or a chain involving PSHUFB, we
22380   // can replace them with a single PSHUFB instruction profitably. Intel's
22381   // manuals suggest only using PSHUFB if doing so replacing 5 instructions, but
22382   // in practice PSHUFB tends to be *very* fast so we're more aggressive.
22383   if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) {
22384     SmallVector<SDValue, 16> PSHUFBMask;
22385     assert(Mask.size() <= 16 && "Can't shuffle elements smaller than bytes!");
22386     int Ratio = 16 / Mask.size();
22387     for (unsigned i = 0; i < 16; ++i) {
22388       if (Mask[i / Ratio] == SM_SentinelUndef) {
22389         PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
22390         continue;
22391       }
22392       int M = Mask[i / Ratio] != SM_SentinelZero
22393                   ? Ratio * Mask[i / Ratio] + i % Ratio
22394                   : 255;
22395       PSHUFBMask.push_back(DAG.getConstant(M, MVT::i8));
22396     }
22397     Op = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Input);
22398     DCI.AddToWorklist(Op.getNode());
22399     SDValue PSHUFBMaskOp =
22400         DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, PSHUFBMask);
22401     DCI.AddToWorklist(PSHUFBMaskOp.getNode());
22402     Op = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, Op, PSHUFBMaskOp);
22403     DCI.AddToWorklist(Op.getNode());
22404     DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22405                   /*AddTo*/ true);
22406     return true;
22407   }
22408
22409   // Failed to find any combines.
22410   return false;
22411 }
22412
22413 /// \brief Fully generic combining of x86 shuffle instructions.
22414 ///
22415 /// This should be the last combine run over the x86 shuffle instructions. Once
22416 /// they have been fully optimized, this will recursively consider all chains
22417 /// of single-use shuffle instructions, build a generic model of the cumulative
22418 /// shuffle operation, and check for simpler instructions which implement this
22419 /// operation. We use this primarily for two purposes:
22420 ///
22421 /// 1) Collapse generic shuffles to specialized single instructions when
22422 ///    equivalent. In most cases, this is just an encoding size win, but
22423 ///    sometimes we will collapse multiple generic shuffles into a single
22424 ///    special-purpose shuffle.
22425 /// 2) Look for sequences of shuffle instructions with 3 or more total
22426 ///    instructions, and replace them with the slightly more expensive SSSE3
22427 ///    PSHUFB instruction if available. We do this as the last combining step
22428 ///    to ensure we avoid using PSHUFB if we can implement the shuffle with
22429 ///    a suitable short sequence of other instructions. The PHUFB will either
22430 ///    use a register or have to read from memory and so is slightly (but only
22431 ///    slightly) more expensive than the other shuffle instructions.
22432 ///
22433 /// Because this is inherently a quadratic operation (for each shuffle in
22434 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
22435 /// This should never be an issue in practice as the shuffle lowering doesn't
22436 /// produce sequences of more than 8 instructions.
22437 ///
22438 /// FIXME: We will currently miss some cases where the redundant shuffling
22439 /// would simplify under the threshold for PSHUFB formation because of
22440 /// combine-ordering. To fix this, we should do the redundant instruction
22441 /// combining in this recursive walk.
22442 static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
22443                                           ArrayRef<int> RootMask,
22444                                           int Depth, bool HasPSHUFB,
22445                                           SelectionDAG &DAG,
22446                                           TargetLowering::DAGCombinerInfo &DCI,
22447                                           const X86Subtarget *Subtarget) {
22448   // Bound the depth of our recursive combine because this is ultimately
22449   // quadratic in nature.
22450   if (Depth > 8)
22451     return false;
22452
22453   // Directly rip through bitcasts to find the underlying operand.
22454   while (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).hasOneUse())
22455     Op = Op.getOperand(0);
22456
22457   MVT VT = Op.getSimpleValueType();
22458   if (!VT.isVector())
22459     return false; // Bail if we hit a non-vector.
22460   // FIXME: This routine should be taught about 256-bit shuffles, or a 256-bit
22461   // version should be added.
22462   if (VT.getSizeInBits() != 128)
22463     return false;
22464
22465   assert(Root.getSimpleValueType().isVector() &&
22466          "Shuffles operate on vector types!");
22467   assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
22468          "Can only combine shuffles of the same vector register size.");
22469
22470   if (!isTargetShuffle(Op.getOpcode()))
22471     return false;
22472   SmallVector<int, 16> OpMask;
22473   bool IsUnary;
22474   bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, OpMask, IsUnary);
22475   // We only can combine unary shuffles which we can decode the mask for.
22476   if (!HaveMask || !IsUnary)
22477     return false;
22478
22479   assert(VT.getVectorNumElements() == OpMask.size() &&
22480          "Different mask size from vector size!");
22481   assert(((RootMask.size() > OpMask.size() &&
22482            RootMask.size() % OpMask.size() == 0) ||
22483           (OpMask.size() > RootMask.size() &&
22484            OpMask.size() % RootMask.size() == 0) ||
22485           OpMask.size() == RootMask.size()) &&
22486          "The smaller number of elements must divide the larger.");
22487   int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
22488   int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
22489   assert(((RootRatio == 1 && OpRatio == 1) ||
22490           (RootRatio == 1) != (OpRatio == 1)) &&
22491          "Must not have a ratio for both incoming and op masks!");
22492
22493   SmallVector<int, 16> Mask;
22494   Mask.reserve(std::max(OpMask.size(), RootMask.size()));
22495
22496   // Merge this shuffle operation's mask into our accumulated mask. Note that
22497   // this shuffle's mask will be the first applied to the input, followed by the
22498   // root mask to get us all the way to the root value arrangement. The reason
22499   // for this order is that we are recursing up the operation chain.
22500   for (int i = 0, e = std::max(OpMask.size(), RootMask.size()); i < e; ++i) {
22501     int RootIdx = i / RootRatio;
22502     if (RootMask[RootIdx] < 0) {
22503       // This is a zero or undef lane, we're done.
22504       Mask.push_back(RootMask[RootIdx]);
22505       continue;
22506     }
22507
22508     int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
22509     int OpIdx = RootMaskedIdx / OpRatio;
22510     if (OpMask[OpIdx] < 0) {
22511       // The incoming lanes are zero or undef, it doesn't matter which ones we
22512       // are using.
22513       Mask.push_back(OpMask[OpIdx]);
22514       continue;
22515     }
22516
22517     // Ok, we have non-zero lanes, map them through.
22518     Mask.push_back(OpMask[OpIdx] * OpRatio +
22519                    RootMaskedIdx % OpRatio);
22520   }
22521
22522   // See if we can recurse into the operand to combine more things.
22523   switch (Op.getOpcode()) {
22524     case X86ISD::PSHUFB:
22525       HasPSHUFB = true;
22526     case X86ISD::PSHUFD:
22527     case X86ISD::PSHUFHW:
22528     case X86ISD::PSHUFLW:
22529       if (Op.getOperand(0).hasOneUse() &&
22530           combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
22531                                         HasPSHUFB, DAG, DCI, Subtarget))
22532         return true;
22533       break;
22534
22535     case X86ISD::UNPCKL:
22536     case X86ISD::UNPCKH:
22537       assert(Op.getOperand(0) == Op.getOperand(1) && "We only combine unary shuffles!");
22538       // We can't check for single use, we have to check that this shuffle is the only user.
22539       if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
22540           combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
22541                                         HasPSHUFB, DAG, DCI, Subtarget))
22542           return true;
22543       break;
22544   }
22545
22546   // Minor canonicalization of the accumulated shuffle mask to make it easier
22547   // to match below. All this does is detect masks with squential pairs of
22548   // elements, and shrink them to the half-width mask. It does this in a loop
22549   // so it will reduce the size of the mask to the minimal width mask which
22550   // performs an equivalent shuffle.
22551   SmallVector<int, 16> WidenedMask;
22552   while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
22553     Mask = std::move(WidenedMask);
22554     WidenedMask.clear();
22555   }
22556
22557   return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI,
22558                                 Subtarget);
22559 }
22560
22561 /// \brief Get the PSHUF-style mask from PSHUF node.
22562 ///
22563 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
22564 /// PSHUF-style masks that can be reused with such instructions.
22565 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
22566   SmallVector<int, 4> Mask;
22567   bool IsUnary;
22568   bool HaveMask = getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), Mask, IsUnary);
22569   (void)HaveMask;
22570   assert(HaveMask);
22571
22572   switch (N.getOpcode()) {
22573   case X86ISD::PSHUFD:
22574     return Mask;
22575   case X86ISD::PSHUFLW:
22576     Mask.resize(4);
22577     return Mask;
22578   case X86ISD::PSHUFHW:
22579     Mask.erase(Mask.begin(), Mask.begin() + 4);
22580     for (int &M : Mask)
22581       M -= 4;
22582     return Mask;
22583   default:
22584     llvm_unreachable("No valid shuffle instruction found!");
22585   }
22586 }
22587
22588 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
22589 ///
22590 /// We walk up the chain and look for a combinable shuffle, skipping over
22591 /// shuffles that we could hoist this shuffle's transformation past without
22592 /// altering anything.
22593 static SDValue
22594 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
22595                              SelectionDAG &DAG,
22596                              TargetLowering::DAGCombinerInfo &DCI) {
22597   assert(N.getOpcode() == X86ISD::PSHUFD &&
22598          "Called with something other than an x86 128-bit half shuffle!");
22599   SDLoc DL(N);
22600
22601   // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
22602   // of the shuffles in the chain so that we can form a fresh chain to replace
22603   // this one.
22604   SmallVector<SDValue, 8> Chain;
22605   SDValue V = N.getOperand(0);
22606   for (; V.hasOneUse(); V = V.getOperand(0)) {
22607     switch (V.getOpcode()) {
22608     default:
22609       return SDValue(); // Nothing combined!
22610
22611     case ISD::BITCAST:
22612       // Skip bitcasts as we always know the type for the target specific
22613       // instructions.
22614       continue;
22615
22616     case X86ISD::PSHUFD:
22617       // Found another dword shuffle.
22618       break;
22619
22620     case X86ISD::PSHUFLW:
22621       // Check that the low words (being shuffled) are the identity in the
22622       // dword shuffle, and the high words are self-contained.
22623       if (Mask[0] != 0 || Mask[1] != 1 ||
22624           !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
22625         return SDValue();
22626
22627       Chain.push_back(V);
22628       continue;
22629
22630     case X86ISD::PSHUFHW:
22631       // Check that the high words (being shuffled) are the identity in the
22632       // dword shuffle, and the low words are self-contained.
22633       if (Mask[2] != 2 || Mask[3] != 3 ||
22634           !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
22635         return SDValue();
22636
22637       Chain.push_back(V);
22638       continue;
22639
22640     case X86ISD::UNPCKL:
22641     case X86ISD::UNPCKH:
22642       // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
22643       // shuffle into a preceding word shuffle.
22644       if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16)
22645         return SDValue();
22646
22647       // Search for a half-shuffle which we can combine with.
22648       unsigned CombineOp =
22649           V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
22650       if (V.getOperand(0) != V.getOperand(1) ||
22651           !V->isOnlyUserOf(V.getOperand(0).getNode()))
22652         return SDValue();
22653       Chain.push_back(V);
22654       V = V.getOperand(0);
22655       do {
22656         switch (V.getOpcode()) {
22657         default:
22658           return SDValue(); // Nothing to combine.
22659
22660         case X86ISD::PSHUFLW:
22661         case X86ISD::PSHUFHW:
22662           if (V.getOpcode() == CombineOp)
22663             break;
22664
22665           Chain.push_back(V);
22666
22667           // Fallthrough!
22668         case ISD::BITCAST:
22669           V = V.getOperand(0);
22670           continue;
22671         }
22672         break;
22673       } while (V.hasOneUse());
22674       break;
22675     }
22676     // Break out of the loop if we break out of the switch.
22677     break;
22678   }
22679
22680   if (!V.hasOneUse())
22681     // We fell out of the loop without finding a viable combining instruction.
22682     return SDValue();
22683
22684   // Merge this node's mask and our incoming mask.
22685   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
22686   for (int &M : Mask)
22687     M = VMask[M];
22688   V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
22689                   getV4X86ShuffleImm8ForMask(Mask, DAG));
22690
22691   // Rebuild the chain around this new shuffle.
22692   while (!Chain.empty()) {
22693     SDValue W = Chain.pop_back_val();
22694
22695     if (V.getValueType() != W.getOperand(0).getValueType())
22696       V = DAG.getNode(ISD::BITCAST, DL, W.getOperand(0).getValueType(), V);
22697
22698     switch (W.getOpcode()) {
22699     default:
22700       llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
22701
22702     case X86ISD::UNPCKL:
22703     case X86ISD::UNPCKH:
22704       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
22705       break;
22706
22707     case X86ISD::PSHUFD:
22708     case X86ISD::PSHUFLW:
22709     case X86ISD::PSHUFHW:
22710       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
22711       break;
22712     }
22713   }
22714   if (V.getValueType() != N.getValueType())
22715     V = DAG.getNode(ISD::BITCAST, DL, N.getValueType(), V);
22716
22717   // Return the new chain to replace N.
22718   return V;
22719 }
22720
22721 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw.
22722 ///
22723 /// We walk up the chain, skipping shuffles of the other half and looking
22724 /// through shuffles which switch halves trying to find a shuffle of the same
22725 /// pair of dwords.
22726 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
22727                                         SelectionDAG &DAG,
22728                                         TargetLowering::DAGCombinerInfo &DCI) {
22729   assert(
22730       (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
22731       "Called with something other than an x86 128-bit half shuffle!");
22732   SDLoc DL(N);
22733   unsigned CombineOpcode = N.getOpcode();
22734
22735   // Walk up a single-use chain looking for a combinable shuffle.
22736   SDValue V = N.getOperand(0);
22737   for (; V.hasOneUse(); V = V.getOperand(0)) {
22738     switch (V.getOpcode()) {
22739     default:
22740       return false; // Nothing combined!
22741
22742     case ISD::BITCAST:
22743       // Skip bitcasts as we always know the type for the target specific
22744       // instructions.
22745       continue;
22746
22747     case X86ISD::PSHUFLW:
22748     case X86ISD::PSHUFHW:
22749       if (V.getOpcode() == CombineOpcode)
22750         break;
22751
22752       // Other-half shuffles are no-ops.
22753       continue;
22754     }
22755     // Break out of the loop if we break out of the switch.
22756     break;
22757   }
22758
22759   if (!V.hasOneUse())
22760     // We fell out of the loop without finding a viable combining instruction.
22761     return false;
22762
22763   // Combine away the bottom node as its shuffle will be accumulated into
22764   // a preceding shuffle.
22765   DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
22766
22767   // Record the old value.
22768   SDValue Old = V;
22769
22770   // Merge this node's mask and our incoming mask (adjusted to account for all
22771   // the pshufd instructions encountered).
22772   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
22773   for (int &M : Mask)
22774     M = VMask[M];
22775   V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
22776                   getV4X86ShuffleImm8ForMask(Mask, DAG));
22777
22778   // Check that the shuffles didn't cancel each other out. If not, we need to
22779   // combine to the new one.
22780   if (Old != V)
22781     // Replace the combinable shuffle with the combined one, updating all users
22782     // so that we re-evaluate the chain here.
22783     DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
22784
22785   return true;
22786 }
22787
22788 /// \brief Try to combine x86 target specific shuffles.
22789 static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
22790                                            TargetLowering::DAGCombinerInfo &DCI,
22791                                            const X86Subtarget *Subtarget) {
22792   SDLoc DL(N);
22793   MVT VT = N.getSimpleValueType();
22794   SmallVector<int, 4> Mask;
22795
22796   switch (N.getOpcode()) {
22797   case X86ISD::PSHUFD:
22798   case X86ISD::PSHUFLW:
22799   case X86ISD::PSHUFHW:
22800     Mask = getPSHUFShuffleMask(N);
22801     assert(Mask.size() == 4);
22802     break;
22803   default:
22804     return SDValue();
22805   }
22806
22807   // Nuke no-op shuffles that show up after combining.
22808   if (isNoopShuffleMask(Mask))
22809     return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
22810
22811   // Look for simplifications involving one or two shuffle instructions.
22812   SDValue V = N.getOperand(0);
22813   switch (N.getOpcode()) {
22814   default:
22815     break;
22816   case X86ISD::PSHUFLW:
22817   case X86ISD::PSHUFHW:
22818     assert(VT == MVT::v8i16);
22819     (void)VT;
22820
22821     if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
22822       return SDValue(); // We combined away this shuffle, so we're done.
22823
22824     // See if this reduces to a PSHUFD which is no more expensive and can
22825     // combine with more operations. Note that it has to at least flip the
22826     // dwords as otherwise it would have been removed as a no-op.
22827     if (Mask[0] == 2 && Mask[1] == 3 && Mask[2] == 0 && Mask[3] == 1) {
22828       int DMask[] = {0, 1, 2, 3};
22829       int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
22830       DMask[DOffset + 0] = DOffset + 1;
22831       DMask[DOffset + 1] = DOffset + 0;
22832       V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V);
22833       DCI.AddToWorklist(V.getNode());
22834       V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V,
22835                       getV4X86ShuffleImm8ForMask(DMask, DAG));
22836       DCI.AddToWorklist(V.getNode());
22837       return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
22838     }
22839
22840     // Look for shuffle patterns which can be implemented as a single unpack.
22841     // FIXME: This doesn't handle the location of the PSHUFD generically, and
22842     // only works when we have a PSHUFD followed by two half-shuffles.
22843     if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
22844         (V.getOpcode() == X86ISD::PSHUFLW ||
22845          V.getOpcode() == X86ISD::PSHUFHW) &&
22846         V.getOpcode() != N.getOpcode() &&
22847         V.hasOneUse()) {
22848       SDValue D = V.getOperand(0);
22849       while (D.getOpcode() == ISD::BITCAST && D.hasOneUse())
22850         D = D.getOperand(0);
22851       if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
22852         SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
22853         SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
22854         int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
22855         int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
22856         int WordMask[8];
22857         for (int i = 0; i < 4; ++i) {
22858           WordMask[i + NOffset] = Mask[i] + NOffset;
22859           WordMask[i + VOffset] = VMask[i] + VOffset;
22860         }
22861         // Map the word mask through the DWord mask.
22862         int MappedMask[8];
22863         for (int i = 0; i < 8; ++i)
22864           MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
22865         const int UnpackLoMask[] = {0, 0, 1, 1, 2, 2, 3, 3};
22866         const int UnpackHiMask[] = {4, 4, 5, 5, 6, 6, 7, 7};
22867         if (std::equal(std::begin(MappedMask), std::end(MappedMask),
22868                        std::begin(UnpackLoMask)) ||
22869             std::equal(std::begin(MappedMask), std::end(MappedMask),
22870                        std::begin(UnpackHiMask))) {
22871           // We can replace all three shuffles with an unpack.
22872           V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, D.getOperand(0));
22873           DCI.AddToWorklist(V.getNode());
22874           return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
22875                                                 : X86ISD::UNPCKH,
22876                              DL, MVT::v8i16, V, V);
22877         }
22878       }
22879     }
22880
22881     break;
22882
22883   case X86ISD::PSHUFD:
22884     if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI))
22885       return NewN;
22886
22887     break;
22888   }
22889
22890   return SDValue();
22891 }
22892
22893 /// \brief Try to combine a shuffle into a target-specific add-sub node.
22894 ///
22895 /// We combine this directly on the abstract vector shuffle nodes so it is
22896 /// easier to generically match. We also insert dummy vector shuffle nodes for
22897 /// the operands which explicitly discard the lanes which are unused by this
22898 /// operation to try to flow through the rest of the combiner the fact that
22899 /// they're unused.
22900 static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) {
22901   SDLoc DL(N);
22902   EVT VT = N->getValueType(0);
22903
22904   // We only handle target-independent shuffles.
22905   // FIXME: It would be easy and harmless to use the target shuffle mask
22906   // extraction tool to support more.
22907   if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
22908     return SDValue();
22909
22910   auto *SVN = cast<ShuffleVectorSDNode>(N);
22911   ArrayRef<int> Mask = SVN->getMask();
22912   SDValue V1 = N->getOperand(0);
22913   SDValue V2 = N->getOperand(1);
22914
22915   // We require the first shuffle operand to be the SUB node, and the second to
22916   // be the ADD node.
22917   // FIXME: We should support the commuted patterns.
22918   if (V1->getOpcode() != ISD::FSUB || V2->getOpcode() != ISD::FADD)
22919     return SDValue();
22920
22921   // If there are other uses of these operations we can't fold them.
22922   if (!V1->hasOneUse() || !V2->hasOneUse())
22923     return SDValue();
22924
22925   // Ensure that both operations have the same operands. Note that we can
22926   // commute the FADD operands.
22927   SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
22928   if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
22929       (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
22930     return SDValue();
22931
22932   // We're looking for blends between FADD and FSUB nodes. We insist on these
22933   // nodes being lined up in a specific expected pattern.
22934   if (!(isShuffleEquivalent(V1, V2, Mask, 0, 3) ||
22935         isShuffleEquivalent(V1, V2, Mask, 0, 5, 2, 7) ||
22936         isShuffleEquivalent(V1, V2, Mask, 0, 9, 2, 11, 4, 13, 6, 15)))
22937     return SDValue();
22938
22939   // Only specific types are legal at this point, assert so we notice if and
22940   // when these change.
22941   assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 ||
22942           VT == MVT::v4f64) &&
22943          "Unknown vector type encountered!");
22944
22945   return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS);
22946 }
22947
22948 /// PerformShuffleCombine - Performs several different shuffle combines.
22949 static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
22950                                      TargetLowering::DAGCombinerInfo &DCI,
22951                                      const X86Subtarget *Subtarget) {
22952   SDLoc dl(N);
22953   SDValue N0 = N->getOperand(0);
22954   SDValue N1 = N->getOperand(1);
22955   EVT VT = N->getValueType(0);
22956
22957   // Don't create instructions with illegal types after legalize types has run.
22958   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22959   if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
22960     return SDValue();
22961
22962   // If we have legalized the vector types, look for blends of FADD and FSUB
22963   // nodes that we can fuse into an ADDSUB node.
22964   if (TLI.isTypeLegal(VT) && Subtarget->hasSSE3())
22965     if (SDValue AddSub = combineShuffleToAddSub(N, DAG))
22966       return AddSub;
22967
22968   // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
22969   if (Subtarget->hasFp256() && VT.is256BitVector() &&
22970       N->getOpcode() == ISD::VECTOR_SHUFFLE)
22971     return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
22972
22973   // During Type Legalization, when promoting illegal vector types,
22974   // the backend might introduce new shuffle dag nodes and bitcasts.
22975   //
22976   // This code performs the following transformation:
22977   // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
22978   //       (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
22979   //
22980   // We do this only if both the bitcast and the BINOP dag nodes have
22981   // one use. Also, perform this transformation only if the new binary
22982   // operation is legal. This is to avoid introducing dag nodes that
22983   // potentially need to be further expanded (or custom lowered) into a
22984   // less optimal sequence of dag nodes.
22985   if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
22986       N1.getOpcode() == ISD::UNDEF && N0.hasOneUse() &&
22987       N0.getOpcode() == ISD::BITCAST) {
22988     SDValue BC0 = N0.getOperand(0);
22989     EVT SVT = BC0.getValueType();
22990     unsigned Opcode = BC0.getOpcode();
22991     unsigned NumElts = VT.getVectorNumElements();
22992
22993     if (BC0.hasOneUse() && SVT.isVector() &&
22994         SVT.getVectorNumElements() * 2 == NumElts &&
22995         TLI.isOperationLegal(Opcode, VT)) {
22996       bool CanFold = false;
22997       switch (Opcode) {
22998       default : break;
22999       case ISD::ADD :
23000       case ISD::FADD :
23001       case ISD::SUB :
23002       case ISD::FSUB :
23003       case ISD::MUL :
23004       case ISD::FMUL :
23005         CanFold = true;
23006       }
23007
23008       unsigned SVTNumElts = SVT.getVectorNumElements();
23009       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
23010       for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
23011         CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
23012       for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
23013         CanFold = SVOp->getMaskElt(i) < 0;
23014
23015       if (CanFold) {
23016         SDValue BC00 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(0));
23017         SDValue BC01 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(1));
23018         SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
23019         return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]);
23020       }
23021     }
23022   }
23023
23024   // Only handle 128 wide vector from here on.
23025   if (!VT.is128BitVector())
23026     return SDValue();
23027
23028   // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
23029   // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
23030   // consecutive, non-overlapping, and in the right order.
23031   SmallVector<SDValue, 16> Elts;
23032   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
23033     Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
23034
23035   SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true);
23036   if (LD.getNode())
23037     return LD;
23038
23039   if (isTargetShuffle(N->getOpcode())) {
23040     SDValue Shuffle =
23041         PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget);
23042     if (Shuffle.getNode())
23043       return Shuffle;
23044
23045     // Try recursively combining arbitrary sequences of x86 shuffle
23046     // instructions into higher-order shuffles. We do this after combining
23047     // specific PSHUF instruction sequences into their minimal form so that we
23048     // can evaluate how many specialized shuffle instructions are involved in
23049     // a particular chain.
23050     SmallVector<int, 1> NonceMask; // Just a placeholder.
23051     NonceMask.push_back(0);
23052     if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask,
23053                                       /*Depth*/ 1, /*HasPSHUFB*/ false, DAG,
23054                                       DCI, Subtarget))
23055       return SDValue(); // This routine will use CombineTo to replace N.
23056   }
23057
23058   return SDValue();
23059 }
23060
23061 /// PerformTruncateCombine - Converts truncate operation to
23062 /// a sequence of vector shuffle operations.
23063 /// It is possible when we truncate 256-bit vector to 128-bit vector
23064 static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
23065                                       TargetLowering::DAGCombinerInfo &DCI,
23066                                       const X86Subtarget *Subtarget)  {
23067   return SDValue();
23068 }
23069
23070 /// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target
23071 /// specific shuffle of a load can be folded into a single element load.
23072 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
23073 /// shuffles have been custom lowered so we need to handle those here.
23074 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
23075                                          TargetLowering::DAGCombinerInfo &DCI) {
23076   if (DCI.isBeforeLegalizeOps())
23077     return SDValue();
23078
23079   SDValue InVec = N->getOperand(0);
23080   SDValue EltNo = N->getOperand(1);
23081
23082   if (!isa<ConstantSDNode>(EltNo))
23083     return SDValue();
23084
23085   EVT OriginalVT = InVec.getValueType();
23086
23087   if (InVec.getOpcode() == ISD::BITCAST) {
23088     // Don't duplicate a load with other uses.
23089     if (!InVec.hasOneUse())
23090       return SDValue();
23091     EVT BCVT = InVec.getOperand(0).getValueType();
23092     if (BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
23093       return SDValue();
23094     InVec = InVec.getOperand(0);
23095   }
23096
23097   EVT CurrentVT = InVec.getValueType();
23098
23099   if (!isTargetShuffle(InVec.getOpcode()))
23100     return SDValue();
23101
23102   // Don't duplicate a load with other uses.
23103   if (!InVec.hasOneUse())
23104     return SDValue();
23105
23106   SmallVector<int, 16> ShuffleMask;
23107   bool UnaryShuffle;
23108   if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(),
23109                             ShuffleMask, UnaryShuffle))
23110     return SDValue();
23111
23112   // Select the input vector, guarding against out of range extract vector.
23113   unsigned NumElems = CurrentVT.getVectorNumElements();
23114   int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
23115   int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt];
23116   SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0)
23117                                          : InVec.getOperand(1);
23118
23119   // If inputs to shuffle are the same for both ops, then allow 2 uses
23120   unsigned AllowedUses = InVec.getNumOperands() > 1 &&
23121                          InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
23122
23123   if (LdNode.getOpcode() == ISD::BITCAST) {
23124     // Don't duplicate a load with other uses.
23125     if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
23126       return SDValue();
23127
23128     AllowedUses = 1; // only allow 1 load use if we have a bitcast
23129     LdNode = LdNode.getOperand(0);
23130   }
23131
23132   if (!ISD::isNormalLoad(LdNode.getNode()))
23133     return SDValue();
23134
23135   LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
23136
23137   if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
23138     return SDValue();
23139
23140   EVT EltVT = N->getValueType(0);
23141   // If there's a bitcast before the shuffle, check if the load type and
23142   // alignment is valid.
23143   unsigned Align = LN0->getAlignment();
23144   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23145   unsigned NewAlign = TLI.getDataLayout()->getABITypeAlignment(
23146       EltVT.getTypeForEVT(*DAG.getContext()));
23147
23148   if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
23149     return SDValue();
23150
23151   // All checks match so transform back to vector_shuffle so that DAG combiner
23152   // can finish the job
23153   SDLoc dl(N);
23154
23155   // Create shuffle node taking into account the case that its a unary shuffle
23156   SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT)
23157                                    : InVec.getOperand(1);
23158   Shuffle = DAG.getVectorShuffle(CurrentVT, dl,
23159                                  InVec.getOperand(0), Shuffle,
23160                                  &ShuffleMask[0]);
23161   Shuffle = DAG.getNode(ISD::BITCAST, dl, OriginalVT, Shuffle);
23162   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
23163                      EltNo);
23164 }
23165
23166 /// \brief Detect bitcasts between i32 to x86mmx low word. Since MMX types are
23167 /// special and don't usually play with other vector types, it's better to
23168 /// handle them early to be sure we emit efficient code by avoiding
23169 /// store-load conversions.
23170 static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG) {
23171   if (N->getValueType(0) != MVT::x86mmx ||
23172       N->getOperand(0)->getOpcode() != ISD::BUILD_VECTOR ||
23173       N->getOperand(0)->getValueType(0) != MVT::v2i32)
23174     return SDValue();
23175
23176   SDValue V = N->getOperand(0);
23177   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(1));
23178   if (C && C->getZExtValue() == 0 && V.getOperand(0).getValueType() == MVT::i32)
23179     return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(V.getOperand(0)),
23180                        N->getValueType(0), V.getOperand(0));
23181
23182   return SDValue();
23183 }
23184
23185 /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
23186 /// generation and convert it from being a bunch of shuffles and extracts
23187 /// into a somewhat faster sequence. For i686, the best sequence is apparently
23188 /// storing the value and loading scalars back, while for x64 we should
23189 /// use 64-bit extracts and shifts.
23190 static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
23191                                          TargetLowering::DAGCombinerInfo &DCI) {
23192   SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI);
23193   if (NewOp.getNode())
23194     return NewOp;
23195
23196   SDValue InputVector = N->getOperand(0);
23197
23198   // Detect mmx to i32 conversion through a v2i32 elt extract.
23199   if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
23200       N->getValueType(0) == MVT::i32 &&
23201       InputVector.getValueType() == MVT::v2i32) {
23202
23203     // The bitcast source is a direct mmx result.
23204     SDValue MMXSrc = InputVector.getNode()->getOperand(0);
23205     if (MMXSrc.getValueType() == MVT::x86mmx)
23206       return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
23207                          N->getValueType(0),
23208                          InputVector.getNode()->getOperand(0));
23209
23210     // The mmx is indirect: (i64 extract_elt (v1i64 bitcast (x86mmx ...))).
23211     SDValue MMXSrcOp = MMXSrc.getOperand(0);
23212     if (MMXSrc.getOpcode() == ISD::EXTRACT_VECTOR_ELT && MMXSrc.hasOneUse() &&
23213         MMXSrc.getValueType() == MVT::i64 && MMXSrcOp.hasOneUse() &&
23214         MMXSrcOp.getOpcode() == ISD::BITCAST &&
23215         MMXSrcOp.getValueType() == MVT::v1i64 &&
23216         MMXSrcOp.getOperand(0).getValueType() == MVT::x86mmx)
23217       return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
23218                          N->getValueType(0),
23219                          MMXSrcOp.getOperand(0));
23220   }
23221
23222   // Only operate on vectors of 4 elements, where the alternative shuffling
23223   // gets to be more expensive.
23224   if (InputVector.getValueType() != MVT::v4i32)
23225     return SDValue();
23226
23227   // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
23228   // single use which is a sign-extend or zero-extend, and all elements are
23229   // used.
23230   SmallVector<SDNode *, 4> Uses;
23231   unsigned ExtractedElements = 0;
23232   for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
23233        UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
23234     if (UI.getUse().getResNo() != InputVector.getResNo())
23235       return SDValue();
23236
23237     SDNode *Extract = *UI;
23238     if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
23239       return SDValue();
23240
23241     if (Extract->getValueType(0) != MVT::i32)
23242       return SDValue();
23243     if (!Extract->hasOneUse())
23244       return SDValue();
23245     if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
23246         Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
23247       return SDValue();
23248     if (!isa<ConstantSDNode>(Extract->getOperand(1)))
23249       return SDValue();
23250
23251     // Record which element was extracted.
23252     ExtractedElements |=
23253       1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
23254
23255     Uses.push_back(Extract);
23256   }
23257
23258   // If not all the elements were used, this may not be worthwhile.
23259   if (ExtractedElements != 15)
23260     return SDValue();
23261
23262   // Ok, we've now decided to do the transformation.
23263   // If 64-bit shifts are legal, use the extract-shift sequence,
23264   // otherwise bounce the vector off the cache.
23265   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23266   SDValue Vals[4];
23267   SDLoc dl(InputVector);
23268
23269   if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
23270     SDValue Cst = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, InputVector);
23271     EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy();
23272     SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
23273       DAG.getConstant(0, VecIdxTy));
23274     SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
23275       DAG.getConstant(1, VecIdxTy));
23276
23277     SDValue ShAmt = DAG.getConstant(32,
23278       DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64));
23279     Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
23280     Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
23281       DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
23282     Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
23283     Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
23284       DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
23285   } else {
23286     // Store the value to a temporary stack slot.
23287     SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
23288     SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
23289       MachinePointerInfo(), false, false, 0);
23290
23291     EVT ElementType = InputVector.getValueType().getVectorElementType();
23292     unsigned EltSize = ElementType.getSizeInBits() / 8;
23293
23294     // Replace each use (extract) with a load of the appropriate element.
23295     for (unsigned i = 0; i < 4; ++i) {
23296       uint64_t Offset = EltSize * i;
23297       SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
23298
23299       SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
23300                                        StackPtr, OffsetVal);
23301
23302       // Load the scalar.
23303       Vals[i] = DAG.getLoad(ElementType, dl, Ch,
23304                             ScalarAddr, MachinePointerInfo(),
23305                             false, false, false, 0);
23306
23307     }
23308   }
23309
23310   // Replace the extracts
23311   for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
23312     UE = Uses.end(); UI != UE; ++UI) {
23313     SDNode *Extract = *UI;
23314
23315     SDValue Idx = Extract->getOperand(1);
23316     uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
23317     DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
23318   }
23319
23320   // The replacement was made in place; don't return anything.
23321   return SDValue();
23322 }
23323
23324 /// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match.
23325 static std::pair<unsigned, bool>
23326 matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
23327                    SelectionDAG &DAG, const X86Subtarget *Subtarget) {
23328   if (!VT.isVector())
23329     return std::make_pair(0, false);
23330
23331   bool NeedSplit = false;
23332   switch (VT.getSimpleVT().SimpleTy) {
23333   default: return std::make_pair(0, false);
23334   case MVT::v4i64:
23335   case MVT::v2i64:
23336     if (!Subtarget->hasVLX())
23337       return std::make_pair(0, false);
23338     break;
23339   case MVT::v64i8:
23340   case MVT::v32i16:
23341     if (!Subtarget->hasBWI())
23342       return std::make_pair(0, false);
23343     break;
23344   case MVT::v16i32:
23345   case MVT::v8i64:
23346     if (!Subtarget->hasAVX512())
23347       return std::make_pair(0, false);
23348     break;
23349   case MVT::v32i8:
23350   case MVT::v16i16:
23351   case MVT::v8i32:
23352     if (!Subtarget->hasAVX2())
23353       NeedSplit = true;
23354     if (!Subtarget->hasAVX())
23355       return std::make_pair(0, false);
23356     break;
23357   case MVT::v16i8:
23358   case MVT::v8i16:
23359   case MVT::v4i32:
23360     if (!Subtarget->hasSSE2())
23361       return std::make_pair(0, false);
23362   }
23363
23364   // SSE2 has only a small subset of the operations.
23365   bool hasUnsigned = Subtarget->hasSSE41() ||
23366                      (Subtarget->hasSSE2() && VT == MVT::v16i8);
23367   bool hasSigned = Subtarget->hasSSE41() ||
23368                    (Subtarget->hasSSE2() && VT == MVT::v8i16);
23369
23370   ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23371
23372   unsigned Opc = 0;
23373   // Check for x CC y ? x : y.
23374   if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
23375       DAG.isEqualTo(RHS, Cond.getOperand(1))) {
23376     switch (CC) {
23377     default: break;
23378     case ISD::SETULT:
23379     case ISD::SETULE:
23380       Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
23381     case ISD::SETUGT:
23382     case ISD::SETUGE:
23383       Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
23384     case ISD::SETLT:
23385     case ISD::SETLE:
23386       Opc = hasSigned ? X86ISD::SMIN : 0; break;
23387     case ISD::SETGT:
23388     case ISD::SETGE:
23389       Opc = hasSigned ? X86ISD::SMAX : 0; break;
23390     }
23391   // Check for x CC y ? y : x -- a min/max with reversed arms.
23392   } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
23393              DAG.isEqualTo(RHS, Cond.getOperand(0))) {
23394     switch (CC) {
23395     default: break;
23396     case ISD::SETULT:
23397     case ISD::SETULE:
23398       Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
23399     case ISD::SETUGT:
23400     case ISD::SETUGE:
23401       Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
23402     case ISD::SETLT:
23403     case ISD::SETLE:
23404       Opc = hasSigned ? X86ISD::SMAX : 0; break;
23405     case ISD::SETGT:
23406     case ISD::SETGE:
23407       Opc = hasSigned ? X86ISD::SMIN : 0; break;
23408     }
23409   }
23410
23411   return std::make_pair(Opc, NeedSplit);
23412 }
23413
23414 static SDValue
23415 transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
23416                                       const X86Subtarget *Subtarget) {
23417   SDLoc dl(N);
23418   SDValue Cond = N->getOperand(0);
23419   SDValue LHS = N->getOperand(1);
23420   SDValue RHS = N->getOperand(2);
23421
23422   if (Cond.getOpcode() == ISD::SIGN_EXTEND) {
23423     SDValue CondSrc = Cond->getOperand(0);
23424     if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG)
23425       Cond = CondSrc->getOperand(0);
23426   }
23427
23428   if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
23429     return SDValue();
23430
23431   // A vselect where all conditions and data are constants can be optimized into
23432   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
23433   if (ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
23434       ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
23435     return SDValue();
23436
23437   unsigned MaskValue = 0;
23438   if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
23439     return SDValue();
23440
23441   MVT VT = N->getSimpleValueType(0);
23442   unsigned NumElems = VT.getVectorNumElements();
23443   SmallVector<int, 8> ShuffleMask(NumElems, -1);
23444   for (unsigned i = 0; i < NumElems; ++i) {
23445     // Be sure we emit undef where we can.
23446     if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF)
23447       ShuffleMask[i] = -1;
23448     else
23449       ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1);
23450   }
23451
23452   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23453   if (!TLI.isShuffleMaskLegal(ShuffleMask, VT))
23454     return SDValue();
23455   return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]);
23456 }
23457
23458 /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
23459 /// nodes.
23460 static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
23461                                     TargetLowering::DAGCombinerInfo &DCI,
23462                                     const X86Subtarget *Subtarget) {
23463   SDLoc DL(N);
23464   SDValue Cond = N->getOperand(0);
23465   // Get the LHS/RHS of the select.
23466   SDValue LHS = N->getOperand(1);
23467   SDValue RHS = N->getOperand(2);
23468   EVT VT = LHS.getValueType();
23469   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23470
23471   // If we have SSE[12] support, try to form min/max nodes. SSE min/max
23472   // instructions match the semantics of the common C idiom x<y?x:y but not
23473   // x<=y?x:y, because of how they handle negative zero (which can be
23474   // ignored in unsafe-math mode).
23475   // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
23476   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
23477       VT != MVT::f80 && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
23478       (Subtarget->hasSSE2() ||
23479        (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
23480     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23481
23482     unsigned Opcode = 0;
23483     // Check for x CC y ? x : y.
23484     if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
23485         DAG.isEqualTo(RHS, Cond.getOperand(1))) {
23486       switch (CC) {
23487       default: break;
23488       case ISD::SETULT:
23489         // Converting this to a min would handle NaNs incorrectly, and swapping
23490         // the operands would cause it to handle comparisons between positive
23491         // and negative zero incorrectly.
23492         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
23493           if (!DAG.getTarget().Options.UnsafeFPMath &&
23494               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
23495             break;
23496           std::swap(LHS, RHS);
23497         }
23498         Opcode = X86ISD::FMIN;
23499         break;
23500       case ISD::SETOLE:
23501         // Converting this to a min would handle comparisons between positive
23502         // and negative zero incorrectly.
23503         if (!DAG.getTarget().Options.UnsafeFPMath &&
23504             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
23505           break;
23506         Opcode = X86ISD::FMIN;
23507         break;
23508       case ISD::SETULE:
23509         // Converting this to a min would handle both negative zeros and NaNs
23510         // incorrectly, but we can swap the operands to fix both.
23511         std::swap(LHS, RHS);
23512       case ISD::SETOLT:
23513       case ISD::SETLT:
23514       case ISD::SETLE:
23515         Opcode = X86ISD::FMIN;
23516         break;
23517
23518       case ISD::SETOGE:
23519         // Converting this to a max would handle comparisons between positive
23520         // and negative zero incorrectly.
23521         if (!DAG.getTarget().Options.UnsafeFPMath &&
23522             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
23523           break;
23524         Opcode = X86ISD::FMAX;
23525         break;
23526       case ISD::SETUGT:
23527         // Converting this to a max would handle NaNs incorrectly, and swapping
23528         // the operands would cause it to handle comparisons between positive
23529         // and negative zero incorrectly.
23530         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
23531           if (!DAG.getTarget().Options.UnsafeFPMath &&
23532               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
23533             break;
23534           std::swap(LHS, RHS);
23535         }
23536         Opcode = X86ISD::FMAX;
23537         break;
23538       case ISD::SETUGE:
23539         // Converting this to a max would handle both negative zeros and NaNs
23540         // incorrectly, but we can swap the operands to fix both.
23541         std::swap(LHS, RHS);
23542       case ISD::SETOGT:
23543       case ISD::SETGT:
23544       case ISD::SETGE:
23545         Opcode = X86ISD::FMAX;
23546         break;
23547       }
23548     // Check for x CC y ? y : x -- a min/max with reversed arms.
23549     } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
23550                DAG.isEqualTo(RHS, Cond.getOperand(0))) {
23551       switch (CC) {
23552       default: break;
23553       case ISD::SETOGE:
23554         // Converting this to a min would handle comparisons between positive
23555         // and negative zero incorrectly, and swapping the operands would
23556         // cause it to handle NaNs incorrectly.
23557         if (!DAG.getTarget().Options.UnsafeFPMath &&
23558             !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
23559           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
23560             break;
23561           std::swap(LHS, RHS);
23562         }
23563         Opcode = X86ISD::FMIN;
23564         break;
23565       case ISD::SETUGT:
23566         // Converting this to a min would handle NaNs incorrectly.
23567         if (!DAG.getTarget().Options.UnsafeFPMath &&
23568             (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
23569           break;
23570         Opcode = X86ISD::FMIN;
23571         break;
23572       case ISD::SETUGE:
23573         // Converting this to a min would handle both negative zeros and NaNs
23574         // incorrectly, but we can swap the operands to fix both.
23575         std::swap(LHS, RHS);
23576       case ISD::SETOGT:
23577       case ISD::SETGT:
23578       case ISD::SETGE:
23579         Opcode = X86ISD::FMIN;
23580         break;
23581
23582       case ISD::SETULT:
23583         // Converting this to a max would handle NaNs incorrectly.
23584         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
23585           break;
23586         Opcode = X86ISD::FMAX;
23587         break;
23588       case ISD::SETOLE:
23589         // Converting this to a max would handle comparisons between positive
23590         // and negative zero incorrectly, and swapping the operands would
23591         // cause it to handle NaNs incorrectly.
23592         if (!DAG.getTarget().Options.UnsafeFPMath &&
23593             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
23594           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
23595             break;
23596           std::swap(LHS, RHS);
23597         }
23598         Opcode = X86ISD::FMAX;
23599         break;
23600       case ISD::SETULE:
23601         // Converting this to a max would handle both negative zeros and NaNs
23602         // incorrectly, but we can swap the operands to fix both.
23603         std::swap(LHS, RHS);
23604       case ISD::SETOLT:
23605       case ISD::SETLT:
23606       case ISD::SETLE:
23607         Opcode = X86ISD::FMAX;
23608         break;
23609       }
23610     }
23611
23612     if (Opcode)
23613       return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
23614   }
23615
23616   EVT CondVT = Cond.getValueType();
23617   if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() &&
23618       CondVT.getVectorElementType() == MVT::i1) {
23619     // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
23620     // lowering on KNL. In this case we convert it to
23621     // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
23622     // The same situation for all 128 and 256-bit vectors of i8 and i16.
23623     // Since SKX these selects have a proper lowering.
23624     EVT OpVT = LHS.getValueType();
23625     if ((OpVT.is128BitVector() || OpVT.is256BitVector()) &&
23626         (OpVT.getVectorElementType() == MVT::i8 ||
23627          OpVT.getVectorElementType() == MVT::i16) &&
23628         !(Subtarget->hasBWI() && Subtarget->hasVLX())) {
23629       Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond);
23630       DCI.AddToWorklist(Cond.getNode());
23631       return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS);
23632     }
23633   }
23634   // If this is a select between two integer constants, try to do some
23635   // optimizations.
23636   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
23637     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
23638       // Don't do this for crazy integer types.
23639       if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
23640         // If this is efficiently invertible, canonicalize the LHSC/RHSC values
23641         // so that TrueC (the true value) is larger than FalseC.
23642         bool NeedsCondInvert = false;
23643
23644         if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
23645             // Efficiently invertible.
23646             (Cond.getOpcode() == ISD::SETCC ||  // setcc -> invertible.
23647              (Cond.getOpcode() == ISD::XOR &&   // xor(X, C) -> invertible.
23648               isa<ConstantSDNode>(Cond.getOperand(1))))) {
23649           NeedsCondInvert = true;
23650           std::swap(TrueC, FalseC);
23651         }
23652
23653         // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
23654         if (FalseC->getAPIntValue() == 0 &&
23655             TrueC->getAPIntValue().isPowerOf2()) {
23656           if (NeedsCondInvert) // Invert the condition if needed.
23657             Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
23658                                DAG.getConstant(1, Cond.getValueType()));
23659
23660           // Zero extend the condition if needed.
23661           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
23662
23663           unsigned ShAmt = TrueC->getAPIntValue().logBase2();
23664           return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
23665                              DAG.getConstant(ShAmt, MVT::i8));
23666         }
23667
23668         // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
23669         if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
23670           if (NeedsCondInvert) // Invert the condition if needed.
23671             Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
23672                                DAG.getConstant(1, Cond.getValueType()));
23673
23674           // Zero extend the condition if needed.
23675           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
23676                              FalseC->getValueType(0), Cond);
23677           return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
23678                              SDValue(FalseC, 0));
23679         }
23680
23681         // Optimize cases that will turn into an LEA instruction.  This requires
23682         // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
23683         if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
23684           uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
23685           if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
23686
23687           bool isFastMultiplier = false;
23688           if (Diff < 10) {
23689             switch ((unsigned char)Diff) {
23690               default: break;
23691               case 1:  // result = add base, cond
23692               case 2:  // result = lea base(    , cond*2)
23693               case 3:  // result = lea base(cond, cond*2)
23694               case 4:  // result = lea base(    , cond*4)
23695               case 5:  // result = lea base(cond, cond*4)
23696               case 8:  // result = lea base(    , cond*8)
23697               case 9:  // result = lea base(cond, cond*8)
23698                 isFastMultiplier = true;
23699                 break;
23700             }
23701           }
23702
23703           if (isFastMultiplier) {
23704             APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
23705             if (NeedsCondInvert) // Invert the condition if needed.
23706               Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
23707                                  DAG.getConstant(1, Cond.getValueType()));
23708
23709             // Zero extend the condition if needed.
23710             Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
23711                                Cond);
23712             // Scale the condition by the difference.
23713             if (Diff != 1)
23714               Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
23715                                  DAG.getConstant(Diff, Cond.getValueType()));
23716
23717             // Add the base if non-zero.
23718             if (FalseC->getAPIntValue() != 0)
23719               Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
23720                                  SDValue(FalseC, 0));
23721             return Cond;
23722           }
23723         }
23724       }
23725   }
23726
23727   // Canonicalize max and min:
23728   // (x > y) ? x : y -> (x >= y) ? x : y
23729   // (x < y) ? x : y -> (x <= y) ? x : y
23730   // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
23731   // the need for an extra compare
23732   // against zero. e.g.
23733   // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
23734   // subl   %esi, %edi
23735   // testl  %edi, %edi
23736   // movl   $0, %eax
23737   // cmovgl %edi, %eax
23738   // =>
23739   // xorl   %eax, %eax
23740   // subl   %esi, $edi
23741   // cmovsl %eax, %edi
23742   if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
23743       DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
23744       DAG.isEqualTo(RHS, Cond.getOperand(1))) {
23745     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23746     switch (CC) {
23747     default: break;
23748     case ISD::SETLT:
23749     case ISD::SETGT: {
23750       ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
23751       Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
23752                           Cond.getOperand(0), Cond.getOperand(1), NewCC);
23753       return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
23754     }
23755     }
23756   }
23757
23758   // Early exit check
23759   if (!TLI.isTypeLegal(VT))
23760     return SDValue();
23761
23762   // Match VSELECTs into subs with unsigned saturation.
23763   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
23764       // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
23765       ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
23766        (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
23767     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23768
23769     // Check if one of the arms of the VSELECT is a zero vector. If it's on the
23770     // left side invert the predicate to simplify logic below.
23771     SDValue Other;
23772     if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
23773       Other = RHS;
23774       CC = ISD::getSetCCInverse(CC, true);
23775     } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
23776       Other = LHS;
23777     }
23778
23779     if (Other.getNode() && Other->getNumOperands() == 2 &&
23780         DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
23781       SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
23782       SDValue CondRHS = Cond->getOperand(1);
23783
23784       // Look for a general sub with unsigned saturation first.
23785       // x >= y ? x-y : 0 --> subus x, y
23786       // x >  y ? x-y : 0 --> subus x, y
23787       if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
23788           Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
23789         return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
23790
23791       if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
23792         if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
23793           if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
23794             if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
23795               // If the RHS is a constant we have to reverse the const
23796               // canonicalization.
23797               // x > C-1 ? x+-C : 0 --> subus x, C
23798               if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
23799                   CondRHSConst->getAPIntValue() ==
23800                       (-OpRHSConst->getAPIntValue() - 1))
23801                 return DAG.getNode(
23802                     X86ISD::SUBUS, DL, VT, OpLHS,
23803                     DAG.getConstant(-OpRHSConst->getAPIntValue(), VT));
23804
23805           // Another special case: If C was a sign bit, the sub has been
23806           // canonicalized into a xor.
23807           // FIXME: Would it be better to use computeKnownBits to determine
23808           //        whether it's safe to decanonicalize the xor?
23809           // x s< 0 ? x^C : 0 --> subus x, C
23810           if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
23811               ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
23812               OpRHSConst->getAPIntValue().isSignBit())
23813             // Note that we have to rebuild the RHS constant here to ensure we
23814             // don't rely on particular values of undef lanes.
23815             return DAG.getNode(
23816                 X86ISD::SUBUS, DL, VT, OpLHS,
23817                 DAG.getConstant(OpRHSConst->getAPIntValue(), VT));
23818         }
23819     }
23820   }
23821
23822   // Try to match a min/max vector operation.
23823   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) {
23824     std::pair<unsigned, bool> ret = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget);
23825     unsigned Opc = ret.first;
23826     bool NeedSplit = ret.second;
23827
23828     if (Opc && NeedSplit) {
23829       unsigned NumElems = VT.getVectorNumElements();
23830       // Extract the LHS vectors
23831       SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, DL);
23832       SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, DL);
23833
23834       // Extract the RHS vectors
23835       SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, DL);
23836       SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, DL);
23837
23838       // Create min/max for each subvector
23839       LHS = DAG.getNode(Opc, DL, LHS1.getValueType(), LHS1, RHS1);
23840       RHS = DAG.getNode(Opc, DL, LHS2.getValueType(), LHS2, RHS2);
23841
23842       // Merge the result
23843       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS, RHS);
23844     } else if (Opc)
23845       return DAG.getNode(Opc, DL, VT, LHS, RHS);
23846   }
23847
23848   // Simplify vector selection if condition value type matches vselect
23849   // operand type
23850   if (N->getOpcode() == ISD::VSELECT && CondVT == VT) {
23851     assert(Cond.getValueType().isVector() &&
23852            "vector select expects a vector selector!");
23853
23854     bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
23855     bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
23856
23857     // Try invert the condition if true value is not all 1s and false value
23858     // is not all 0s.
23859     if (!TValIsAllOnes && !FValIsAllZeros &&
23860         // Check if the selector will be produced by CMPP*/PCMP*
23861         Cond.getOpcode() == ISD::SETCC &&
23862         // Check if SETCC has already been promoted
23863         TLI.getSetCCResultType(*DAG.getContext(), VT) == CondVT) {
23864       bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
23865       bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
23866
23867       if (TValIsAllZeros || FValIsAllOnes) {
23868         SDValue CC = Cond.getOperand(2);
23869         ISD::CondCode NewCC =
23870           ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
23871                                Cond.getOperand(0).getValueType().isInteger());
23872         Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC);
23873         std::swap(LHS, RHS);
23874         TValIsAllOnes = FValIsAllOnes;
23875         FValIsAllZeros = TValIsAllZeros;
23876       }
23877     }
23878
23879     if (TValIsAllOnes || FValIsAllZeros) {
23880       SDValue Ret;
23881
23882       if (TValIsAllOnes && FValIsAllZeros)
23883         Ret = Cond;
23884       else if (TValIsAllOnes)
23885         Ret = DAG.getNode(ISD::OR, DL, CondVT, Cond,
23886                           DAG.getNode(ISD::BITCAST, DL, CondVT, RHS));
23887       else if (FValIsAllZeros)
23888         Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond,
23889                           DAG.getNode(ISD::BITCAST, DL, CondVT, LHS));
23890
23891       return DAG.getNode(ISD::BITCAST, DL, VT, Ret);
23892     }
23893   }
23894
23895   // If we know that this node is legal then we know that it is going to be
23896   // matched by one of the SSE/AVX BLEND instructions. These instructions only
23897   // depend on the highest bit in each word. Try to use SimplifyDemandedBits
23898   // to simplify previous instructions.
23899   if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
23900       !DCI.isBeforeLegalize() &&
23901       // We explicitly check against v8i16 and v16i16 because, although
23902       // they're marked as Custom, they might only be legal when Cond is a
23903       // build_vector of constants. This will be taken care in a later
23904       // condition.
23905       (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 &&
23906        VT != MVT::v8i16) &&
23907       // Don't optimize vector of constants. Those are handled by
23908       // the generic code and all the bits must be properly set for
23909       // the generic optimizer.
23910       !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
23911     unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
23912
23913     // Don't optimize vector selects that map to mask-registers.
23914     if (BitWidth == 1)
23915       return SDValue();
23916
23917     assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
23918     APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
23919
23920     APInt KnownZero, KnownOne;
23921     TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
23922                                           DCI.isBeforeLegalizeOps());
23923     if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
23924         TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
23925                                  TLO)) {
23926       // If we changed the computation somewhere in the DAG, this change
23927       // will affect all users of Cond.
23928       // Make sure it is fine and update all the nodes so that we do not
23929       // use the generic VSELECT anymore. Otherwise, we may perform
23930       // wrong optimizations as we messed up with the actual expectation
23931       // for the vector boolean values.
23932       if (Cond != TLO.Old) {
23933         // Check all uses of that condition operand to check whether it will be
23934         // consumed by non-BLEND instructions, which may depend on all bits are
23935         // set properly.
23936         for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
23937              I != E; ++I)
23938           if (I->getOpcode() != ISD::VSELECT)
23939             // TODO: Add other opcodes eventually lowered into BLEND.
23940             return SDValue();
23941
23942         // Update all the users of the condition, before committing the change,
23943         // so that the VSELECT optimizations that expect the correct vector
23944         // boolean value will not be triggered.
23945         for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
23946              I != E; ++I)
23947           DAG.ReplaceAllUsesOfValueWith(
23948               SDValue(*I, 0),
23949               DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0),
23950                           Cond, I->getOperand(1), I->getOperand(2)));
23951         DCI.CommitTargetLoweringOpt(TLO);
23952         return SDValue();
23953       }
23954       // At this point, only Cond is changed. Change the condition
23955       // just for N to keep the opportunity to optimize all other
23956       // users their own way.
23957       DAG.ReplaceAllUsesOfValueWith(
23958           SDValue(N, 0),
23959           DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0),
23960                       TLO.New, N->getOperand(1), N->getOperand(2)));
23961       return SDValue();
23962     }
23963   }
23964
23965   // We should generate an X86ISD::BLENDI from a vselect if its argument
23966   // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
23967   // constants. This specific pattern gets generated when we split a
23968   // selector for a 512 bit vector in a machine without AVX512 (but with
23969   // 256-bit vectors), during legalization:
23970   //
23971   // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
23972   //
23973   // Iff we find this pattern and the build_vectors are built from
23974   // constants, we translate the vselect into a shuffle_vector that we
23975   // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
23976   if ((N->getOpcode() == ISD::VSELECT ||
23977        N->getOpcode() == X86ISD::SHRUNKBLEND) &&
23978       !DCI.isBeforeLegalize()) {
23979     SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
23980     if (Shuffle.getNode())
23981       return Shuffle;
23982   }
23983
23984   return SDValue();
23985 }
23986
23987 // Check whether a boolean test is testing a boolean value generated by
23988 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
23989 // code.
23990 //
23991 // Simplify the following patterns:
23992 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
23993 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
23994 // to (Op EFLAGS Cond)
23995 //
23996 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
23997 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
23998 // to (Op EFLAGS !Cond)
23999 //
24000 // where Op could be BRCOND or CMOV.
24001 //
24002 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
24003   // Quit if not CMP and SUB with its value result used.
24004   if (Cmp.getOpcode() != X86ISD::CMP &&
24005       (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0)))
24006       return SDValue();
24007
24008   // Quit if not used as a boolean value.
24009   if (CC != X86::COND_E && CC != X86::COND_NE)
24010     return SDValue();
24011
24012   // Check CMP operands. One of them should be 0 or 1 and the other should be
24013   // an SetCC or extended from it.
24014   SDValue Op1 = Cmp.getOperand(0);
24015   SDValue Op2 = Cmp.getOperand(1);
24016
24017   SDValue SetCC;
24018   const ConstantSDNode* C = nullptr;
24019   bool needOppositeCond = (CC == X86::COND_E);
24020   bool checkAgainstTrue = false; // Is it a comparison against 1?
24021
24022   if ((C = dyn_cast<ConstantSDNode>(Op1)))
24023     SetCC = Op2;
24024   else if ((C = dyn_cast<ConstantSDNode>(Op2)))
24025     SetCC = Op1;
24026   else // Quit if all operands are not constants.
24027     return SDValue();
24028
24029   if (C->getZExtValue() == 1) {
24030     needOppositeCond = !needOppositeCond;
24031     checkAgainstTrue = true;
24032   } else if (C->getZExtValue() != 0)
24033     // Quit if the constant is neither 0 or 1.
24034     return SDValue();
24035
24036   bool truncatedToBoolWithAnd = false;
24037   // Skip (zext $x), (trunc $x), or (and $x, 1) node.
24038   while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
24039          SetCC.getOpcode() == ISD::TRUNCATE ||
24040          SetCC.getOpcode() == ISD::AND) {
24041     if (SetCC.getOpcode() == ISD::AND) {
24042       int OpIdx = -1;
24043       ConstantSDNode *CS;
24044       if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(0))) &&
24045           CS->getZExtValue() == 1)
24046         OpIdx = 1;
24047       if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(1))) &&
24048           CS->getZExtValue() == 1)
24049         OpIdx = 0;
24050       if (OpIdx == -1)
24051         break;
24052       SetCC = SetCC.getOperand(OpIdx);
24053       truncatedToBoolWithAnd = true;
24054     } else
24055       SetCC = SetCC.getOperand(0);
24056   }
24057
24058   switch (SetCC.getOpcode()) {
24059   case X86ISD::SETCC_CARRY:
24060     // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
24061     // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
24062     // i.e. it's a comparison against true but the result of SETCC_CARRY is not
24063     // truncated to i1 using 'and'.
24064     if (checkAgainstTrue && !truncatedToBoolWithAnd)
24065       break;
24066     assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
24067            "Invalid use of SETCC_CARRY!");
24068     // FALL THROUGH
24069   case X86ISD::SETCC:
24070     // Set the condition code or opposite one if necessary.
24071     CC = X86::CondCode(SetCC.getConstantOperandVal(0));
24072     if (needOppositeCond)
24073       CC = X86::GetOppositeBranchCondition(CC);
24074     return SetCC.getOperand(1);
24075   case X86ISD::CMOV: {
24076     // Check whether false/true value has canonical one, i.e. 0 or 1.
24077     ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
24078     ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
24079     // Quit if true value is not a constant.
24080     if (!TVal)
24081       return SDValue();
24082     // Quit if false value is not a constant.
24083     if (!FVal) {
24084       SDValue Op = SetCC.getOperand(0);
24085       // Skip 'zext' or 'trunc' node.
24086       if (Op.getOpcode() == ISD::ZERO_EXTEND ||
24087           Op.getOpcode() == ISD::TRUNCATE)
24088         Op = Op.getOperand(0);
24089       // A special case for rdrand/rdseed, where 0 is set if false cond is
24090       // found.
24091       if ((Op.getOpcode() != X86ISD::RDRAND &&
24092            Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
24093         return SDValue();
24094     }
24095     // Quit if false value is not the constant 0 or 1.
24096     bool FValIsFalse = true;
24097     if (FVal && FVal->getZExtValue() != 0) {
24098       if (FVal->getZExtValue() != 1)
24099         return SDValue();
24100       // If FVal is 1, opposite cond is needed.
24101       needOppositeCond = !needOppositeCond;
24102       FValIsFalse = false;
24103     }
24104     // Quit if TVal is not the constant opposite of FVal.
24105     if (FValIsFalse && TVal->getZExtValue() != 1)
24106       return SDValue();
24107     if (!FValIsFalse && TVal->getZExtValue() != 0)
24108       return SDValue();
24109     CC = X86::CondCode(SetCC.getConstantOperandVal(2));
24110     if (needOppositeCond)
24111       CC = X86::GetOppositeBranchCondition(CC);
24112     return SetCC.getOperand(3);
24113   }
24114   }
24115
24116   return SDValue();
24117 }
24118
24119 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
24120 static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
24121                                   TargetLowering::DAGCombinerInfo &DCI,
24122                                   const X86Subtarget *Subtarget) {
24123   SDLoc DL(N);
24124
24125   // If the flag operand isn't dead, don't touch this CMOV.
24126   if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
24127     return SDValue();
24128
24129   SDValue FalseOp = N->getOperand(0);
24130   SDValue TrueOp = N->getOperand(1);
24131   X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
24132   SDValue Cond = N->getOperand(3);
24133
24134   if (CC == X86::COND_E || CC == X86::COND_NE) {
24135     switch (Cond.getOpcode()) {
24136     default: break;
24137     case X86ISD::BSR:
24138     case X86ISD::BSF:
24139       // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
24140       if (DAG.isKnownNeverZero(Cond.getOperand(0)))
24141         return (CC == X86::COND_E) ? FalseOp : TrueOp;
24142     }
24143   }
24144
24145   SDValue Flags;
24146
24147   Flags = checkBoolTestSetCCCombine(Cond, CC);
24148   if (Flags.getNode() &&
24149       // Extra check as FCMOV only supports a subset of X86 cond.
24150       (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
24151     SDValue Ops[] = { FalseOp, TrueOp,
24152                       DAG.getConstant(CC, MVT::i8), Flags };
24153     return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
24154   }
24155
24156   // If this is a select between two integer constants, try to do some
24157   // optimizations.  Note that the operands are ordered the opposite of SELECT
24158   // operands.
24159   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
24160     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
24161       // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
24162       // larger than FalseC (the false value).
24163       if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
24164         CC = X86::GetOppositeBranchCondition(CC);
24165         std::swap(TrueC, FalseC);
24166         std::swap(TrueOp, FalseOp);
24167       }
24168
24169       // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
24170       // This is efficient for any integer data type (including i8/i16) and
24171       // shift amount.
24172       if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
24173         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
24174                            DAG.getConstant(CC, MVT::i8), Cond);
24175
24176         // Zero extend the condition if needed.
24177         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
24178
24179         unsigned ShAmt = TrueC->getAPIntValue().logBase2();
24180         Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
24181                            DAG.getConstant(ShAmt, MVT::i8));
24182         if (N->getNumValues() == 2)  // Dead flag value?
24183           return DCI.CombineTo(N, Cond, SDValue());
24184         return Cond;
24185       }
24186
24187       // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
24188       // for any integer data type, including i8/i16.
24189       if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
24190         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
24191                            DAG.getConstant(CC, MVT::i8), Cond);
24192
24193         // Zero extend the condition if needed.
24194         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
24195                            FalseC->getValueType(0), Cond);
24196         Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
24197                            SDValue(FalseC, 0));
24198
24199         if (N->getNumValues() == 2)  // Dead flag value?
24200           return DCI.CombineTo(N, Cond, SDValue());
24201         return Cond;
24202       }
24203
24204       // Optimize cases that will turn into an LEA instruction.  This requires
24205       // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
24206       if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
24207         uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
24208         if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
24209
24210         bool isFastMultiplier = false;
24211         if (Diff < 10) {
24212           switch ((unsigned char)Diff) {
24213           default: break;
24214           case 1:  // result = add base, cond
24215           case 2:  // result = lea base(    , cond*2)
24216           case 3:  // result = lea base(cond, cond*2)
24217           case 4:  // result = lea base(    , cond*4)
24218           case 5:  // result = lea base(cond, cond*4)
24219           case 8:  // result = lea base(    , cond*8)
24220           case 9:  // result = lea base(cond, cond*8)
24221             isFastMultiplier = true;
24222             break;
24223           }
24224         }
24225
24226         if (isFastMultiplier) {
24227           APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
24228           Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
24229                              DAG.getConstant(CC, MVT::i8), Cond);
24230           // Zero extend the condition if needed.
24231           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
24232                              Cond);
24233           // Scale the condition by the difference.
24234           if (Diff != 1)
24235             Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
24236                                DAG.getConstant(Diff, Cond.getValueType()));
24237
24238           // Add the base if non-zero.
24239           if (FalseC->getAPIntValue() != 0)
24240             Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
24241                                SDValue(FalseC, 0));
24242           if (N->getNumValues() == 2)  // Dead flag value?
24243             return DCI.CombineTo(N, Cond, SDValue());
24244           return Cond;
24245         }
24246       }
24247     }
24248   }
24249
24250   // Handle these cases:
24251   //   (select (x != c), e, c) -> select (x != c), e, x),
24252   //   (select (x == c), c, e) -> select (x == c), x, e)
24253   // where the c is an integer constant, and the "select" is the combination
24254   // of CMOV and CMP.
24255   //
24256   // The rationale for this change is that the conditional-move from a constant
24257   // needs two instructions, however, conditional-move from a register needs
24258   // only one instruction.
24259   //
24260   // CAVEAT: By replacing a constant with a symbolic value, it may obscure
24261   //  some instruction-combining opportunities. This opt needs to be
24262   //  postponed as late as possible.
24263   //
24264   if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
24265     // the DCI.xxxx conditions are provided to postpone the optimization as
24266     // late as possible.
24267
24268     ConstantSDNode *CmpAgainst = nullptr;
24269     if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
24270         (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
24271         !isa<ConstantSDNode>(Cond.getOperand(0))) {
24272
24273       if (CC == X86::COND_NE &&
24274           CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
24275         CC = X86::GetOppositeBranchCondition(CC);
24276         std::swap(TrueOp, FalseOp);
24277       }
24278
24279       if (CC == X86::COND_E &&
24280           CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
24281         SDValue Ops[] = { FalseOp, Cond.getOperand(0),
24282                           DAG.getConstant(CC, MVT::i8), Cond };
24283         return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
24284       }
24285     }
24286   }
24287
24288   return SDValue();
24289 }
24290
24291 static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
24292                                                 const X86Subtarget *Subtarget) {
24293   unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
24294   switch (IntNo) {
24295   default: return SDValue();
24296   // SSE/AVX/AVX2 blend intrinsics.
24297   case Intrinsic::x86_avx2_pblendvb:
24298   case Intrinsic::x86_avx2_pblendw:
24299   case Intrinsic::x86_avx2_pblendd_128:
24300   case Intrinsic::x86_avx2_pblendd_256:
24301     // Don't try to simplify this intrinsic if we don't have AVX2.
24302     if (!Subtarget->hasAVX2())
24303       return SDValue();
24304     // FALL-THROUGH
24305   case Intrinsic::x86_avx_blend_pd_256:
24306   case Intrinsic::x86_avx_blend_ps_256:
24307   case Intrinsic::x86_avx_blendv_pd_256:
24308   case Intrinsic::x86_avx_blendv_ps_256:
24309     // Don't try to simplify this intrinsic if we don't have AVX.
24310     if (!Subtarget->hasAVX())
24311       return SDValue();
24312     // FALL-THROUGH
24313   case Intrinsic::x86_sse41_pblendw:
24314   case Intrinsic::x86_sse41_blendpd:
24315   case Intrinsic::x86_sse41_blendps:
24316   case Intrinsic::x86_sse41_blendvps:
24317   case Intrinsic::x86_sse41_blendvpd:
24318   case Intrinsic::x86_sse41_pblendvb: {
24319     SDValue Op0 = N->getOperand(1);
24320     SDValue Op1 = N->getOperand(2);
24321     SDValue Mask = N->getOperand(3);
24322
24323     // Don't try to simplify this intrinsic if we don't have SSE4.1.
24324     if (!Subtarget->hasSSE41())
24325       return SDValue();
24326
24327     // fold (blend A, A, Mask) -> A
24328     if (Op0 == Op1)
24329       return Op0;
24330     // fold (blend A, B, allZeros) -> A
24331     if (ISD::isBuildVectorAllZeros(Mask.getNode()))
24332       return Op0;
24333     // fold (blend A, B, allOnes) -> B
24334     if (ISD::isBuildVectorAllOnes(Mask.getNode()))
24335       return Op1;
24336
24337     // Simplify the case where the mask is a constant i32 value.
24338     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mask)) {
24339       if (C->isNullValue())
24340         return Op0;
24341       if (C->isAllOnesValue())
24342         return Op1;
24343     }
24344
24345     return SDValue();
24346   }
24347
24348   // Packed SSE2/AVX2 arithmetic shift immediate intrinsics.
24349   case Intrinsic::x86_sse2_psrai_w:
24350   case Intrinsic::x86_sse2_psrai_d:
24351   case Intrinsic::x86_avx2_psrai_w:
24352   case Intrinsic::x86_avx2_psrai_d:
24353   case Intrinsic::x86_sse2_psra_w:
24354   case Intrinsic::x86_sse2_psra_d:
24355   case Intrinsic::x86_avx2_psra_w:
24356   case Intrinsic::x86_avx2_psra_d: {
24357     SDValue Op0 = N->getOperand(1);
24358     SDValue Op1 = N->getOperand(2);
24359     EVT VT = Op0.getValueType();
24360     assert(VT.isVector() && "Expected a vector type!");
24361
24362     if (isa<BuildVectorSDNode>(Op1))
24363       Op1 = Op1.getOperand(0);
24364
24365     if (!isa<ConstantSDNode>(Op1))
24366       return SDValue();
24367
24368     EVT SVT = VT.getVectorElementType();
24369     unsigned SVTBits = SVT.getSizeInBits();
24370
24371     ConstantSDNode *CND = cast<ConstantSDNode>(Op1);
24372     const APInt &C = APInt(SVTBits, CND->getAPIntValue().getZExtValue());
24373     uint64_t ShAmt = C.getZExtValue();
24374
24375     // Don't try to convert this shift into a ISD::SRA if the shift
24376     // count is bigger than or equal to the element size.
24377     if (ShAmt >= SVTBits)
24378       return SDValue();
24379
24380     // Trivial case: if the shift count is zero, then fold this
24381     // into the first operand.
24382     if (ShAmt == 0)
24383       return Op0;
24384
24385     // Replace this packed shift intrinsic with a target independent
24386     // shift dag node.
24387     SDValue Splat = DAG.getConstant(C, VT);
24388     return DAG.getNode(ISD::SRA, SDLoc(N), VT, Op0, Splat);
24389   }
24390   }
24391 }
24392
24393 /// PerformMulCombine - Optimize a single multiply with constant into two
24394 /// in order to implement it with two cheaper instructions, e.g.
24395 /// LEA + SHL, LEA + LEA.
24396 static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
24397                                  TargetLowering::DAGCombinerInfo &DCI) {
24398   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
24399     return SDValue();
24400
24401   EVT VT = N->getValueType(0);
24402   if (VT != MVT::i64 && VT != MVT::i32)
24403     return SDValue();
24404
24405   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
24406   if (!C)
24407     return SDValue();
24408   uint64_t MulAmt = C->getZExtValue();
24409   if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
24410     return SDValue();
24411
24412   uint64_t MulAmt1 = 0;
24413   uint64_t MulAmt2 = 0;
24414   if ((MulAmt % 9) == 0) {
24415     MulAmt1 = 9;
24416     MulAmt2 = MulAmt / 9;
24417   } else if ((MulAmt % 5) == 0) {
24418     MulAmt1 = 5;
24419     MulAmt2 = MulAmt / 5;
24420   } else if ((MulAmt % 3) == 0) {
24421     MulAmt1 = 3;
24422     MulAmt2 = MulAmt / 3;
24423   }
24424   if (MulAmt2 &&
24425       (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
24426     SDLoc DL(N);
24427
24428     if (isPowerOf2_64(MulAmt2) &&
24429         !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
24430       // If second multiplifer is pow2, issue it first. We want the multiply by
24431       // 3, 5, or 9 to be folded into the addressing mode unless the lone use
24432       // is an add.
24433       std::swap(MulAmt1, MulAmt2);
24434
24435     SDValue NewMul;
24436     if (isPowerOf2_64(MulAmt1))
24437       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
24438                            DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
24439     else
24440       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
24441                            DAG.getConstant(MulAmt1, VT));
24442
24443     if (isPowerOf2_64(MulAmt2))
24444       NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
24445                            DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
24446     else
24447       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
24448                            DAG.getConstant(MulAmt2, VT));
24449
24450     // Do not add new nodes to DAG combiner worklist.
24451     DCI.CombineTo(N, NewMul, false);
24452   }
24453   return SDValue();
24454 }
24455
24456 static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
24457   SDValue N0 = N->getOperand(0);
24458   SDValue N1 = N->getOperand(1);
24459   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
24460   EVT VT = N0.getValueType();
24461
24462   // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
24463   // since the result of setcc_c is all zero's or all ones.
24464   if (VT.isInteger() && !VT.isVector() &&
24465       N1C && N0.getOpcode() == ISD::AND &&
24466       N0.getOperand(1).getOpcode() == ISD::Constant) {
24467     SDValue N00 = N0.getOperand(0);
24468     if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
24469         ((N00.getOpcode() == ISD::ANY_EXTEND ||
24470           N00.getOpcode() == ISD::ZERO_EXTEND) &&
24471          N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) {
24472       APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
24473       APInt ShAmt = N1C->getAPIntValue();
24474       Mask = Mask.shl(ShAmt);
24475       if (Mask != 0)
24476         return DAG.getNode(ISD::AND, SDLoc(N), VT,
24477                            N00, DAG.getConstant(Mask, VT));
24478     }
24479   }
24480
24481   // Hardware support for vector shifts is sparse which makes us scalarize the
24482   // vector operations in many cases. Also, on sandybridge ADD is faster than
24483   // shl.
24484   // (shl V, 1) -> add V,V
24485   if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
24486     if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
24487       assert(N0.getValueType().isVector() && "Invalid vector shift type");
24488       // We shift all of the values by one. In many cases we do not have
24489       // hardware support for this operation. This is better expressed as an ADD
24490       // of two values.
24491       if (N1SplatC->getZExtValue() == 1)
24492         return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
24493     }
24494
24495   return SDValue();
24496 }
24497
24498 /// \brief Returns a vector of 0s if the node in input is a vector logical
24499 /// shift by a constant amount which is known to be bigger than or equal
24500 /// to the vector element size in bits.
24501 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
24502                                       const X86Subtarget *Subtarget) {
24503   EVT VT = N->getValueType(0);
24504
24505   if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
24506       (!Subtarget->hasInt256() ||
24507        (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
24508     return SDValue();
24509
24510   SDValue Amt = N->getOperand(1);
24511   SDLoc DL(N);
24512   if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
24513     if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
24514       APInt ShiftAmt = AmtSplat->getAPIntValue();
24515       unsigned MaxAmount = VT.getVectorElementType().getSizeInBits();
24516
24517       // SSE2/AVX2 logical shifts always return a vector of 0s
24518       // if the shift amount is bigger than or equal to
24519       // the element size. The constant shift amount will be
24520       // encoded as a 8-bit immediate.
24521       if (ShiftAmt.trunc(8).uge(MaxAmount))
24522         return getZeroVector(VT, Subtarget, DAG, DL);
24523     }
24524
24525   return SDValue();
24526 }
24527
24528 /// PerformShiftCombine - Combine shifts.
24529 static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
24530                                    TargetLowering::DAGCombinerInfo &DCI,
24531                                    const X86Subtarget *Subtarget) {
24532   if (N->getOpcode() == ISD::SHL) {
24533     SDValue V = PerformSHLCombine(N, DAG);
24534     if (V.getNode()) return V;
24535   }
24536
24537   if (N->getOpcode() != ISD::SRA) {
24538     // Try to fold this logical shift into a zero vector.
24539     SDValue V = performShiftToAllZeros(N, DAG, Subtarget);
24540     if (V.getNode()) return V;
24541   }
24542
24543   return SDValue();
24544 }
24545
24546 // CMPEQCombine - Recognize the distinctive  (AND (setcc ...) (setcc ..))
24547 // where both setccs reference the same FP CMP, and rewrite for CMPEQSS
24548 // and friends.  Likewise for OR -> CMPNEQSS.
24549 static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
24550                             TargetLowering::DAGCombinerInfo &DCI,
24551                             const X86Subtarget *Subtarget) {
24552   unsigned opcode;
24553
24554   // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
24555   // we're requiring SSE2 for both.
24556   if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
24557     SDValue N0 = N->getOperand(0);
24558     SDValue N1 = N->getOperand(1);
24559     SDValue CMP0 = N0->getOperand(1);
24560     SDValue CMP1 = N1->getOperand(1);
24561     SDLoc DL(N);
24562
24563     // The SETCCs should both refer to the same CMP.
24564     if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
24565       return SDValue();
24566
24567     SDValue CMP00 = CMP0->getOperand(0);
24568     SDValue CMP01 = CMP0->getOperand(1);
24569     EVT     VT    = CMP00.getValueType();
24570
24571     if (VT == MVT::f32 || VT == MVT::f64) {
24572       bool ExpectingFlags = false;
24573       // Check for any users that want flags:
24574       for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
24575            !ExpectingFlags && UI != UE; ++UI)
24576         switch (UI->getOpcode()) {
24577         default:
24578         case ISD::BR_CC:
24579         case ISD::BRCOND:
24580         case ISD::SELECT:
24581           ExpectingFlags = true;
24582           break;
24583         case ISD::CopyToReg:
24584         case ISD::SIGN_EXTEND:
24585         case ISD::ZERO_EXTEND:
24586         case ISD::ANY_EXTEND:
24587           break;
24588         }
24589
24590       if (!ExpectingFlags) {
24591         enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
24592         enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
24593
24594         if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
24595           X86::CondCode tmp = cc0;
24596           cc0 = cc1;
24597           cc1 = tmp;
24598         }
24599
24600         if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
24601             (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
24602           // FIXME: need symbolic constants for these magic numbers.
24603           // See X86ATTInstPrinter.cpp:printSSECC().
24604           unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
24605           if (Subtarget->hasAVX512()) {
24606             SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00,
24607                                          CMP01, DAG.getConstant(x86cc, MVT::i8));
24608             if (N->getValueType(0) != MVT::i1)
24609               return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
24610                                  FSetCC);
24611             return FSetCC;
24612           }
24613           SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
24614                                               CMP00.getValueType(), CMP00, CMP01,
24615                                               DAG.getConstant(x86cc, MVT::i8));
24616
24617           bool is64BitFP = (CMP00.getValueType() == MVT::f64);
24618           MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
24619
24620           if (is64BitFP && !Subtarget->is64Bit()) {
24621             // On a 32-bit target, we cannot bitcast the 64-bit float to a
24622             // 64-bit integer, since that's not a legal type. Since
24623             // OnesOrZeroesF is all ones of all zeroes, we don't need all the
24624             // bits, but can do this little dance to extract the lowest 32 bits
24625             // and work with those going forward.
24626             SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
24627                                            OnesOrZeroesF);
24628             SDValue Vector32 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32,
24629                                            Vector64);
24630             OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
24631                                         Vector32, DAG.getIntPtrConstant(0));
24632             IntVT = MVT::i32;
24633           }
24634
24635           SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, IntVT, OnesOrZeroesF);
24636           SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
24637                                       DAG.getConstant(1, IntVT));
24638           SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed);
24639           return OneBitOfTruth;
24640         }
24641       }
24642     }
24643   }
24644   return SDValue();
24645 }
24646
24647 /// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector
24648 /// so it can be folded inside ANDNP.
24649 static bool CanFoldXORWithAllOnes(const SDNode *N) {
24650   EVT VT = N->getValueType(0);
24651
24652   // Match direct AllOnes for 128 and 256-bit vectors
24653   if (ISD::isBuildVectorAllOnes(N))
24654     return true;
24655
24656   // Look through a bit convert.
24657   if (N->getOpcode() == ISD::BITCAST)
24658     N = N->getOperand(0).getNode();
24659
24660   // Sometimes the operand may come from a insert_subvector building a 256-bit
24661   // allones vector
24662   if (VT.is256BitVector() &&
24663       N->getOpcode() == ISD::INSERT_SUBVECTOR) {
24664     SDValue V1 = N->getOperand(0);
24665     SDValue V2 = N->getOperand(1);
24666
24667     if (V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
24668         V1.getOperand(0).getOpcode() == ISD::UNDEF &&
24669         ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
24670         ISD::isBuildVectorAllOnes(V2.getNode()))
24671       return true;
24672   }
24673
24674   return false;
24675 }
24676
24677 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
24678 // register. In most cases we actually compare or select YMM-sized registers
24679 // and mixing the two types creates horrible code. This method optimizes
24680 // some of the transition sequences.
24681 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
24682                                  TargetLowering::DAGCombinerInfo &DCI,
24683                                  const X86Subtarget *Subtarget) {
24684   EVT VT = N->getValueType(0);
24685   if (!VT.is256BitVector())
24686     return SDValue();
24687
24688   assert((N->getOpcode() == ISD::ANY_EXTEND ||
24689           N->getOpcode() == ISD::ZERO_EXTEND ||
24690           N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
24691
24692   SDValue Narrow = N->getOperand(0);
24693   EVT NarrowVT = Narrow->getValueType(0);
24694   if (!NarrowVT.is128BitVector())
24695     return SDValue();
24696
24697   if (Narrow->getOpcode() != ISD::XOR &&
24698       Narrow->getOpcode() != ISD::AND &&
24699       Narrow->getOpcode() != ISD::OR)
24700     return SDValue();
24701
24702   SDValue N0  = Narrow->getOperand(0);
24703   SDValue N1  = Narrow->getOperand(1);
24704   SDLoc DL(Narrow);
24705
24706   // The Left side has to be a trunc.
24707   if (N0.getOpcode() != ISD::TRUNCATE)
24708     return SDValue();
24709
24710   // The type of the truncated inputs.
24711   EVT WideVT = N0->getOperand(0)->getValueType(0);
24712   if (WideVT != VT)
24713     return SDValue();
24714
24715   // The right side has to be a 'trunc' or a constant vector.
24716   bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
24717   ConstantSDNode *RHSConstSplat = nullptr;
24718   if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
24719     RHSConstSplat = RHSBV->getConstantSplatNode();
24720   if (!RHSTrunc && !RHSConstSplat)
24721     return SDValue();
24722
24723   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24724
24725   if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
24726     return SDValue();
24727
24728   // Set N0 and N1 to hold the inputs to the new wide operation.
24729   N0 = N0->getOperand(0);
24730   if (RHSConstSplat) {
24731     N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(),
24732                      SDValue(RHSConstSplat, 0));
24733     SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);
24734     N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C);
24735   } else if (RHSTrunc) {
24736     N1 = N1->getOperand(0);
24737   }
24738
24739   // Generate the wide operation.
24740   SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
24741   unsigned Opcode = N->getOpcode();
24742   switch (Opcode) {
24743   case ISD::ANY_EXTEND:
24744     return Op;
24745   case ISD::ZERO_EXTEND: {
24746     unsigned InBits = NarrowVT.getScalarType().getSizeInBits();
24747     APInt Mask = APInt::getAllOnesValue(InBits);
24748     Mask = Mask.zext(VT.getScalarType().getSizeInBits());
24749     return DAG.getNode(ISD::AND, DL, VT,
24750                        Op, DAG.getConstant(Mask, VT));
24751   }
24752   case ISD::SIGN_EXTEND:
24753     return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
24754                        Op, DAG.getValueType(NarrowVT));
24755   default:
24756     llvm_unreachable("Unexpected opcode");
24757   }
24758 }
24759
24760 static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
24761                                  TargetLowering::DAGCombinerInfo &DCI,
24762                                  const X86Subtarget *Subtarget) {
24763   EVT VT = N->getValueType(0);
24764   if (DCI.isBeforeLegalizeOps())
24765     return SDValue();
24766
24767   SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
24768   if (R.getNode())
24769     return R;
24770
24771   // Create BEXTR instructions
24772   // BEXTR is ((X >> imm) & (2**size-1))
24773   if (VT == MVT::i32 || VT == MVT::i64) {
24774     SDValue N0 = N->getOperand(0);
24775     SDValue N1 = N->getOperand(1);
24776     SDLoc DL(N);
24777
24778     // Check for BEXTR.
24779     if ((Subtarget->hasBMI() || Subtarget->hasTBM()) &&
24780         (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) {
24781       ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
24782       ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
24783       if (MaskNode && ShiftNode) {
24784         uint64_t Mask = MaskNode->getZExtValue();
24785         uint64_t Shift = ShiftNode->getZExtValue();
24786         if (isMask_64(Mask)) {
24787           uint64_t MaskSize = countPopulation(Mask);
24788           if (Shift + MaskSize <= VT.getSizeInBits())
24789             return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
24790                                DAG.getConstant(Shift | (MaskSize << 8), VT));
24791         }
24792       }
24793     } // BEXTR
24794
24795     return SDValue();
24796   }
24797
24798   // Want to form ANDNP nodes:
24799   // 1) In the hopes of then easily combining them with OR and AND nodes
24800   //    to form PBLEND/PSIGN.
24801   // 2) To match ANDN packed intrinsics
24802   if (VT != MVT::v2i64 && VT != MVT::v4i64)
24803     return SDValue();
24804
24805   SDValue N0 = N->getOperand(0);
24806   SDValue N1 = N->getOperand(1);
24807   SDLoc DL(N);
24808
24809   // Check LHS for vnot
24810   if (N0.getOpcode() == ISD::XOR &&
24811       //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
24812       CanFoldXORWithAllOnes(N0.getOperand(1).getNode()))
24813     return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
24814
24815   // Check RHS for vnot
24816   if (N1.getOpcode() == ISD::XOR &&
24817       //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
24818       CanFoldXORWithAllOnes(N1.getOperand(1).getNode()))
24819     return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
24820
24821   return SDValue();
24822 }
24823
24824 static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
24825                                 TargetLowering::DAGCombinerInfo &DCI,
24826                                 const X86Subtarget *Subtarget) {
24827   if (DCI.isBeforeLegalizeOps())
24828     return SDValue();
24829
24830   SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
24831   if (R.getNode())
24832     return R;
24833
24834   SDValue N0 = N->getOperand(0);
24835   SDValue N1 = N->getOperand(1);
24836   EVT VT = N->getValueType(0);
24837
24838   // look for psign/blend
24839   if (VT == MVT::v2i64 || VT == MVT::v4i64) {
24840     if (!Subtarget->hasSSSE3() ||
24841         (VT == MVT::v4i64 && !Subtarget->hasInt256()))
24842       return SDValue();
24843
24844     // Canonicalize pandn to RHS
24845     if (N0.getOpcode() == X86ISD::ANDNP)
24846       std::swap(N0, N1);
24847     // or (and (m, y), (pandn m, x))
24848     if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
24849       SDValue Mask = N1.getOperand(0);
24850       SDValue X    = N1.getOperand(1);
24851       SDValue Y;
24852       if (N0.getOperand(0) == Mask)
24853         Y = N0.getOperand(1);
24854       if (N0.getOperand(1) == Mask)
24855         Y = N0.getOperand(0);
24856
24857       // Check to see if the mask appeared in both the AND and ANDNP and
24858       if (!Y.getNode())
24859         return SDValue();
24860
24861       // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
24862       // Look through mask bitcast.
24863       if (Mask.getOpcode() == ISD::BITCAST)
24864         Mask = Mask.getOperand(0);
24865       if (X.getOpcode() == ISD::BITCAST)
24866         X = X.getOperand(0);
24867       if (Y.getOpcode() == ISD::BITCAST)
24868         Y = Y.getOperand(0);
24869
24870       EVT MaskVT = Mask.getValueType();
24871
24872       // Validate that the Mask operand is a vector sra node.
24873       // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
24874       // there is no psrai.b
24875       unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
24876       unsigned SraAmt = ~0;
24877       if (Mask.getOpcode() == ISD::SRA) {
24878         if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
24879           if (auto *AmtConst = AmtBV->getConstantSplatNode())
24880             SraAmt = AmtConst->getZExtValue();
24881       } else if (Mask.getOpcode() == X86ISD::VSRAI) {
24882         SDValue SraC = Mask.getOperand(1);
24883         SraAmt  = cast<ConstantSDNode>(SraC)->getZExtValue();
24884       }
24885       if ((SraAmt + 1) != EltBits)
24886         return SDValue();
24887
24888       SDLoc DL(N);
24889
24890       // Now we know we at least have a plendvb with the mask val.  See if
24891       // we can form a psignb/w/d.
24892       // psign = x.type == y.type == mask.type && y = sub(0, x);
24893       if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
24894           ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
24895           X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
24896         assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
24897                "Unsupported VT for PSIGN");
24898         Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0));
24899         return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
24900       }
24901       // PBLENDVB only available on SSE 4.1
24902       if (!Subtarget->hasSSE41())
24903         return SDValue();
24904
24905       EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
24906
24907       X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X);
24908       Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y);
24909       Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask);
24910       Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
24911       return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
24912     }
24913   }
24914
24915   if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
24916     return SDValue();
24917
24918   // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
24919   MachineFunction &MF = DAG.getMachineFunction();
24920   bool OptForSize =
24921       MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize);
24922
24923   // SHLD/SHRD instructions have lower register pressure, but on some
24924   // platforms they have higher latency than the equivalent
24925   // series of shifts/or that would otherwise be generated.
24926   // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
24927   // have higher latencies and we are not optimizing for size.
24928   if (!OptForSize && Subtarget->isSHLDSlow())
24929     return SDValue();
24930
24931   if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
24932     std::swap(N0, N1);
24933   if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
24934     return SDValue();
24935   if (!N0.hasOneUse() || !N1.hasOneUse())
24936     return SDValue();
24937
24938   SDValue ShAmt0 = N0.getOperand(1);
24939   if (ShAmt0.getValueType() != MVT::i8)
24940     return SDValue();
24941   SDValue ShAmt1 = N1.getOperand(1);
24942   if (ShAmt1.getValueType() != MVT::i8)
24943     return SDValue();
24944   if (ShAmt0.getOpcode() == ISD::TRUNCATE)
24945     ShAmt0 = ShAmt0.getOperand(0);
24946   if (ShAmt1.getOpcode() == ISD::TRUNCATE)
24947     ShAmt1 = ShAmt1.getOperand(0);
24948
24949   SDLoc DL(N);
24950   unsigned Opc = X86ISD::SHLD;
24951   SDValue Op0 = N0.getOperand(0);
24952   SDValue Op1 = N1.getOperand(0);
24953   if (ShAmt0.getOpcode() == ISD::SUB) {
24954     Opc = X86ISD::SHRD;
24955     std::swap(Op0, Op1);
24956     std::swap(ShAmt0, ShAmt1);
24957   }
24958
24959   unsigned Bits = VT.getSizeInBits();
24960   if (ShAmt1.getOpcode() == ISD::SUB) {
24961     SDValue Sum = ShAmt1.getOperand(0);
24962     if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
24963       SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
24964       if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
24965         ShAmt1Op1 = ShAmt1Op1.getOperand(0);
24966       if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
24967         return DAG.getNode(Opc, DL, VT,
24968                            Op0, Op1,
24969                            DAG.getNode(ISD::TRUNCATE, DL,
24970                                        MVT::i8, ShAmt0));
24971     }
24972   } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
24973     ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
24974     if (ShAmt0C &&
24975         ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
24976       return DAG.getNode(Opc, DL, VT,
24977                          N0.getOperand(0), N1.getOperand(0),
24978                          DAG.getNode(ISD::TRUNCATE, DL,
24979                                        MVT::i8, ShAmt0));
24980   }
24981
24982   return SDValue();
24983 }
24984
24985 // Generate NEG and CMOV for integer abs.
24986 static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
24987   EVT VT = N->getValueType(0);
24988
24989   // Since X86 does not have CMOV for 8-bit integer, we don't convert
24990   // 8-bit integer abs to NEG and CMOV.
24991   if (VT.isInteger() && VT.getSizeInBits() == 8)
24992     return SDValue();
24993
24994   SDValue N0 = N->getOperand(0);
24995   SDValue N1 = N->getOperand(1);
24996   SDLoc DL(N);
24997
24998   // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
24999   // and change it to SUB and CMOV.
25000   if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
25001       N0.getOpcode() == ISD::ADD &&
25002       N0.getOperand(1) == N1 &&
25003       N1.getOpcode() == ISD::SRA &&
25004       N1.getOperand(0) == N0.getOperand(0))
25005     if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
25006       if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) {
25007         // Generate SUB & CMOV.
25008         SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
25009                                   DAG.getConstant(0, VT), N0.getOperand(0));
25010
25011         SDValue Ops[] = { N0.getOperand(0), Neg,
25012                           DAG.getConstant(X86::COND_GE, MVT::i8),
25013                           SDValue(Neg.getNode(), 1) };
25014         return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
25015       }
25016   return SDValue();
25017 }
25018
25019 // PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes
25020 static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
25021                                  TargetLowering::DAGCombinerInfo &DCI,
25022                                  const X86Subtarget *Subtarget) {
25023   if (DCI.isBeforeLegalizeOps())
25024     return SDValue();
25025
25026   if (Subtarget->hasCMov()) {
25027     SDValue RV = performIntegerAbsCombine(N, DAG);
25028     if (RV.getNode())
25029       return RV;
25030   }
25031
25032   return SDValue();
25033 }
25034
25035 /// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
25036 static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
25037                                   TargetLowering::DAGCombinerInfo &DCI,
25038                                   const X86Subtarget *Subtarget) {
25039   LoadSDNode *Ld = cast<LoadSDNode>(N);
25040   EVT RegVT = Ld->getValueType(0);
25041   EVT MemVT = Ld->getMemoryVT();
25042   SDLoc dl(Ld);
25043   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25044
25045   // For chips with slow 32-byte unaligned loads, break the 32-byte operation
25046   // into two 16-byte operations.
25047   ISD::LoadExtType Ext = Ld->getExtensionType();
25048   unsigned Alignment = Ld->getAlignment();
25049   bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
25050   if (RegVT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
25051       !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
25052     unsigned NumElems = RegVT.getVectorNumElements();
25053     if (NumElems < 2)
25054       return SDValue();
25055
25056     SDValue Ptr = Ld->getBasePtr();
25057     SDValue Increment = DAG.getConstant(16, TLI.getPointerTy());
25058
25059     EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
25060                                   NumElems/2);
25061     SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
25062                                 Ld->getPointerInfo(), Ld->isVolatile(),
25063                                 Ld->isNonTemporal(), Ld->isInvariant(),
25064                                 Alignment);
25065     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
25066     SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
25067                                 Ld->getPointerInfo(), Ld->isVolatile(),
25068                                 Ld->isNonTemporal(), Ld->isInvariant(),
25069                                 std::min(16U, Alignment));
25070     SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
25071                              Load1.getValue(1),
25072                              Load2.getValue(1));
25073
25074     SDValue NewVec = DAG.getUNDEF(RegVT);
25075     NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);
25076     NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);
25077     return DCI.CombineTo(N, NewVec, TF, true);
25078   }
25079
25080   return SDValue();
25081 }
25082
25083 /// PerformMLOADCombine - Resolve extending loads
25084 static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
25085                                    TargetLowering::DAGCombinerInfo &DCI,
25086                                    const X86Subtarget *Subtarget) {
25087   MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
25088   if (Mld->getExtensionType() != ISD::SEXTLOAD)
25089     return SDValue();
25090
25091   EVT VT = Mld->getValueType(0);
25092   unsigned NumElems = VT.getVectorNumElements();
25093   EVT LdVT = Mld->getMemoryVT();
25094   SDLoc dl(Mld);
25095
25096   assert(LdVT != VT && "Cannot extend to the same type");
25097   unsigned ToSz = VT.getVectorElementType().getSizeInBits();
25098   unsigned FromSz = LdVT.getVectorElementType().getSizeInBits();
25099   // From, To sizes and ElemCount must be pow of two
25100   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
25101     "Unexpected size for extending masked load");
25102
25103   unsigned SizeRatio  = ToSz / FromSz;
25104   assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
25105
25106   // Create a type on which we perform the shuffle
25107   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
25108           LdVT.getScalarType(), NumElems*SizeRatio);
25109   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
25110
25111   // Convert Src0 value
25112   SDValue WideSrc0 = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mld->getSrc0());
25113   if (Mld->getSrc0().getOpcode() != ISD::UNDEF) {
25114     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
25115     for (unsigned i = 0; i != NumElems; ++i)
25116       ShuffleVec[i] = i * SizeRatio;
25117
25118     // Can't shuffle using an illegal type.
25119     assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)
25120             && "WideVecVT should be legal");
25121     WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
25122                                     DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
25123   }
25124   // Prepare the new mask
25125   SDValue NewMask;
25126   SDValue Mask = Mld->getMask();
25127   if (Mask.getValueType() == VT) {
25128     // Mask and original value have the same type
25129     NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
25130     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
25131     for (unsigned i = 0; i != NumElems; ++i)
25132       ShuffleVec[i] = i * SizeRatio;
25133     for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
25134       ShuffleVec[i] = NumElems*SizeRatio;
25135     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
25136                                    DAG.getConstant(0, WideVecVT),
25137                                    &ShuffleVec[0]);
25138   }
25139   else {
25140     assert(Mask.getValueType().getVectorElementType() == MVT::i1);
25141     unsigned WidenNumElts = NumElems*SizeRatio;
25142     unsigned MaskNumElts = VT.getVectorNumElements();
25143     EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
25144                                      WidenNumElts);
25145
25146     unsigned NumConcat = WidenNumElts / MaskNumElts;
25147     SmallVector<SDValue, 16> Ops(NumConcat);
25148     SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType());
25149     Ops[0] = Mask;
25150     for (unsigned i = 1; i != NumConcat; ++i)
25151       Ops[i] = ZeroVal;
25152
25153     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
25154   }
25155
25156   SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
25157                                      Mld->getBasePtr(), NewMask, WideSrc0,
25158                                      Mld->getMemoryVT(), Mld->getMemOperand(),
25159                                      ISD::NON_EXTLOAD);
25160   SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
25161   return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
25162
25163 }
25164 /// PerformMSTORECombine - Resolve truncating stores
25165 static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
25166                                     const X86Subtarget *Subtarget) {
25167   MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
25168   if (!Mst->isTruncatingStore())
25169     return SDValue();
25170
25171   EVT VT = Mst->getValue().getValueType();
25172   unsigned NumElems = VT.getVectorNumElements();
25173   EVT StVT = Mst->getMemoryVT();
25174   SDLoc dl(Mst);
25175
25176   assert(StVT != VT && "Cannot truncate to the same type");
25177   unsigned FromSz = VT.getVectorElementType().getSizeInBits();
25178   unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
25179
25180   // From, To sizes and ElemCount must be pow of two
25181   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
25182     "Unexpected size for truncating masked store");
25183   // We are going to use the original vector elt for storing.
25184   // Accumulated smaller vector elements must be a multiple of the store size.
25185   assert (((NumElems * FromSz) % ToSz) == 0 &&
25186           "Unexpected ratio for truncating masked store");
25187
25188   unsigned SizeRatio  = FromSz / ToSz;
25189   assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
25190
25191   // Create a type on which we perform the shuffle
25192   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
25193           StVT.getScalarType(), NumElems*SizeRatio);
25194
25195   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
25196
25197   SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mst->getValue());
25198   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
25199   for (unsigned i = 0; i != NumElems; ++i)
25200     ShuffleVec[i] = i * SizeRatio;
25201
25202   // Can't shuffle using an illegal type.
25203   assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)
25204           && "WideVecVT should be legal");
25205
25206   SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
25207                                         DAG.getUNDEF(WideVecVT),
25208                                         &ShuffleVec[0]);
25209
25210   SDValue NewMask;
25211   SDValue Mask = Mst->getMask();
25212   if (Mask.getValueType() == VT) {
25213     // Mask and original value have the same type
25214     NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
25215     for (unsigned i = 0; i != NumElems; ++i)
25216       ShuffleVec[i] = i * SizeRatio;
25217     for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
25218       ShuffleVec[i] = NumElems*SizeRatio;
25219     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
25220                                    DAG.getConstant(0, WideVecVT),
25221                                    &ShuffleVec[0]);
25222   }
25223   else {
25224     assert(Mask.getValueType().getVectorElementType() == MVT::i1);
25225     unsigned WidenNumElts = NumElems*SizeRatio;
25226     unsigned MaskNumElts = VT.getVectorNumElements();
25227     EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
25228                                      WidenNumElts);
25229
25230     unsigned NumConcat = WidenNumElts / MaskNumElts;
25231     SmallVector<SDValue, 16> Ops(NumConcat);
25232     SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType());
25233     Ops[0] = Mask;
25234     for (unsigned i = 1; i != NumConcat; ++i)
25235       Ops[i] = ZeroVal;
25236
25237     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
25238   }
25239
25240   return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, Mst->getBasePtr(),
25241                             NewMask, StVT, Mst->getMemOperand(), false);
25242 }
25243 /// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
25244 static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
25245                                    const X86Subtarget *Subtarget) {
25246   StoreSDNode *St = cast<StoreSDNode>(N);
25247   EVT VT = St->getValue().getValueType();
25248   EVT StVT = St->getMemoryVT();
25249   SDLoc dl(St);
25250   SDValue StoredVal = St->getOperand(1);
25251   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25252
25253   // If we are saving a concatenation of two XMM registers and 32-byte stores
25254   // are slow, such as on Sandy Bridge, perform two 16-byte stores.
25255   unsigned Alignment = St->getAlignment();
25256   bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8;
25257   if (VT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
25258       StVT == VT && !IsAligned) {
25259     unsigned NumElems = VT.getVectorNumElements();
25260     if (NumElems < 2)
25261       return SDValue();
25262
25263     SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl);
25264     SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl);
25265
25266     SDValue Stride = DAG.getConstant(16, TLI.getPointerTy());
25267     SDValue Ptr0 = St->getBasePtr();
25268     SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
25269
25270     SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
25271                                 St->getPointerInfo(), St->isVolatile(),
25272                                 St->isNonTemporal(), Alignment);
25273     SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
25274                                 St->getPointerInfo(), St->isVolatile(),
25275                                 St->isNonTemporal(),
25276                                 std::min(16U, Alignment));
25277     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
25278   }
25279
25280   // Optimize trunc store (of multiple scalars) to shuffle and store.
25281   // First, pack all of the elements in one place. Next, store to memory
25282   // in fewer chunks.
25283   if (St->isTruncatingStore() && VT.isVector()) {
25284     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25285     unsigned NumElems = VT.getVectorNumElements();
25286     assert(StVT != VT && "Cannot truncate to the same type");
25287     unsigned FromSz = VT.getVectorElementType().getSizeInBits();
25288     unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
25289
25290     // From, To sizes and ElemCount must be pow of two
25291     if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
25292     // We are going to use the original vector elt for storing.
25293     // Accumulated smaller vector elements must be a multiple of the store size.
25294     if (0 != (NumElems * FromSz) % ToSz) return SDValue();
25295
25296     unsigned SizeRatio  = FromSz / ToSz;
25297
25298     assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
25299
25300     // Create a type on which we perform the shuffle
25301     EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
25302             StVT.getScalarType(), NumElems*SizeRatio);
25303
25304     assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
25305
25306     SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());
25307     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
25308     for (unsigned i = 0; i != NumElems; ++i)
25309       ShuffleVec[i] = i * SizeRatio;
25310
25311     // Can't shuffle using an illegal type.
25312     if (!TLI.isTypeLegal(WideVecVT))
25313       return SDValue();
25314
25315     SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
25316                                          DAG.getUNDEF(WideVecVT),
25317                                          &ShuffleVec[0]);
25318     // At this point all of the data is stored at the bottom of the
25319     // register. We now need to save it to mem.
25320
25321     // Find the largest store unit
25322     MVT StoreType = MVT::i8;
25323     for (MVT Tp : MVT::integer_valuetypes()) {
25324       if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
25325         StoreType = Tp;
25326     }
25327
25328     // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
25329     if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
25330         (64 <= NumElems * ToSz))
25331       StoreType = MVT::f64;
25332
25333     // Bitcast the original vector into a vector of store-size units
25334     EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
25335             StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
25336     assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
25337     SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
25338     SmallVector<SDValue, 8> Chains;
25339     SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
25340                                         TLI.getPointerTy());
25341     SDValue Ptr = St->getBasePtr();
25342
25343     // Perform one or more big stores into memory.
25344     for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
25345       SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
25346                                    StoreType, ShuffWide,
25347                                    DAG.getIntPtrConstant(i));
25348       SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
25349                                 St->getPointerInfo(), St->isVolatile(),
25350                                 St->isNonTemporal(), St->getAlignment());
25351       Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
25352       Chains.push_back(Ch);
25353     }
25354
25355     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
25356   }
25357
25358   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
25359   // the FP state in cases where an emms may be missing.
25360   // A preferable solution to the general problem is to figure out the right
25361   // places to insert EMMS.  This qualifies as a quick hack.
25362
25363   // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
25364   if (VT.getSizeInBits() != 64)
25365     return SDValue();
25366
25367   const Function *F = DAG.getMachineFunction().getFunction();
25368   bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
25369   bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps
25370                      && Subtarget->hasSSE2();
25371   if ((VT.isVector() ||
25372        (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
25373       isa<LoadSDNode>(St->getValue()) &&
25374       !cast<LoadSDNode>(St->getValue())->isVolatile() &&
25375       St->getChain().hasOneUse() && !St->isVolatile()) {
25376     SDNode* LdVal = St->getValue().getNode();
25377     LoadSDNode *Ld = nullptr;
25378     int TokenFactorIndex = -1;
25379     SmallVector<SDValue, 8> Ops;
25380     SDNode* ChainVal = St->getChain().getNode();
25381     // Must be a store of a load.  We currently handle two cases:  the load
25382     // is a direct child, and it's under an intervening TokenFactor.  It is
25383     // possible to dig deeper under nested TokenFactors.
25384     if (ChainVal == LdVal)
25385       Ld = cast<LoadSDNode>(St->getChain());
25386     else if (St->getValue().hasOneUse() &&
25387              ChainVal->getOpcode() == ISD::TokenFactor) {
25388       for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
25389         if (ChainVal->getOperand(i).getNode() == LdVal) {
25390           TokenFactorIndex = i;
25391           Ld = cast<LoadSDNode>(St->getValue());
25392         } else
25393           Ops.push_back(ChainVal->getOperand(i));
25394       }
25395     }
25396
25397     if (!Ld || !ISD::isNormalLoad(Ld))
25398       return SDValue();
25399
25400     // If this is not the MMX case, i.e. we are just turning i64 load/store
25401     // into f64 load/store, avoid the transformation if there are multiple
25402     // uses of the loaded value.
25403     if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
25404       return SDValue();
25405
25406     SDLoc LdDL(Ld);
25407     SDLoc StDL(N);
25408     // If we are a 64-bit capable x86, lower to a single movq load/store pair.
25409     // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
25410     // pair instead.
25411     if (Subtarget->is64Bit() || F64IsLegal) {
25412       EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
25413       SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
25414                                   Ld->getPointerInfo(), Ld->isVolatile(),
25415                                   Ld->isNonTemporal(), Ld->isInvariant(),
25416                                   Ld->getAlignment());
25417       SDValue NewChain = NewLd.getValue(1);
25418       if (TokenFactorIndex != -1) {
25419         Ops.push_back(NewChain);
25420         NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
25421       }
25422       return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
25423                           St->getPointerInfo(),
25424                           St->isVolatile(), St->isNonTemporal(),
25425                           St->getAlignment());
25426     }
25427
25428     // Otherwise, lower to two pairs of 32-bit loads / stores.
25429     SDValue LoAddr = Ld->getBasePtr();
25430     SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
25431                                  DAG.getConstant(4, MVT::i32));
25432
25433     SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
25434                                Ld->getPointerInfo(),
25435                                Ld->isVolatile(), Ld->isNonTemporal(),
25436                                Ld->isInvariant(), Ld->getAlignment());
25437     SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
25438                                Ld->getPointerInfo().getWithOffset(4),
25439                                Ld->isVolatile(), Ld->isNonTemporal(),
25440                                Ld->isInvariant(),
25441                                MinAlign(Ld->getAlignment(), 4));
25442
25443     SDValue NewChain = LoLd.getValue(1);
25444     if (TokenFactorIndex != -1) {
25445       Ops.push_back(LoLd);
25446       Ops.push_back(HiLd);
25447       NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
25448     }
25449
25450     LoAddr = St->getBasePtr();
25451     HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
25452                          DAG.getConstant(4, MVT::i32));
25453
25454     SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
25455                                 St->getPointerInfo(),
25456                                 St->isVolatile(), St->isNonTemporal(),
25457                                 St->getAlignment());
25458     SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
25459                                 St->getPointerInfo().getWithOffset(4),
25460                                 St->isVolatile(),
25461                                 St->isNonTemporal(),
25462                                 MinAlign(St->getAlignment(), 4));
25463     return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
25464   }
25465   return SDValue();
25466 }
25467
25468 /// Return 'true' if this vector operation is "horizontal"
25469 /// and return the operands for the horizontal operation in LHS and RHS.  A
25470 /// horizontal operation performs the binary operation on successive elements
25471 /// of its first operand, then on successive elements of its second operand,
25472 /// returning the resulting values in a vector.  For example, if
25473 ///   A = < float a0, float a1, float a2, float a3 >
25474 /// and
25475 ///   B = < float b0, float b1, float b2, float b3 >
25476 /// then the result of doing a horizontal operation on A and B is
25477 ///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
25478 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
25479 /// A horizontal-op B, for some already available A and B, and if so then LHS is
25480 /// set to A, RHS to B, and the routine returns 'true'.
25481 /// Note that the binary operation should have the property that if one of the
25482 /// operands is UNDEF then the result is UNDEF.
25483 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
25484   // Look for the following pattern: if
25485   //   A = < float a0, float a1, float a2, float a3 >
25486   //   B = < float b0, float b1, float b2, float b3 >
25487   // and
25488   //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
25489   //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
25490   // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
25491   // which is A horizontal-op B.
25492
25493   // At least one of the operands should be a vector shuffle.
25494   if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
25495       RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
25496     return false;
25497
25498   MVT VT = LHS.getSimpleValueType();
25499
25500   assert((VT.is128BitVector() || VT.is256BitVector()) &&
25501          "Unsupported vector type for horizontal add/sub");
25502
25503   // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
25504   // operate independently on 128-bit lanes.
25505   unsigned NumElts = VT.getVectorNumElements();
25506   unsigned NumLanes = VT.getSizeInBits()/128;
25507   unsigned NumLaneElts = NumElts / NumLanes;
25508   assert((NumLaneElts % 2 == 0) &&
25509          "Vector type should have an even number of elements in each lane");
25510   unsigned HalfLaneElts = NumLaneElts/2;
25511
25512   // View LHS in the form
25513   //   LHS = VECTOR_SHUFFLE A, B, LMask
25514   // If LHS is not a shuffle then pretend it is the shuffle
25515   //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
25516   // NOTE: in what follows a default initialized SDValue represents an UNDEF of
25517   // type VT.
25518   SDValue A, B;
25519   SmallVector<int, 16> LMask(NumElts);
25520   if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
25521     if (LHS.getOperand(0).getOpcode() != ISD::UNDEF)
25522       A = LHS.getOperand(0);
25523     if (LHS.getOperand(1).getOpcode() != ISD::UNDEF)
25524       B = LHS.getOperand(1);
25525     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
25526     std::copy(Mask.begin(), Mask.end(), LMask.begin());
25527   } else {
25528     if (LHS.getOpcode() != ISD::UNDEF)
25529       A = LHS;
25530     for (unsigned i = 0; i != NumElts; ++i)
25531       LMask[i] = i;
25532   }
25533
25534   // Likewise, view RHS in the form
25535   //   RHS = VECTOR_SHUFFLE C, D, RMask
25536   SDValue C, D;
25537   SmallVector<int, 16> RMask(NumElts);
25538   if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
25539     if (RHS.getOperand(0).getOpcode() != ISD::UNDEF)
25540       C = RHS.getOperand(0);
25541     if (RHS.getOperand(1).getOpcode() != ISD::UNDEF)
25542       D = RHS.getOperand(1);
25543     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
25544     std::copy(Mask.begin(), Mask.end(), RMask.begin());
25545   } else {
25546     if (RHS.getOpcode() != ISD::UNDEF)
25547       C = RHS;
25548     for (unsigned i = 0; i != NumElts; ++i)
25549       RMask[i] = i;
25550   }
25551
25552   // Check that the shuffles are both shuffling the same vectors.
25553   if (!(A == C && B == D) && !(A == D && B == C))
25554     return false;
25555
25556   // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
25557   if (!A.getNode() && !B.getNode())
25558     return false;
25559
25560   // If A and B occur in reverse order in RHS, then "swap" them (which means
25561   // rewriting the mask).
25562   if (A != C)
25563     CommuteVectorShuffleMask(RMask, NumElts);
25564
25565   // At this point LHS and RHS are equivalent to
25566   //   LHS = VECTOR_SHUFFLE A, B, LMask
25567   //   RHS = VECTOR_SHUFFLE A, B, RMask
25568   // Check that the masks correspond to performing a horizontal operation.
25569   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
25570     for (unsigned i = 0; i != NumLaneElts; ++i) {
25571       int LIdx = LMask[i+l], RIdx = RMask[i+l];
25572
25573       // Ignore any UNDEF components.
25574       if (LIdx < 0 || RIdx < 0 ||
25575           (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
25576           (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
25577         continue;
25578
25579       // Check that successive elements are being operated on.  If not, this is
25580       // not a horizontal operation.
25581       unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
25582       int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
25583       if (!(LIdx == Index && RIdx == Index + 1) &&
25584           !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
25585         return false;
25586     }
25587   }
25588
25589   LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
25590   RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
25591   return true;
25592 }
25593
25594 /// Do target-specific dag combines on floating point adds.
25595 static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
25596                                   const X86Subtarget *Subtarget) {
25597   EVT VT = N->getValueType(0);
25598   SDValue LHS = N->getOperand(0);
25599   SDValue RHS = N->getOperand(1);
25600
25601   // Try to synthesize horizontal adds from adds of shuffles.
25602   if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
25603        (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
25604       isHorizontalBinOp(LHS, RHS, true))
25605     return DAG.getNode(X86ISD::FHADD, SDLoc(N), VT, LHS, RHS);
25606   return SDValue();
25607 }
25608
25609 /// Do target-specific dag combines on floating point subs.
25610 static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
25611                                   const X86Subtarget *Subtarget) {
25612   EVT VT = N->getValueType(0);
25613   SDValue LHS = N->getOperand(0);
25614   SDValue RHS = N->getOperand(1);
25615
25616   // Try to synthesize horizontal subs from subs of shuffles.
25617   if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
25618        (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
25619       isHorizontalBinOp(LHS, RHS, false))
25620     return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS);
25621   return SDValue();
25622 }
25623
25624 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
25625 static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
25626   assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
25627
25628   // F[X]OR(0.0, x) -> x
25629   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
25630     if (C->getValueAPF().isPosZero())
25631       return N->getOperand(1);
25632
25633   // F[X]OR(x, 0.0) -> x
25634   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
25635     if (C->getValueAPF().isPosZero())
25636       return N->getOperand(0);
25637   return SDValue();
25638 }
25639
25640 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
25641 static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
25642   assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
25643
25644   // Only perform optimizations if UnsafeMath is used.
25645   if (!DAG.getTarget().Options.UnsafeFPMath)
25646     return SDValue();
25647
25648   // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
25649   // into FMINC and FMAXC, which are Commutative operations.
25650   unsigned NewOp = 0;
25651   switch (N->getOpcode()) {
25652     default: llvm_unreachable("unknown opcode");
25653     case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
25654     case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
25655   }
25656
25657   return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
25658                      N->getOperand(0), N->getOperand(1));
25659 }
25660
25661 /// Do target-specific dag combines on X86ISD::FAND nodes.
25662 static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
25663   // FAND(0.0, x) -> 0.0
25664   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
25665     if (C->getValueAPF().isPosZero())
25666       return N->getOperand(0);
25667
25668   // FAND(x, 0.0) -> 0.0
25669   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
25670     if (C->getValueAPF().isPosZero())
25671       return N->getOperand(1);
25672
25673   return SDValue();
25674 }
25675
25676 /// Do target-specific dag combines on X86ISD::FANDN nodes
25677 static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) {
25678   // FANDN(0.0, x) -> x
25679   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
25680     if (C->getValueAPF().isPosZero())
25681       return N->getOperand(1);
25682
25683   // FANDN(x, 0.0) -> 0.0
25684   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
25685     if (C->getValueAPF().isPosZero())
25686       return N->getOperand(1);
25687
25688   return SDValue();
25689 }
25690
25691 static SDValue PerformBTCombine(SDNode *N,
25692                                 SelectionDAG &DAG,
25693                                 TargetLowering::DAGCombinerInfo &DCI) {
25694   // BT ignores high bits in the bit index operand.
25695   SDValue Op1 = N->getOperand(1);
25696   if (Op1.hasOneUse()) {
25697     unsigned BitWidth = Op1.getValueSizeInBits();
25698     APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
25699     APInt KnownZero, KnownOne;
25700     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
25701                                           !DCI.isBeforeLegalizeOps());
25702     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25703     if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
25704         TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
25705       DCI.CommitTargetLoweringOpt(TLO);
25706   }
25707   return SDValue();
25708 }
25709
25710 static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
25711   SDValue Op = N->getOperand(0);
25712   if (Op.getOpcode() == ISD::BITCAST)
25713     Op = Op.getOperand(0);
25714   EVT VT = N->getValueType(0), OpVT = Op.getValueType();
25715   if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
25716       VT.getVectorElementType().getSizeInBits() ==
25717       OpVT.getVectorElementType().getSizeInBits()) {
25718     return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
25719   }
25720   return SDValue();
25721 }
25722
25723 static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
25724                                                const X86Subtarget *Subtarget) {
25725   EVT VT = N->getValueType(0);
25726   if (!VT.isVector())
25727     return SDValue();
25728
25729   SDValue N0 = N->getOperand(0);
25730   SDValue N1 = N->getOperand(1);
25731   EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
25732   SDLoc dl(N);
25733
25734   // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
25735   // both SSE and AVX2 since there is no sign-extended shift right
25736   // operation on a vector with 64-bit elements.
25737   //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
25738   // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
25739   if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
25740       N0.getOpcode() == ISD::SIGN_EXTEND)) {
25741     SDValue N00 = N0.getOperand(0);
25742
25743     // EXTLOAD has a better solution on AVX2,
25744     // it may be replaced with X86ISD::VSEXT node.
25745     if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256())
25746       if (!ISD::isNormalLoad(N00.getNode()))
25747         return SDValue();
25748
25749     if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
25750         SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
25751                                   N00, N1);
25752       return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
25753     }
25754   }
25755   return SDValue();
25756 }
25757
25758 static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
25759                                   TargetLowering::DAGCombinerInfo &DCI,
25760                                   const X86Subtarget *Subtarget) {
25761   SDValue N0 = N->getOperand(0);
25762   EVT VT = N->getValueType(0);
25763
25764   // (i8,i32 sext (sdivrem (i8 x, i8 y)) ->
25765   // (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y)
25766   // This exposes the sext to the sdivrem lowering, so that it directly extends
25767   // from AH (which we otherwise need to do contortions to access).
25768   if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 &&
25769       N0.getValueType() == MVT::i8 && VT == MVT::i32) {
25770     SDLoc dl(N);
25771     SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
25772     SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, dl, NodeTys,
25773                             N0.getOperand(0), N0.getOperand(1));
25774     DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
25775     return R.getValue(1);
25776   }
25777
25778   if (!DCI.isBeforeLegalizeOps())
25779     return SDValue();
25780
25781   if (!Subtarget->hasFp256())
25782     return SDValue();
25783
25784   if (VT.isVector() && VT.getSizeInBits() == 256) {
25785     SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
25786     if (R.getNode())
25787       return R;
25788   }
25789
25790   return SDValue();
25791 }
25792
25793 static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
25794                                  const X86Subtarget* Subtarget) {
25795   SDLoc dl(N);
25796   EVT VT = N->getValueType(0);
25797
25798   // Let legalize expand this if it isn't a legal type yet.
25799   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
25800     return SDValue();
25801
25802   EVT ScalarVT = VT.getScalarType();
25803   if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
25804       (!Subtarget->hasFMA() && !Subtarget->hasFMA4()))
25805     return SDValue();
25806
25807   SDValue A = N->getOperand(0);
25808   SDValue B = N->getOperand(1);
25809   SDValue C = N->getOperand(2);
25810
25811   bool NegA = (A.getOpcode() == ISD::FNEG);
25812   bool NegB = (B.getOpcode() == ISD::FNEG);
25813   bool NegC = (C.getOpcode() == ISD::FNEG);
25814
25815   // Negative multiplication when NegA xor NegB
25816   bool NegMul = (NegA != NegB);
25817   if (NegA)
25818     A = A.getOperand(0);
25819   if (NegB)
25820     B = B.getOperand(0);
25821   if (NegC)
25822     C = C.getOperand(0);
25823
25824   unsigned Opcode;
25825   if (!NegMul)
25826     Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
25827   else
25828     Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
25829
25830   return DAG.getNode(Opcode, dl, VT, A, B, C);
25831 }
25832
25833 static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
25834                                   TargetLowering::DAGCombinerInfo &DCI,
25835                                   const X86Subtarget *Subtarget) {
25836   // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
25837   //           (and (i32 x86isd::setcc_carry), 1)
25838   // This eliminates the zext. This transformation is necessary because
25839   // ISD::SETCC is always legalized to i8.
25840   SDLoc dl(N);
25841   SDValue N0 = N->getOperand(0);
25842   EVT VT = N->getValueType(0);
25843
25844   if (N0.getOpcode() == ISD::AND &&
25845       N0.hasOneUse() &&
25846       N0.getOperand(0).hasOneUse()) {
25847     SDValue N00 = N0.getOperand(0);
25848     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
25849       ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
25850       if (!C || C->getZExtValue() != 1)
25851         return SDValue();
25852       return DAG.getNode(ISD::AND, dl, VT,
25853                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
25854                                      N00.getOperand(0), N00.getOperand(1)),
25855                          DAG.getConstant(1, VT));
25856     }
25857   }
25858
25859   if (N0.getOpcode() == ISD::TRUNCATE &&
25860       N0.hasOneUse() &&
25861       N0.getOperand(0).hasOneUse()) {
25862     SDValue N00 = N0.getOperand(0);
25863     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
25864       return DAG.getNode(ISD::AND, dl, VT,
25865                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
25866                                      N00.getOperand(0), N00.getOperand(1)),
25867                          DAG.getConstant(1, VT));
25868     }
25869   }
25870   if (VT.is256BitVector()) {
25871     SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
25872     if (R.getNode())
25873       return R;
25874   }
25875
25876   // (i8,i32 zext (udivrem (i8 x, i8 y)) ->
25877   // (i8,i32 (udivrem_zext_hreg (i8 x, i8 y)
25878   // This exposes the zext to the udivrem lowering, so that it directly extends
25879   // from AH (which we otherwise need to do contortions to access).
25880   if (N0.getOpcode() == ISD::UDIVREM &&
25881       N0.getResNo() == 1 && N0.getValueType() == MVT::i8 &&
25882       (VT == MVT::i32 || VT == MVT::i64)) {
25883     SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
25884     SDValue R = DAG.getNode(X86ISD::UDIVREM8_ZEXT_HREG, dl, NodeTys,
25885                             N0.getOperand(0), N0.getOperand(1));
25886     DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
25887     return R.getValue(1);
25888   }
25889
25890   return SDValue();
25891 }
25892
25893 // Optimize x == -y --> x+y == 0
25894 //          x != -y --> x+y != 0
25895 static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
25896                                       const X86Subtarget* Subtarget) {
25897   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
25898   SDValue LHS = N->getOperand(0);
25899   SDValue RHS = N->getOperand(1);
25900   EVT VT = N->getValueType(0);
25901   SDLoc DL(N);
25902
25903   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
25904     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
25905       if (C->getAPIntValue() == 0 && LHS.hasOneUse()) {
25906         SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
25907                                    LHS.getValueType(), RHS, LHS.getOperand(1));
25908         return DAG.getSetCC(SDLoc(N), N->getValueType(0),
25909                             addV, DAG.getConstant(0, addV.getValueType()), CC);
25910       }
25911   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
25912     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0)))
25913       if (C->getAPIntValue() == 0 && RHS.hasOneUse()) {
25914         SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
25915                                    RHS.getValueType(), LHS, RHS.getOperand(1));
25916         return DAG.getSetCC(SDLoc(N), N->getValueType(0),
25917                             addV, DAG.getConstant(0, addV.getValueType()), CC);
25918       }
25919
25920   if (VT.getScalarType() == MVT::i1) {
25921     bool IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
25922       (LHS.getOperand(0).getValueType().getScalarType() ==  MVT::i1);
25923     bool IsVZero0 = ISD::isBuildVectorAllZeros(LHS.getNode());
25924     if (!IsSEXT0 && !IsVZero0)
25925       return SDValue();
25926     bool IsSEXT1 = (RHS.getOpcode() == ISD::SIGN_EXTEND) &&
25927       (RHS.getOperand(0).getValueType().getScalarType() ==  MVT::i1);
25928     bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
25929
25930     if (!IsSEXT1 && !IsVZero1)
25931       return SDValue();
25932
25933     if (IsSEXT0 && IsVZero1) {
25934       assert(VT == LHS.getOperand(0).getValueType() && "Uexpected operand type");
25935       if (CC == ISD::SETEQ)
25936         return DAG.getNOT(DL, LHS.getOperand(0), VT);
25937       return LHS.getOperand(0);
25938     }
25939     if (IsSEXT1 && IsVZero0) {
25940       assert(VT == RHS.getOperand(0).getValueType() && "Uexpected operand type");
25941       if (CC == ISD::SETEQ)
25942         return DAG.getNOT(DL, RHS.getOperand(0), VT);
25943       return RHS.getOperand(0);
25944     }
25945   }
25946
25947   return SDValue();
25948 }
25949
25950 static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
25951                                       const X86Subtarget *Subtarget) {
25952   SDLoc dl(N);
25953   MVT VT = N->getOperand(1)->getSimpleValueType(0);
25954   assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
25955          "X86insertps is only defined for v4x32");
25956
25957   SDValue Ld = N->getOperand(1);
25958   if (MayFoldLoad(Ld)) {
25959     // Extract the countS bits from the immediate so we can get the proper
25960     // address when narrowing the vector load to a specific element.
25961     // When the second source op is a memory address, interps doesn't use
25962     // countS and just gets an f32 from that address.
25963     unsigned DestIndex =
25964         cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
25965     Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
25966   } else
25967     return SDValue();
25968
25969   // Create this as a scalar to vector to match the instruction pattern.
25970   SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
25971   // countS bits are ignored when loading from memory on insertps, which
25972   // means we don't need to explicitly set them to 0.
25973   return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
25974                      LoadScalarToVector, N->getOperand(2));
25975 }
25976
25977 // Helper function of PerformSETCCCombine. It is to materialize "setb reg"
25978 // as "sbb reg,reg", since it can be extended without zext and produces
25979 // an all-ones bit which is more useful than 0/1 in some cases.
25980 static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG,
25981                                MVT VT) {
25982   if (VT == MVT::i8)
25983     return DAG.getNode(ISD::AND, DL, VT,
25984                        DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
25985                                    DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS),
25986                        DAG.getConstant(1, VT));
25987   assert (VT == MVT::i1 && "Unexpected type for SECCC node");
25988   return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,
25989                      DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
25990                                  DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS));
25991 }
25992
25993 // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
25994 static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
25995                                    TargetLowering::DAGCombinerInfo &DCI,
25996                                    const X86Subtarget *Subtarget) {
25997   SDLoc DL(N);
25998   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
25999   SDValue EFLAGS = N->getOperand(1);
26000
26001   if (CC == X86::COND_A) {
26002     // Try to convert COND_A into COND_B in an attempt to facilitate
26003     // materializing "setb reg".
26004     //
26005     // Do not flip "e > c", where "c" is a constant, because Cmp instruction
26006     // cannot take an immediate as its first operand.
26007     //
26008     if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
26009         EFLAGS.getValueType().isInteger() &&
26010         !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
26011       SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
26012                                    EFLAGS.getNode()->getVTList(),
26013                                    EFLAGS.getOperand(1), EFLAGS.getOperand(0));
26014       SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
26015       return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0));
26016     }
26017   }
26018
26019   // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
26020   // a zext and produces an all-ones bit which is more useful than 0/1 in some
26021   // cases.
26022   if (CC == X86::COND_B)
26023     return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
26024
26025   SDValue Flags;
26026
26027   Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
26028   if (Flags.getNode()) {
26029     SDValue Cond = DAG.getConstant(CC, MVT::i8);
26030     return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
26031   }
26032
26033   return SDValue();
26034 }
26035
26036 // Optimize branch condition evaluation.
26037 //
26038 static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
26039                                     TargetLowering::DAGCombinerInfo &DCI,
26040                                     const X86Subtarget *Subtarget) {
26041   SDLoc DL(N);
26042   SDValue Chain = N->getOperand(0);
26043   SDValue Dest = N->getOperand(1);
26044   SDValue EFLAGS = N->getOperand(3);
26045   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
26046
26047   SDValue Flags;
26048
26049   Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
26050   if (Flags.getNode()) {
26051     SDValue Cond = DAG.getConstant(CC, MVT::i8);
26052     return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
26053                        Flags);
26054   }
26055
26056   return SDValue();
26057 }
26058
26059 static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
26060                                                          SelectionDAG &DAG) {
26061   // Take advantage of vector comparisons producing 0 or -1 in each lane to
26062   // optimize away operation when it's from a constant.
26063   //
26064   // The general transformation is:
26065   //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
26066   //       AND(VECTOR_CMP(x,y), constant2)
26067   //    constant2 = UNARYOP(constant)
26068
26069   // Early exit if this isn't a vector operation, the operand of the
26070   // unary operation isn't a bitwise AND, or if the sizes of the operations
26071   // aren't the same.
26072   EVT VT = N->getValueType(0);
26073   if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
26074       N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
26075       VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
26076     return SDValue();
26077
26078   // Now check that the other operand of the AND is a constant. We could
26079   // make the transformation for non-constant splats as well, but it's unclear
26080   // that would be a benefit as it would not eliminate any operations, just
26081   // perform one more step in scalar code before moving to the vector unit.
26082   if (BuildVectorSDNode *BV =
26083           dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
26084     // Bail out if the vector isn't a constant.
26085     if (!BV->isConstant())
26086       return SDValue();
26087
26088     // Everything checks out. Build up the new and improved node.
26089     SDLoc DL(N);
26090     EVT IntVT = BV->getValueType(0);
26091     // Create a new constant of the appropriate type for the transformed
26092     // DAG.
26093     SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
26094     // The AND node needs bitcasts to/from an integer vector type around it.
26095     SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
26096     SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
26097                                  N->getOperand(0)->getOperand(0), MaskConst);
26098     SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
26099     return Res;
26100   }
26101
26102   return SDValue();
26103 }
26104
26105 static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
26106                                         const X86Subtarget *Subtarget) {
26107   // First try to optimize away the conversion entirely when it's
26108   // conditionally from a constant. Vectors only.
26109   SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG);
26110   if (Res != SDValue())
26111     return Res;
26112
26113   // Now move on to more general possibilities.
26114   SDValue Op0 = N->getOperand(0);
26115   EVT InVT = Op0->getValueType(0);
26116
26117   // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32))
26118   if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
26119     SDLoc dl(N);
26120     MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
26121     SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
26122     return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
26123   }
26124
26125   // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
26126   // a 32-bit target where SSE doesn't support i64->FP operations.
26127   if (Op0.getOpcode() == ISD::LOAD) {
26128     LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
26129     EVT VT = Ld->getValueType(0);
26130     if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
26131         ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
26132         !Subtarget->is64Bit() && VT == MVT::i64) {
26133       SDValue FILDChain = Subtarget->getTargetLowering()->BuildFILD(
26134           SDValue(N, 0), Ld->getValueType(0), Ld->getChain(), Op0, DAG);
26135       DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
26136       return FILDChain;
26137     }
26138   }
26139   return SDValue();
26140 }
26141
26142 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
26143 static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
26144                                  X86TargetLowering::DAGCombinerInfo &DCI) {
26145   // If the LHS and RHS of the ADC node are zero, then it can't overflow and
26146   // the result is either zero or one (depending on the input carry bit).
26147   // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
26148   if (X86::isZeroNode(N->getOperand(0)) &&
26149       X86::isZeroNode(N->getOperand(1)) &&
26150       // We don't have a good way to replace an EFLAGS use, so only do this when
26151       // dead right now.
26152       SDValue(N, 1).use_empty()) {
26153     SDLoc DL(N);
26154     EVT VT = N->getValueType(0);
26155     SDValue CarryOut = DAG.getConstant(0, N->getValueType(1));
26156     SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
26157                                DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
26158                                            DAG.getConstant(X86::COND_B,MVT::i8),
26159                                            N->getOperand(2)),
26160                                DAG.getConstant(1, VT));
26161     return DCI.CombineTo(N, Res1, CarryOut);
26162   }
26163
26164   return SDValue();
26165 }
26166
26167 // fold (add Y, (sete  X, 0)) -> adc  0, Y
26168 //      (add Y, (setne X, 0)) -> sbb -1, Y
26169 //      (sub (sete  X, 0), Y) -> sbb  0, Y
26170 //      (sub (setne X, 0), Y) -> adc -1, Y
26171 static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
26172   SDLoc DL(N);
26173
26174   // Look through ZExts.
26175   SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
26176   if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
26177     return SDValue();
26178
26179   SDValue SetCC = Ext.getOperand(0);
26180   if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
26181     return SDValue();
26182
26183   X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
26184   if (CC != X86::COND_E && CC != X86::COND_NE)
26185     return SDValue();
26186
26187   SDValue Cmp = SetCC.getOperand(1);
26188   if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
26189       !X86::isZeroNode(Cmp.getOperand(1)) ||
26190       !Cmp.getOperand(0).getValueType().isInteger())
26191     return SDValue();
26192
26193   SDValue CmpOp0 = Cmp.getOperand(0);
26194   SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
26195                                DAG.getConstant(1, CmpOp0.getValueType()));
26196
26197   SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
26198   if (CC == X86::COND_NE)
26199     return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
26200                        DL, OtherVal.getValueType(), OtherVal,
26201                        DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp);
26202   return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
26203                      DL, OtherVal.getValueType(), OtherVal,
26204                      DAG.getConstant(0, OtherVal.getValueType()), NewCmp);
26205 }
26206
26207 /// PerformADDCombine - Do target-specific dag combines on integer adds.
26208 static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
26209                                  const X86Subtarget *Subtarget) {
26210   EVT VT = N->getValueType(0);
26211   SDValue Op0 = N->getOperand(0);
26212   SDValue Op1 = N->getOperand(1);
26213
26214   // Try to synthesize horizontal adds from adds of shuffles.
26215   if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
26216        (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
26217       isHorizontalBinOp(Op0, Op1, true))
26218     return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
26219
26220   return OptimizeConditionalInDecrement(N, DAG);
26221 }
26222
26223 static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
26224                                  const X86Subtarget *Subtarget) {
26225   SDValue Op0 = N->getOperand(0);
26226   SDValue Op1 = N->getOperand(1);
26227
26228   // X86 can't encode an immediate LHS of a sub. See if we can push the
26229   // negation into a preceding instruction.
26230   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
26231     // If the RHS of the sub is a XOR with one use and a constant, invert the
26232     // immediate. Then add one to the LHS of the sub so we can turn
26233     // X-Y -> X+~Y+1, saving one register.
26234     if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
26235         isa<ConstantSDNode>(Op1.getOperand(1))) {
26236       APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
26237       EVT VT = Op0.getValueType();
26238       SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
26239                                    Op1.getOperand(0),
26240                                    DAG.getConstant(~XorC, VT));
26241       return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
26242                          DAG.getConstant(C->getAPIntValue()+1, VT));
26243     }
26244   }
26245
26246   // Try to synthesize horizontal adds from adds of shuffles.
26247   EVT VT = N->getValueType(0);
26248   if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
26249        (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
26250       isHorizontalBinOp(Op0, Op1, true))
26251     return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
26252
26253   return OptimizeConditionalInDecrement(N, DAG);
26254 }
26255
26256 /// performVZEXTCombine - Performs build vector combines
26257 static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
26258                                    TargetLowering::DAGCombinerInfo &DCI,
26259                                    const X86Subtarget *Subtarget) {
26260   SDLoc DL(N);
26261   MVT VT = N->getSimpleValueType(0);
26262   SDValue Op = N->getOperand(0);
26263   MVT OpVT = Op.getSimpleValueType();
26264   MVT OpEltVT = OpVT.getVectorElementType();
26265   unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
26266
26267   // (vzext (bitcast (vzext (x)) -> (vzext x)
26268   SDValue V = Op;
26269   while (V.getOpcode() == ISD::BITCAST)
26270     V = V.getOperand(0);
26271
26272   if (V != Op && V.getOpcode() == X86ISD::VZEXT) {
26273     MVT InnerVT = V.getSimpleValueType();
26274     MVT InnerEltVT = InnerVT.getVectorElementType();
26275
26276     // If the element sizes match exactly, we can just do one larger vzext. This
26277     // is always an exact type match as vzext operates on integer types.
26278     if (OpEltVT == InnerEltVT) {
26279       assert(OpVT == InnerVT && "Types must match for vzext!");
26280       return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
26281     }
26282
26283     // The only other way we can combine them is if only a single element of the
26284     // inner vzext is used in the input to the outer vzext.
26285     if (InnerEltVT.getSizeInBits() < InputBits)
26286       return SDValue();
26287
26288     // In this case, the inner vzext is completely dead because we're going to
26289     // only look at bits inside of the low element. Just do the outer vzext on
26290     // a bitcast of the input to the inner.
26291     return DAG.getNode(X86ISD::VZEXT, DL, VT,
26292                        DAG.getNode(ISD::BITCAST, DL, OpVT, V));
26293   }
26294
26295   // Check if we can bypass extracting and re-inserting an element of an input
26296   // vector. Essentialy:
26297   // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
26298   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
26299       V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
26300       V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
26301     SDValue ExtractedV = V.getOperand(0);
26302     SDValue OrigV = ExtractedV.getOperand(0);
26303     if (auto *ExtractIdx = dyn_cast<ConstantSDNode>(ExtractedV.getOperand(1)))
26304       if (ExtractIdx->getZExtValue() == 0) {
26305         MVT OrigVT = OrigV.getSimpleValueType();
26306         // Extract a subvector if necessary...
26307         if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
26308           int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
26309           OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
26310                                     OrigVT.getVectorNumElements() / Ratio);
26311           OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
26312                               DAG.getIntPtrConstant(0));
26313         }
26314         Op = DAG.getNode(ISD::BITCAST, DL, OpVT, OrigV);
26315         return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
26316       }
26317   }
26318
26319   return SDValue();
26320 }
26321
26322 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
26323                                              DAGCombinerInfo &DCI) const {
26324   SelectionDAG &DAG = DCI.DAG;
26325   switch (N->getOpcode()) {
26326   default: break;
26327   case ISD::EXTRACT_VECTOR_ELT:
26328     return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
26329   case ISD::VSELECT:
26330   case ISD::SELECT:
26331   case X86ISD::SHRUNKBLEND:
26332     return PerformSELECTCombine(N, DAG, DCI, Subtarget);
26333   case ISD::BITCAST:        return PerformBITCASTCombine(N, DAG);
26334   case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI, Subtarget);
26335   case ISD::ADD:            return PerformAddCombine(N, DAG, Subtarget);
26336   case ISD::SUB:            return PerformSubCombine(N, DAG, Subtarget);
26337   case X86ISD::ADC:         return PerformADCCombine(N, DAG, DCI);
26338   case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
26339   case ISD::SHL:
26340   case ISD::SRA:
26341   case ISD::SRL:            return PerformShiftCombine(N, DAG, DCI, Subtarget);
26342   case ISD::AND:            return PerformAndCombine(N, DAG, DCI, Subtarget);
26343   case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
26344   case ISD::XOR:            return PerformXorCombine(N, DAG, DCI, Subtarget);
26345   case ISD::LOAD:           return PerformLOADCombine(N, DAG, DCI, Subtarget);
26346   case ISD::MLOAD:          return PerformMLOADCombine(N, DAG, DCI, Subtarget);
26347   case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
26348   case ISD::MSTORE:         return PerformMSTORECombine(N, DAG, Subtarget);
26349   case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, Subtarget);
26350   case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
26351   case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
26352   case X86ISD::FXOR:
26353   case X86ISD::FOR:         return PerformFORCombine(N, DAG);
26354   case X86ISD::FMIN:
26355   case X86ISD::FMAX:        return PerformFMinFMaxCombine(N, DAG);
26356   case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
26357   case X86ISD::FANDN:       return PerformFANDNCombine(N, DAG);
26358   case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
26359   case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
26360   case ISD::ANY_EXTEND:
26361   case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, DCI, Subtarget);
26362   case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
26363   case ISD::SIGN_EXTEND_INREG:
26364     return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
26365   case ISD::TRUNCATE:       return PerformTruncateCombine(N, DAG,DCI,Subtarget);
26366   case ISD::SETCC:          return PerformISDSETCCCombine(N, DAG, Subtarget);
26367   case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG, DCI, Subtarget);
26368   case X86ISD::BRCOND:      return PerformBrCondCombine(N, DAG, DCI, Subtarget);
26369   case X86ISD::VZEXT:       return performVZEXTCombine(N, DAG, DCI, Subtarget);
26370   case X86ISD::SHUFP:       // Handle all target specific shuffles
26371   case X86ISD::PALIGNR:
26372   case X86ISD::UNPCKH:
26373   case X86ISD::UNPCKL:
26374   case X86ISD::MOVHLPS:
26375   case X86ISD::MOVLHPS:
26376   case X86ISD::PSHUFB:
26377   case X86ISD::PSHUFD:
26378   case X86ISD::PSHUFHW:
26379   case X86ISD::PSHUFLW:
26380   case X86ISD::MOVSS:
26381   case X86ISD::MOVSD:
26382   case X86ISD::VPERMILPI:
26383   case X86ISD::VPERM2X128:
26384   case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
26385   case ISD::FMA:            return PerformFMACombine(N, DAG, Subtarget);
26386   case ISD::INTRINSIC_WO_CHAIN:
26387     return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget);
26388   case X86ISD::INSERTPS: {
26389     if (getTargetMachine().getOptLevel() > CodeGenOpt::None)
26390       return PerformINSERTPSCombine(N, DAG, Subtarget);
26391     break;
26392   }
26393   case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DAG, Subtarget);
26394   }
26395
26396   return SDValue();
26397 }
26398
26399 /// isTypeDesirableForOp - Return true if the target has native support for
26400 /// the specified value type and it is 'desirable' to use the type for the
26401 /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
26402 /// instruction encodings are longer and some i16 instructions are slow.
26403 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
26404   if (!isTypeLegal(VT))
26405     return false;
26406   if (VT != MVT::i16)
26407     return true;
26408
26409   switch (Opc) {
26410   default:
26411     return true;
26412   case ISD::LOAD:
26413   case ISD::SIGN_EXTEND:
26414   case ISD::ZERO_EXTEND:
26415   case ISD::ANY_EXTEND:
26416   case ISD::SHL:
26417   case ISD::SRL:
26418   case ISD::SUB:
26419   case ISD::ADD:
26420   case ISD::MUL:
26421   case ISD::AND:
26422   case ISD::OR:
26423   case ISD::XOR:
26424     return false;
26425   }
26426 }
26427
26428 /// IsDesirableToPromoteOp - This method query the target whether it is
26429 /// beneficial for dag combiner to promote the specified node. If true, it
26430 /// should return the desired promotion type by reference.
26431 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
26432   EVT VT = Op.getValueType();
26433   if (VT != MVT::i16)
26434     return false;
26435
26436   bool Promote = false;
26437   bool Commute = false;
26438   switch (Op.getOpcode()) {
26439   default: break;
26440   case ISD::LOAD: {
26441     LoadSDNode *LD = cast<LoadSDNode>(Op);
26442     // If the non-extending load has a single use and it's not live out, then it
26443     // might be folded.
26444     if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&&
26445                                                      Op.hasOneUse()*/) {
26446       for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
26447              UE = Op.getNode()->use_end(); UI != UE; ++UI) {
26448         // The only case where we'd want to promote LOAD (rather then it being
26449         // promoted as an operand is when it's only use is liveout.
26450         if (UI->getOpcode() != ISD::CopyToReg)
26451           return false;
26452       }
26453     }
26454     Promote = true;
26455     break;
26456   }
26457   case ISD::SIGN_EXTEND:
26458   case ISD::ZERO_EXTEND:
26459   case ISD::ANY_EXTEND:
26460     Promote = true;
26461     break;
26462   case ISD::SHL:
26463   case ISD::SRL: {
26464     SDValue N0 = Op.getOperand(0);
26465     // Look out for (store (shl (load), x)).
26466     if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
26467       return false;
26468     Promote = true;
26469     break;
26470   }
26471   case ISD::ADD:
26472   case ISD::MUL:
26473   case ISD::AND:
26474   case ISD::OR:
26475   case ISD::XOR:
26476     Commute = true;
26477     // fallthrough
26478   case ISD::SUB: {
26479     SDValue N0 = Op.getOperand(0);
26480     SDValue N1 = Op.getOperand(1);
26481     if (!Commute && MayFoldLoad(N1))
26482       return false;
26483     // Avoid disabling potential load folding opportunities.
26484     if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
26485       return false;
26486     if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
26487       return false;
26488     Promote = true;
26489   }
26490   }
26491
26492   PVT = MVT::i32;
26493   return Promote;
26494 }
26495
26496 //===----------------------------------------------------------------------===//
26497 //                           X86 Inline Assembly Support
26498 //===----------------------------------------------------------------------===//
26499
26500 namespace {
26501   // Helper to match a string separated by whitespace.
26502   bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) {
26503     s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace.
26504
26505     for (unsigned i = 0, e = args.size(); i != e; ++i) {
26506       StringRef piece(*args[i]);
26507       if (!s.startswith(piece)) // Check if the piece matches.
26508         return false;
26509
26510       s = s.substr(piece.size());
26511       StringRef::size_type pos = s.find_first_not_of(" \t");
26512       if (pos == 0) // We matched a prefix.
26513         return false;
26514
26515       s = s.substr(pos);
26516     }
26517
26518     return s.empty();
26519   }
26520   const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={};
26521 }
26522
26523 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
26524
26525   if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
26526     if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
26527         std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
26528         std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
26529
26530       if (AsmPieces.size() == 3)
26531         return true;
26532       else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
26533         return true;
26534     }
26535   }
26536   return false;
26537 }
26538
26539 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
26540   InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
26541
26542   std::string AsmStr = IA->getAsmString();
26543
26544   IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
26545   if (!Ty || Ty->getBitWidth() % 16 != 0)
26546     return false;
26547
26548   // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
26549   SmallVector<StringRef, 4> AsmPieces;
26550   SplitString(AsmStr, AsmPieces, ";\n");
26551
26552   switch (AsmPieces.size()) {
26553   default: return false;
26554   case 1:
26555     // FIXME: this should verify that we are targeting a 486 or better.  If not,
26556     // we will turn this bswap into something that will be lowered to logical
26557     // ops instead of emitting the bswap asm.  For now, we don't support 486 or
26558     // lower so don't worry about this.
26559     // bswap $0
26560     if (matchAsm(AsmPieces[0], "bswap", "$0") ||
26561         matchAsm(AsmPieces[0], "bswapl", "$0") ||
26562         matchAsm(AsmPieces[0], "bswapq", "$0") ||
26563         matchAsm(AsmPieces[0], "bswap", "${0:q}") ||
26564         matchAsm(AsmPieces[0], "bswapl", "${0:q}") ||
26565         matchAsm(AsmPieces[0], "bswapq", "${0:q}")) {
26566       // No need to check constraints, nothing other than the equivalent of
26567       // "=r,0" would be valid here.
26568       return IntrinsicLowering::LowerToByteSwap(CI);
26569     }
26570
26571     // rorw $$8, ${0:w}  -->  llvm.bswap.i16
26572     if (CI->getType()->isIntegerTy(16) &&
26573         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
26574         (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") ||
26575          matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) {
26576       AsmPieces.clear();
26577       const std::string &ConstraintsStr = IA->getConstraintString();
26578       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
26579       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
26580       if (clobbersFlagRegisters(AsmPieces))
26581         return IntrinsicLowering::LowerToByteSwap(CI);
26582     }
26583     break;
26584   case 3:
26585     if (CI->getType()->isIntegerTy(32) &&
26586         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
26587         matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") &&
26588         matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") &&
26589         matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) {
26590       AsmPieces.clear();
26591       const std::string &ConstraintsStr = IA->getConstraintString();
26592       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
26593       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
26594       if (clobbersFlagRegisters(AsmPieces))
26595         return IntrinsicLowering::LowerToByteSwap(CI);
26596     }
26597
26598     if (CI->getType()->isIntegerTy(64)) {
26599       InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
26600       if (Constraints.size() >= 2 &&
26601           Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
26602           Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
26603         // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
26604         if (matchAsm(AsmPieces[0], "bswap", "%eax") &&
26605             matchAsm(AsmPieces[1], "bswap", "%edx") &&
26606             matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx"))
26607           return IntrinsicLowering::LowerToByteSwap(CI);
26608       }
26609     }
26610     break;
26611   }
26612   return false;
26613 }
26614
26615 /// getConstraintType - Given a constraint letter, return the type of
26616 /// constraint it is for this target.
26617 X86TargetLowering::ConstraintType
26618 X86TargetLowering::getConstraintType(const std::string &Constraint) const {
26619   if (Constraint.size() == 1) {
26620     switch (Constraint[0]) {
26621     case 'R':
26622     case 'q':
26623     case 'Q':
26624     case 'f':
26625     case 't':
26626     case 'u':
26627     case 'y':
26628     case 'x':
26629     case 'Y':
26630     case 'l':
26631       return C_RegisterClass;
26632     case 'a':
26633     case 'b':
26634     case 'c':
26635     case 'd':
26636     case 'S':
26637     case 'D':
26638     case 'A':
26639       return C_Register;
26640     case 'I':
26641     case 'J':
26642     case 'K':
26643     case 'L':
26644     case 'M':
26645     case 'N':
26646     case 'G':
26647     case 'C':
26648     case 'e':
26649     case 'Z':
26650       return C_Other;
26651     default:
26652       break;
26653     }
26654   }
26655   return TargetLowering::getConstraintType(Constraint);
26656 }
26657
26658 /// Examine constraint type and operand type and determine a weight value.
26659 /// This object must already have been set up with the operand type
26660 /// and the current alternative constraint selected.
26661 TargetLowering::ConstraintWeight
26662   X86TargetLowering::getSingleConstraintMatchWeight(
26663     AsmOperandInfo &info, const char *constraint) const {
26664   ConstraintWeight weight = CW_Invalid;
26665   Value *CallOperandVal = info.CallOperandVal;
26666     // If we don't have a value, we can't do a match,
26667     // but allow it at the lowest weight.
26668   if (!CallOperandVal)
26669     return CW_Default;
26670   Type *type = CallOperandVal->getType();
26671   // Look at the constraint type.
26672   switch (*constraint) {
26673   default:
26674     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
26675   case 'R':
26676   case 'q':
26677   case 'Q':
26678   case 'a':
26679   case 'b':
26680   case 'c':
26681   case 'd':
26682   case 'S':
26683   case 'D':
26684   case 'A':
26685     if (CallOperandVal->getType()->isIntegerTy())
26686       weight = CW_SpecificReg;
26687     break;
26688   case 'f':
26689   case 't':
26690   case 'u':
26691     if (type->isFloatingPointTy())
26692       weight = CW_SpecificReg;
26693     break;
26694   case 'y':
26695     if (type->isX86_MMXTy() && Subtarget->hasMMX())
26696       weight = CW_SpecificReg;
26697     break;
26698   case 'x':
26699   case 'Y':
26700     if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) ||
26701         ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256()))
26702       weight = CW_Register;
26703     break;
26704   case 'I':
26705     if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
26706       if (C->getZExtValue() <= 31)
26707         weight = CW_Constant;
26708     }
26709     break;
26710   case 'J':
26711     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26712       if (C->getZExtValue() <= 63)
26713         weight = CW_Constant;
26714     }
26715     break;
26716   case 'K':
26717     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26718       if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
26719         weight = CW_Constant;
26720     }
26721     break;
26722   case 'L':
26723     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26724       if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
26725         weight = CW_Constant;
26726     }
26727     break;
26728   case 'M':
26729     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26730       if (C->getZExtValue() <= 3)
26731         weight = CW_Constant;
26732     }
26733     break;
26734   case 'N':
26735     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26736       if (C->getZExtValue() <= 0xff)
26737         weight = CW_Constant;
26738     }
26739     break;
26740   case 'G':
26741   case 'C':
26742     if (dyn_cast<ConstantFP>(CallOperandVal)) {
26743       weight = CW_Constant;
26744     }
26745     break;
26746   case 'e':
26747     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26748       if ((C->getSExtValue() >= -0x80000000LL) &&
26749           (C->getSExtValue() <= 0x7fffffffLL))
26750         weight = CW_Constant;
26751     }
26752     break;
26753   case 'Z':
26754     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26755       if (C->getZExtValue() <= 0xffffffff)
26756         weight = CW_Constant;
26757     }
26758     break;
26759   }
26760   return weight;
26761 }
26762
26763 /// LowerXConstraint - try to replace an X constraint, which matches anything,
26764 /// with another that has more specific requirements based on the type of the
26765 /// corresponding operand.
26766 const char *X86TargetLowering::
26767 LowerXConstraint(EVT ConstraintVT) const {
26768   // FP X constraints get lowered to SSE1/2 registers if available, otherwise
26769   // 'f' like normal targets.
26770   if (ConstraintVT.isFloatingPoint()) {
26771     if (Subtarget->hasSSE2())
26772       return "Y";
26773     if (Subtarget->hasSSE1())
26774       return "x";
26775   }
26776
26777   return TargetLowering::LowerXConstraint(ConstraintVT);
26778 }
26779
26780 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
26781 /// vector.  If it is invalid, don't add anything to Ops.
26782 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
26783                                                      std::string &Constraint,
26784                                                      std::vector<SDValue>&Ops,
26785                                                      SelectionDAG &DAG) const {
26786   SDValue Result;
26787
26788   // Only support length 1 constraints for now.
26789   if (Constraint.length() > 1) return;
26790
26791   char ConstraintLetter = Constraint[0];
26792   switch (ConstraintLetter) {
26793   default: break;
26794   case 'I':
26795     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26796       if (C->getZExtValue() <= 31) {
26797         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26798         break;
26799       }
26800     }
26801     return;
26802   case 'J':
26803     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26804       if (C->getZExtValue() <= 63) {
26805         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26806         break;
26807       }
26808     }
26809     return;
26810   case 'K':
26811     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26812       if (isInt<8>(C->getSExtValue())) {
26813         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26814         break;
26815       }
26816     }
26817     return;
26818   case 'L':
26819     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26820       if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
26821           (Subtarget->is64Bit() && C->getZExtValue() == 0xffffffff)) {
26822         Result = DAG.getTargetConstant(C->getSExtValue(), Op.getValueType());
26823         break;
26824       }
26825     }
26826     return;
26827   case 'M':
26828     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26829       if (C->getZExtValue() <= 3) {
26830         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26831         break;
26832       }
26833     }
26834     return;
26835   case 'N':
26836     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26837       if (C->getZExtValue() <= 255) {
26838         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26839         break;
26840       }
26841     }
26842     return;
26843   case 'O':
26844     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26845       if (C->getZExtValue() <= 127) {
26846         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26847         break;
26848       }
26849     }
26850     return;
26851   case 'e': {
26852     // 32-bit signed value
26853     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26854       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
26855                                            C->getSExtValue())) {
26856         // Widen to 64 bits here to get it sign extended.
26857         Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
26858         break;
26859       }
26860     // FIXME gcc accepts some relocatable values here too, but only in certain
26861     // memory models; it's complicated.
26862     }
26863     return;
26864   }
26865   case 'Z': {
26866     // 32-bit unsigned value
26867     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26868       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
26869                                            C->getZExtValue())) {
26870         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26871         break;
26872       }
26873     }
26874     // FIXME gcc accepts some relocatable values here too, but only in certain
26875     // memory models; it's complicated.
26876     return;
26877   }
26878   case 'i': {
26879     // Literal immediates are always ok.
26880     if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
26881       // Widen to 64 bits here to get it sign extended.
26882       Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64);
26883       break;
26884     }
26885
26886     // In any sort of PIC mode addresses need to be computed at runtime by
26887     // adding in a register or some sort of table lookup.  These can't
26888     // be used as immediates.
26889     if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC())
26890       return;
26891
26892     // If we are in non-pic codegen mode, we allow the address of a global (with
26893     // an optional displacement) to be used with 'i'.
26894     GlobalAddressSDNode *GA = nullptr;
26895     int64_t Offset = 0;
26896
26897     // Match either (GA), (GA+C), (GA+C1+C2), etc.
26898     while (1) {
26899       if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
26900         Offset += GA->getOffset();
26901         break;
26902       } else if (Op.getOpcode() == ISD::ADD) {
26903         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
26904           Offset += C->getZExtValue();
26905           Op = Op.getOperand(0);
26906           continue;
26907         }
26908       } else if (Op.getOpcode() == ISD::SUB) {
26909         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
26910           Offset += -C->getZExtValue();
26911           Op = Op.getOperand(0);
26912           continue;
26913         }
26914       }
26915
26916       // Otherwise, this isn't something we can handle, reject it.
26917       return;
26918     }
26919
26920     const GlobalValue *GV = GA->getGlobal();
26921     // If we require an extra load to get this address, as in PIC mode, we
26922     // can't accept it.
26923     if (isGlobalStubReference(
26924             Subtarget->ClassifyGlobalReference(GV, DAG.getTarget())))
26925       return;
26926
26927     Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
26928                                         GA->getValueType(0), Offset);
26929     break;
26930   }
26931   }
26932
26933   if (Result.getNode()) {
26934     Ops.push_back(Result);
26935     return;
26936   }
26937   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
26938 }
26939
26940 std::pair<unsigned, const TargetRegisterClass*>
26941 X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
26942                                                 MVT VT) const {
26943   // First, see if this is a constraint that directly corresponds to an LLVM
26944   // register class.
26945   if (Constraint.size() == 1) {
26946     // GCC Constraint Letters
26947     switch (Constraint[0]) {
26948     default: break;
26949       // TODO: Slight differences here in allocation order and leaving
26950       // RIP in the class. Do they matter any more here than they do
26951       // in the normal allocation?
26952     case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
26953       if (Subtarget->is64Bit()) {
26954         if (VT == MVT::i32 || VT == MVT::f32)
26955           return std::make_pair(0U, &X86::GR32RegClass);
26956         if (VT == MVT::i16)
26957           return std::make_pair(0U, &X86::GR16RegClass);
26958         if (VT == MVT::i8 || VT == MVT::i1)
26959           return std::make_pair(0U, &X86::GR8RegClass);
26960         if (VT == MVT::i64 || VT == MVT::f64)
26961           return std::make_pair(0U, &X86::GR64RegClass);
26962         break;
26963       }
26964       // 32-bit fallthrough
26965     case 'Q':   // Q_REGS
26966       if (VT == MVT::i32 || VT == MVT::f32)
26967         return std::make_pair(0U, &X86::GR32_ABCDRegClass);
26968       if (VT == MVT::i16)
26969         return std::make_pair(0U, &X86::GR16_ABCDRegClass);
26970       if (VT == MVT::i8 || VT == MVT::i1)
26971         return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
26972       if (VT == MVT::i64)
26973         return std::make_pair(0U, &X86::GR64_ABCDRegClass);
26974       break;
26975     case 'r':   // GENERAL_REGS
26976     case 'l':   // INDEX_REGS
26977       if (VT == MVT::i8 || VT == MVT::i1)
26978         return std::make_pair(0U, &X86::GR8RegClass);
26979       if (VT == MVT::i16)
26980         return std::make_pair(0U, &X86::GR16RegClass);
26981       if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit())
26982         return std::make_pair(0U, &X86::GR32RegClass);
26983       return std::make_pair(0U, &X86::GR64RegClass);
26984     case 'R':   // LEGACY_REGS
26985       if (VT == MVT::i8 || VT == MVT::i1)
26986         return std::make_pair(0U, &X86::GR8_NOREXRegClass);
26987       if (VT == MVT::i16)
26988         return std::make_pair(0U, &X86::GR16_NOREXRegClass);
26989       if (VT == MVT::i32 || !Subtarget->is64Bit())
26990         return std::make_pair(0U, &X86::GR32_NOREXRegClass);
26991       return std::make_pair(0U, &X86::GR64_NOREXRegClass);
26992     case 'f':  // FP Stack registers.
26993       // If SSE is enabled for this VT, use f80 to ensure the isel moves the
26994       // value to the correct fpstack register class.
26995       if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
26996         return std::make_pair(0U, &X86::RFP32RegClass);
26997       if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
26998         return std::make_pair(0U, &X86::RFP64RegClass);
26999       return std::make_pair(0U, &X86::RFP80RegClass);
27000     case 'y':   // MMX_REGS if MMX allowed.
27001       if (!Subtarget->hasMMX()) break;
27002       return std::make_pair(0U, &X86::VR64RegClass);
27003     case 'Y':   // SSE_REGS if SSE2 allowed
27004       if (!Subtarget->hasSSE2()) break;
27005       // FALL THROUGH.
27006     case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
27007       if (!Subtarget->hasSSE1()) break;
27008
27009       switch (VT.SimpleTy) {
27010       default: break;
27011       // Scalar SSE types.
27012       case MVT::f32:
27013       case MVT::i32:
27014         return std::make_pair(0U, &X86::FR32RegClass);
27015       case MVT::f64:
27016       case MVT::i64:
27017         return std::make_pair(0U, &X86::FR64RegClass);
27018       // Vector types.
27019       case MVT::v16i8:
27020       case MVT::v8i16:
27021       case MVT::v4i32:
27022       case MVT::v2i64:
27023       case MVT::v4f32:
27024       case MVT::v2f64:
27025         return std::make_pair(0U, &X86::VR128RegClass);
27026       // AVX types.
27027       case MVT::v32i8:
27028       case MVT::v16i16:
27029       case MVT::v8i32:
27030       case MVT::v4i64:
27031       case MVT::v8f32:
27032       case MVT::v4f64:
27033         return std::make_pair(0U, &X86::VR256RegClass);
27034       case MVT::v8f64:
27035       case MVT::v16f32:
27036       case MVT::v16i32:
27037       case MVT::v8i64:
27038         return std::make_pair(0U, &X86::VR512RegClass);
27039       }
27040       break;
27041     }
27042   }
27043
27044   // Use the default implementation in TargetLowering to convert the register
27045   // constraint into a member of a register class.
27046   std::pair<unsigned, const TargetRegisterClass*> Res;
27047   Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
27048
27049   // Not found as a standard register?
27050   if (!Res.second) {
27051     // Map st(0) -> st(7) -> ST0
27052     if (Constraint.size() == 7 && Constraint[0] == '{' &&
27053         tolower(Constraint[1]) == 's' &&
27054         tolower(Constraint[2]) == 't' &&
27055         Constraint[3] == '(' &&
27056         (Constraint[4] >= '0' && Constraint[4] <= '7') &&
27057         Constraint[5] == ')' &&
27058         Constraint[6] == '}') {
27059
27060       Res.first = X86::FP0+Constraint[4]-'0';
27061       Res.second = &X86::RFP80RegClass;
27062       return Res;
27063     }
27064
27065     // GCC allows "st(0)" to be called just plain "st".
27066     if (StringRef("{st}").equals_lower(Constraint)) {
27067       Res.first = X86::FP0;
27068       Res.second = &X86::RFP80RegClass;
27069       return Res;
27070     }
27071
27072     // flags -> EFLAGS
27073     if (StringRef("{flags}").equals_lower(Constraint)) {
27074       Res.first = X86::EFLAGS;
27075       Res.second = &X86::CCRRegClass;
27076       return Res;
27077     }
27078
27079     // 'A' means EAX + EDX.
27080     if (Constraint == "A") {
27081       Res.first = X86::EAX;
27082       Res.second = &X86::GR32_ADRegClass;
27083       return Res;
27084     }
27085     return Res;
27086   }
27087
27088   // Otherwise, check to see if this is a register class of the wrong value
27089   // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
27090   // turn into {ax},{dx}.
27091   if (Res.second->hasType(VT))
27092     return Res;   // Correct type already, nothing to do.
27093
27094   // All of the single-register GCC register classes map their values onto
27095   // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp".  If we
27096   // really want an 8-bit or 32-bit register, map to the appropriate register
27097   // class and return the appropriate register.
27098   if (Res.second == &X86::GR16RegClass) {
27099     if (VT == MVT::i8 || VT == MVT::i1) {
27100       unsigned DestReg = 0;
27101       switch (Res.first) {
27102       default: break;
27103       case X86::AX: DestReg = X86::AL; break;
27104       case X86::DX: DestReg = X86::DL; break;
27105       case X86::CX: DestReg = X86::CL; break;
27106       case X86::BX: DestReg = X86::BL; break;
27107       }
27108       if (DestReg) {
27109         Res.first = DestReg;
27110         Res.second = &X86::GR8RegClass;
27111       }
27112     } else if (VT == MVT::i32 || VT == MVT::f32) {
27113       unsigned DestReg = 0;
27114       switch (Res.first) {
27115       default: break;
27116       case X86::AX: DestReg = X86::EAX; break;
27117       case X86::DX: DestReg = X86::EDX; break;
27118       case X86::CX: DestReg = X86::ECX; break;
27119       case X86::BX: DestReg = X86::EBX; break;
27120       case X86::SI: DestReg = X86::ESI; break;
27121       case X86::DI: DestReg = X86::EDI; break;
27122       case X86::BP: DestReg = X86::EBP; break;
27123       case X86::SP: DestReg = X86::ESP; break;
27124       }
27125       if (DestReg) {
27126         Res.first = DestReg;
27127         Res.second = &X86::GR32RegClass;
27128       }
27129     } else if (VT == MVT::i64 || VT == MVT::f64) {
27130       unsigned DestReg = 0;
27131       switch (Res.first) {
27132       default: break;
27133       case X86::AX: DestReg = X86::RAX; break;
27134       case X86::DX: DestReg = X86::RDX; break;
27135       case X86::CX: DestReg = X86::RCX; break;
27136       case X86::BX: DestReg = X86::RBX; break;
27137       case X86::SI: DestReg = X86::RSI; break;
27138       case X86::DI: DestReg = X86::RDI; break;
27139       case X86::BP: DestReg = X86::RBP; break;
27140       case X86::SP: DestReg = X86::RSP; break;
27141       }
27142       if (DestReg) {
27143         Res.first = DestReg;
27144         Res.second = &X86::GR64RegClass;
27145       }
27146     }
27147   } else if (Res.second == &X86::FR32RegClass ||
27148              Res.second == &X86::FR64RegClass ||
27149              Res.second == &X86::VR128RegClass ||
27150              Res.second == &X86::VR256RegClass ||
27151              Res.second == &X86::FR32XRegClass ||
27152              Res.second == &X86::FR64XRegClass ||
27153              Res.second == &X86::VR128XRegClass ||
27154              Res.second == &X86::VR256XRegClass ||
27155              Res.second == &X86::VR512RegClass) {
27156     // Handle references to XMM physical registers that got mapped into the
27157     // wrong class.  This can happen with constraints like {xmm0} where the
27158     // target independent register mapper will just pick the first match it can
27159     // find, ignoring the required type.
27160
27161     if (VT == MVT::f32 || VT == MVT::i32)
27162       Res.second = &X86::FR32RegClass;
27163     else if (VT == MVT::f64 || VT == MVT::i64)
27164       Res.second = &X86::FR64RegClass;
27165     else if (X86::VR128RegClass.hasType(VT))
27166       Res.second = &X86::VR128RegClass;
27167     else if (X86::VR256RegClass.hasType(VT))
27168       Res.second = &X86::VR256RegClass;
27169     else if (X86::VR512RegClass.hasType(VT))
27170       Res.second = &X86::VR512RegClass;
27171   }
27172
27173   return Res;
27174 }
27175
27176 int X86TargetLowering::getScalingFactorCost(const AddrMode &AM,
27177                                             Type *Ty) const {
27178   // Scaling factors are not free at all.
27179   // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
27180   // will take 2 allocations in the out of order engine instead of 1
27181   // for plain addressing mode, i.e. inst (reg1).
27182   // E.g.,
27183   // vaddps (%rsi,%drx), %ymm0, %ymm1
27184   // Requires two allocations (one for the load, one for the computation)
27185   // whereas:
27186   // vaddps (%rsi), %ymm0, %ymm1
27187   // Requires just 1 allocation, i.e., freeing allocations for other operations
27188   // and having less micro operations to execute.
27189   //
27190   // For some X86 architectures, this is even worse because for instance for
27191   // stores, the complex addressing mode forces the instruction to use the
27192   // "load" ports instead of the dedicated "store" port.
27193   // E.g., on Haswell:
27194   // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
27195   // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
27196   if (isLegalAddressingMode(AM, Ty))
27197     // Scale represents reg2 * scale, thus account for 1
27198     // as soon as we use a second register.
27199     return AM.Scale != 0;
27200   return -1;
27201 }
27202
27203 bool X86TargetLowering::isTargetFTOL() const {
27204   return Subtarget->isTargetKnownWindowsMSVC() && !Subtarget->is64Bit();
27205 }