lib/Target/X86/X86ISelLowering.cpp

   1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This file defines the interfaces that X86 uses to lower LLVM code into a
  11 // selection DAG.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "X86ISelLowering.h"
  16 #include "Utils/X86ShuffleDecode.h"
  17 #include "X86CallingConv.h"
  18 #include "X86InstrBuilder.h"
  19 #include "X86MachineFunctionInfo.h"
  20 #include "X86TargetMachine.h"
  21 #include "X86TargetObjectFile.h"
  22 #include "llvm/ADT/SmallBitVector.h"
  23 #include "llvm/ADT/SmallSet.h"
  24 #include "llvm/ADT/Statistic.h"
  25 #include "llvm/ADT/StringExtras.h"
  26 #include "llvm/ADT/StringSwitch.h"
  27 #include "llvm/ADT/VariadicFunction.h"
  28 #include "llvm/CodeGen/IntrinsicLowering.h"
  29 #include "llvm/CodeGen/MachineFrameInfo.h"
  30 #include "llvm/CodeGen/MachineFunction.h"
  31 #include "llvm/CodeGen/MachineInstrBuilder.h"
  32 #include "llvm/CodeGen/MachineJumpTableInfo.h"
  33 #include "llvm/CodeGen/MachineModuleInfo.h"
  34 #include "llvm/CodeGen/MachineRegisterInfo.h"
  35 #include "llvm/IR/CallSite.h"
  36 #include "llvm/IR/CallingConv.h"
  37 #include "llvm/IR/Constants.h"
  38 #include "llvm/IR/DerivedTypes.h"
  39 #include "llvm/IR/Function.h"
  40 #include "llvm/IR/GlobalAlias.h"
  41 #include "llvm/IR/GlobalVariable.h"
  42 #include "llvm/IR/Instructions.h"
  43 #include "llvm/IR/Intrinsics.h"
  44 #include "llvm/MC/MCAsmInfo.h"
  45 #include "llvm/MC/MCContext.h"
  46 #include "llvm/MC/MCExpr.h"
  47 #include "llvm/MC/MCSymbol.h"
  48 #include "llvm/Support/CommandLine.h"
  49 #include "llvm/Support/Debug.h"
  50 #include "llvm/Support/ErrorHandling.h"
  51 #include "llvm/Support/MathExtras.h"
  52 #include "llvm/Target/TargetOptions.h"
  53 #include "X86IntrinsicsInfo.h"
  54 #include <bitset>
  55 #include <numeric>
  56 #include <cctype>
  57 using namespace llvm;
  58
  59 #define DEBUG_TYPE "x86-isel"
  60
  61 STATISTIC(NumTailCalls, "Number of tail calls");
  62
  63 static cl::opt<bool> ExperimentalVectorWideningLegalization(
  64     "x86-experimental-vector-widening-legalization", cl::init(false),
  65     cl::desc("Enable an experimental vector type legalization through widening "
  66              "rather than promotion."),
  67     cl::Hidden);
  68
  69 static cl::opt<bool> ExperimentalVectorShuffleLowering(
  70     "x86-experimental-vector-shuffle-lowering", cl::init(true),
  71     cl::desc("Enable an experimental vector shuffle lowering code path."),
  72     cl::Hidden);
  73
  74 static cl::opt<bool> ExperimentalVectorShuffleLegality(
  75     "x86-experimental-vector-shuffle-legality", cl::init(false),
  76     cl::desc("Enable experimental shuffle legality based on the experimental "
  77              "shuffle lowering. Should only be used with the experimental "
  78              "shuffle lowering."),
  79     cl::Hidden);
  80
  81 static cl::opt<int> ReciprocalEstimateRefinementSteps(
  82     "x86-recip-refinement-steps", cl::init(1),
  83     cl::desc("Specify the number of Newton-Raphson iterations applied to the "
  84              "result of the hardware reciprocal estimate instruction."),
  85     cl::NotHidden);
  86
  87 // Forward declarations.
  88 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
  89                        SDValue V2);
  90
  91 static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
  92                                 SelectionDAG &DAG, SDLoc dl,
  93                                 unsigned vectorWidth) {
  94   assert((vectorWidth == 128 || vectorWidth == 256) &&
  95          "Unsupported vector width");
  96   EVT VT = Vec.getValueType();
  97   EVT ElVT = VT.getVectorElementType();
  98   unsigned Factor = VT.getSizeInBits()/vectorWidth;
  99   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
 100                                   VT.getVectorNumElements()/Factor);
 101
 102   // Extract from UNDEF is UNDEF.
 103   if (Vec.getOpcode() == ISD::UNDEF)
 104     return DAG.getUNDEF(ResultVT);
 105
 106   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
 107   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
 108
 109   // This is the index of the first element of the vectorWidth-bit chunk
 110   // we want.
 111   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
 112                                * ElemsPerChunk);
 113
 114   // If the input is a buildvector just emit a smaller one.
 115   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
 116     return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
 117                        makeArrayRef(Vec->op_begin() + NormalizedIdxVal,
 118                                     ElemsPerChunk));
 119
 120   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
 121   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
 122 }
 123
 124 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
 125 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
 126 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
 127 /// instructions or a simple subregister reference. Idx is an index in the
 128 /// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
 129 /// lowering EXTRACT_VECTOR_ELT operations easier.
 130 static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
 131                                    SelectionDAG &DAG, SDLoc dl) {
 132   assert((Vec.getValueType().is256BitVector() ||
 133           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
 134   return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
 135 }
 136
 137 /// Generate a DAG to grab 256-bits from a 512-bit vector.
 138 static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
 139                                    SelectionDAG &DAG, SDLoc dl) {
 140   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
 141   return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
 142 }
 143
 144 static SDValue InsertSubVector(SDValue Result, SDValue Vec,
 145                                unsigned IdxVal, SelectionDAG &DAG,
 146                                SDLoc dl, unsigned vectorWidth) {
 147   assert((vectorWidth == 128 || vectorWidth == 256) &&
 148          "Unsupported vector width");
 149   // Inserting UNDEF is Result
 150   if (Vec.getOpcode() == ISD::UNDEF)
 151     return Result;
 152   EVT VT = Vec.getValueType();
 153   EVT ElVT = VT.getVectorElementType();
 154   EVT ResultVT = Result.getValueType();
 155
 156   // Insert the relevant vectorWidth bits.
 157   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
 158
 159   // This is the index of the first element of the vectorWidth-bit chunk
 160   // we want.
 161   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
 162                                * ElemsPerChunk);
 163
 164   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
 165   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
 166 }
 167
 168 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
 169 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
 170 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
 171 /// simple superregister reference.  Idx is an index in the 128 bits
 172 /// we want.  It need not be aligned to a 128-bit boundary.  That makes
 173 /// lowering INSERT_VECTOR_ELT operations easier.
 174 static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
 175                                   SelectionDAG &DAG,SDLoc dl) {
 176   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
 177   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
 178 }
 179
 180 static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
 181                                   SelectionDAG &DAG, SDLoc dl) {
 182   assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
 183   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
 184 }
 185
 186 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
 187 /// instructions. This is used because creating CONCAT_VECTOR nodes of
 188 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
 189 /// large BUILD_VECTORS.
 190 static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
 191                                    unsigned NumElems, SelectionDAG &DAG,
 192                                    SDLoc dl) {
 193   SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
 194   return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
 195 }
 196
 197 static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
 198                                    unsigned NumElems, SelectionDAG &DAG,
 199                                    SDLoc dl) {
 200   SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
 201   return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
 202 }
 203
 204 // FIXME: This should stop caching the target machine as soon as
 205 // we can remove resetOperationActions et al.
 206 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM)
 207     : TargetLowering(TM) {
 208   Subtarget = &TM.getSubtarget<X86Subtarget>();
 209   X86ScalarSSEf64 = Subtarget->hasSSE2();
 210   X86ScalarSSEf32 = Subtarget->hasSSE1();
 211   TD = getDataLayout();
 212
 213   resetOperationActions();
 214 }
 215
 216 void X86TargetLowering::resetOperationActions() {
 217   const TargetMachine &TM = getTargetMachine();
 218   static bool FirstTimeThrough = true;
 219
 220   // If none of the target options have changed, then we don't need to reset the
 221   // operation actions.
 222   if (!FirstTimeThrough && TO == TM.Options) return;
 223
 224   if (!FirstTimeThrough) {
 225     // Reinitialize the actions.
 226     initActions();
 227     FirstTimeThrough = false;
 228   }
 229
 230   TO = TM.Options;
 231
 232   // Set up the TargetLowering object.
 233   static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
 234
 235   // X86 is weird. It always uses i8 for shift amounts and setcc results.
 236   setBooleanContents(ZeroOrOneBooleanContent);
 237   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
 238   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 239
 240   // For 64-bit, since we have so many registers, use the ILP scheduler.
 241   // For 32-bit, use the register pressure specific scheduling.
 242   // For Atom, always use ILP scheduling.
 243   if (Subtarget->isAtom())
 244     setSchedulingPreference(Sched::ILP);
 245   else if (Subtarget->is64Bit())
 246     setSchedulingPreference(Sched::ILP);
 247   else
 248     setSchedulingPreference(Sched::RegPressure);
 249   const X86RegisterInfo *RegInfo =
 250       TM.getSubtarget<X86Subtarget>().getRegisterInfo();
 251   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
 252
 253   // Bypass expensive divides on Atom when compiling with O2.
 254   if (TM.getOptLevel() >= CodeGenOpt::Default) {
 255     if (Subtarget->hasSlowDivide32())
 256       addBypassSlowDiv(32, 8);
 257     if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit())
 258       addBypassSlowDiv(64, 16);
 259   }
 260
 261   if (Subtarget->isTargetKnownWindowsMSVC()) {
 262     // Setup Windows compiler runtime calls.
 263     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
 264     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
 265     setLibcallName(RTLIB::SREM_I64, "_allrem");
 266     setLibcallName(RTLIB::UREM_I64, "_aullrem");
 267     setLibcallName(RTLIB::MUL_I64, "_allmul");
 268     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
 269     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
 270     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
 271     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
 272     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
 273
 274     // The _ftol2 runtime function has an unusual calling conv, which
 275     // is modeled by a special pseudo-instruction.
 276     setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr);
 277     setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr);
 278     setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr);
 279     setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr);
 280   }
 281
 282   if (Subtarget->isTargetDarwin()) {
 283     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
 284     setUseUnderscoreSetJmp(false);
 285     setUseUnderscoreLongJmp(false);
 286   } else if (Subtarget->isTargetWindowsGNU()) {
 287     // MS runtime is weird: it exports _setjmp, but longjmp!
 288     setUseUnderscoreSetJmp(true);
 289     setUseUnderscoreLongJmp(false);
 290   } else {
 291     setUseUnderscoreSetJmp(true);
 292     setUseUnderscoreLongJmp(true);
 293   }
 294
 295   // Set up the register classes.
 296   addRegisterClass(MVT::i8, &X86::GR8RegClass);
 297   addRegisterClass(MVT::i16, &X86::GR16RegClass);
 298   addRegisterClass(MVT::i32, &X86::GR32RegClass);
 299   if (Subtarget->is64Bit())
 300     addRegisterClass(MVT::i64, &X86::GR64RegClass);
 301
 302   for (MVT VT : MVT::integer_valuetypes())
 303     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
 304
 305   // We don't accept any truncstore of integer registers.
 306   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
 307   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
 308   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
 309   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
 310   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
 311   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
 312
 313   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 314
 315   // SETOEQ and SETUNE require checking two conditions.
 316   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
 317   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
 318   setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
 319   setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
 320   setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
 321   setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
 322
 323   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
 324   // operation.
 325   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
 326   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
 327   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
 328
 329   if (Subtarget->is64Bit()) {
 330     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
 331     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
 332   } else if (!TM.Options.UseSoftFloat) {
 333     // We have an algorithm for SSE2->double, and we turn this into a
 334     // 64-bit FILD followed by conditional FADD for other targets.
 335     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
 336     // We have an algorithm for SSE2, and we turn this into a 64-bit
 337     // FILD for other targets.
 338     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
 339   }
 340
 341   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
 342   // this operation.
 343   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
 344   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
 345
 346   if (!TM.Options.UseSoftFloat) {
 347     // SSE has no i16 to fp conversion, only i32
 348     if (X86ScalarSSEf32) {
 349       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
 350       // f32 and f64 cases are Legal, f80 case is not
 351       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
 352     } else {
 353       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
 354       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
 355     }
 356   } else {
 357     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
 358     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
 359   }
 360
 361   // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
 362   // are Legal, f80 is custom lowered.
 363   setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
 364   setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
 365
 366   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
 367   // this operation.
 368   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
 369   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
 370
 371   if (X86ScalarSSEf32) {
 372     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
 373     // f32 and f64 cases are Legal, f80 case is not
 374     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
 375   } else {
 376     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
 377     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
 378   }
 379
 380   // Handle FP_TO_UINT by promoting the destination to a larger signed
 381   // conversion.
 382   setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
 383   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
 384   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
 385
 386   if (Subtarget->is64Bit()) {
 387     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
 388     setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
 389   } else if (!TM.Options.UseSoftFloat) {
 390     // Since AVX is a superset of SSE3, only check for SSE here.
 391     if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
 392       // Expand FP_TO_UINT into a select.
 393       // FIXME: We would like to use a Custom expander here eventually to do
 394       // the optimal thing for SSE vs. the default expansion in the legalizer.
 395       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
 396     else
 397       // With SSE3 we can use fisttpll to convert to a signed i64; without
 398       // SSE, we're stuck with a fistpll.
 399       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
 400   }
 401
 402   if (isTargetFTOL()) {
 403     // Use the _ftol2 runtime function, which has a pseudo-instruction
 404     // to handle its weird calling convention.
 405     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
 406   }
 407
 408   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
 409   if (!X86ScalarSSEf64) {
 410     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
 411     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
 412     if (Subtarget->is64Bit()) {
 413       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
 414       // Without SSE, i64->f64 goes through memory.
 415       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
 416     }
 417   }
 418
 419   // Scalar integer divide and remainder are lowered to use operations that
 420   // produce two results, to match the available instructions. This exposes
 421   // the two-result form to trivial CSE, which is able to combine x/y and x%y
 422   // into a single instruction.
 423   //
 424   // Scalar integer multiply-high is also lowered to use two-result
 425   // operations, to match the available instructions. However, plain multiply
 426   // (low) operations are left as Legal, as there are single-result
 427   // instructions for this in x86. Using the two-result multiply instructions
 428   // when both high and low results are needed must be arranged by dagcombine.
 429   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
 430     MVT VT = IntVTs[i];
 431     setOperationAction(ISD::MULHS, VT, Expand);
 432     setOperationAction(ISD::MULHU, VT, Expand);
 433     setOperationAction(ISD::SDIV, VT, Expand);
 434     setOperationAction(ISD::UDIV, VT, Expand);
 435     setOperationAction(ISD::SREM, VT, Expand);
 436     setOperationAction(ISD::UREM, VT, Expand);
 437
 438     // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
 439     setOperationAction(ISD::ADDC, VT, Custom);
 440     setOperationAction(ISD::ADDE, VT, Custom);
 441     setOperationAction(ISD::SUBC, VT, Custom);
 442     setOperationAction(ISD::SUBE, VT, Custom);
 443   }
 444
 445   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
 446   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
 447   setOperationAction(ISD::BR_CC            , MVT::f32,   Expand);
 448   setOperationAction(ISD::BR_CC            , MVT::f64,   Expand);
 449   setOperationAction(ISD::BR_CC            , MVT::f80,   Expand);
 450   setOperationAction(ISD::BR_CC            , MVT::i8,    Expand);
 451   setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
 452   setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
 453   setOperationAction(ISD::BR_CC            , MVT::i64,   Expand);
 454   setOperationAction(ISD::SELECT_CC        , MVT::f32,   Expand);
 455   setOperationAction(ISD::SELECT_CC        , MVT::f64,   Expand);
 456   setOperationAction(ISD::SELECT_CC        , MVT::f80,   Expand);
 457   setOperationAction(ISD::SELECT_CC        , MVT::i8,    Expand);
 458   setOperationAction(ISD::SELECT_CC        , MVT::i16,   Expand);
 459   setOperationAction(ISD::SELECT_CC        , MVT::i32,   Expand);
 460   setOperationAction(ISD::SELECT_CC        , MVT::i64,   Expand);
 461   if (Subtarget->is64Bit())
 462     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
 463   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
 464   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
 465   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
 466   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
 467   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
 468   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
 469   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
 470   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
 471
 472   // Promote the i8 variants and force them on up to i32 which has a shorter
 473   // encoding.
 474   setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
 475   AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
 476   setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
 477   AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
 478   if (Subtarget->hasBMI()) {
 479     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
 480     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
 481     if (Subtarget->is64Bit())
 482       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
 483   } else {
 484     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
 485     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
 486     if (Subtarget->is64Bit())
 487       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
 488   }
 489
 490   if (Subtarget->hasLZCNT()) {
 491     // When promoting the i8 variants, force them to i32 for a shorter
 492     // encoding.
 493     setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
 494     AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
 495     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
 496     AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
 497     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
 498     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
 499     if (Subtarget->is64Bit())
 500       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
 501   } else {
 502     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
 503     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
 504     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
 505     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
 506     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
 507     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
 508     if (Subtarget->is64Bit()) {
 509       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
 510       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
 511     }
 512   }
 513
 514   // Special handling for half-precision floating point conversions.
 515   // If we don't have F16C support, then lower half float conversions
 516   // into library calls.
 517   if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) {
 518     setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
 519     setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
 520   }
 521
 522   // There's never any support for operations beyond MVT::f32.
 523   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
 524   setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
 525   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
 526   setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
 527
 528   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
 529   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
 530   setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
 531   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
 532   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
 533   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
 534
 535   if (Subtarget->hasPOPCNT()) {
 536     setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
 537   } else {
 538     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
 539     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
 540     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
 541     if (Subtarget->is64Bit())
 542       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
 543   }
 544
 545   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
 546
 547   if (!Subtarget->hasMOVBE())
 548     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
 549
 550   // These should be promoted to a larger select which is supported.
 551   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
 552   // X86 wants to expand cmov itself.
 553   setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
 554   setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
 555   setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
 556   setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
 557   setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
 558   setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
 559   setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
 560   setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
 561   setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
 562   setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
 563   setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
 564   setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
 565   if (Subtarget->is64Bit()) {
 566     setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
 567     setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
 568   }
 569   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
 570   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
 571   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
 572   // support continuation, user-level threading, and etc.. As a result, no
 573   // other SjLj exception interfaces are implemented and please don't build
 574   // your own exception handling based on them.
 575   // LLVM/Clang supports zero-cost DWARF exception handling.
 576   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
 577   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
 578
 579   // Darwin ABI issue.
 580   setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
 581   setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
 582   setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
 583   setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
 584   if (Subtarget->is64Bit())
 585     setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
 586   setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
 587   setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
 588   if (Subtarget->is64Bit()) {
 589     setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
 590     setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
 591     setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
 592     setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
 593     setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
 594   }
 595   // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
 596   setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
 597   setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
 598   setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
 599   if (Subtarget->is64Bit()) {
 600     setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
 601     setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
 602     setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
 603   }
 604
 605   if (Subtarget->hasSSE1())
 606     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
 607
 608   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
 609
 610   // Expand certain atomics
 611   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
 612     MVT VT = IntVTs[i];
 613     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
 614     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
 615     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
 616   }
 617
 618   if (Subtarget->hasCmpxchg16b()) {
 619     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
 620   }
 621
 622   // FIXME - use subtarget debug flags
 623   if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() &&
 624       !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) {
 625     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
 626   }
 627
 628   if (Subtarget->is64Bit()) {
 629     setExceptionPointerRegister(X86::RAX);
 630     setExceptionSelectorRegister(X86::RDX);
 631   } else {
 632     setExceptionPointerRegister(X86::EAX);
 633     setExceptionSelectorRegister(X86::EDX);
 634   }
 635   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
 636   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
 637
 638   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
 639   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
 640
 641   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 642   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
 643
 644   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
 645   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
 646   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
 647   if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {
 648     // TargetInfo::X86_64ABIBuiltinVaList
 649     setOperationAction(ISD::VAARG           , MVT::Other, Custom);
 650     setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
 651   } else {
 652     // TargetInfo::CharPtrBuiltinVaList
 653     setOperationAction(ISD::VAARG           , MVT::Other, Expand);
 654     setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
 655   }
 656
 657   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
 658   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
 659
 660   setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(), Custom);
 661
 662   if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
 663     // f32 and f64 use SSE.
 664     // Set up the FP register classes.
 665     addRegisterClass(MVT::f32, &X86::FR32RegClass);
 666     addRegisterClass(MVT::f64, &X86::FR64RegClass);
 667
 668     // Use ANDPD to simulate FABS.
 669     setOperationAction(ISD::FABS , MVT::f64, Custom);
 670     setOperationAction(ISD::FABS , MVT::f32, Custom);
 671
 672     // Use XORP to simulate FNEG.
 673     setOperationAction(ISD::FNEG , MVT::f64, Custom);
 674     setOperationAction(ISD::FNEG , MVT::f32, Custom);
 675
 676     // Use ANDPD and ORPD to simulate FCOPYSIGN.
 677     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
 678     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 679
 680     // Lower this to FGETSIGNx86 plus an AND.
 681     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
 682     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
 683
 684     // We don't support sin/cos/fmod
 685     setOperationAction(ISD::FSIN   , MVT::f64, Expand);
 686     setOperationAction(ISD::FCOS   , MVT::f64, Expand);
 687     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 688     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
 689     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
 690     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 691
 692     // Expand FP immediates into loads from the stack, except for the special
 693     // cases we handle.
 694     addLegalFPImmediate(APFloat(+0.0)); // xorpd
 695     addLegalFPImmediate(APFloat(+0.0f)); // xorps
 696   } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
 697     // Use SSE for f32, x87 for f64.
 698     // Set up the FP register classes.
 699     addRegisterClass(MVT::f32, &X86::FR32RegClass);
 700     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
 701
 702     // Use ANDPS to simulate FABS.
 703     setOperationAction(ISD::FABS , MVT::f32, Custom);
 704
 705     // Use XORP to simulate FNEG.
 706     setOperationAction(ISD::FNEG , MVT::f32, Custom);
 707
 708     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
 709
 710     // Use ANDPS and ORPS to simulate FCOPYSIGN.
 711     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
 712     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 713
 714     // We don't support sin/cos/fmod
 715     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
 716     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
 717     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 718
 719     // Special cases we handle for FP constants.
 720     addLegalFPImmediate(APFloat(+0.0f)); // xorps
 721     addLegalFPImmediate(APFloat(+0.0)); // FLD0
 722     addLegalFPImmediate(APFloat(+1.0)); // FLD1
 723     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
 724     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
 725
 726     if (!TM.Options.UnsafeFPMath) {
 727       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
 728       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
 729       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 730     }
 731   } else if (!TM.Options.UseSoftFloat) {
 732     // f32 and f64 in x87.
 733     // Set up the FP register classes.
 734     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
 735     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
 736
 737     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
 738     setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
 739     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
 740     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
 741
 742     if (!TM.Options.UnsafeFPMath) {
 743       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
 744       setOperationAction(ISD::FSIN   , MVT::f32, Expand);
 745       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
 746       setOperationAction(ISD::FCOS   , MVT::f32, Expand);
 747       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 748       setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 749     }
 750     addLegalFPImmediate(APFloat(+0.0)); // FLD0
 751     addLegalFPImmediate(APFloat(+1.0)); // FLD1
 752     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
 753     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
 754     addLegalFPImmediate(APFloat(+0.0f)); // FLD0
 755     addLegalFPImmediate(APFloat(+1.0f)); // FLD1
 756     addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
 757     addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
 758   }
 759
 760   // We don't support FMA.
 761   setOperationAction(ISD::FMA, MVT::f64, Expand);
 762   setOperationAction(ISD::FMA, MVT::f32, Expand);
 763
 764   // Long double always uses X87.
 765   if (!TM.Options.UseSoftFloat) {
 766     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
 767     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
 768     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
 769     {
 770       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
 771       addLegalFPImmediate(TmpFlt);  // FLD0
 772       TmpFlt.changeSign();
 773       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
 774
 775       bool ignored;
 776       APFloat TmpFlt2(+1.0);
 777       TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
 778                       &ignored);
 779       addLegalFPImmediate(TmpFlt2);  // FLD1
 780       TmpFlt2.changeSign();
 781       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
 782     }
 783
 784     if (!TM.Options.UnsafeFPMath) {
 785       setOperationAction(ISD::FSIN   , MVT::f80, Expand);
 786       setOperationAction(ISD::FCOS   , MVT::f80, Expand);
 787       setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
 788     }
 789
 790     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
 791     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
 792     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
 793     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
 794     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
 795     setOperationAction(ISD::FMA, MVT::f80, Expand);
 796   }
 797
 798   // Always use a library call for pow.
 799   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
 800   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
 801   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
 802
 803   setOperationAction(ISD::FLOG, MVT::f80, Expand);
 804   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
 805   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
 806   setOperationAction(ISD::FEXP, MVT::f80, Expand);
 807   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
 808   setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
 809   setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
 810
 811   // First set operation action for all vector types to either promote
 812   // (for widening) or expand (for scalarization). Then we will selectively
 813   // turn on ones that can be effectively codegen'd.
 814   for (MVT VT : MVT::vector_valuetypes()) {
 815     setOperationAction(ISD::ADD , VT, Expand);
 816     setOperationAction(ISD::SUB , VT, Expand);
 817     setOperationAction(ISD::FADD, VT, Expand);
 818     setOperationAction(ISD::FNEG, VT, Expand);
 819     setOperationAction(ISD::FSUB, VT, Expand);
 820     setOperationAction(ISD::MUL , VT, Expand);
 821     setOperationAction(ISD::FMUL, VT, Expand);
 822     setOperationAction(ISD::SDIV, VT, Expand);
 823     setOperationAction(ISD::UDIV, VT, Expand);
 824     setOperationAction(ISD::FDIV, VT, Expand);
 825     setOperationAction(ISD::SREM, VT, Expand);
 826     setOperationAction(ISD::UREM, VT, Expand);
 827     setOperationAction(ISD::LOAD, VT, Expand);
 828     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
 829     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
 830     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
 831     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
 832     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
 833     setOperationAction(ISD::FABS, VT, Expand);
 834     setOperationAction(ISD::FSIN, VT, Expand);
 835     setOperationAction(ISD::FSINCOS, VT, Expand);
 836     setOperationAction(ISD::FCOS, VT, Expand);
 837     setOperationAction(ISD::FSINCOS, VT, Expand);
 838     setOperationAction(ISD::FREM, VT, Expand);
 839     setOperationAction(ISD::FMA,  VT, Expand);
 840     setOperationAction(ISD::FPOWI, VT, Expand);
 841     setOperationAction(ISD::FSQRT, VT, Expand);
 842     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
 843     setOperationAction(ISD::FFLOOR, VT, Expand);
 844     setOperationAction(ISD::FCEIL, VT, Expand);
 845     setOperationAction(ISD::FTRUNC, VT, Expand);
 846     setOperationAction(ISD::FRINT, VT, Expand);
 847     setOperationAction(ISD::FNEARBYINT, VT, Expand);
 848     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
 849     setOperationAction(ISD::MULHS, VT, Expand);
 850     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 851     setOperationAction(ISD::MULHU, VT, Expand);
 852     setOperationAction(ISD::SDIVREM, VT, Expand);
 853     setOperationAction(ISD::UDIVREM, VT, Expand);
 854     setOperationAction(ISD::FPOW, VT, Expand);
 855     setOperationAction(ISD::CTPOP, VT, Expand);
 856     setOperationAction(ISD::CTTZ, VT, Expand);
 857     setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
 858     setOperationAction(ISD::CTLZ, VT, Expand);
 859     setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
 860     setOperationAction(ISD::SHL, VT, Expand);
 861     setOperationAction(ISD::SRA, VT, Expand);
 862     setOperationAction(ISD::SRL, VT, Expand);
 863     setOperationAction(ISD::ROTL, VT, Expand);
 864     setOperationAction(ISD::ROTR, VT, Expand);
 865     setOperationAction(ISD::BSWAP, VT, Expand);
 866     setOperationAction(ISD::SETCC, VT, Expand);
 867     setOperationAction(ISD::FLOG, VT, Expand);
 868     setOperationAction(ISD::FLOG2, VT, Expand);
 869     setOperationAction(ISD::FLOG10, VT, Expand);
 870     setOperationAction(ISD::FEXP, VT, Expand);
 871     setOperationAction(ISD::FEXP2, VT, Expand);
 872     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
 873     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
 874     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
 875     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
 876     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
 877     setOperationAction(ISD::TRUNCATE, VT, Expand);
 878     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
 879     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
 880     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
 881     setOperationAction(ISD::VSELECT, VT, Expand);
 882     setOperationAction(ISD::SELECT_CC, VT, Expand);
 883     for (MVT InnerVT : MVT::vector_valuetypes()) {
 884       setTruncStoreAction(InnerVT, VT, Expand);
 885
 886       setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
 887       setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
 888
 889       // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
 890       // types, we have to deal with them whether we ask for Expansion or not.
 891       // Setting Expand causes its own optimisation problems though, so leave
 892       // them legal.
 893       if (VT.getVectorElementType() == MVT::i1)
 894         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
 895     }
 896   }
 897
 898   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
 899   // with -msoft-float, disable use of MMX as well.
 900   if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
 901     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
 902     // No operations on x86mmx supported, everything uses intrinsics.
 903   }
 904
 905   // MMX-sized vectors (other than x86mmx) are expected to be expanded
 906   // into smaller operations.
 907   setOperationAction(ISD::MULHS,              MVT::v8i8,  Expand);
 908   setOperationAction(ISD::MULHS,              MVT::v4i16, Expand);
 909   setOperationAction(ISD::MULHS,              MVT::v2i32, Expand);
 910   setOperationAction(ISD::MULHS,              MVT::v1i64, Expand);
 911   setOperationAction(ISD::AND,                MVT::v8i8,  Expand);
 912   setOperationAction(ISD::AND,                MVT::v4i16, Expand);
 913   setOperationAction(ISD::AND,                MVT::v2i32, Expand);
 914   setOperationAction(ISD::AND,                MVT::v1i64, Expand);
 915   setOperationAction(ISD::OR,                 MVT::v8i8,  Expand);
 916   setOperationAction(ISD::OR,                 MVT::v4i16, Expand);
 917   setOperationAction(ISD::OR,                 MVT::v2i32, Expand);
 918   setOperationAction(ISD::OR,                 MVT::v1i64, Expand);
 919   setOperationAction(ISD::XOR,                MVT::v8i8,  Expand);
 920   setOperationAction(ISD::XOR,                MVT::v4i16, Expand);
 921   setOperationAction(ISD::XOR,                MVT::v2i32, Expand);
 922   setOperationAction(ISD::XOR,                MVT::v1i64, Expand);
 923   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Expand);
 924   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Expand);
 925   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2i32, Expand);
 926   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Expand);
 927   setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
 928   setOperationAction(ISD::SELECT,             MVT::v8i8,  Expand);
 929   setOperationAction(ISD::SELECT,             MVT::v4i16, Expand);
 930   setOperationAction(ISD::SELECT,             MVT::v2i32, Expand);
 931   setOperationAction(ISD::SELECT,             MVT::v1i64, Expand);
 932   setOperationAction(ISD::BITCAST,            MVT::v8i8,  Expand);
 933   setOperationAction(ISD::BITCAST,            MVT::v4i16, Expand);
 934   setOperationAction(ISD::BITCAST,            MVT::v2i32, Expand);
 935   setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
 936
 937   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
 938     addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
 939
 940     setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
 941     setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
 942     setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
 943     setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
 944     setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
 945     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
 946     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
 947     setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
 948     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
 949     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
 950     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
 951     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
 952     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
 953   }
 954
 955   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
 956     addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
 957
 958     // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
 959     // registers cannot be used even for integer operations.
 960     addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
 961     addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
 962     addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
 963     addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
 964
 965     setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
 966     setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
 967     setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
 968     setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
 969     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
 970     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
 971     setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
 972     setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
 973     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
 974     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
 975     setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
 976     setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
 977     setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
 978     setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
 979     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
 980     setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
 981     setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
 982     setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
 983     setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
 984     setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
 985     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
 986     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
 987
 988     setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
 989     setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
 990     setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
 991     setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
 992
 993     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
 994     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
 995     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
 996     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
 997     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
 998
 999     // Only provide customized ctpop vector bit twiddling for vector types we
1000     // know to perform better than using the popcnt instructions on each vector
1001     // element. If popcnt isn't supported, always provide the custom version.
1002     if (!Subtarget->hasPOPCNT()) {
1003       setOperationAction(ISD::CTPOP,            MVT::v4i32, Custom);
1004       setOperationAction(ISD::CTPOP,            MVT::v2i64, Custom);
1005     }
1006
1007     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
1008     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
1009       MVT VT = (MVT::SimpleValueType)i;
1010       // Do not attempt to custom lower non-power-of-2 vectors
1011       if (!isPowerOf2_32(VT.getVectorNumElements()))
1012         continue;
1013       // Do not attempt to custom lower non-128-bit vectors
1014       if (!VT.is128BitVector())
1015         continue;
1016       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1017       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1018       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1019     }
1020
1021     // We support custom legalizing of sext and anyext loads for specific
1022     // memory vector types which we can load as a scalar (or sequence of
1023     // scalars) and extend in-register to a legal 128-bit vector type. For sext
1024     // loads these must work with a single scalar load.
1025     for (MVT VT : MVT::integer_vector_valuetypes()) {
1026       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
1027       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
1028       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
1029       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
1030       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
1031       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
1032       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
1033       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
1034       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
1035     }
1036
1037     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
1038     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
1039     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
1040     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
1041     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
1042     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
1043
1044     if (Subtarget->is64Bit()) {
1045       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
1046       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
1047     }
1048
1049     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
1050     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
1051       MVT VT = (MVT::SimpleValueType)i;
1052
1053       // Do not attempt to promote non-128-bit vectors
1054       if (!VT.is128BitVector())
1055         continue;
1056
1057       setOperationAction(ISD::AND,    VT, Promote);
1058       AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
1059       setOperationAction(ISD::OR,     VT, Promote);
1060       AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
1061       setOperationAction(ISD::XOR,    VT, Promote);
1062       AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
1063       setOperationAction(ISD::LOAD,   VT, Promote);
1064       AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
1065       setOperationAction(ISD::SELECT, VT, Promote);
1066       AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
1067     }
1068
1069     // Custom lower v2i64 and v2f64 selects.
1070     setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
1071     setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
1072     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
1073     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
1074
1075     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
1076     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
1077
1078     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
1079     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
1080     // As there is no 64-bit GPR available, we need build a special custom
1081     // sequence to convert from v2i32 to v2f32.
1082     if (!Subtarget->is64Bit())
1083       setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
1084
1085     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
1086     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
1087
1088     for (MVT VT : MVT::fp_vector_valuetypes())
1089       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
1090
1091     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
1092     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
1093     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
1094   }
1095
1096   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
1097     setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
1098     setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
1099     setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
1100     setOperationAction(ISD::FRINT,              MVT::f32,   Legal);
1101     setOperationAction(ISD::FNEARBYINT,         MVT::f32,   Legal);
1102     setOperationAction(ISD::FFLOOR,             MVT::f64,   Legal);
1103     setOperationAction(ISD::FCEIL,              MVT::f64,   Legal);
1104     setOperationAction(ISD::FTRUNC,             MVT::f64,   Legal);
1105     setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
1106     setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
1107
1108     setOperationAction(ISD::FFLOOR,             MVT::v4f32, Legal);
1109     setOperationAction(ISD::FCEIL,              MVT::v4f32, Legal);
1110     setOperationAction(ISD::FTRUNC,             MVT::v4f32, Legal);
1111     setOperationAction(ISD::FRINT,              MVT::v4f32, Legal);
1112     setOperationAction(ISD::FNEARBYINT,         MVT::v4f32, Legal);
1113     setOperationAction(ISD::FFLOOR,             MVT::v2f64, Legal);
1114     setOperationAction(ISD::FCEIL,              MVT::v2f64, Legal);
1115     setOperationAction(ISD::FTRUNC,             MVT::v2f64, Legal);
1116     setOperationAction(ISD::FRINT,              MVT::v2f64, Legal);
1117     setOperationAction(ISD::FNEARBYINT,         MVT::v2f64, Legal);
1118
1119     // FIXME: Do we need to handle scalar-to-vector here?
1120     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
1121
1122     setOperationAction(ISD::VSELECT,            MVT::v2f64, Custom);
1123     setOperationAction(ISD::VSELECT,            MVT::v2i64, Custom);
1124     setOperationAction(ISD::VSELECT,            MVT::v4i32, Custom);
1125     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
1126     setOperationAction(ISD::VSELECT,            MVT::v8i16, Custom);
1127     // There is no BLENDI for byte vectors. We don't need to custom lower
1128     // some vselects for now.
1129     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
1130
1131     // SSE41 brings specific instructions for doing vector sign extend even in
1132     // cases where we don't have SRA.
1133     for (MVT VT : MVT::integer_vector_valuetypes()) {
1134       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
1135       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
1136       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
1137     }
1138
1139     // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1140     setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
1141     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
1142     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
1143     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1144     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1145     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1146
1147     setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
1148     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
1149     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
1150     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1151     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1152     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1153
1154     // i8 and i16 vectors are custom because the source register and source
1155     // source memory operand types are not the same width.  f32 vectors are
1156     // custom since the immediate controlling the insert encodes additional
1157     // information.
1158     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
1159     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
1160     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
1161     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
1162
1163     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
1164     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
1165     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
1166     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
1167
1168     // FIXME: these should be Legal, but that's only for the case where
1169     // the index is constant.  For now custom expand to deal with that.
1170     if (Subtarget->is64Bit()) {
1171       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
1172       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
1173     }
1174   }
1175
1176   if (Subtarget->hasSSE2()) {
1177     setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
1178     setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
1179
1180     setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
1181     setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
1182
1183     setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
1184     setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
1185
1186     // In the customized shift lowering, the legal cases in AVX2 will be
1187     // recognized.
1188     setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
1189     setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
1190
1191     setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
1192     setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
1193
1194     setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
1195   }
1196
1197   if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
1198     addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
1199     addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
1200     addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
1201     addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
1202     addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
1203     addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
1204
1205     setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
1206     setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
1207     setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
1208
1209     setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
1210     setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
1211     setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
1212     setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
1213     setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
1214     setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
1215     setOperationAction(ISD::FCEIL,              MVT::v8f32, Legal);
1216     setOperationAction(ISD::FTRUNC,             MVT::v8f32, Legal);
1217     setOperationAction(ISD::FRINT,              MVT::v8f32, Legal);
1218     setOperationAction(ISD::FNEARBYINT,         MVT::v8f32, Legal);
1219     setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
1220     setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
1221
1222     setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
1223     setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
1224     setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
1225     setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
1226     setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
1227     setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
1228     setOperationAction(ISD::FCEIL,              MVT::v4f64, Legal);
1229     setOperationAction(ISD::FTRUNC,             MVT::v4f64, Legal);
1230     setOperationAction(ISD::FRINT,              MVT::v4f64, Legal);
1231     setOperationAction(ISD::FNEARBYINT,         MVT::v4f64, Legal);
1232     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
1233     setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
1234
1235     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1236     // even though v8i16 is a legal type.
1237     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Promote);
1238     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Promote);
1239     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
1240
1241     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
1242     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
1243     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
1244
1245     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
1246     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
1247
1248     for (MVT VT : MVT::fp_vector_valuetypes())
1249       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1250
1251     setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
1252     setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
1253
1254     setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
1255     setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
1256
1257     setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
1258     setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
1259
1260     setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
1261     setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
1262     setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
1263     setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
1264
1265     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
1266     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
1267     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
1268
1269     setOperationAction(ISD::VSELECT,           MVT::v4f64, Custom);
1270     setOperationAction(ISD::VSELECT,           MVT::v4i64, Custom);
1271     setOperationAction(ISD::VSELECT,           MVT::v8i32, Custom);
1272     setOperationAction(ISD::VSELECT,           MVT::v8f32, Custom);
1273
1274     setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
1275     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
1276     setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
1277     setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
1278     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
1279     setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i16, Custom);
1280     setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
1281     setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
1282     setOperationAction(ISD::ANY_EXTEND,        MVT::v16i16, Custom);
1283     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
1284     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
1285     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
1286
1287     if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
1288       setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
1289       setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
1290       setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
1291       setOperationAction(ISD::FMA,             MVT::v2f64, Legal);
1292       setOperationAction(ISD::FMA,             MVT::f32, Legal);
1293       setOperationAction(ISD::FMA,             MVT::f64, Legal);
1294     }
1295
1296     if (Subtarget->hasInt256()) {
1297       setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
1298       setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
1299       setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
1300       setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
1301
1302       setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
1303       setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
1304       setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
1305       setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
1306
1307       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
1308       setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
1309       setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
1310       // Don't lower v32i8 because there is no 128-bit byte mul
1311
1312       setOperationAction(ISD::UMUL_LOHI,       MVT::v8i32, Custom);
1313       setOperationAction(ISD::SMUL_LOHI,       MVT::v8i32, Custom);
1314       setOperationAction(ISD::MULHU,           MVT::v16i16, Legal);
1315       setOperationAction(ISD::MULHS,           MVT::v16i16, Legal);
1316
1317       setOperationAction(ISD::VSELECT,         MVT::v16i16, Custom);
1318       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
1319
1320       // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1321       // when we have a 256bit-wide blend with immediate.
1322       setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1323
1324       // Only provide customized ctpop vector bit twiddling for vector types we
1325       // know to perform better than using the popcnt instructions on each
1326       // vector element. If popcnt isn't supported, always provide the custom
1327       // version.
1328       if (!Subtarget->hasPOPCNT())
1329         setOperationAction(ISD::CTPOP,           MVT::v4i64, Custom);
1330
1331       // Custom CTPOP always performs better on natively supported v8i32
1332       setOperationAction(ISD::CTPOP,             MVT::v8i32, Custom);
1333
1334       // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1335       setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1336       setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
1337       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
1338       setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
1339       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
1340       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
1341
1342       setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1343       setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
1344       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
1345       setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
1346       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
1347       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
1348     } else {
1349       setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
1350       setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
1351       setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
1352       setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
1353
1354       setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
1355       setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
1356       setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
1357       setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
1358
1359       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
1360       setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
1361       setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
1362       // Don't lower v32i8 because there is no 128-bit byte mul
1363     }
1364
1365     // In the customized shift lowering, the legal cases in AVX2 will be
1366     // recognized.
1367     setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
1368     setOperationAction(ISD::SRL,               MVT::v8i32, Custom);
1369
1370     setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
1371     setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
1372
1373     setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
1374
1375     // Custom lower several nodes for 256-bit types.
1376     for (MVT VT : MVT::vector_valuetypes()) {
1377       if (VT.getScalarSizeInBits() >= 32) {
1378         setOperationAction(ISD::MLOAD,  VT, Legal);
1379         setOperationAction(ISD::MSTORE, VT, Legal);
1380       }
1381       // Extract subvector is special because the value type
1382       // (result) is 128-bit but the source is 256-bit wide.
1383       if (VT.is128BitVector()) {
1384         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1385       }
1386       // Do not attempt to custom lower other non-256-bit vectors
1387       if (!VT.is256BitVector())
1388         continue;
1389
1390       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1391       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1392       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1393       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1394       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
1395       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
1396       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1397     }
1398
1399     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1400     for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
1401       MVT VT = (MVT::SimpleValueType)i;
1402
1403       // Do not attempt to promote non-256-bit vectors
1404       if (!VT.is256BitVector())
1405         continue;
1406
1407       setOperationAction(ISD::AND,    VT, Promote);
1408       AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
1409       setOperationAction(ISD::OR,     VT, Promote);
1410       AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
1411       setOperationAction(ISD::XOR,    VT, Promote);
1412       AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
1413       setOperationAction(ISD::LOAD,   VT, Promote);
1414       AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
1415       setOperationAction(ISD::SELECT, VT, Promote);
1416       AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
1417     }
1418   }
1419
1420   if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) {
1421     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1422     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1423     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
1424     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
1425
1426     addRegisterClass(MVT::i1,     &X86::VK1RegClass);
1427     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
1428     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
1429
1430     for (MVT VT : MVT::fp_vector_valuetypes())
1431       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1432
1433     setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
1434     setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
1435     setOperationAction(ISD::XOR,                MVT::i1,    Legal);
1436     setOperationAction(ISD::OR,                 MVT::i1,    Legal);
1437     setOperationAction(ISD::AND,                MVT::i1,    Legal);
1438     setOperationAction(ISD::LOAD,               MVT::v16f32, Legal);
1439     setOperationAction(ISD::LOAD,               MVT::v8f64, Legal);
1440     setOperationAction(ISD::LOAD,               MVT::v8i64, Legal);
1441     setOperationAction(ISD::LOAD,               MVT::v16i32, Legal);
1442     setOperationAction(ISD::LOAD,               MVT::v16i1, Legal);
1443
1444     setOperationAction(ISD::FADD,               MVT::v16f32, Legal);
1445     setOperationAction(ISD::FSUB,               MVT::v16f32, Legal);
1446     setOperationAction(ISD::FMUL,               MVT::v16f32, Legal);
1447     setOperationAction(ISD::FDIV,               MVT::v16f32, Legal);
1448     setOperationAction(ISD::FSQRT,              MVT::v16f32, Legal);
1449     setOperationAction(ISD::FNEG,               MVT::v16f32, Custom);
1450
1451     setOperationAction(ISD::FADD,               MVT::v8f64, Legal);
1452     setOperationAction(ISD::FSUB,               MVT::v8f64, Legal);
1453     setOperationAction(ISD::FMUL,               MVT::v8f64, Legal);
1454     setOperationAction(ISD::FDIV,               MVT::v8f64, Legal);
1455     setOperationAction(ISD::FSQRT,              MVT::v8f64, Legal);
1456     setOperationAction(ISD::FNEG,               MVT::v8f64, Custom);
1457     setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
1458     setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
1459
1460     setOperationAction(ISD::FP_TO_SINT,         MVT::i32, Legal);
1461     setOperationAction(ISD::FP_TO_UINT,         MVT::i32, Legal);
1462     setOperationAction(ISD::SINT_TO_FP,         MVT::i32, Legal);
1463     setOperationAction(ISD::UINT_TO_FP,         MVT::i32, Legal);
1464     if (Subtarget->is64Bit()) {
1465       setOperationAction(ISD::FP_TO_UINT,       MVT::i64, Legal);
1466       setOperationAction(ISD::FP_TO_SINT,       MVT::i64, Legal);
1467       setOperationAction(ISD::SINT_TO_FP,       MVT::i64, Legal);
1468       setOperationAction(ISD::UINT_TO_FP,       MVT::i64, Legal);
1469     }
1470     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
1471     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
1472     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
1473     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
1474     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
1475     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i1,   Custom);
1476     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i1,  Custom);
1477     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i8,  Promote);
1478     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i16, Promote);
1479     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
1480     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
1481     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
1482     setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
1483     setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
1484
1485     setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
1486     setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
1487     setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
1488     setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
1489     setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
1490     setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
1491     setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
1492     setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
1493     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
1494     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
1495     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
1496     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
1497     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
1498
1499     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
1500     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
1501     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
1502     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
1503     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1,    Custom);
1504     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1, Legal);
1505
1506     setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
1507     setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);
1508
1509     setOperationAction(ISD::MUL,              MVT::v8i64, Custom);
1510
1511     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
1512     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
1513     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
1514     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
1515     setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
1516     setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
1517     setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
1518     setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
1519     setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
1520
1521     setOperationAction(ISD::ADD,                MVT::v8i64, Legal);
1522     setOperationAction(ISD::ADD,                MVT::v16i32, Legal);
1523
1524     setOperationAction(ISD::SUB,                MVT::v8i64, Legal);
1525     setOperationAction(ISD::SUB,                MVT::v16i32, Legal);
1526
1527     setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
1528
1529     setOperationAction(ISD::SRL,                MVT::v8i64, Custom);
1530     setOperationAction(ISD::SRL,                MVT::v16i32, Custom);
1531
1532     setOperationAction(ISD::SHL,                MVT::v8i64, Custom);
1533     setOperationAction(ISD::SHL,                MVT::v16i32, Custom);
1534
1535     setOperationAction(ISD::SRA,                MVT::v8i64, Custom);
1536     setOperationAction(ISD::SRA,                MVT::v16i32, Custom);
1537
1538     setOperationAction(ISD::AND,                MVT::v8i64, Legal);
1539     setOperationAction(ISD::OR,                 MVT::v8i64, Legal);
1540     setOperationAction(ISD::XOR,                MVT::v8i64, Legal);
1541     setOperationAction(ISD::AND,                MVT::v16i32, Legal);
1542     setOperationAction(ISD::OR,                 MVT::v16i32, Legal);
1543     setOperationAction(ISD::XOR,                MVT::v16i32, Legal);
1544
1545     if (Subtarget->hasCDI()) {
1546       setOperationAction(ISD::CTLZ,             MVT::v8i64, Legal);
1547       setOperationAction(ISD::CTLZ,             MVT::v16i32, Legal);
1548     }
1549
1550     // Custom lower several nodes.
1551     for (MVT VT : MVT::vector_valuetypes()) {
1552       unsigned EltSize = VT.getVectorElementType().getSizeInBits();
1553       // Extract subvector is special because the value type
1554       // (result) is 256/128-bit but the source is 512-bit wide.
1555       if (VT.is128BitVector() || VT.is256BitVector()) {
1556         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1557       }
1558       if (VT.getVectorElementType() == MVT::i1)
1559         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1560
1561       // Do not attempt to custom lower other non-512-bit vectors
1562       if (!VT.is512BitVector())
1563         continue;
1564
1565       if ( EltSize >= 32) {
1566         setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
1567         setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
1568         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
1569         setOperationAction(ISD::VSELECT,             VT, Legal);
1570         setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
1571         setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
1572         setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
1573         setOperationAction(ISD::MLOAD,               VT, Legal);
1574         setOperationAction(ISD::MSTORE,              VT, Legal);
1575       }
1576     }
1577     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
1578       MVT VT = (MVT::SimpleValueType)i;
1579
1580       // Do not attempt to promote non-512-bit vectors.
1581       if (!VT.is512BitVector())
1582         continue;
1583
1584       setOperationAction(ISD::SELECT, VT, Promote);
1585       AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
1586     }
1587   }// has  AVX-512
1588
1589   if (!TM.Options.UseSoftFloat && Subtarget->hasBWI()) {
1590     addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1591     addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
1592
1593     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
1594     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
1595
1596     setOperationAction(ISD::LOAD,               MVT::v32i16, Legal);
1597     setOperationAction(ISD::LOAD,               MVT::v64i8, Legal);
1598     setOperationAction(ISD::SETCC,              MVT::v32i1, Custom);
1599     setOperationAction(ISD::SETCC,              MVT::v64i1, Custom);
1600     setOperationAction(ISD::ADD,                MVT::v32i16, Legal);
1601     setOperationAction(ISD::ADD,                MVT::v64i8, Legal);
1602     setOperationAction(ISD::SUB,                MVT::v32i16, Legal);
1603     setOperationAction(ISD::SUB,                MVT::v64i8, Legal);
1604     setOperationAction(ISD::MUL,                MVT::v32i16, Legal);
1605
1606     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
1607       const MVT VT = (MVT::SimpleValueType)i;
1608
1609       const unsigned EltSize = VT.getVectorElementType().getSizeInBits();
1610
1611       // Do not attempt to promote non-512-bit vectors.
1612       if (!VT.is512BitVector())
1613         continue;
1614
1615       if (EltSize < 32) {
1616         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
1617         setOperationAction(ISD::VSELECT,             VT, Legal);
1618       }
1619     }
1620   }
1621
1622   if (!TM.Options.UseSoftFloat && Subtarget->hasVLX()) {
1623     addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
1624     addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
1625
1626     setOperationAction(ISD::SETCC,              MVT::v4i1, Custom);
1627     setOperationAction(ISD::SETCC,              MVT::v2i1, Custom);
1628     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v8i1, Legal);
1629
1630     setOperationAction(ISD::AND,                MVT::v8i32, Legal);
1631     setOperationAction(ISD::OR,                 MVT::v8i32, Legal);
1632     setOperationAction(ISD::XOR,                MVT::v8i32, Legal);
1633     setOperationAction(ISD::AND,                MVT::v4i32, Legal);
1634     setOperationAction(ISD::OR,                 MVT::v4i32, Legal);
1635     setOperationAction(ISD::XOR,                MVT::v4i32, Legal);
1636   }
1637
1638   // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
1639   // of this type with custom code.
1640   for (MVT VT : MVT::vector_valuetypes())
1641     setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
1642
1643   // We want to custom lower some of our intrinsics.
1644   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1645   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1646   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1647   if (!Subtarget->is64Bit())
1648     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1649
1650   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1651   // handle type legalization for these operations here.
1652   //
1653   // FIXME: We really should do custom legalization for addition and
1654   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
1655   // than generic legalization for 64-bit multiplication-with-overflow, though.
1656   for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
1657     // Add/Sub/Mul with overflow operations are custom lowered.
1658     MVT VT = IntVTs[i];
1659     setOperationAction(ISD::SADDO, VT, Custom);
1660     setOperationAction(ISD::UADDO, VT, Custom);
1661     setOperationAction(ISD::SSUBO, VT, Custom);
1662     setOperationAction(ISD::USUBO, VT, Custom);
1663     setOperationAction(ISD::SMULO, VT, Custom);
1664     setOperationAction(ISD::UMULO, VT, Custom);
1665   }
1666
1667
1668   if (!Subtarget->is64Bit()) {
1669     // These libcalls are not available in 32-bit.
1670     setLibcallName(RTLIB::SHL_I128, nullptr);
1671     setLibcallName(RTLIB::SRL_I128, nullptr);
1672     setLibcallName(RTLIB::SRA_I128, nullptr);
1673   }
1674
1675   // Combine sin / cos into one node or libcall if possible.
1676   if (Subtarget->hasSinCos()) {
1677     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1678     setLibcallName(RTLIB::SINCOS_F64, "sincos");
1679     if (Subtarget->isTargetDarwin()) {
1680       // For MacOSX, we don't want the normal expansion of a libcall to sincos.
1681       // We want to issue a libcall to __sincos_stret to avoid memory traffic.
1682       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1683       setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1684     }
1685   }
1686
1687   if (Subtarget->isTargetWin64()) {
1688     setOperationAction(ISD::SDIV, MVT::i128, Custom);
1689     setOperationAction(ISD::UDIV, MVT::i128, Custom);
1690     setOperationAction(ISD::SREM, MVT::i128, Custom);
1691     setOperationAction(ISD::UREM, MVT::i128, Custom);
1692     setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1693     setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1694   }
1695
1696   // We have target-specific dag combine patterns for the following nodes:
1697   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1698   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1699   setTargetDAGCombine(ISD::VSELECT);
1700   setTargetDAGCombine(ISD::SELECT);
1701   setTargetDAGCombine(ISD::SHL);
1702   setTargetDAGCombine(ISD::SRA);
1703   setTargetDAGCombine(ISD::SRL);
1704   setTargetDAGCombine(ISD::OR);
1705   setTargetDAGCombine(ISD::AND);
1706   setTargetDAGCombine(ISD::ADD);
1707   setTargetDAGCombine(ISD::FADD);
1708   setTargetDAGCombine(ISD::FSUB);
1709   setTargetDAGCombine(ISD::FMA);
1710   setTargetDAGCombine(ISD::SUB);
1711   setTargetDAGCombine(ISD::LOAD);
1712   setTargetDAGCombine(ISD::STORE);
1713   setTargetDAGCombine(ISD::ZERO_EXTEND);
1714   setTargetDAGCombine(ISD::ANY_EXTEND);
1715   setTargetDAGCombine(ISD::SIGN_EXTEND);
1716   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1717   setTargetDAGCombine(ISD::TRUNCATE);
1718   setTargetDAGCombine(ISD::SINT_TO_FP);
1719   setTargetDAGCombine(ISD::SETCC);
1720   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
1721   setTargetDAGCombine(ISD::BUILD_VECTOR);
1722   if (Subtarget->is64Bit())
1723     setTargetDAGCombine(ISD::MUL);
1724   setTargetDAGCombine(ISD::XOR);
1725
1726   computeRegisterProperties();
1727
1728   // On Darwin, -Os means optimize for size without hurting performance,
1729   // do not reduce the limit.
1730   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1731   MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
1732   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1733   MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1734   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1735   MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1736   setPrefLoopAlignment(4); // 2^4 bytes.
1737
1738   // Predictable cmov don't hurt on atom because it's in-order.
1739   PredictableSelectIsExpensive = !Subtarget->isAtom();
1740   EnableExtLdPromotion = true;
1741   setPrefFunctionAlignment(4); // 2^4 bytes.
1742
1743   verifyIntrinsicTables();
1744 }
1745
1746 // This has so far only been implemented for 64-bit MachO.
1747 bool X86TargetLowering::useLoadStackGuardNode() const {
1748   return Subtarget->isTargetMachO() && Subtarget->is64Bit();
1749 }
1750
1751 TargetLoweringBase::LegalizeTypeAction
1752 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1753   if (ExperimentalVectorWideningLegalization &&
1754       VT.getVectorNumElements() != 1 &&
1755       VT.getVectorElementType().getSimpleVT() != MVT::i1)
1756     return TypeWidenVector;
1757
1758   return TargetLoweringBase::getPreferredVectorAction(VT);
1759 }
1760
1761 EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1762   if (!VT.isVector())
1763     return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
1764
1765   const unsigned NumElts = VT.getVectorNumElements();
1766   const EVT EltVT = VT.getVectorElementType();
1767   if (VT.is512BitVector()) {
1768     if (Subtarget->hasAVX512())
1769       if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1770           EltVT == MVT::f32 || EltVT == MVT::f64)
1771         switch(NumElts) {
1772         case  8: return MVT::v8i1;
1773         case 16: return MVT::v16i1;
1774       }
1775     if (Subtarget->hasBWI())
1776       if (EltVT == MVT::i8 || EltVT == MVT::i16)
1777         switch(NumElts) {
1778         case 32: return MVT::v32i1;
1779         case 64: return MVT::v64i1;
1780       }
1781   }
1782
1783   if (VT.is256BitVector() || VT.is128BitVector()) {
1784     if (Subtarget->hasVLX())
1785       if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1786           EltVT == MVT::f32 || EltVT == MVT::f64)
1787         switch(NumElts) {
1788         case 2: return MVT::v2i1;
1789         case 4: return MVT::v4i1;
1790         case 8: return MVT::v8i1;
1791       }
1792     if (Subtarget->hasBWI() && Subtarget->hasVLX())
1793       if (EltVT == MVT::i8 || EltVT == MVT::i16)
1794         switch(NumElts) {
1795         case  8: return MVT::v8i1;
1796         case 16: return MVT::v16i1;
1797         case 32: return MVT::v32i1;
1798       }
1799   }
1800
1801   return VT.changeVectorElementTypeToInteger();
1802 }
1803
1804 /// Helper for getByValTypeAlignment to determine
1805 /// the desired ByVal argument alignment.
1806 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1807   if (MaxAlign == 16)
1808     return;
1809   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1810     if (VTy->getBitWidth() == 128)
1811       MaxAlign = 16;
1812   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1813     unsigned EltAlign = 0;
1814     getMaxByValAlign(ATy->getElementType(), EltAlign);
1815     if (EltAlign > MaxAlign)
1816       MaxAlign = EltAlign;
1817   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1818     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
1819       unsigned EltAlign = 0;
1820       getMaxByValAlign(STy->getElementType(i), EltAlign);
1821       if (EltAlign > MaxAlign)
1822         MaxAlign = EltAlign;
1823       if (MaxAlign == 16)
1824         break;
1825     }
1826   }
1827 }
1828
1829 /// Return the desired alignment for ByVal aggregate
1830 /// function arguments in the caller parameter area. For X86, aggregates
1831 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1832 /// are at 4-byte boundaries.
1833 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
1834   if (Subtarget->is64Bit()) {
1835     // Max of 8 and alignment of type.
1836     unsigned TyAlign = TD->getABITypeAlignment(Ty);
1837     if (TyAlign > 8)
1838       return TyAlign;
1839     return 8;
1840   }
1841
1842   unsigned Align = 4;
1843   if (Subtarget->hasSSE1())
1844     getMaxByValAlign(Ty, Align);
1845   return Align;
1846 }
1847
1848 /// Returns the target specific optimal type for load
1849 /// and store operations as a result of memset, memcpy, and memmove
1850 /// lowering. If DstAlign is zero that means it's safe to destination
1851 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1852 /// means there isn't a need to check it against alignment requirement,
1853 /// probably because the source does not need to be loaded. If 'IsMemset' is
1854 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1855 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1856 /// source is constant so it does not need to be loaded.
1857 /// It returns EVT::Other if the type should be determined using generic
1858 /// target-independent logic.
1859 EVT
1860 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1861                                        unsigned DstAlign, unsigned SrcAlign,
1862                                        bool IsMemset, bool ZeroMemset,
1863                                        bool MemcpyStrSrc,
1864                                        MachineFunction &MF) const {
1865   const Function *F = MF.getFunction();
1866   if ((!IsMemset || ZeroMemset) &&
1867       !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
1868                                        Attribute::NoImplicitFloat)) {
1869     if (Size >= 16 &&
1870         (Subtarget->isUnalignedMemAccessFast() ||
1871          ((DstAlign == 0 || DstAlign >= 16) &&
1872           (SrcAlign == 0 || SrcAlign >= 16)))) {
1873       if (Size >= 32) {
1874         if (Subtarget->hasInt256())
1875           return MVT::v8i32;
1876         if (Subtarget->hasFp256())
1877           return MVT::v8f32;
1878       }
1879       if (Subtarget->hasSSE2())
1880         return MVT::v4i32;
1881       if (Subtarget->hasSSE1())
1882         return MVT::v4f32;
1883     } else if (!MemcpyStrSrc && Size >= 8 &&
1884                !Subtarget->is64Bit() &&
1885                Subtarget->hasSSE2()) {
1886       // Do not use f64 to lower memcpy if source is string constant. It's
1887       // better to use i32 to avoid the loads.
1888       return MVT::f64;
1889     }
1890   }
1891   if (Subtarget->is64Bit() && Size >= 8)
1892     return MVT::i64;
1893   return MVT::i32;
1894 }
1895
1896 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1897   if (VT == MVT::f32)
1898     return X86ScalarSSEf32;
1899   else if (VT == MVT::f64)
1900     return X86ScalarSSEf64;
1901   return true;
1902 }
1903
1904 bool
1905 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1906                                                   unsigned,
1907                                                   unsigned,
1908                                                   bool *Fast) const {
1909   if (Fast)
1910     *Fast = Subtarget->isUnalignedMemAccessFast();
1911   return true;
1912 }
1913
1914 /// Return the entry encoding for a jump table in the
1915 /// current function.  The returned value is a member of the
1916 /// MachineJumpTableInfo::JTEntryKind enum.
1917 unsigned X86TargetLowering::getJumpTableEncoding() const {
1918   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1919   // symbol.
1920   if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1921       Subtarget->isPICStyleGOT())
1922     return MachineJumpTableInfo::EK_Custom32;
1923
1924   // Otherwise, use the normal jump table encoding heuristics.
1925   return TargetLowering::getJumpTableEncoding();
1926 }
1927
1928 const MCExpr *
1929 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1930                                              const MachineBasicBlock *MBB,
1931                                              unsigned uid,MCContext &Ctx) const{
1932   assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ &&
1933          Subtarget->isPICStyleGOT());
1934   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1935   // entries.
1936   return MCSymbolRefExpr::Create(MBB->getSymbol(),
1937                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
1938 }
1939
1940 /// Returns relocation base for the given PIC jumptable.
1941 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1942                                                     SelectionDAG &DAG) const {
1943   if (!Subtarget->is64Bit())
1944     // This doesn't have SDLoc associated with it, but is not really the
1945     // same as a Register.
1946     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
1947   return Table;
1948 }
1949
1950 /// This returns the relocation base for the given PIC jumptable,
1951 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
1952 const MCExpr *X86TargetLowering::
1953 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1954                              MCContext &Ctx) const {
1955   // X86-64 uses RIP relative addressing based on the jump table label.
1956   if (Subtarget->isPICStyleRIPRel())
1957     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1958
1959   // Otherwise, the reference is relative to the PIC base.
1960   return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
1961 }
1962
1963 // FIXME: Why this routine is here? Move to RegInfo!
1964 std::pair<const TargetRegisterClass*, uint8_t>
1965 X86TargetLowering::findRepresentativeClass(MVT VT) const{
1966   const TargetRegisterClass *RRC = nullptr;
1967   uint8_t Cost = 1;
1968   switch (VT.SimpleTy) {
1969   default:
1970     return TargetLowering::findRepresentativeClass(VT);
1971   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1972     RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
1973     break;
1974   case MVT::x86mmx:
1975     RRC = &X86::VR64RegClass;
1976     break;
1977   case MVT::f32: case MVT::f64:
1978   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1979   case MVT::v4f32: case MVT::v2f64:
1980   case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
1981   case MVT::v4f64:
1982     RRC = &X86::VR128RegClass;
1983     break;
1984   }
1985   return std::make_pair(RRC, Cost);
1986 }
1987
1988 bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
1989                                                unsigned &Offset) const {
1990   if (!Subtarget->isTargetLinux())
1991     return false;
1992
1993   if (Subtarget->is64Bit()) {
1994     // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
1995     Offset = 0x28;
1996     if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
1997       AddressSpace = 256;
1998     else
1999       AddressSpace = 257;
2000   } else {
2001     // %gs:0x14 on i386
2002     Offset = 0x14;
2003     AddressSpace = 256;
2004   }
2005   return true;
2006 }
2007
2008 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
2009                                             unsigned DestAS) const {
2010   assert(SrcAS != DestAS && "Expected different address spaces!");
2011
2012   return SrcAS < 256 && DestAS < 256;
2013 }
2014
2015 //===----------------------------------------------------------------------===//
2016 //               Return Value Calling Convention Implementation
2017 //===----------------------------------------------------------------------===//
2018
2019 #include "X86GenCallingConv.inc"
2020
2021 bool
2022 X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
2023                                   MachineFunction &MF, bool isVarArg,
2024                         const SmallVectorImpl<ISD::OutputArg> &Outs,
2025                         LLVMContext &Context) const {
2026   SmallVector<CCValAssign, 16> RVLocs;
2027   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2028   return CCInfo.CheckReturn(Outs, RetCC_X86);
2029 }
2030
2031 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2032   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2033   return ScratchRegs;
2034 }
2035
2036 SDValue
2037 X86TargetLowering::LowerReturn(SDValue Chain,
2038                                CallingConv::ID CallConv, bool isVarArg,
2039                                const SmallVectorImpl<ISD::OutputArg> &Outs,
2040                                const SmallVectorImpl<SDValue> &OutVals,
2041                                SDLoc dl, SelectionDAG &DAG) const {
2042   MachineFunction &MF = DAG.getMachineFunction();
2043   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2044
2045   SmallVector<CCValAssign, 16> RVLocs;
2046   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2047   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2048
2049   SDValue Flag;
2050   SmallVector<SDValue, 6> RetOps;
2051   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2052   // Operand #1 = Bytes To Pop
2053   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
2054                    MVT::i16));
2055
2056   // Copy the result values into the output registers.
2057   for (unsigned i = 0; i != RVLocs.size(); ++i) {
2058     CCValAssign &VA = RVLocs[i];
2059     assert(VA.isRegLoc() && "Can only return in registers!");
2060     SDValue ValToCopy = OutVals[i];
2061     EVT ValVT = ValToCopy.getValueType();
2062
2063     // Promote values to the appropriate types.
2064     if (VA.getLocInfo() == CCValAssign::SExt)
2065       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2066     else if (VA.getLocInfo() == CCValAssign::ZExt)
2067       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2068     else if (VA.getLocInfo() == CCValAssign::AExt)
2069       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2070     else if (VA.getLocInfo() == CCValAssign::BCvt)
2071       ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
2072
2073     assert(VA.getLocInfo() != CCValAssign::FPExt &&
2074            "Unexpected FP-extend for return value.");
2075
2076     // If this is x86-64, and we disabled SSE, we can't return FP values,
2077     // or SSE or MMX vectors.
2078     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2079          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2080           (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
2081       report_fatal_error("SSE register return with SSE disabled");
2082     }
2083     // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
2084     // llvm-gcc has never done it right and no one has noticed, so this
2085     // should be OK for now.
2086     if (ValVT == MVT::f64 &&
2087         (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
2088       report_fatal_error("SSE2 register return with SSE2 disabled");
2089
2090     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2091     // the RET instruction and handled by the FP Stackifier.
2092     if (VA.getLocReg() == X86::FP0 ||
2093         VA.getLocReg() == X86::FP1) {
2094       // If this is a copy from an xmm register to ST(0), use an FPExtend to
2095       // change the value to the FP stack register class.
2096       if (isScalarFPTypeInSSEReg(VA.getValVT()))
2097         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2098       RetOps.push_back(ValToCopy);
2099       // Don't emit a copytoreg.
2100       continue;
2101     }
2102
2103     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2104     // which is returned in RAX / RDX.
2105     if (Subtarget->is64Bit()) {
2106       if (ValVT == MVT::x86mmx) {
2107         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2108           ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
2109           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2110                                   ValToCopy);
2111           // If we don't have SSE2 available, convert to v4f32 so the generated
2112           // register is legal.
2113           if (!Subtarget->hasSSE2())
2114             ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
2115         }
2116       }
2117     }
2118
2119     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
2120     Flag = Chain.getValue(1);
2121     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2122   }
2123
2124   // The x86-64 ABIs require that for returning structs by value we copy
2125   // the sret argument into %rax/%eax (depending on ABI) for the return.
2126   // Win32 requires us to put the sret argument to %eax as well.
2127   // We saved the argument into a virtual register in the entry block,
2128   // so now we copy the value out and into %rax/%eax.
2129   if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() &&
2130       (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
2131     MachineFunction &MF = DAG.getMachineFunction();
2132     X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2133     unsigned Reg = FuncInfo->getSRetReturnReg();
2134     assert(Reg &&
2135            "SRetReturnReg should have been set in LowerFormalArguments().");
2136     SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
2137
2138     unsigned RetValReg
2139         = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
2140           X86::RAX : X86::EAX;
2141     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2142     Flag = Chain.getValue(1);
2143
2144     // RAX/EAX now acts like a return value.
2145     RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
2146   }
2147
2148   RetOps[0] = Chain;  // Update chain.
2149
2150   // Add the flag if we have it.
2151   if (Flag.getNode())
2152     RetOps.push_back(Flag);
2153
2154   return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps);
2155 }
2156
2157 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2158   if (N->getNumValues() != 1)
2159     return false;
2160   if (!N->hasNUsesOfValue(1, 0))
2161     return false;
2162
2163   SDValue TCChain = Chain;
2164   SDNode *Copy = *N->use_begin();
2165   if (Copy->getOpcode() == ISD::CopyToReg) {
2166     // If the copy has a glue operand, we conservatively assume it isn't safe to
2167     // perform a tail call.
2168     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2169       return false;
2170     TCChain = Copy->getOperand(0);
2171   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2172     return false;
2173
2174   bool HasRet = false;
2175   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2176        UI != UE; ++UI) {
2177     if (UI->getOpcode() != X86ISD::RET_FLAG)
2178       return false;
2179     // If we are returning more than one value, we can definitely
2180     // not make a tail call see PR19530
2181     if (UI->getNumOperands() > 4)
2182       return false;
2183     if (UI->getNumOperands() == 4 &&
2184         UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2185       return false;
2186     HasRet = true;
2187   }
2188
2189   if (!HasRet)
2190     return false;
2191
2192   Chain = TCChain;
2193   return true;
2194 }
2195
2196 EVT
2197 X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
2198                                             ISD::NodeType ExtendKind) const {
2199   MVT ReturnMVT;
2200   // TODO: Is this also valid on 32-bit?
2201   if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
2202     ReturnMVT = MVT::i8;
2203   else
2204     ReturnMVT = MVT::i32;
2205
2206   EVT MinVT = getRegisterType(Context, ReturnMVT);
2207   return VT.bitsLT(MinVT) ? MinVT : VT;
2208 }
2209
2210 /// Lower the result values of a call into the
2211 /// appropriate copies out of appropriate physical registers.
2212 ///
2213 SDValue
2214 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
2215                                    CallingConv::ID CallConv, bool isVarArg,
2216                                    const SmallVectorImpl<ISD::InputArg> &Ins,
2217                                    SDLoc dl, SelectionDAG &DAG,
2218                                    SmallVectorImpl<SDValue> &InVals) const {
2219
2220   // Assign locations to each value returned by this call.
2221   SmallVector<CCValAssign, 16> RVLocs;
2222   bool Is64Bit = Subtarget->is64Bit();
2223   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2224                  *DAG.getContext());
2225   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2226
2227   // Copy all of the result registers out of their specified physreg.
2228   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
2229     CCValAssign &VA = RVLocs[i];
2230     EVT CopyVT = VA.getValVT();
2231
2232     // If this is x86-64, and we disabled SSE, we can't return FP values
2233     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
2234         ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
2235       report_fatal_error("SSE register return with SSE disabled");
2236     }
2237
2238     // If we prefer to use the value in xmm registers, copy it out as f80 and
2239     // use a truncate to move it from fp stack reg to xmm reg.
2240     if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2241         isScalarFPTypeInSSEReg(VA.getValVT()))
2242       CopyVT = MVT::f80;
2243
2244     Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
2245                                CopyVT, InFlag).getValue(1);
2246     SDValue Val = Chain.getValue(0);
2247
2248     if (CopyVT != VA.getValVT())
2249       Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2250                         // This truncation won't change the value.
2251                         DAG.getIntPtrConstant(1));
2252
2253     InFlag = Chain.getValue(2);
2254     InVals.push_back(Val);
2255   }
2256
2257   return Chain;
2258 }
2259
2260 //===----------------------------------------------------------------------===//
2261 //                C & StdCall & Fast Calling Convention implementation
2262 //===----------------------------------------------------------------------===//
2263 //  StdCall calling convention seems to be standard for many Windows' API
2264 //  routines and around. It differs from C calling convention just a little:
2265 //  callee should clean up the stack, not caller. Symbols should be also
2266 //  decorated in some fancy way :) It doesn't support any vector arguments.
2267 //  For info on fast calling convention see Fast Calling Convention (tail call)
2268 //  implementation LowerX86_32FastCCCallTo.
2269
2270 /// CallIsStructReturn - Determines whether a call uses struct return
2271 /// semantics.
2272 enum StructReturnType {
2273   NotStructReturn,
2274   RegStructReturn,
2275   StackStructReturn
2276 };
2277 static StructReturnType
2278 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
2279   if (Outs.empty())
2280     return NotStructReturn;
2281
2282   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2283   if (!Flags.isSRet())
2284     return NotStructReturn;
2285   if (Flags.isInReg())
2286     return RegStructReturn;
2287   return StackStructReturn;
2288 }
2289
2290 /// Determines whether a function uses struct return semantics.
2291 static StructReturnType
2292 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
2293   if (Ins.empty())
2294     return NotStructReturn;
2295
2296   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2297   if (!Flags.isSRet())
2298     return NotStructReturn;
2299   if (Flags.isInReg())
2300     return RegStructReturn;
2301   return StackStructReturn;
2302 }
2303
2304 /// Make a copy of an aggregate at address specified by "Src" to address
2305 /// "Dst" with size and alignment information specified by the specific
2306 /// parameter attribute. The copy will be passed as a byval function parameter.
2307 static SDValue
2308 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
2309                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
2310                           SDLoc dl) {
2311   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
2312
2313   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2314                        /*isVolatile*/false, /*AlwaysInline=*/true,
2315                        MachinePointerInfo(), MachinePointerInfo());
2316 }
2317
2318 /// Return true if the calling convention is one that
2319 /// supports tail call optimization.
2320 static bool IsTailCallConvention(CallingConv::ID CC) {
2321   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2322           CC == CallingConv::HiPE);
2323 }
2324
2325 /// \brief Return true if the calling convention is a C calling convention.
2326 static bool IsCCallConvention(CallingConv::ID CC) {
2327   return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||
2328           CC == CallingConv::X86_64_SysV);
2329 }
2330
2331 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
2332   if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
2333     return false;
2334
2335   CallSite CS(CI);
2336   CallingConv::ID CalleeCC = CS.getCallingConv();
2337   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
2338     return false;
2339
2340   return true;
2341 }
2342
2343 /// Return true if the function is being made into
2344 /// a tailcall target by changing its ABI.
2345 static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
2346                                    bool GuaranteedTailCallOpt) {
2347   return GuaranteedTailCallOpt && IsTailCallConvention(CC);
2348 }
2349
2350 SDValue
2351 X86TargetLowering::LowerMemArgument(SDValue Chain,
2352                                     CallingConv::ID CallConv,
2353                                     const SmallVectorImpl<ISD::InputArg> &Ins,
2354                                     SDLoc dl, SelectionDAG &DAG,
2355                                     const CCValAssign &VA,
2356                                     MachineFrameInfo *MFI,
2357                                     unsigned i) const {
2358   // Create the nodes corresponding to a load from this parameter slot.
2359   ISD::ArgFlagsTy Flags = Ins[i].Flags;
2360   bool AlwaysUseMutable = FuncIsMadeTailCallSafe(
2361       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2362   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2363   EVT ValVT;
2364
2365   // If value is passed by pointer we have address passed instead of the value
2366   // itself.
2367   if (VA.getLocInfo() == CCValAssign::Indirect)
2368     ValVT = VA.getLocVT();
2369   else
2370     ValVT = VA.getValVT();
2371
2372   // FIXME: For now, all byval parameter objects are marked mutable. This can be
2373   // changed with more analysis.
2374   // In case of tail call optimization mark all arguments mutable. Since they
2375   // could be overwritten by lowering of arguments in case of a tail call.
2376   if (Flags.isByVal()) {
2377     unsigned Bytes = Flags.getByValSize();
2378     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2379     int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2380     return DAG.getFrameIndex(FI, getPointerTy());
2381   } else {
2382     int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
2383                                     VA.getLocMemOffset(), isImmutable);
2384     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
2385     return DAG.getLoad(ValVT, dl, Chain, FIN,
2386                        MachinePointerInfo::getFixedStack(FI),
2387                        false, false, false, 0);
2388   }
2389 }
2390
2391 // FIXME: Get this from tablegen.
2392 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2393                                                 const X86Subtarget *Subtarget) {
2394   assert(Subtarget->is64Bit());
2395
2396   if (Subtarget->isCallingConvWin64(CallConv)) {
2397     static const MCPhysReg GPR64ArgRegsWin64[] = {
2398       X86::RCX, X86::RDX, X86::R8,  X86::R9
2399     };
2400     return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2401   }
2402
2403   static const MCPhysReg GPR64ArgRegs64Bit[] = {
2404     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2405   };
2406   return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2407 }
2408
2409 // FIXME: Get this from tablegen.
2410 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2411                                                 CallingConv::ID CallConv,
2412                                                 const X86Subtarget *Subtarget) {
2413   assert(Subtarget->is64Bit());
2414   if (Subtarget->isCallingConvWin64(CallConv)) {
2415     // The XMM registers which might contain var arg parameters are shadowed
2416     // in their paired GPR.  So we only need to save the GPR to their home
2417     // slots.
2418     // TODO: __vectorcall will change this.
2419     return None;
2420   }
2421
2422   const Function *Fn = MF.getFunction();
2423   bool NoImplicitFloatOps = Fn->getAttributes().
2424       hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
2425   assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) &&
2426          "SSE register cannot be used when SSE is disabled!");
2427   if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
2428       !Subtarget->hasSSE1())
2429     // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2430     // registers.
2431     return None;
2432
2433   static const MCPhysReg XMMArgRegs64Bit[] = {
2434     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2435     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2436   };
2437   return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2438 }
2439
2440 SDValue
2441 X86TargetLowering::LowerFormalArguments(SDValue Chain,
2442                                         CallingConv::ID CallConv,
2443                                         bool isVarArg,
2444                                       const SmallVectorImpl<ISD::InputArg> &Ins,
2445                                         SDLoc dl,
2446                                         SelectionDAG &DAG,
2447                                         SmallVectorImpl<SDValue> &InVals)
2448                                           const {
2449   MachineFunction &MF = DAG.getMachineFunction();
2450   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2451
2452   const Function* Fn = MF.getFunction();
2453   if (Fn->hasExternalLinkage() &&
2454       Subtarget->isTargetCygMing() &&
2455       Fn->getName() == "main")
2456     FuncInfo->setForceFramePointer(true);
2457
2458   MachineFrameInfo *MFI = MF.getFrameInfo();
2459   bool Is64Bit = Subtarget->is64Bit();
2460   bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
2461
2462   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
2463          "Var args not supported with calling convention fastcc, ghc or hipe");
2464
2465   // Assign locations to all of the incoming arguments.
2466   SmallVector<CCValAssign, 16> ArgLocs;
2467   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2468
2469   // Allocate shadow area for Win64
2470   if (IsWin64)
2471     CCInfo.AllocateStack(32, 8);
2472
2473   CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
2474
2475   unsigned LastVal = ~0U;
2476   SDValue ArgValue;
2477   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2478     CCValAssign &VA = ArgLocs[i];
2479     // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
2480     // places.
2481     assert(VA.getValNo() != LastVal &&
2482            "Don't support value assigned to multiple locs yet");
2483     (void)LastVal;
2484     LastVal = VA.getValNo();
2485
2486     if (VA.isRegLoc()) {
2487       EVT RegVT = VA.getLocVT();
2488       const TargetRegisterClass *RC;
2489       if (RegVT == MVT::i32)
2490         RC = &X86::GR32RegClass;
2491       else if (Is64Bit && RegVT == MVT::i64)
2492         RC = &X86::GR64RegClass;
2493       else if (RegVT == MVT::f32)
2494         RC = &X86::FR32RegClass;
2495       else if (RegVT == MVT::f64)
2496         RC = &X86::FR64RegClass;
2497       else if (RegVT.is512BitVector())
2498         RC = &X86::VR512RegClass;
2499       else if (RegVT.is256BitVector())
2500         RC = &X86::VR256RegClass;
2501       else if (RegVT.is128BitVector())
2502         RC = &X86::VR128RegClass;
2503       else if (RegVT == MVT::x86mmx)
2504         RC = &X86::VR64RegClass;
2505       else if (RegVT == MVT::i1)
2506         RC = &X86::VK1RegClass;
2507       else if (RegVT == MVT::v8i1)
2508         RC = &X86::VK8RegClass;
2509       else if (RegVT == MVT::v16i1)
2510         RC = &X86::VK16RegClass;
2511       else if (RegVT == MVT::v32i1)
2512         RC = &X86::VK32RegClass;
2513       else if (RegVT == MVT::v64i1)
2514         RC = &X86::VK64RegClass;
2515       else
2516         llvm_unreachable("Unknown argument type!");
2517
2518       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2519       ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2520
2521       // If this is an 8 or 16-bit value, it is really passed promoted to 32
2522       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
2523       // right size.
2524       if (VA.getLocInfo() == CCValAssign::SExt)
2525         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
2526                                DAG.getValueType(VA.getValVT()));
2527       else if (VA.getLocInfo() == CCValAssign::ZExt)
2528         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
2529                                DAG.getValueType(VA.getValVT()));
2530       else if (VA.getLocInfo() == CCValAssign::BCvt)
2531         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
2532
2533       if (VA.isExtInLoc()) {
2534         // Handle MMX values passed in XMM regs.
2535         if (RegVT.isVector())
2536           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
2537         else
2538           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
2539       }
2540     } else {
2541       assert(VA.isMemLoc());
2542       ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
2543     }
2544
2545     // If value is passed via pointer - do a load.
2546     if (VA.getLocInfo() == CCValAssign::Indirect)
2547       ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
2548                              MachinePointerInfo(), false, false, false, 0);
2549
2550     InVals.push_back(ArgValue);
2551   }
2552
2553   if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) {
2554     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2555       // The x86-64 ABIs require that for returning structs by value we copy
2556       // the sret argument into %rax/%eax (depending on ABI) for the return.
2557       // Win32 requires us to put the sret argument to %eax as well.
2558       // Save the argument into a virtual register so that we can access it
2559       // from the return points.
2560       if (Ins[i].Flags.isSRet()) {
2561         unsigned Reg = FuncInfo->getSRetReturnReg();
2562         if (!Reg) {
2563           MVT PtrTy = getPointerTy();
2564           Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
2565           FuncInfo->setSRetReturnReg(Reg);
2566         }
2567         SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
2568         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
2569         break;
2570       }
2571     }
2572   }
2573
2574   unsigned StackSize = CCInfo.getNextStackOffset();
2575   // Align stack specially for tail calls.
2576   if (FuncIsMadeTailCallSafe(CallConv,
2577                              MF.getTarget().Options.GuaranteedTailCallOpt))
2578     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
2579
2580   // If the function takes variable number of arguments, make a frame index for
2581   // the start of the first vararg value... for expansion of llvm.va_start. We
2582   // can skip this if there are no va_start calls.
2583   if (MFI->hasVAStart() &&
2584       (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
2585                    CallConv != CallingConv::X86_ThisCall))) {
2586     FuncInfo->setVarArgsFrameIndex(
2587         MFI->CreateFixedObject(1, StackSize, true));
2588   }
2589
2590   // Figure out if XMM registers are in use.
2591   assert(!(MF.getTarget().Options.UseSoftFloat &&
2592            Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
2593                                             Attribute::NoImplicitFloat)) &&
2594          "SSE register cannot be used when SSE is disabled!");
2595
2596   // 64-bit calling conventions support varargs and register parameters, so we
2597   // have to do extra work to spill them in the prologue.
2598   if (Is64Bit && isVarArg && MFI->hasVAStart()) {
2599     // Find the first unallocated argument registers.
2600     ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
2601     ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
2602     unsigned NumIntRegs =
2603         CCInfo.getFirstUnallocated(ArgGPRs.data(), ArgGPRs.size());
2604     unsigned NumXMMRegs =
2605         CCInfo.getFirstUnallocated(ArgXMMs.data(), ArgXMMs.size());
2606     assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
2607            "SSE register cannot be used when SSE is disabled!");
2608
2609     // Gather all the live in physical registers.
2610     SmallVector<SDValue, 6> LiveGPRs;
2611     SmallVector<SDValue, 8> LiveXMMRegs;
2612     SDValue ALVal;
2613     for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
2614       unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
2615       LiveGPRs.push_back(
2616           DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
2617     }
2618     if (!ArgXMMs.empty()) {
2619       unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2620       ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
2621       for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
2622         unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
2623         LiveXMMRegs.push_back(
2624             DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
2625       }
2626     }
2627
2628     if (IsWin64) {
2629       const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
2630       // Get to the caller-allocated home save location.  Add 8 to account
2631       // for the return address.
2632       int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
2633       FuncInfo->setRegSaveFrameIndex(
2634           MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
2635       // Fixup to set vararg frame on shadow area (4 x i64).
2636       if (NumIntRegs < 4)
2637         FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
2638     } else {
2639       // For X86-64, if there are vararg parameters that are passed via
2640       // registers, then we must store them to their spots on the stack so
2641       // they may be loaded by deferencing the result of va_next.
2642       FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
2643       FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
2644       FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
2645           ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
2646     }
2647
2648     // Store the integer parameter registers.
2649     SmallVector<SDValue, 8> MemOps;
2650     SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
2651                                       getPointerTy());
2652     unsigned Offset = FuncInfo->getVarArgsGPOffset();
2653     for (SDValue Val : LiveGPRs) {
2654       SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
2655                                 DAG.getIntPtrConstant(Offset));
2656       SDValue Store =
2657         DAG.getStore(Val.getValue(1), dl, Val, FIN,
2658                      MachinePointerInfo::getFixedStack(
2659                        FuncInfo->getRegSaveFrameIndex(), Offset),
2660                      false, false, 0);
2661       MemOps.push_back(Store);
2662       Offset += 8;
2663     }
2664
2665     if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
2666       // Now store the XMM (fp + vector) parameter registers.
2667       SmallVector<SDValue, 12> SaveXMMOps;
2668       SaveXMMOps.push_back(Chain);
2669       SaveXMMOps.push_back(ALVal);
2670       SaveXMMOps.push_back(DAG.getIntPtrConstant(
2671                              FuncInfo->getRegSaveFrameIndex()));
2672       SaveXMMOps.push_back(DAG.getIntPtrConstant(
2673                              FuncInfo->getVarArgsFPOffset()));
2674       SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
2675                         LiveXMMRegs.end());
2676       MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
2677                                    MVT::Other, SaveXMMOps));
2678     }
2679
2680     if (!MemOps.empty())
2681       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
2682   }
2683
2684   if (isVarArg && MFI->hasMustTailInVarArgFunc()) {
2685     // Find the largest legal vector type.
2686     MVT VecVT = MVT::Other;
2687     // FIXME: Only some x86_32 calling conventions support AVX512.
2688     if (Subtarget->hasAVX512() &&
2689         (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
2690                      CallConv == CallingConv::Intel_OCL_BI)))
2691       VecVT = MVT::v16f32;
2692     else if (Subtarget->hasAVX())
2693       VecVT = MVT::v8f32;
2694     else if (Subtarget->hasSSE2())
2695       VecVT = MVT::v4f32;
2696
2697     // We forward some GPRs and some vector types.
2698     SmallVector<MVT, 2> RegParmTypes;
2699     MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
2700     RegParmTypes.push_back(IntVT);
2701     if (VecVT != MVT::Other)
2702       RegParmTypes.push_back(VecVT);
2703
2704     // Compute the set of forwarded registers. The rest are scratch.
2705     SmallVectorImpl<ForwardedRegister> &Forwards =
2706         FuncInfo->getForwardedMustTailRegParms();
2707     CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
2708
2709     // Conservatively forward AL on x86_64, since it might be used for varargs.
2710     if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
2711       unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2712       Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
2713     }
2714
2715     // Copy all forwards from physical to virtual registers.
2716     for (ForwardedRegister &F : Forwards) {
2717       // FIXME: Can we use a less constrained schedule?
2718       SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
2719       F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
2720       Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
2721     }
2722   }
2723
2724   // Some CCs need callee pop.
2725   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
2726                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
2727     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
2728   } else {
2729     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
2730     // If this is an sret function, the return should pop the hidden pointer.
2731     if (!Is64Bit && !IsTailCallConvention(CallConv) &&
2732         !Subtarget->getTargetTriple().isOSMSVCRT() &&
2733         argsAreStructReturn(Ins) == StackStructReturn)
2734       FuncInfo->setBytesToPopOnReturn(4);
2735   }
2736
2737   if (!Is64Bit) {
2738     // RegSaveFrameIndex is X86-64 only.
2739     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
2740     if (CallConv == CallingConv::X86_FastCall ||
2741         CallConv == CallingConv::X86_ThisCall)
2742       // fastcc functions can't have varargs.
2743       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
2744   }
2745
2746   FuncInfo->setArgumentStackSize(StackSize);
2747
2748   return Chain;
2749 }
2750
2751 SDValue
2752 X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
2753                                     SDValue StackPtr, SDValue Arg,
2754                                     SDLoc dl, SelectionDAG &DAG,
2755                                     const CCValAssign &VA,
2756                                     ISD::ArgFlagsTy Flags) const {
2757   unsigned LocMemOffset = VA.getLocMemOffset();
2758   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
2759   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
2760   if (Flags.isByVal())
2761     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
2762
2763   return DAG.getStore(Chain, dl, Arg, PtrOff,
2764                       MachinePointerInfo::getStack(LocMemOffset),
2765                       false, false, 0);
2766 }
2767
2768 /// Emit a load of return address if tail call
2769 /// optimization is performed and it is required.
2770 SDValue
2771 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
2772                                            SDValue &OutRetAddr, SDValue Chain,
2773                                            bool IsTailCall, bool Is64Bit,
2774                                            int FPDiff, SDLoc dl) const {
2775   // Adjust the Return address stack slot.
2776   EVT VT = getPointerTy();
2777   OutRetAddr = getReturnAddressFrameIndex(DAG);
2778
2779   // Load the "old" Return address.
2780   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
2781                            false, false, false, 0);
2782   return SDValue(OutRetAddr.getNode(), 1);
2783 }
2784
2785 /// Emit a store of the return address if tail call
2786 /// optimization is performed and it is required (FPDiff!=0).
2787 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
2788                                         SDValue Chain, SDValue RetAddrFrIdx,
2789                                         EVT PtrVT, unsigned SlotSize,
2790                                         int FPDiff, SDLoc dl) {
2791   // Store the return address to the appropriate stack slot.
2792   if (!FPDiff) return Chain;
2793   // Calculate the new stack slot for the return address.
2794   int NewReturnAddrFI =
2795     MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
2796                                          false);
2797   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
2798   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
2799                        MachinePointerInfo::getFixedStack(NewReturnAddrFI),
2800                        false, false, 0);
2801   return Chain;
2802 }
2803
2804 SDValue
2805 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2806                              SmallVectorImpl<SDValue> &InVals) const {
2807   SelectionDAG &DAG                     = CLI.DAG;
2808   SDLoc &dl                             = CLI.DL;
2809   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2810   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
2811   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
2812   SDValue Chain                         = CLI.Chain;
2813   SDValue Callee                        = CLI.Callee;
2814   CallingConv::ID CallConv              = CLI.CallConv;
2815   bool &isTailCall                      = CLI.IsTailCall;
2816   bool isVarArg                         = CLI.IsVarArg;
2817
2818   MachineFunction &MF = DAG.getMachineFunction();
2819   bool Is64Bit        = Subtarget->is64Bit();
2820   bool IsWin64        = Subtarget->isCallingConvWin64(CallConv);
2821   StructReturnType SR = callIsStructReturn(Outs);
2822   bool IsSibcall      = false;
2823   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
2824
2825   if (MF.getTarget().Options.DisableTailCalls)
2826     isTailCall = false;
2827
2828   bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
2829   if (IsMustTail) {
2830     // Force this to be a tail call.  The verifier rules are enough to ensure
2831     // that we can lower this successfully without moving the return address
2832     // around.
2833     isTailCall = true;
2834   } else if (isTailCall) {
2835     // Check if it's really possible to do a tail call.
2836     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
2837                     isVarArg, SR != NotStructReturn,
2838                     MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
2839                     Outs, OutVals, Ins, DAG);
2840
2841     // Sibcalls are automatically detected tailcalls which do not require
2842     // ABI changes.
2843     if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
2844       IsSibcall = true;
2845
2846     if (isTailCall)
2847       ++NumTailCalls;
2848   }
2849
2850   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
2851          "Var args not supported with calling convention fastcc, ghc or hipe");
2852
2853   // Analyze operands of the call, assigning locations to each operand.
2854   SmallVector<CCValAssign, 16> ArgLocs;
2855   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2856
2857   // Allocate shadow area for Win64
2858   if (IsWin64)
2859     CCInfo.AllocateStack(32, 8);
2860
2861   CCInfo.AnalyzeCallOperands(Outs, CC_X86);
2862
2863   // Get a count of how many bytes are to be pushed on the stack.
2864   unsigned NumBytes = CCInfo.getNextStackOffset();
2865   if (IsSibcall)
2866     // This is a sibcall. The memory operands are available in caller's
2867     // own caller's stack.
2868     NumBytes = 0;
2869   else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
2870            IsTailCallConvention(CallConv))
2871     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
2872
2873   int FPDiff = 0;
2874   if (isTailCall && !IsSibcall && !IsMustTail) {
2875     // Lower arguments at fp - stackoffset + fpdiff.
2876     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
2877
2878     FPDiff = NumBytesCallerPushed - NumBytes;
2879
2880     // Set the delta of movement of the returnaddr stackslot.
2881     // But only set if delta is greater than previous delta.
2882     if (FPDiff < X86Info->getTCReturnAddrDelta())
2883       X86Info->setTCReturnAddrDelta(FPDiff);
2884   }
2885
2886   unsigned NumBytesToPush = NumBytes;
2887   unsigned NumBytesToPop = NumBytes;
2888
2889   // If we have an inalloca argument, all stack space has already been allocated
2890   // for us and be right at the top of the stack.  We don't support multiple
2891   // arguments passed in memory when using inalloca.
2892   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
2893     NumBytesToPush = 0;
2894     if (!ArgLocs.back().isMemLoc())
2895       report_fatal_error("cannot use inalloca attribute on a register "
2896                          "parameter");
2897     if (ArgLocs.back().getLocMemOffset() != 0)
2898       report_fatal_error("any parameter with the inalloca attribute must be "
2899                          "the only memory argument");
2900   }
2901
2902   if (!IsSibcall)
2903     Chain = DAG.getCALLSEQ_START(
2904         Chain, DAG.getIntPtrConstant(NumBytesToPush, true), dl);
2905
2906   SDValue RetAddrFrIdx;
2907   // Load return address for tail calls.
2908   if (isTailCall && FPDiff)
2909     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
2910                                     Is64Bit, FPDiff, dl);
2911
2912   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2913   SmallVector<SDValue, 8> MemOpChains;
2914   SDValue StackPtr;
2915
2916   // Walk the register/memloc assignments, inserting copies/loads.  In the case
2917   // of tail call optimization arguments are handle later.
2918   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
2919       DAG.getSubtarget().getRegisterInfo());
2920   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2921     // Skip inalloca arguments, they have already been written.
2922     ISD::ArgFlagsTy Flags = Outs[i].Flags;
2923     if (Flags.isInAlloca())
2924       continue;
2925
2926     CCValAssign &VA = ArgLocs[i];
2927     EVT RegVT = VA.getLocVT();
2928     SDValue Arg = OutVals[i];
2929     bool isByVal = Flags.isByVal();
2930
2931     // Promote the value if needed.
2932     switch (VA.getLocInfo()) {
2933     default: llvm_unreachable("Unknown loc info!");
2934     case CCValAssign::Full: break;
2935     case CCValAssign::SExt:
2936       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
2937       break;
2938     case CCValAssign::ZExt:
2939       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
2940       break;
2941     case CCValAssign::AExt:
2942       if (RegVT.is128BitVector()) {
2943         // Special case: passing MMX values in XMM registers.
2944         Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
2945         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
2946         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
2947       } else
2948         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
2949       break;
2950     case CCValAssign::BCvt:
2951       Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
2952       break;
2953     case CCValAssign::Indirect: {
2954       // Store the argument.
2955       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
2956       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
2957       Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
2958                            MachinePointerInfo::getFixedStack(FI),
2959                            false, false, 0);
2960       Arg = SpillSlot;
2961       break;
2962     }
2963     }
2964
2965     if (VA.isRegLoc()) {
2966       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2967       if (isVarArg && IsWin64) {
2968         // Win64 ABI requires argument XMM reg to be copied to the corresponding
2969         // shadow reg if callee is a varargs function.
2970         unsigned ShadowReg = 0;
2971         switch (VA.getLocReg()) {
2972         case X86::XMM0: ShadowReg = X86::RCX; break;
2973         case X86::XMM1: ShadowReg = X86::RDX; break;
2974         case X86::XMM2: ShadowReg = X86::R8; break;
2975         case X86::XMM3: ShadowReg = X86::R9; break;
2976         }
2977         if (ShadowReg)
2978           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
2979       }
2980     } else if (!IsSibcall && (!isTailCall || isByVal)) {
2981       assert(VA.isMemLoc());
2982       if (!StackPtr.getNode())
2983         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
2984                                       getPointerTy());
2985       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
2986                                              dl, DAG, VA, Flags));
2987     }
2988   }
2989
2990   if (!MemOpChains.empty())
2991     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2992
2993   if (Subtarget->isPICStyleGOT()) {
2994     // ELF / PIC requires GOT in the EBX register before function calls via PLT
2995     // GOT pointer.
2996     if (!isTailCall) {
2997       RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
2998                DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
2999     } else {
3000       // If we are tail calling and generating PIC/GOT style code load the
3001       // address of the callee into ECX. The value in ecx is used as target of
3002       // the tail jump. This is done to circumvent the ebx/callee-saved problem
3003       // for tail calls on PIC/GOT architectures. Normally we would just put the
3004       // address of GOT into ebx and then call target@PLT. But for tail calls
3005       // ebx would be restored (since ebx is callee saved) before jumping to the
3006       // target@PLT.
3007
3008       // Note: The actual moving to ECX is done further down.
3009       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3010       if (G && !G->getGlobal()->hasHiddenVisibility() &&
3011           !G->getGlobal()->hasProtectedVisibility())
3012         Callee = LowerGlobalAddress(Callee, DAG);
3013       else if (isa<ExternalSymbolSDNode>(Callee))
3014         Callee = LowerExternalSymbol(Callee, DAG);
3015     }
3016   }
3017
3018   if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3019     // From AMD64 ABI document:
3020     // For calls that may call functions that use varargs or stdargs
3021     // (prototype-less calls or calls to functions containing ellipsis (...) in
3022     // the declaration) %al is used as hidden argument to specify the number
3023     // of SSE registers used. The contents of %al do not need to match exactly
3024     // the number of registers, but must be an ubound on the number of SSE
3025     // registers used and is in the range 0 - 8 inclusive.
3026
3027     // Count the number of XMM registers allocated.
3028     static const MCPhysReg XMMArgRegs[] = {
3029       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3030       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3031     };
3032     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
3033     assert((Subtarget->hasSSE1() || !NumXMMRegs)
3034            && "SSE registers cannot be used when SSE is disabled");
3035
3036     RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3037                                         DAG.getConstant(NumXMMRegs, MVT::i8)));
3038   }
3039
3040   if (isVarArg && IsMustTail) {
3041     const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3042     for (const auto &F : Forwards) {
3043       SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3044       RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3045     }
3046   }
3047
3048   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
3049   // don't need this because the eligibility check rejects calls that require
3050   // shuffling arguments passed in memory.
3051   if (!IsSibcall && isTailCall) {
3052     // Force all the incoming stack arguments to be loaded from the stack
3053     // before any new outgoing arguments are stored to the stack, because the
3054     // outgoing stack slots may alias the incoming argument stack slots, and
3055     // the alias isn't otherwise explicit. This is slightly more conservative
3056     // than necessary, because it means that each store effectively depends
3057     // on every argument instead of just those arguments it would clobber.
3058     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3059
3060     SmallVector<SDValue, 8> MemOpChains2;
3061     SDValue FIN;
3062     int FI = 0;
3063     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3064       CCValAssign &VA = ArgLocs[i];
3065       if (VA.isRegLoc())
3066         continue;
3067       assert(VA.isMemLoc());
3068       SDValue Arg = OutVals[i];
3069       ISD::ArgFlagsTy Flags = Outs[i].Flags;
3070       // Skip inalloca arguments.  They don't require any work.
3071       if (Flags.isInAlloca())
3072         continue;
3073       // Create frame index.
3074       int32_t Offset = VA.getLocMemOffset()+FPDiff;
3075       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3076       FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
3077       FIN = DAG.getFrameIndex(FI, getPointerTy());
3078
3079       if (Flags.isByVal()) {
3080         // Copy relative to framepointer.
3081         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
3082         if (!StackPtr.getNode())
3083           StackPtr = DAG.getCopyFromReg(Chain, dl,
3084                                         RegInfo->getStackRegister(),
3085                                         getPointerTy());
3086         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
3087
3088         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3089                                                          ArgChain,
3090                                                          Flags, DAG, dl));
3091       } else {
3092         // Store relative to framepointer.
3093         MemOpChains2.push_back(
3094           DAG.getStore(ArgChain, dl, Arg, FIN,
3095                        MachinePointerInfo::getFixedStack(FI),
3096                        false, false, 0));
3097       }
3098     }
3099
3100     if (!MemOpChains2.empty())
3101       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3102
3103     // Store the return address to the appropriate stack slot.
3104     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3105                                      getPointerTy(), RegInfo->getSlotSize(),
3106                                      FPDiff, dl);
3107   }
3108
3109   // Build a sequence of copy-to-reg nodes chained together with token chain
3110   // and flag operands which copy the outgoing args into registers.
3111   SDValue InFlag;
3112   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3113     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3114                              RegsToPass[i].second, InFlag);
3115     InFlag = Chain.getValue(1);
3116   }
3117
3118   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3119     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3120     // In the 64-bit large code model, we have to make all calls
3121     // through a register, since the call instruction's 32-bit
3122     // pc-relative offset may not be large enough to hold the whole
3123     // address.
3124   } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3125     // If the callee is a GlobalAddress node (quite common, every direct call
3126     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3127     // it.
3128     GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3129
3130     // We should use extra load for direct calls to dllimported functions in
3131     // non-JIT mode.
3132     const GlobalValue *GV = G->getGlobal();
3133     if (!GV->hasDLLImportStorageClass()) {
3134       unsigned char OpFlags = 0;
3135       bool ExtraLoad = false;
3136       unsigned WrapperKind = ISD::DELETED_NODE;
3137
3138       // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
3139       // external symbols most go through the PLT in PIC mode.  If the symbol
3140       // has hidden or protected visibility, or if it is static or local, then
3141       // we don't need to use the PLT - we can directly call it.
3142       if (Subtarget->isTargetELF() &&
3143           DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
3144           GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
3145         OpFlags = X86II::MO_PLT;
3146       } else if (Subtarget->isPICStyleStubAny() &&
3147                  (GV->isDeclaration() || GV->isWeakForLinker()) &&
3148                  (!Subtarget->getTargetTriple().isMacOSX() ||
3149                   Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
3150         // PC-relative references to external symbols should go through $stub,
3151         // unless we're building with the leopard linker or later, which
3152         // automatically synthesizes these stubs.
3153         OpFlags = X86II::MO_DARWIN_STUB;
3154       } else if (Subtarget->isPICStyleRIPRel() &&
3155                  isa<Function>(GV) &&
3156                  cast<Function>(GV)->getAttributes().
3157                    hasAttribute(AttributeSet::FunctionIndex,
3158                                 Attribute::NonLazyBind)) {
3159         // If the function is marked as non-lazy, generate an indirect call
3160         // which loads from the GOT directly. This avoids runtime overhead
3161         // at the cost of eager binding (and one extra byte of encoding).
3162         OpFlags = X86II::MO_GOTPCREL;
3163         WrapperKind = X86ISD::WrapperRIP;
3164         ExtraLoad = true;
3165       }
3166
3167       Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
3168                                           G->getOffset(), OpFlags);
3169
3170       // Add a wrapper if needed.
3171       if (WrapperKind != ISD::DELETED_NODE)
3172         Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
3173       // Add extra indirection if needed.
3174       if (ExtraLoad)
3175         Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
3176                              MachinePointerInfo::getGOT(),
3177                              false, false, false, 0);
3178     }
3179   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3180     unsigned char OpFlags = 0;
3181
3182     // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
3183     // external symbols should go through the PLT.
3184     if (Subtarget->isTargetELF() &&
3185         DAG.getTarget().getRelocationModel() == Reloc::PIC_) {
3186       OpFlags = X86II::MO_PLT;
3187     } else if (Subtarget->isPICStyleStubAny() &&
3188                (!Subtarget->getTargetTriple().isMacOSX() ||
3189                 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
3190       // PC-relative references to external symbols should go through $stub,
3191       // unless we're building with the leopard linker or later, which
3192       // automatically synthesizes these stubs.
3193       OpFlags = X86II::MO_DARWIN_STUB;
3194     }
3195
3196     Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
3197                                          OpFlags);
3198   } else if (Subtarget->isTarget64BitILP32() && Callee->getValueType(0) == MVT::i32) {
3199     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3200     Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3201   }
3202
3203   // Returns a chain & a flag for retval copy to use.
3204   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3205   SmallVector<SDValue, 8> Ops;
3206
3207   if (!IsSibcall && isTailCall) {
3208     Chain = DAG.getCALLSEQ_END(Chain,
3209                                DAG.getIntPtrConstant(NumBytesToPop, true),
3210                                DAG.getIntPtrConstant(0, true), InFlag, dl);
3211     InFlag = Chain.getValue(1);
3212   }
3213
3214   Ops.push_back(Chain);
3215   Ops.push_back(Callee);
3216
3217   if (isTailCall)
3218     Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
3219
3220   // Add argument registers to the end of the list so that they are known live
3221   // into the call.
3222   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3223     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3224                                   RegsToPass[i].second.getValueType()));
3225
3226   // Add a register mask operand representing the call-preserved registers.
3227   const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
3228   const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
3229   assert(Mask && "Missing call preserved mask for calling convention");
3230   Ops.push_back(DAG.getRegisterMask(Mask));
3231
3232   if (InFlag.getNode())
3233     Ops.push_back(InFlag);
3234
3235   if (isTailCall) {
3236     // We used to do:
3237     //// If this is the first return lowered for this function, add the regs
3238     //// to the liveout set for the function.
3239     // This isn't right, although it's probably harmless on x86; liveouts
3240     // should be computed from returns not tail calls.  Consider a void
3241     // function making a tail call to a function returning int.
3242     return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3243   }
3244
3245   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3246   InFlag = Chain.getValue(1);
3247
3248   // Create the CALLSEQ_END node.
3249   unsigned NumBytesForCalleeToPop;
3250   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3251                        DAG.getTarget().Options.GuaranteedTailCallOpt))
3252     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
3253   else if (!Is64Bit && !IsTailCallConvention(CallConv) &&
3254            !Subtarget->getTargetTriple().isOSMSVCRT() &&
3255            SR == StackStructReturn)
3256     // If this is a call to a struct-return function, the callee
3257     // pops the hidden struct pointer, so we have to push it back.
3258     // This is common for Darwin/X86, Linux & Mingw32 targets.
3259     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3260     NumBytesForCalleeToPop = 4;
3261   else
3262     NumBytesForCalleeToPop = 0;  // Callee pops nothing.
3263
3264   // Returns a flag for retval copy to use.
3265   if (!IsSibcall) {
3266     Chain = DAG.getCALLSEQ_END(Chain,
3267                                DAG.getIntPtrConstant(NumBytesToPop, true),
3268                                DAG.getIntPtrConstant(NumBytesForCalleeToPop,
3269                                                      true),
3270                                InFlag, dl);
3271     InFlag = Chain.getValue(1);
3272   }
3273
3274   // Handle result values, copying them out of physregs into vregs that we
3275   // return.
3276   return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
3277                          Ins, dl, DAG, InVals);
3278 }
3279
3280 //===----------------------------------------------------------------------===//
3281 //                Fast Calling Convention (tail call) implementation
3282 //===----------------------------------------------------------------------===//
3283
3284 //  Like std call, callee cleans arguments, convention except that ECX is
3285 //  reserved for storing the tail called function address. Only 2 registers are
3286 //  free for argument passing (inreg). Tail call optimization is performed
3287 //  provided:
3288 //                * tailcallopt is enabled
3289 //                * caller/callee are fastcc
3290 //  On X86_64 architecture with GOT-style position independent code only local
3291 //  (within module) calls are supported at the moment.
3292 //  To keep the stack aligned according to platform abi the function
3293 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
3294 //  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3295 //  If a tail called function callee has more arguments than the caller the
3296 //  caller needs to make sure that there is room to move the RETADDR to. This is
3297 //  achieved by reserving an area the size of the argument delta right after the
3298 //  original RETADDR, but before the saved framepointer or the spilled registers
3299 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3300 //  stack layout:
3301 //    arg1
3302 //    arg2
3303 //    RETADDR
3304 //    [ new RETADDR
3305 //      move area ]
3306 //    (possible EBP)
3307 //    ESI
3308 //    EDI
3309 //    local1 ..
3310
3311 /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
3312 /// for a 16 byte align requirement.
3313 unsigned
3314 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3315                                                SelectionDAG& DAG) const {
3316   MachineFunction &MF = DAG.getMachineFunction();
3317   const TargetMachine &TM = MF.getTarget();
3318   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
3319       TM.getSubtargetImpl()->getRegisterInfo());
3320   const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering();
3321   unsigned StackAlignment = TFI.getStackAlignment();
3322   uint64_t AlignMask = StackAlignment - 1;
3323   int64_t Offset = StackSize;
3324   unsigned SlotSize = RegInfo->getSlotSize();
3325   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3326     // Number smaller than 12 so just add the difference.
3327     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3328   } else {
3329     // Mask out lower bits, add stackalignment once plus the 12 bytes.
3330     Offset = ((~AlignMask) & Offset) + StackAlignment +
3331       (StackAlignment-SlotSize);
3332   }
3333   return Offset;
3334 }
3335
3336 /// MatchingStackOffset - Return true if the given stack call argument is
3337 /// already available in the same position (relatively) of the caller's
3338 /// incoming argument stack.
3339 static
3340 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3341                          MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
3342                          const X86InstrInfo *TII) {
3343   unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
3344   int FI = INT_MAX;
3345   if (Arg.getOpcode() == ISD::CopyFromReg) {
3346     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3347     if (!TargetRegisterInfo::isVirtualRegister(VR))
3348       return false;
3349     MachineInstr *Def = MRI->getVRegDef(VR);
3350     if (!Def)
3351       return false;
3352     if (!Flags.isByVal()) {
3353       if (!TII->isLoadFromStackSlot(Def, FI))
3354         return false;
3355     } else {
3356       unsigned Opcode = Def->getOpcode();
3357       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
3358           Def->getOperand(1).isFI()) {
3359         FI = Def->getOperand(1).getIndex();
3360         Bytes = Flags.getByValSize();
3361       } else
3362         return false;
3363     }
3364   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
3365     if (Flags.isByVal())
3366       // ByVal argument is passed in as a pointer but it's now being
3367       // dereferenced. e.g.
3368       // define @foo(%struct.X* %A) {
3369       //   tail call @bar(%struct.X* byval %A)
3370       // }
3371       return false;
3372     SDValue Ptr = Ld->getBasePtr();
3373     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
3374     if (!FINode)
3375       return false;
3376     FI = FINode->getIndex();
3377   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
3378     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
3379     FI = FINode->getIndex();
3380     Bytes = Flags.getByValSize();
3381   } else
3382     return false;
3383
3384   assert(FI != INT_MAX);
3385   if (!MFI->isFixedObjectIndex(FI))
3386     return false;
3387   return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
3388 }
3389
3390 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
3391 /// for tail call optimization. Targets which want to do tail call
3392 /// optimization should implement this function.
3393 bool
3394 X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
3395                                                      CallingConv::ID CalleeCC,
3396                                                      bool isVarArg,
3397                                                      bool isCalleeStructRet,
3398                                                      bool isCallerStructRet,
3399                                                      Type *RetTy,
3400                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
3401                                     const SmallVectorImpl<SDValue> &OutVals,
3402                                     const SmallVectorImpl<ISD::InputArg> &Ins,
3403                                                      SelectionDAG &DAG) const {
3404   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
3405     return false;
3406
3407   // If -tailcallopt is specified, make fastcc functions tail-callable.
3408   const MachineFunction &MF = DAG.getMachineFunction();
3409   const Function *CallerF = MF.getFunction();
3410
3411   // If the function return type is x86_fp80 and the callee return type is not,
3412   // then the FP_EXTEND of the call result is not a nop. It's not safe to
3413   // perform a tailcall optimization here.
3414   if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
3415     return false;
3416
3417   CallingConv::ID CallerCC = CallerF->getCallingConv();
3418   bool CCMatch = CallerCC == CalleeCC;
3419   bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
3420   bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
3421
3422   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
3423     if (IsTailCallConvention(CalleeCC) && CCMatch)
3424       return true;
3425     return false;
3426   }
3427
3428   // Look for obvious safe cases to perform tail call optimization that do not
3429   // require ABI changes. This is what gcc calls sibcall.
3430
3431   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
3432   // emit a special epilogue.
3433   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
3434       DAG.getSubtarget().getRegisterInfo());
3435   if (RegInfo->needsStackRealignment(MF))
3436     return false;
3437
3438   // Also avoid sibcall optimization if either caller or callee uses struct
3439   // return semantics.
3440   if (isCalleeStructRet || isCallerStructRet)
3441     return false;
3442
3443   // An stdcall/thiscall caller is expected to clean up its arguments; the
3444   // callee isn't going to do that.
3445   // FIXME: this is more restrictive than needed. We could produce a tailcall
3446   // when the stack adjustment matches. For example, with a thiscall that takes
3447   // only one argument.
3448   if (!CCMatch && (CallerCC == CallingConv::X86_StdCall ||
3449                    CallerCC == CallingConv::X86_ThisCall))
3450     return false;
3451
3452   // Do not sibcall optimize vararg calls unless all arguments are passed via
3453   // registers.
3454   if (isVarArg && !Outs.empty()) {
3455
3456     // Optimizing for varargs on Win64 is unlikely to be safe without
3457     // additional testing.
3458     if (IsCalleeWin64 || IsCallerWin64)
3459       return false;
3460
3461     SmallVector<CCValAssign, 16> ArgLocs;
3462     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
3463                    *DAG.getContext());
3464
3465     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3466     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
3467       if (!ArgLocs[i].isRegLoc())
3468         return false;
3469   }
3470
3471   // If the call result is in ST0 / ST1, it needs to be popped off the x87
3472   // stack.  Therefore, if it's not used by the call it is not safe to optimize
3473   // this into a sibcall.
3474   bool Unused = false;
3475   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
3476     if (!Ins[i].Used) {
3477       Unused = true;
3478       break;
3479     }
3480   }
3481   if (Unused) {
3482     SmallVector<CCValAssign, 16> RVLocs;
3483     CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs,
3484                    *DAG.getContext());
3485     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3486     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
3487       CCValAssign &VA = RVLocs[i];
3488       if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
3489         return false;
3490     }
3491   }
3492
3493   // If the calling conventions do not match, then we'd better make sure the
3494   // results are returned in the same way as what the caller expects.
3495   if (!CCMatch) {
3496     SmallVector<CCValAssign, 16> RVLocs1;
3497     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
3498                     *DAG.getContext());
3499     CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
3500
3501     SmallVector<CCValAssign, 16> RVLocs2;
3502     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
3503                     *DAG.getContext());
3504     CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
3505
3506     if (RVLocs1.size() != RVLocs2.size())
3507       return false;
3508     for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
3509       if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
3510         return false;
3511       if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
3512         return false;
3513       if (RVLocs1[i].isRegLoc()) {
3514         if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
3515           return false;
3516       } else {
3517         if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
3518           return false;
3519       }
3520     }
3521   }
3522
3523   // If the callee takes no arguments then go on to check the results of the
3524   // call.
3525   if (!Outs.empty()) {
3526     // Check if stack adjustment is needed. For now, do not do this if any
3527     // argument is passed on the stack.
3528     SmallVector<CCValAssign, 16> ArgLocs;
3529     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
3530                    *DAG.getContext());
3531
3532     // Allocate shadow area for Win64
3533     if (IsCalleeWin64)
3534       CCInfo.AllocateStack(32, 8);
3535
3536     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3537     if (CCInfo.getNextStackOffset()) {
3538       MachineFunction &MF = DAG.getMachineFunction();
3539       if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
3540         return false;
3541
3542       // Check if the arguments are already laid out in the right way as
3543       // the caller's fixed stack objects.
3544       MachineFrameInfo *MFI = MF.getFrameInfo();
3545       const MachineRegisterInfo *MRI = &MF.getRegInfo();
3546       const X86InstrInfo *TII =
3547           static_cast<const X86InstrInfo *>(DAG.getSubtarget().getInstrInfo());
3548       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3549         CCValAssign &VA = ArgLocs[i];
3550         SDValue Arg = OutVals[i];
3551         ISD::ArgFlagsTy Flags = Outs[i].Flags;
3552         if (VA.getLocInfo() == CCValAssign::Indirect)
3553           return false;
3554         if (!VA.isRegLoc()) {
3555           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
3556                                    MFI, MRI, TII))
3557             return false;
3558         }
3559       }
3560     }
3561
3562     // If the tailcall address may be in a register, then make sure it's
3563     // possible to register allocate for it. In 32-bit, the call address can
3564     // only target EAX, EDX, or ECX since the tail call must be scheduled after
3565     // callee-saved registers are restored. These happen to be the same
3566     // registers used to pass 'inreg' arguments so watch out for those.
3567     if (!Subtarget->is64Bit() &&
3568         ((!isa<GlobalAddressSDNode>(Callee) &&
3569           !isa<ExternalSymbolSDNode>(Callee)) ||
3570          DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
3571       unsigned NumInRegs = 0;
3572       // In PIC we need an extra register to formulate the address computation
3573       // for the callee.
3574       unsigned MaxInRegs =
3575         (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
3576
3577       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3578         CCValAssign &VA = ArgLocs[i];
3579         if (!VA.isRegLoc())
3580           continue;
3581         unsigned Reg = VA.getLocReg();
3582         switch (Reg) {
3583         default: break;
3584         case X86::EAX: case X86::EDX: case X86::ECX:
3585           if (++NumInRegs == MaxInRegs)
3586             return false;
3587           break;
3588         }
3589       }
3590     }
3591   }
3592
3593   return true;
3594 }
3595
3596 FastISel *
3597 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
3598                                   const TargetLibraryInfo *libInfo) const {
3599   return X86::createFastISel(funcInfo, libInfo);
3600 }
3601
3602 //===----------------------------------------------------------------------===//
3603 //                           Other Lowering Hooks
3604 //===----------------------------------------------------------------------===//
3605
3606 static bool MayFoldLoad(SDValue Op) {
3607   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
3608 }
3609
3610 static bool MayFoldIntoStore(SDValue Op) {
3611   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
3612 }
3613
3614 static bool isTargetShuffle(unsigned Opcode) {
3615   switch(Opcode) {
3616   default: return false;
3617   case X86ISD::BLENDI:
3618   case X86ISD::PSHUFB:
3619   case X86ISD::PSHUFD:
3620   case X86ISD::PSHUFHW:
3621   case X86ISD::PSHUFLW:
3622   case X86ISD::SHUFP:
3623   case X86ISD::PALIGNR:
3624   case X86ISD::MOVLHPS:
3625   case X86ISD::MOVLHPD:
3626   case X86ISD::MOVHLPS:
3627   case X86ISD::MOVLPS:
3628   case X86ISD::MOVLPD:
3629   case X86ISD::MOVSHDUP:
3630   case X86ISD::MOVSLDUP:
3631   case X86ISD::MOVDDUP:
3632   case X86ISD::MOVSS:
3633   case X86ISD::MOVSD:
3634   case X86ISD::UNPCKL:
3635   case X86ISD::UNPCKH:
3636   case X86ISD::VPERMILPI:
3637   case X86ISD::VPERM2X128:
3638   case X86ISD::VPERMI:
3639     return true;
3640   }
3641 }
3642
3643 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3644                                     SDValue V1, SelectionDAG &DAG) {
3645   switch(Opc) {
3646   default: llvm_unreachable("Unknown x86 shuffle node");
3647   case X86ISD::MOVSHDUP:
3648   case X86ISD::MOVSLDUP:
3649   case X86ISD::MOVDDUP:
3650     return DAG.getNode(Opc, dl, VT, V1);
3651   }
3652 }
3653
3654 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3655                                     SDValue V1, unsigned TargetMask,
3656                                     SelectionDAG &DAG) {
3657   switch(Opc) {
3658   default: llvm_unreachable("Unknown x86 shuffle node");
3659   case X86ISD::PSHUFD:
3660   case X86ISD::PSHUFHW:
3661   case X86ISD::PSHUFLW:
3662   case X86ISD::VPERMILPI:
3663   case X86ISD::VPERMI:
3664     return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
3665   }
3666 }
3667
3668 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3669                                     SDValue V1, SDValue V2, unsigned TargetMask,
3670                                     SelectionDAG &DAG) {
3671   switch(Opc) {
3672   default: llvm_unreachable("Unknown x86 shuffle node");
3673   case X86ISD::PALIGNR:
3674   case X86ISD::VALIGN:
3675   case X86ISD::SHUFP:
3676   case X86ISD::VPERM2X128:
3677     return DAG.getNode(Opc, dl, VT, V1, V2,
3678                        DAG.getConstant(TargetMask, MVT::i8));
3679   }
3680 }
3681
3682 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3683                                     SDValue V1, SDValue V2, SelectionDAG &DAG) {
3684   switch(Opc) {
3685   default: llvm_unreachable("Unknown x86 shuffle node");
3686   case X86ISD::MOVLHPS:
3687   case X86ISD::MOVLHPD:
3688   case X86ISD::MOVHLPS:
3689   case X86ISD::MOVLPS:
3690   case X86ISD::MOVLPD:
3691   case X86ISD::MOVSS:
3692   case X86ISD::MOVSD:
3693   case X86ISD::UNPCKL:
3694   case X86ISD::UNPCKH:
3695     return DAG.getNode(Opc, dl, VT, V1, V2);
3696   }
3697 }
3698
3699 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
3700   MachineFunction &MF = DAG.getMachineFunction();
3701   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
3702       DAG.getSubtarget().getRegisterInfo());
3703   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3704   int ReturnAddrIndex = FuncInfo->getRAIndex();
3705
3706   if (ReturnAddrIndex == 0) {
3707     // Set up a frame object for the return address.
3708     unsigned SlotSize = RegInfo->getSlotSize();
3709     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
3710                                                            -(int64_t)SlotSize,
3711                                                            false);
3712     FuncInfo->setRAIndex(ReturnAddrIndex);
3713   }
3714
3715   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
3716 }
3717
3718 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
3719                                        bool hasSymbolicDisplacement) {
3720   // Offset should fit into 32 bit immediate field.
3721   if (!isInt<32>(Offset))
3722     return false;
3723
3724   // If we don't have a symbolic displacement - we don't have any extra
3725   // restrictions.
3726   if (!hasSymbolicDisplacement)
3727     return true;
3728
3729   // FIXME: Some tweaks might be needed for medium code model.
3730   if (M != CodeModel::Small && M != CodeModel::Kernel)
3731     return false;
3732
3733   // For small code model we assume that latest object is 16MB before end of 31
3734   // bits boundary. We may also accept pretty large negative constants knowing
3735   // that all objects are in the positive half of address space.
3736   if (M == CodeModel::Small && Offset < 16*1024*1024)
3737     return true;
3738
3739   // For kernel code model we know that all object resist in the negative half
3740   // of 32bits address space. We may not accept negative offsets, since they may
3741   // be just off and we may accept pretty large positive ones.
3742   if (M == CodeModel::Kernel && Offset >= 0)
3743     return true;
3744
3745   return false;
3746 }
3747
3748 /// isCalleePop - Determines whether the callee is required to pop its
3749 /// own arguments. Callee pop is necessary to support tail calls.
3750 bool X86::isCalleePop(CallingConv::ID CallingConv,
3751                       bool is64Bit, bool IsVarArg, bool TailCallOpt) {
3752   switch (CallingConv) {
3753   default:
3754     return false;
3755   case CallingConv::X86_StdCall:
3756   case CallingConv::X86_FastCall:
3757   case CallingConv::X86_ThisCall:
3758     return !is64Bit;
3759   case CallingConv::Fast:
3760   case CallingConv::GHC:
3761   case CallingConv::HiPE:
3762     if (IsVarArg)
3763       return false;
3764     return TailCallOpt;
3765   }
3766 }
3767
3768 /// \brief Return true if the condition is an unsigned comparison operation.
3769 static bool isX86CCUnsigned(unsigned X86CC) {
3770   switch (X86CC) {
3771   default: llvm_unreachable("Invalid integer condition!");
3772   case X86::COND_E:     return true;
3773   case X86::COND_G:     return false;
3774   case X86::COND_GE:    return false;
3775   case X86::COND_L:     return false;
3776   case X86::COND_LE:    return false;
3777   case X86::COND_NE:    return true;
3778   case X86::COND_B:     return true;
3779   case X86::COND_A:     return true;
3780   case X86::COND_BE:    return true;
3781   case X86::COND_AE:    return true;
3782   }
3783   llvm_unreachable("covered switch fell through?!");
3784 }
3785
3786 /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
3787 /// specific condition code, returning the condition code and the LHS/RHS of the
3788 /// comparison to make.
3789 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
3790                                SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
3791   if (!isFP) {
3792     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
3793       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
3794         // X > -1   -> X == 0, jump !sign.
3795         RHS = DAG.getConstant(0, RHS.getValueType());
3796         return X86::COND_NS;
3797       }
3798       if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
3799         // X < 0   -> X == 0, jump on sign.
3800         return X86::COND_S;
3801       }
3802       if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
3803         // X < 1   -> X <= 0
3804         RHS = DAG.getConstant(0, RHS.getValueType());
3805         return X86::COND_LE;
3806       }
3807     }
3808
3809     switch (SetCCOpcode) {
3810     default: llvm_unreachable("Invalid integer condition!");
3811     case ISD::SETEQ:  return X86::COND_E;
3812     case ISD::SETGT:  return X86::COND_G;
3813     case ISD::SETGE:  return X86::COND_GE;
3814     case ISD::SETLT:  return X86::COND_L;
3815     case ISD::SETLE:  return X86::COND_LE;
3816     case ISD::SETNE:  return X86::COND_NE;
3817     case ISD::SETULT: return X86::COND_B;
3818     case ISD::SETUGT: return X86::COND_A;
3819     case ISD::SETULE: return X86::COND_BE;
3820     case ISD::SETUGE: return X86::COND_AE;
3821     }
3822   }
3823
3824   // First determine if it is required or is profitable to flip the operands.
3825
3826   // If LHS is a foldable load, but RHS is not, flip the condition.
3827   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3828       !ISD::isNON_EXTLoad(RHS.getNode())) {
3829     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3830     std::swap(LHS, RHS);
3831   }
3832
3833   switch (SetCCOpcode) {
3834   default: break;
3835   case ISD::SETOLT:
3836   case ISD::SETOLE:
3837   case ISD::SETUGT:
3838   case ISD::SETUGE:
3839     std::swap(LHS, RHS);
3840     break;
3841   }
3842
3843   // On a floating point condition, the flags are set as follows:
3844   // ZF  PF  CF   op
3845   //  0 | 0 | 0 | X > Y
3846   //  0 | 0 | 1 | X < Y
3847   //  1 | 0 | 0 | X == Y
3848   //  1 | 1 | 1 | unordered
3849   switch (SetCCOpcode) {
3850   default: llvm_unreachable("Condcode should be pre-legalized away");
3851   case ISD::SETUEQ:
3852   case ISD::SETEQ:   return X86::COND_E;
3853   case ISD::SETOLT:              // flipped
3854   case ISD::SETOGT:
3855   case ISD::SETGT:   return X86::COND_A;
3856   case ISD::SETOLE:              // flipped
3857   case ISD::SETOGE:
3858   case ISD::SETGE:   return X86::COND_AE;
3859   case ISD::SETUGT:              // flipped
3860   case ISD::SETULT:
3861   case ISD::SETLT:   return X86::COND_B;
3862   case ISD::SETUGE:              // flipped
3863   case ISD::SETULE:
3864   case ISD::SETLE:   return X86::COND_BE;
3865   case ISD::SETONE:
3866   case ISD::SETNE:   return X86::COND_NE;
3867   case ISD::SETUO:   return X86::COND_P;
3868   case ISD::SETO:    return X86::COND_NP;
3869   case ISD::SETOEQ:
3870   case ISD::SETUNE:  return X86::COND_INVALID;
3871   }
3872 }
3873
3874 /// hasFPCMov - is there a floating point cmov for the specific X86 condition
3875 /// code. Current x86 isa includes the following FP cmov instructions:
3876 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3877 static bool hasFPCMov(unsigned X86CC) {
3878   switch (X86CC) {
3879   default:
3880     return false;
3881   case X86::COND_B:
3882   case X86::COND_BE:
3883   case X86::COND_E:
3884   case X86::COND_P:
3885   case X86::COND_A:
3886   case X86::COND_AE:
3887   case X86::COND_NE:
3888   case X86::COND_NP:
3889     return true;
3890   }
3891 }
3892
3893 /// isFPImmLegal - Returns true if the target can instruction select the
3894 /// specified FP immediate natively. If false, the legalizer will
3895 /// materialize the FP immediate as a load from a constant pool.
3896 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
3897   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
3898     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
3899       return true;
3900   }
3901   return false;
3902 }
3903
3904 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
3905                                               ISD::LoadExtType ExtTy,
3906                                               EVT NewVT) const {
3907   // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3908   // relocation target a movq or addq instruction: don't let the load shrink.
3909   SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3910   if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3911     if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3912       return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3913   return true;
3914 }
3915
3916 /// \brief Returns true if it is beneficial to convert a load of a constant
3917 /// to just the constant itself.
3918 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
3919                                                           Type *Ty) const {
3920   assert(Ty->isIntegerTy());
3921
3922   unsigned BitSize = Ty->getPrimitiveSizeInBits();
3923   if (BitSize == 0 || BitSize > 64)
3924     return false;
3925   return true;
3926 }
3927
3928 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
3929                                                 unsigned Index) const {
3930   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
3931     return false;
3932
3933   return (Index == 0 || Index == ResVT.getVectorNumElements());
3934 }
3935
3936 bool X86TargetLowering::isCheapToSpeculateCttz() const {
3937   // Speculate cttz only if we can directly use TZCNT.
3938   return Subtarget->hasBMI();
3939 }
3940
3941 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
3942   // Speculate ctlz only if we can directly use LZCNT.
3943   return Subtarget->hasLZCNT();
3944 }
3945
3946 /// isUndefOrInRange - Return true if Val is undef or if its value falls within
3947 /// the specified range (L, H].
3948 static bool isUndefOrInRange(int Val, int Low, int Hi) {
3949   return (Val < 0) || (Val >= Low && Val < Hi);
3950 }
3951
3952 /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
3953 /// specified value.
3954 static bool isUndefOrEqual(int Val, int CmpVal) {
3955   return (Val < 0 || Val == CmpVal);
3956 }
3957
3958 /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
3959 /// from position Pos and ending in Pos+Size, falls within the specified
3960 /// sequential range (Low, Low+Size]. or is undef.
3961 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
3962                                        unsigned Pos, unsigned Size, int Low) {
3963   for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
3964     if (!isUndefOrEqual(Mask[i], Low))
3965       return false;
3966   return true;
3967 }
3968
3969 /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
3970 /// is suitable for input to PSHUFD. That is, it doesn't reference the other
3971 /// operand - by default will match for first operand.
3972 static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT,
3973                          bool TestSecondOperand = false) {
3974   if (VT != MVT::v4f32 && VT != MVT::v4i32 &&
3975       VT != MVT::v2f64 && VT != MVT::v2i64)
3976     return false;
3977
3978   unsigned NumElems = VT.getVectorNumElements();
3979   unsigned Lo = TestSecondOperand ? NumElems : 0;
3980   unsigned Hi = Lo + NumElems;
3981
3982   for (unsigned i = 0; i < NumElems; ++i)
3983     if (!isUndefOrInRange(Mask[i], (int)Lo, (int)Hi))
3984       return false;
3985
3986   return true;
3987 }
3988
3989 /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
3990 /// is suitable for input to PSHUFHW.
3991 static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
3992   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
3993     return false;
3994
3995   // Lower quadword copied in order or undef.
3996   if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
3997     return false;
3998
3999   // Upper quadword shuffled.
4000   for (unsigned i = 4; i != 8; ++i)
4001     if (!isUndefOrInRange(Mask[i], 4, 8))
4002       return false;
4003
4004   if (VT == MVT::v16i16) {
4005     // Lower quadword copied in order or undef.
4006     if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
4007       return false;
4008
4009     // Upper quadword shuffled.
4010     for (unsigned i = 12; i != 16; ++i)
4011       if (!isUndefOrInRange(Mask[i], 12, 16))
4012         return false;
4013   }
4014
4015   return true;
4016 }
4017
4018 /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
4019 /// is suitable for input to PSHUFLW.
4020 static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
4021   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
4022     return false;
4023
4024   // Upper quadword copied in order.
4025   if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
4026     return false;
4027
4028   // Lower quadword shuffled.
4029   for (unsigned i = 0; i != 4; ++i)
4030     if (!isUndefOrInRange(Mask[i], 0, 4))
4031       return false;
4032
4033   if (VT == MVT::v16i16) {
4034     // Upper quadword copied in order.
4035     if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
4036       return false;
4037
4038     // Lower quadword shuffled.
4039     for (unsigned i = 8; i != 12; ++i)
4040       if (!isUndefOrInRange(Mask[i], 8, 12))
4041         return false;
4042   }
4043
4044   return true;
4045 }
4046
4047 /// \brief Return true if the mask specifies a shuffle of elements that is
4048 /// suitable for input to intralane (palignr) or interlane (valign) vector
4049 /// right-shift.
4050 static bool isAlignrMask(ArrayRef<int> Mask, MVT VT, bool InterLane) {
4051   unsigned NumElts = VT.getVectorNumElements();
4052   unsigned NumLanes = InterLane ? 1: VT.getSizeInBits()/128;
4053   unsigned NumLaneElts = NumElts/NumLanes;
4054
4055   // Do not handle 64-bit element shuffles with palignr.
4056   if (NumLaneElts == 2)
4057     return false;
4058
4059   for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
4060     unsigned i;
4061     for (i = 0; i != NumLaneElts; ++i) {
4062       if (Mask[i+l] >= 0)
4063         break;
4064     }
4065
4066     // Lane is all undef, go to next lane
4067     if (i == NumLaneElts)
4068       continue;
4069
4070     int Start = Mask[i+l];
4071
4072     // Make sure its in this lane in one of the sources
4073     if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
4074         !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
4075       return false;
4076
4077     // If not lane 0, then we must match lane 0
4078     if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
4079       return false;
4080
4081     // Correct second source to be contiguous with first source
4082     if (Start >= (int)NumElts)
4083       Start -= NumElts - NumLaneElts;
4084
4085     // Make sure we're shifting in the right direction.
4086     if (Start <= (int)(i+l))
4087       return false;
4088
4089     Start -= i;
4090
4091     // Check the rest of the elements to see if they are consecutive.
4092     for (++i; i != NumLaneElts; ++i) {
4093       int Idx = Mask[i+l];
4094
4095       // Make sure its in this lane
4096       if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
4097           !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
4098         return false;
4099
4100       // If not lane 0, then we must match lane 0
4101       if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
4102         return false;
4103
4104       if (Idx >= (int)NumElts)
4105         Idx -= NumElts - NumLaneElts;
4106
4107       if (!isUndefOrEqual(Idx, Start+i))
4108         return false;
4109
4110     }
4111   }
4112
4113   return true;
4114 }
4115
4116 /// \brief Return true if the node specifies a shuffle of elements that is
4117 /// suitable for input to PALIGNR.
4118 static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
4119                           const X86Subtarget *Subtarget) {
4120   if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
4121       (VT.is256BitVector() && !Subtarget->hasInt256()) ||
4122       VT.is512BitVector())
4123     // FIXME: Add AVX512BW.
4124     return false;
4125
4126   return isAlignrMask(Mask, VT, false);
4127 }
4128
4129 /// \brief Return true if the node specifies a shuffle of elements that is
4130 /// suitable for input to VALIGN.
4131 static bool isVALIGNMask(ArrayRef<int> Mask, MVT VT,
4132                           const X86Subtarget *Subtarget) {
4133   // FIXME: Add AVX512VL.
4134   if (!VT.is512BitVector() || !Subtarget->hasAVX512())
4135     return false;
4136   return isAlignrMask(Mask, VT, true);
4137 }
4138
4139 /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
4140 /// the two vector operands have swapped position.
4141 static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
4142                                      unsigned NumElems) {
4143   for (unsigned i = 0; i != NumElems; ++i) {
4144     int idx = Mask[i];
4145     if (idx < 0)
4146       continue;
4147     else if (idx < (int)NumElems)
4148       Mask[i] = idx + NumElems;
4149     else
4150       Mask[i] = idx - NumElems;
4151   }
4152 }
4153
4154 /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
4155 /// specifies a shuffle of elements that is suitable for input to 128/256-bit
4156 /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
4157 /// reverse of what x86 shuffles want.
4158 static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) {
4159
4160   unsigned NumElems = VT.getVectorNumElements();
4161   unsigned NumLanes = VT.getSizeInBits()/128;
4162   unsigned NumLaneElems = NumElems/NumLanes;
4163
4164   if (NumLaneElems != 2 && NumLaneElems != 4)
4165     return false;
4166
4167   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4168   bool symetricMaskRequired =
4169     (VT.getSizeInBits() >= 256) && (EltSize == 32);
4170
4171   // VSHUFPSY divides the resulting vector into 4 chunks.
4172   // The sources are also splitted into 4 chunks, and each destination
4173   // chunk must come from a different source chunk.
4174   //
4175   //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
4176   //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
4177   //
4178   //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
4179   //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
4180   //
4181   // VSHUFPDY divides the resulting vector into 4 chunks.
4182   // The sources are also splitted into 4 chunks, and each destination
4183   // chunk must come from a different source chunk.
4184   //
4185   //  SRC1 =>      X3       X2       X1       X0
4186   //  SRC2 =>      Y3       Y2       Y1       Y0
4187   //
4188   //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
4189   //
4190   SmallVector<int, 4> MaskVal(NumLaneElems, -1);
4191   unsigned HalfLaneElems = NumLaneElems/2;
4192   for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
4193     for (unsigned i = 0; i != NumLaneElems; ++i) {
4194       int Idx = Mask[i+l];
4195       unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
4196       if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
4197         return false;
4198       // For VSHUFPSY, the mask of the second half must be the same as the
4199       // first but with the appropriate offsets. This works in the same way as
4200       // VPERMILPS works with masks.
4201       if (!symetricMaskRequired || Idx < 0)
4202         continue;
4203       if (MaskVal[i] < 0) {
4204         MaskVal[i] = Idx - l;
4205         continue;
4206       }
4207       if ((signed)(Idx - l) != MaskVal[i])
4208         return false;
4209     }
4210   }
4211
4212   return true;
4213 }
4214
4215 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
4216 /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
4217 static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) {
4218   if (!VT.is128BitVector())
4219     return false;
4220
4221   unsigned NumElems = VT.getVectorNumElements();
4222
4223   if (NumElems != 4)
4224     return false;
4225
4226   // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
4227   return isUndefOrEqual(Mask[0], 6) &&
4228          isUndefOrEqual(Mask[1], 7) &&
4229          isUndefOrEqual(Mask[2], 2) &&
4230          isUndefOrEqual(Mask[3], 3);
4231 }
4232
4233 /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
4234 /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
4235 /// <2, 3, 2, 3>
4236 static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) {
4237   if (!VT.is128BitVector())
4238     return false;
4239
4240   unsigned NumElems = VT.getVectorNumElements();
4241
4242   if (NumElems != 4)
4243     return false;
4244
4245   return isUndefOrEqual(Mask[0], 2) &&
4246          isUndefOrEqual(Mask[1], 3) &&
4247          isUndefOrEqual(Mask[2], 2) &&
4248          isUndefOrEqual(Mask[3], 3);
4249 }
4250
4251 /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
4252 /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
4253 static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) {
4254   if (!VT.is128BitVector())
4255     return false;
4256
4257   unsigned NumElems = VT.getVectorNumElements();
4258
4259   if (NumElems != 2 && NumElems != 4)
4260     return false;
4261
4262   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4263     if (!isUndefOrEqual(Mask[i], i + NumElems))
4264       return false;
4265
4266   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
4267     if (!isUndefOrEqual(Mask[i], i))
4268       return false;
4269
4270   return true;
4271 }
4272
4273 /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
4274 /// specifies a shuffle of elements that is suitable for input to MOVLHPS.
4275 static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
4276   if (!VT.is128BitVector())
4277     return false;
4278
4279   unsigned NumElems = VT.getVectorNumElements();
4280
4281   if (NumElems != 2 && NumElems != 4)
4282     return false;
4283
4284   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4285     if (!isUndefOrEqual(Mask[i], i))
4286       return false;
4287
4288   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4289     if (!isUndefOrEqual(Mask[i + e], i + NumElems))
4290       return false;
4291
4292   return true;
4293 }
4294
4295 /// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
4296 /// specifies a shuffle of elements that is suitable for input to INSERTPS.
4297 /// i. e: If all but one element come from the same vector.
4298 static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
4299   // TODO: Deal with AVX's VINSERTPS
4300   if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))
4301     return false;
4302
4303   unsigned CorrectPosV1 = 0;
4304   unsigned CorrectPosV2 = 0;
4305   for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) {
4306     if (Mask[i] == -1) {
4307       ++CorrectPosV1;
4308       ++CorrectPosV2;
4309       continue;
4310     }
4311
4312     if (Mask[i] == i)
4313       ++CorrectPosV1;
4314     else if (Mask[i] == i + 4)
4315       ++CorrectPosV2;
4316   }
4317
4318   if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
4319     // We have 3 elements (undefs count as elements from any vector) from one
4320     // vector, and one from another.
4321     return true;
4322
4323   return false;
4324 }
4325
4326 //
4327 // Some special combinations that can be optimized.
4328 //
4329 static
4330 SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
4331                                SelectionDAG &DAG) {
4332   MVT VT = SVOp->getSimpleValueType(0);
4333   SDLoc dl(SVOp);
4334
4335   if (VT != MVT::v8i32 && VT != MVT::v8f32)
4336     return SDValue();
4337
4338   ArrayRef<int> Mask = SVOp->getMask();
4339
4340   // These are the special masks that may be optimized.
4341   static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
4342   static const int MaskToOptimizeOdd[]  = {1, 9, 3, 11, 5, 13, 7, 15};
4343   bool MatchEvenMask = true;
4344   bool MatchOddMask  = true;
4345   for (int i=0; i<8; ++i) {
4346     if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
4347       MatchEvenMask = false;
4348     if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
4349       MatchOddMask = false;
4350   }
4351
4352   if (!MatchEvenMask && !MatchOddMask)
4353     return SDValue();
4354
4355   SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
4356
4357   SDValue Op0 = SVOp->getOperand(0);
4358   SDValue Op1 = SVOp->getOperand(1);
4359
4360   if (MatchEvenMask) {
4361     // Shift the second operand right to 32 bits.
4362     static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
4363     Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
4364   } else {
4365     // Shift the first operand left to 32 bits.
4366     static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
4367     Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
4368   }
4369   static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
4370   return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
4371 }
4372
4373 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
4374 /// specifies a shuffle of elements that is suitable for input to UNPCKL.
4375 static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT,
4376                          bool HasInt256, bool V2IsSplat = false) {
4377
4378   assert(VT.getSizeInBits() >= 128 &&
4379          "Unsupported vector type for unpckl");
4380
4381   unsigned NumElts = VT.getVectorNumElements();
4382   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
4383       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4384     return false;
4385
4386   assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
4387          "Unsupported vector type for unpckh");
4388
4389   // AVX defines UNPCK* to operate independently on 128-bit lanes.
4390   unsigned NumLanes = VT.getSizeInBits()/128;
4391   unsigned NumLaneElts = NumElts/NumLanes;
4392
4393   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4394     for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
4395       int BitI  = Mask[l+i];
4396       int BitI1 = Mask[l+i+1];
4397       if (!isUndefOrEqual(BitI, j))
4398         return false;
4399       if (V2IsSplat) {
4400         if (!isUndefOrEqual(BitI1, NumElts))
4401           return false;
4402       } else {
4403         if (!isUndefOrEqual(BitI1, j + NumElts))
4404           return false;
4405       }
4406     }
4407   }
4408
4409   return true;
4410 }
4411
4412 /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
4413 /// specifies a shuffle of elements that is suitable for input to UNPCKH.
4414 static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT,
4415                          bool HasInt256, bool V2IsSplat = false) {
4416   assert(VT.getSizeInBits() >= 128 &&
4417          "Unsupported vector type for unpckh");
4418
4419   unsigned NumElts = VT.getVectorNumElements();
4420   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
4421       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4422     return false;
4423
4424   assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
4425          "Unsupported vector type for unpckh");
4426
4427   // AVX defines UNPCK* to operate independently on 128-bit lanes.
4428   unsigned NumLanes = VT.getSizeInBits()/128;
4429   unsigned NumLaneElts = NumElts/NumLanes;
4430
4431   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4432     for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
4433       int BitI  = Mask[l+i];
4434       int BitI1 = Mask[l+i+1];
4435       if (!isUndefOrEqual(BitI, j))
4436         return false;
4437       if (V2IsSplat) {
4438         if (isUndefOrEqual(BitI1, NumElts))
4439           return false;
4440       } else {
4441         if (!isUndefOrEqual(BitI1, j+NumElts))
4442           return false;
4443       }
4444     }
4445   }
4446   return true;
4447 }
4448
4449 /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
4450 /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
4451 /// <0, 0, 1, 1>
4452 static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
4453   unsigned NumElts = VT.getVectorNumElements();
4454   bool Is256BitVec = VT.is256BitVector();
4455
4456   if (VT.is512BitVector())
4457     return false;
4458   assert((VT.is128BitVector() || VT.is256BitVector()) &&
4459          "Unsupported vector type for unpckh");
4460
4461   if (Is256BitVec && NumElts != 4 && NumElts != 8 &&
4462       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4463     return false;
4464
4465   // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
4466   // FIXME: Need a better way to get rid of this, there's no latency difference
4467   // between UNPCKLPD and MOVDDUP, the later should always be checked first and
4468   // the former later. We should also remove the "_undef" special mask.
4469   if (NumElts == 4 && Is256BitVec)
4470     return false;
4471
4472   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
4473   // independently on 128-bit lanes.
4474   unsigned NumLanes = VT.getSizeInBits()/128;
4475   unsigned NumLaneElts = NumElts/NumLanes;
4476
4477   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4478     for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
4479       int BitI  = Mask[l+i];
4480       int BitI1 = Mask[l+i+1];
4481
4482       if (!isUndefOrEqual(BitI, j))
4483         return false;
4484       if (!isUndefOrEqual(BitI1, j))
4485         return false;
4486     }
4487   }
4488
4489   return true;
4490 }
4491
4492 /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
4493 /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
4494 /// <2, 2, 3, 3>
4495 static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
4496   unsigned NumElts = VT.getVectorNumElements();
4497
4498   if (VT.is512BitVector())
4499     return false;
4500
4501   assert((VT.is128BitVector() || VT.is256BitVector()) &&
4502          "Unsupported vector type for unpckh");
4503
4504   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
4505       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4506     return false;
4507
4508   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
4509   // independently on 128-bit lanes.
4510   unsigned NumLanes = VT.getSizeInBits()/128;
4511   unsigned NumLaneElts = NumElts/NumLanes;
4512
4513   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4514     for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
4515       int BitI  = Mask[l+i];
4516       int BitI1 = Mask[l+i+1];
4517       if (!isUndefOrEqual(BitI, j))
4518         return false;
4519       if (!isUndefOrEqual(BitI1, j))
4520         return false;
4521     }
4522   }
4523   return true;
4524 }
4525
4526 // Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or
4527 // (src1[0], src0[1]), manipulation with 256-bit sub-vectors
4528 static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) {
4529   if (!VT.is512BitVector())
4530     return false;
4531
4532   unsigned NumElts = VT.getVectorNumElements();
4533   unsigned HalfSize = NumElts/2;
4534   if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) {
4535     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) {
4536       *Imm = 1;
4537       return true;
4538     }
4539   }
4540   if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) {
4541     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) {
4542       *Imm = 0;
4543       return true;
4544     }
4545   }
4546   return false;
4547 }
4548
4549 /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
4550 /// specifies a shuffle of elements that is suitable for input to MOVSS,
4551 /// MOVSD, and MOVD, i.e. setting the lowest element.
4552 static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
4553   if (VT.getVectorElementType().getSizeInBits() < 32)
4554     return false;
4555   if (!VT.is128BitVector())
4556     return false;
4557
4558   unsigned NumElts = VT.getVectorNumElements();
4559
4560   if (!isUndefOrEqual(Mask[0], NumElts))
4561     return false;
4562
4563   for (unsigned i = 1; i != NumElts; ++i)
4564     if (!isUndefOrEqual(Mask[i], i))
4565       return false;
4566
4567   return true;
4568 }
4569
4570 /// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
4571 /// as permutations between 128-bit chunks or halves. As an example: this
4572 /// shuffle bellow:
4573 ///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
4574 /// The first half comes from the second half of V1 and the second half from the
4575 /// the second half of V2.
4576 static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
4577   if (!HasFp256 || !VT.is256BitVector())
4578     return false;
4579
4580   // The shuffle result is divided into half A and half B. In total the two
4581   // sources have 4 halves, namely: C, D, E, F. The final values of A and
4582   // B must come from C, D, E or F.
4583   unsigned HalfSize = VT.getVectorNumElements()/2;
4584   bool MatchA = false, MatchB = false;
4585
4586   // Check if A comes from one of C, D, E, F.
4587   for (unsigned Half = 0; Half != 4; ++Half) {
4588     if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
4589       MatchA = true;
4590       break;
4591     }
4592   }
4593
4594   // Check if B comes from one of C, D, E, F.
4595   for (unsigned Half = 0; Half != 4; ++Half) {
4596     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
4597       MatchB = true;
4598       break;
4599     }
4600   }
4601
4602   return MatchA && MatchB;
4603 }
4604
4605 /// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
4606 /// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
4607 static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
4608   MVT VT = SVOp->getSimpleValueType(0);
4609
4610   unsigned HalfSize = VT.getVectorNumElements()/2;
4611
4612   unsigned FstHalf = 0, SndHalf = 0;
4613   for (unsigned i = 0; i < HalfSize; ++i) {
4614     if (SVOp->getMaskElt(i) > 0) {
4615       FstHalf = SVOp->getMaskElt(i)/HalfSize;
4616       break;
4617     }
4618   }
4619   for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
4620     if (SVOp->getMaskElt(i) > 0) {
4621       SndHalf = SVOp->getMaskElt(i)/HalfSize;
4622       break;
4623     }
4624   }
4625
4626   return (FstHalf | (SndHalf << 4));
4627 }
4628
4629 // Symetric in-lane mask. Each lane has 4 elements (for imm8)
4630 static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) {
4631   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4632   if (EltSize < 32)
4633     return false;
4634
4635   unsigned NumElts = VT.getVectorNumElements();
4636   Imm8 = 0;
4637   if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) {
4638     for (unsigned i = 0; i != NumElts; ++i) {
4639       if (Mask[i] < 0)
4640         continue;
4641       Imm8 |= Mask[i] << (i*2);
4642     }
4643     return true;
4644   }
4645
4646   unsigned LaneSize = 4;
4647   SmallVector<int, 4> MaskVal(LaneSize, -1);
4648
4649   for (unsigned l = 0; l != NumElts; l += LaneSize) {
4650     for (unsigned i = 0; i != LaneSize; ++i) {
4651       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
4652         return false;
4653       if (Mask[i+l] < 0)
4654         continue;
4655       if (MaskVal[i] < 0) {
4656         MaskVal[i] = Mask[i+l] - l;
4657         Imm8 |= MaskVal[i] << (i*2);
4658         continue;
4659       }
4660       if (Mask[i+l] != (signed)(MaskVal[i]+l))
4661         return false;
4662     }
4663   }
4664   return true;
4665 }
4666
4667 /// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
4668 /// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
4669 /// Note that VPERMIL mask matching is different depending whether theunderlying
4670 /// type is 32 or 64. In the VPERMILPS the high half of the mask should point
4671 /// to the same elements of the low, but to the higher half of the source.
4672 /// In VPERMILPD the two lanes could be shuffled independently of each other
4673 /// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
4674 static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {
4675   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4676   if (VT.getSizeInBits() < 256 || EltSize < 32)
4677     return false;
4678   bool symetricMaskRequired = (EltSize == 32);
4679   unsigned NumElts = VT.getVectorNumElements();
4680
4681   unsigned NumLanes = VT.getSizeInBits()/128;
4682   unsigned LaneSize = NumElts/NumLanes;
4683   // 2 or 4 elements in one lane
4684
4685   SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);
4686   for (unsigned l = 0; l != NumElts; l += LaneSize) {
4687     for (unsigned i = 0; i != LaneSize; ++i) {
4688       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
4689         return false;
4690       if (symetricMaskRequired) {
4691         if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {
4692           ExpectedMaskVal[i] = Mask[i+l] - l;
4693           continue;
4694         }
4695         if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l))
4696           return false;
4697       }
4698     }
4699   }
4700   return true;
4701 }
4702
4703 /// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
4704 /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
4705 /// element of vector 2 and the other elements to come from vector 1 in order.
4706 static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT,
4707                                bool V2IsSplat = false, bool V2IsUndef = false) {
4708   if (!VT.is128BitVector())
4709     return false;
4710
4711   unsigned NumOps = VT.getVectorNumElements();
4712   if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
4713     return false;
4714
4715   if (!isUndefOrEqual(Mask[0], 0))
4716     return false;
4717
4718   for (unsigned i = 1; i != NumOps; ++i)
4719     if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
4720           (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
4721           (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
4722       return false;
4723
4724   return true;
4725 }
4726
4727 /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4728 /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
4729 /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
4730 static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT,
4731                            const X86Subtarget *Subtarget) {
4732   if (!Subtarget->hasSSE3())
4733     return false;
4734
4735   unsigned NumElems = VT.getVectorNumElements();
4736
4737   if ((VT.is128BitVector() && NumElems != 4) ||
4738       (VT.is256BitVector() && NumElems != 8) ||
4739       (VT.is512BitVector() && NumElems != 16))
4740     return false;
4741
4742   // "i+1" is the value the indexed mask element must have
4743   for (unsigned i = 0; i != NumElems; i += 2)
4744     if (!isUndefOrEqual(Mask[i], i+1) ||
4745         !isUndefOrEqual(Mask[i+1], i+1))
4746       return false;
4747
4748   return true;
4749 }
4750
4751 /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4752 /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
4753 /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
4754 static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT,
4755                            const X86Subtarget *Subtarget) {
4756   if (!Subtarget->hasSSE3())
4757     return false;
4758
4759   unsigned NumElems = VT.getVectorNumElements();
4760
4761   if ((VT.is128BitVector() && NumElems != 4) ||
4762       (VT.is256BitVector() && NumElems != 8) ||
4763       (VT.is512BitVector() && NumElems != 16))
4764     return false;
4765
4766   // "i" is the value the indexed mask element must have
4767   for (unsigned i = 0; i != NumElems; i += 2)
4768     if (!isUndefOrEqual(Mask[i], i) ||
4769         !isUndefOrEqual(Mask[i+1], i))
4770       return false;
4771
4772   return true;
4773 }
4774
4775 /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
4776 /// specifies a shuffle of elements that is suitable for input to 256-bit
4777 /// version of MOVDDUP.
4778 static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
4779   if (!HasFp256 || !VT.is256BitVector())
4780     return false;
4781
4782   unsigned NumElts = VT.getVectorNumElements();
4783   if (NumElts != 4)
4784     return false;
4785
4786   for (unsigned i = 0; i != NumElts/2; ++i)
4787     if (!isUndefOrEqual(Mask[i], 0))
4788       return false;
4789   for (unsigned i = NumElts/2; i != NumElts; ++i)
4790     if (!isUndefOrEqual(Mask[i], NumElts/2))
4791       return false;
4792   return true;
4793 }
4794
4795 /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4796 /// specifies a shuffle of elements that is suitable for input to 128-bit
4797 /// version of MOVDDUP.
4798 static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) {
4799   if (!VT.is128BitVector())
4800     return false;
4801
4802   unsigned e = VT.getVectorNumElements() / 2;
4803   for (unsigned i = 0; i != e; ++i)
4804     if (!isUndefOrEqual(Mask[i], i))
4805       return false;
4806   for (unsigned i = 0; i != e; ++i)
4807     if (!isUndefOrEqual(Mask[e+i], i))
4808       return false;
4809   return true;
4810 }
4811
4812 /// isVEXTRACTIndex - Return true if the specified
4813 /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
4814 /// suitable for instruction that extract 128 or 256 bit vectors
4815 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
4816   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4817   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4818     return false;
4819
4820   // The index should be aligned on a vecWidth-bit boundary.
4821   uint64_t Index =
4822     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4823
4824   MVT VT = N->getSimpleValueType(0);
4825   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
4826   bool Result = (Index * ElSize) % vecWidth == 0;
4827
4828   return Result;
4829 }
4830
4831 /// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR
4832 /// operand specifies a subvector insert that is suitable for input to
4833 /// insertion of 128 or 256-bit subvectors
4834 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
4835   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4836   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4837     return false;
4838   // The index should be aligned on a vecWidth-bit boundary.
4839   uint64_t Index =
4840     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4841
4842   MVT VT = N->getSimpleValueType(0);
4843   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
4844   bool Result = (Index * ElSize) % vecWidth == 0;
4845
4846   return Result;
4847 }
4848
4849 bool X86::isVINSERT128Index(SDNode *N) {
4850   return isVINSERTIndex(N, 128);
4851 }
4852
4853 bool X86::isVINSERT256Index(SDNode *N) {
4854   return isVINSERTIndex(N, 256);
4855 }
4856
4857 bool X86::isVEXTRACT128Index(SDNode *N) {
4858   return isVEXTRACTIndex(N, 128);
4859 }
4860
4861 bool X86::isVEXTRACT256Index(SDNode *N) {
4862   return isVEXTRACTIndex(N, 256);
4863 }
4864
4865 /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
4866 /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
4867 /// Handles 128-bit and 256-bit.
4868 static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
4869   MVT VT = N->getSimpleValueType(0);
4870
4871   assert((VT.getSizeInBits() >= 128) &&
4872          "Unsupported vector type for PSHUF/SHUFP");
4873
4874   // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
4875   // independently on 128-bit lanes.
4876   unsigned NumElts = VT.getVectorNumElements();
4877   unsigned NumLanes = VT.getSizeInBits()/128;
4878   unsigned NumLaneElts = NumElts/NumLanes;
4879
4880   assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) &&
4881          "Only supports 2, 4 or 8 elements per lane");
4882
4883   unsigned Shift = (NumLaneElts >= 4) ? 1 : 0;
4884   unsigned Mask = 0;
4885   for (unsigned i = 0; i != NumElts; ++i) {
4886     int Elt = N->getMaskElt(i);
4887     if (Elt < 0) continue;
4888     Elt &= NumLaneElts - 1;
4889     unsigned ShAmt = (i << Shift) % 8;
4890     Mask |= Elt << ShAmt;
4891   }
4892
4893   return Mask;
4894 }
4895
4896 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
4897 /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
4898 static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
4899   MVT VT = N->getSimpleValueType(0);
4900
4901   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
4902          "Unsupported vector type for PSHUFHW");
4903
4904   unsigned NumElts = VT.getVectorNumElements();
4905
4906   unsigned Mask = 0;
4907   for (unsigned l = 0; l != NumElts; l += 8) {
4908     // 8 nodes per lane, but we only care about the last 4.
4909     for (unsigned i = 0; i < 4; ++i) {
4910       int Elt = N->getMaskElt(l+i+4);
4911       if (Elt < 0) continue;
4912       Elt &= 0x3; // only 2-bits.
4913       Mask |= Elt << (i * 2);
4914     }
4915   }
4916
4917   return Mask;
4918 }
4919
4920 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
4921 /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
4922 static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
4923   MVT VT = N->getSimpleValueType(0);
4924
4925   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
4926          "Unsupported vector type for PSHUFHW");
4927
4928   unsigned NumElts = VT.getVectorNumElements();
4929
4930   unsigned Mask = 0;
4931   for (unsigned l = 0; l != NumElts; l += 8) {
4932     // 8 nodes per lane, but we only care about the first 4.
4933     for (unsigned i = 0; i < 4; ++i) {
4934       int Elt = N->getMaskElt(l+i);
4935       if (Elt < 0) continue;
4936       Elt &= 0x3; // only 2-bits
4937       Mask |= Elt << (i * 2);
4938     }
4939   }
4940
4941   return Mask;
4942 }
4943
4944 /// \brief Return the appropriate immediate to shuffle the specified
4945 /// VECTOR_SHUFFLE mask with the PALIGNR (if InterLane is false) or with
4946 /// VALIGN (if Interlane is true) instructions.
4947 static unsigned getShuffleAlignrImmediate(ShuffleVectorSDNode *SVOp,
4948                                            bool InterLane) {
4949   MVT VT = SVOp->getSimpleValueType(0);
4950   unsigned EltSize = InterLane ? 1 :
4951     VT.getVectorElementType().getSizeInBits() >> 3;
4952
4953   unsigned NumElts = VT.getVectorNumElements();
4954   unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128;
4955   unsigned NumLaneElts = NumElts/NumLanes;
4956
4957   int Val = 0;
4958   unsigned i;
4959   for (i = 0; i != NumElts; ++i) {
4960     Val = SVOp->getMaskElt(i);
4961     if (Val >= 0)
4962       break;
4963   }
4964   if (Val >= (int)NumElts)
4965     Val -= NumElts - NumLaneElts;
4966
4967   assert(Val - i > 0 && "PALIGNR imm should be positive");
4968   return (Val - i) * EltSize;
4969 }
4970
4971 /// \brief Return the appropriate immediate to shuffle the specified
4972 /// VECTOR_SHUFFLE mask with the PALIGNR instruction.
4973 static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
4974   return getShuffleAlignrImmediate(SVOp, false);
4975 }
4976
4977 /// \brief Return the appropriate immediate to shuffle the specified
4978 /// VECTOR_SHUFFLE mask with the VALIGN instruction.
4979 static unsigned getShuffleVALIGNImmediate(ShuffleVectorSDNode *SVOp) {
4980   return getShuffleAlignrImmediate(SVOp, true);
4981 }
4982
4983
4984 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
4985   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4986   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4987     llvm_unreachable("Illegal extract subvector for VEXTRACT");
4988
4989   uint64_t Index =
4990     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4991
4992   MVT VecVT = N->getOperand(0).getSimpleValueType();
4993   MVT ElVT = VecVT.getVectorElementType();
4994
4995   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4996   return Index / NumElemsPerChunk;
4997 }
4998
4999 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
5000   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
5001   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
5002     llvm_unreachable("Illegal insert subvector for VINSERT");
5003
5004   uint64_t Index =
5005     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
5006
5007   MVT VecVT = N->getSimpleValueType(0);
5008   MVT ElVT = VecVT.getVectorElementType();
5009
5010   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
5011   return Index / NumElemsPerChunk;
5012 }
5013
5014 /// getExtractVEXTRACT128Immediate - Return the appropriate immediate
5015 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
5016 /// and VINSERTI128 instructions.
5017 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
5018   return getExtractVEXTRACTImmediate(N, 128);
5019 }
5020
5021 /// getExtractVEXTRACT256Immediate - Return the appropriate immediate
5022 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4
5023 /// and VINSERTI64x4 instructions.
5024 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
5025   return getExtractVEXTRACTImmediate(N, 256);
5026 }
5027
5028 /// getInsertVINSERT128Immediate - Return the appropriate immediate
5029 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
5030 /// and VINSERTI128 instructions.
5031 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
5032   return getInsertVINSERTImmediate(N, 128);
5033 }
5034
5035 /// getInsertVINSERT256Immediate - Return the appropriate immediate
5036 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4
5037 /// and VINSERTI64x4 instructions.
5038 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
5039   return getInsertVINSERTImmediate(N, 256);
5040 }
5041
5042 /// isZero - Returns true if Elt is a constant integer zero
5043 static bool isZero(SDValue V) {
5044   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
5045   return C && C->isNullValue();
5046 }
5047
5048 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
5049 /// constant +0.0.
5050 bool X86::isZeroNode(SDValue Elt) {
5051   if (isZero(Elt))
5052     return true;
5053   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
5054     return CFP->getValueAPF().isPosZero();
5055   return false;
5056 }
5057
5058 /// ShouldXformToMOVHLPS - Return true if the node should be transformed to
5059 /// match movhlps. The lower half elements should come from upper half of
5060 /// V1 (and in order), and the upper half elements should come from the upper
5061 /// half of V2 (and in order).
5062 static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) {
5063   if (!VT.is128BitVector())
5064     return false;
5065   if (VT.getVectorNumElements() != 4)
5066     return false;
5067   for (unsigned i = 0, e = 2; i != e; ++i)
5068     if (!isUndefOrEqual(Mask[i], i+2))
5069       return false;
5070   for (unsigned i = 2; i != 4; ++i)
5071     if (!isUndefOrEqual(Mask[i], i+4))
5072       return false;
5073   return true;
5074 }
5075
5076 /// isScalarLoadToVector - Returns true if the node is a scalar load that
5077 /// is promoted to a vector. It also returns the LoadSDNode by reference if
5078 /// required.
5079 static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) {
5080   if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
5081     return false;
5082   N = N->getOperand(0).getNode();
5083   if (!ISD::isNON_EXTLoad(N))
5084     return false;
5085   if (LD)
5086     *LD = cast<LoadSDNode>(N);
5087   return true;
5088 }
5089
5090 // Test whether the given value is a vector value which will be legalized
5091 // into a load.
5092 static bool WillBeConstantPoolLoad(SDNode *N) {
5093   if (N->getOpcode() != ISD::BUILD_VECTOR)
5094     return false;
5095
5096   // Check for any non-constant elements.
5097   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
5098     switch (N->getOperand(i).getNode()->getOpcode()) {
5099     case ISD::UNDEF:
5100     case ISD::ConstantFP:
5101     case ISD::Constant:
5102       break;
5103     default:
5104       return false;
5105     }
5106
5107   // Vectors of all-zeros and all-ones are materialized with special
5108   // instructions rather than being loaded.
5109   return !ISD::isBuildVectorAllZeros(N) &&
5110          !ISD::isBuildVectorAllOnes(N);
5111 }
5112
5113 /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
5114 /// match movlp{s|d}. The lower half elements should come from lower half of
5115 /// V1 (and in order), and the upper half elements should come from the upper
5116 /// half of V2 (and in order). And since V1 will become the source of the
5117 /// MOVLP, it must be either a vector load or a scalar load to vector.
5118 static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
5119                                ArrayRef<int> Mask, MVT VT) {
5120   if (!VT.is128BitVector())
5121     return false;
5122
5123   if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
5124     return false;
5125   // Is V2 is a vector load, don't do this transformation. We will try to use
5126   // load folding shufps op.
5127   if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
5128     return false;
5129
5130   unsigned NumElems = VT.getVectorNumElements();
5131
5132   if (NumElems != 2 && NumElems != 4)
5133     return false;
5134   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
5135     if (!isUndefOrEqual(Mask[i], i))
5136       return false;
5137   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
5138     if (!isUndefOrEqual(Mask[i], i+NumElems))
5139       return false;
5140   return true;
5141 }
5142
5143 /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
5144 /// to an zero vector.
5145 /// FIXME: move to dag combiner / method on ShuffleVectorSDNode
5146 static bool isZeroShuffle(ShuffleVectorSDNode *N) {
5147   SDValue V1 = N->getOperand(0);
5148   SDValue V2 = N->getOperand(1);
5149   unsigned NumElems = N->getValueType(0).getVectorNumElements();
5150   for (unsigned i = 0; i != NumElems; ++i) {
5151     int Idx = N->getMaskElt(i);
5152     if (Idx >= (int)NumElems) {
5153       unsigned Opc = V2.getOpcode();
5154       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
5155         continue;
5156       if (Opc != ISD::BUILD_VECTOR ||
5157           !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
5158         return false;
5159     } else if (Idx >= 0) {
5160       unsigned Opc = V1.getOpcode();
5161       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
5162         continue;
5163       if (Opc != ISD::BUILD_VECTOR ||
5164           !X86::isZeroNode(V1.getOperand(Idx)))
5165         return false;
5166     }
5167   }
5168   return true;
5169 }
5170
5171 /// getZeroVector - Returns a vector of specified type with all zero elements.
5172 ///
5173 static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
5174                              SelectionDAG &DAG, SDLoc dl) {
5175   assert(VT.isVector() && "Expected a vector type");
5176
5177   // Always build SSE zero vectors as <4 x i32> bitcasted
5178   // to their dest type. This ensures they get CSE'd.
5179   SDValue Vec;
5180   if (VT.is128BitVector()) {  // SSE
5181     if (Subtarget->hasSSE2()) {  // SSE2
5182       SDValue Cst = DAG.getConstant(0, MVT::i32);
5183       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
5184     } else { // SSE1
5185       SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
5186       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
5187     }
5188   } else if (VT.is256BitVector()) { // AVX
5189     if (Subtarget->hasInt256()) { // AVX2
5190       SDValue Cst = DAG.getConstant(0, MVT::i32);
5191       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5192       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
5193     } else {
5194       // 256-bit logic and arithmetic instructions in AVX are all
5195       // floating-point, no support for integer ops. Emit fp zeroed vectors.
5196       SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
5197       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5198       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
5199     }
5200   } else if (VT.is512BitVector()) { // AVX-512
5201       SDValue Cst = DAG.getConstant(0, MVT::i32);
5202       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
5203                         Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5204       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
5205   } else if (VT.getScalarType() == MVT::i1) {
5206     assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type");
5207     SDValue Cst = DAG.getConstant(0, MVT::i1);
5208     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
5209     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
5210   } else
5211     llvm_unreachable("Unexpected vector type");
5212
5213   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
5214 }
5215
5216 /// getOnesVector - Returns a vector of specified type with all bits set.
5217 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
5218 /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
5219 /// Then bitcast to their original type, ensuring they get CSE'd.
5220 static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
5221                              SDLoc dl) {
5222   assert(VT.isVector() && "Expected a vector type");
5223
5224   SDValue Cst = DAG.getConstant(~0U, MVT::i32);
5225   SDValue Vec;
5226   if (VT.is256BitVector()) {
5227     if (HasInt256) { // AVX2
5228       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5229       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
5230     } else { // AVX
5231       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
5232       Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
5233     }
5234   } else if (VT.is128BitVector()) {
5235     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
5236   } else
5237     llvm_unreachable("Unexpected vector type");
5238
5239   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
5240 }
5241
5242 /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
5243 /// that point to V2 points to its first element.
5244 static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
5245   for (unsigned i = 0; i != NumElems; ++i) {
5246     if (Mask[i] > (int)NumElems) {
5247       Mask[i] = NumElems;
5248     }
5249   }
5250 }
5251
5252 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
5253 /// operation of specified width.
5254 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
5255                        SDValue V2) {
5256   unsigned NumElems = VT.getVectorNumElements();
5257   SmallVector<int, 8> Mask;
5258   Mask.push_back(NumElems);
5259   for (unsigned i = 1; i != NumElems; ++i)
5260     Mask.push_back(i);
5261   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
5262 }
5263
5264 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
5265 static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
5266                           SDValue V2) {
5267   unsigned NumElems = VT.getVectorNumElements();
5268   SmallVector<int, 8> Mask;
5269   for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
5270     Mask.push_back(i);
5271     Mask.push_back(i + NumElems);
5272   }
5273   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
5274 }
5275
5276 /// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
5277 static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
5278                           SDValue V2) {
5279   unsigned NumElems = VT.getVectorNumElements();
5280   SmallVector<int, 8> Mask;
5281   for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
5282     Mask.push_back(i + Half);
5283     Mask.push_back(i + NumElems + Half);
5284   }
5285   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
5286 }
5287
5288 // PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
5289 // a generic shuffle instruction because the target has no such instructions.
5290 // Generate shuffles which repeat i16 and i8 several times until they can be
5291 // represented by v4f32 and then be manipulated by target suported shuffles.
5292 static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
5293   MVT VT = V.getSimpleValueType();
5294   int NumElems = VT.getVectorNumElements();
5295   SDLoc dl(V);
5296
5297   while (NumElems > 4) {
5298     if (EltNo < NumElems/2) {
5299       V = getUnpackl(DAG, dl, VT, V, V);
5300     } else {
5301       V = getUnpackh(DAG, dl, VT, V, V);
5302       EltNo -= NumElems/2;
5303     }
5304     NumElems >>= 1;
5305   }
5306   return V;
5307 }
5308
5309 /// getLegalSplat - Generate a legal splat with supported x86 shuffles
5310 static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
5311   MVT VT = V.getSimpleValueType();
5312   SDLoc dl(V);
5313
5314   if (VT.is128BitVector()) {
5315     V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
5316     int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
5317     V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
5318                              &SplatMask[0]);
5319   } else if (VT.is256BitVector()) {
5320     // To use VPERMILPS to splat scalars, the second half of indicies must
5321     // refer to the higher part, which is a duplication of the lower one,
5322     // because VPERMILPS can only handle in-lane permutations.
5323     int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
5324                          EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
5325
5326     V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
5327     V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
5328                              &SplatMask[0]);
5329   } else
5330     llvm_unreachable("Vector size not supported");
5331
5332   return DAG.getNode(ISD::BITCAST, dl, VT, V);
5333 }
5334
5335 /// PromoteSplat - Splat is promoted to target supported vector shuffles.
5336 static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
5337   MVT SrcVT = SV->getSimpleValueType(0);
5338   SDValue V1 = SV->getOperand(0);
5339   SDLoc dl(SV);
5340
5341   int EltNo = SV->getSplatIndex();
5342   int NumElems = SrcVT.getVectorNumElements();
5343   bool Is256BitVec = SrcVT.is256BitVector();
5344
5345   assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) &&
5346          "Unknown how to promote splat for type");
5347
5348   // Extract the 128-bit part containing the splat element and update
5349   // the splat element index when it refers to the higher register.
5350   if (Is256BitVec) {
5351     V1 = Extract128BitVector(V1, EltNo, DAG, dl);
5352     if (EltNo >= NumElems/2)
5353       EltNo -= NumElems/2;
5354   }
5355
5356   // All i16 and i8 vector types can't be used directly by a generic shuffle
5357   // instruction because the target has no such instruction. Generate shuffles
5358   // which repeat i16 and i8 several times until they fit in i32, and then can
5359   // be manipulated by target suported shuffles.
5360   MVT EltVT = SrcVT.getVectorElementType();
5361   if (EltVT == MVT::i8 || EltVT == MVT::i16)
5362     V1 = PromoteSplati8i16(V1, DAG, EltNo);
5363
5364   // Recreate the 256-bit vector and place the same 128-bit vector
5365   // into the low and high part. This is necessary because we want
5366   // to use VPERM* to shuffle the vectors
5367   if (Is256BitVec) {
5368     V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
5369   }
5370
5371   return getLegalSplat(DAG, V1, EltNo);
5372 }
5373
5374 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
5375 /// vector of zero or undef vector.  This produces a shuffle where the low
5376 /// element of V2 is swizzled into the zero/undef vector, landing at element
5377 /// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
5378 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
5379                                            bool IsZero,
5380                                            const X86Subtarget *Subtarget,
5381                                            SelectionDAG &DAG) {
5382   MVT VT = V2.getSimpleValueType();
5383   SDValue V1 = IsZero
5384     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5385   unsigned NumElems = VT.getVectorNumElements();
5386   SmallVector<int, 16> MaskVec;
5387   for (unsigned i = 0; i != NumElems; ++i)
5388     // If this is the insertion idx, put the low elt of V2 here.
5389     MaskVec.push_back(i == Idx ? NumElems : i);
5390   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
5391 }
5392
5393 /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
5394 /// target specific opcode. Returns true if the Mask could be calculated. Sets
5395 /// IsUnary to true if only uses one source. Note that this will set IsUnary for
5396 /// shuffles which use a single input multiple times, and in those cases it will
5397 /// adjust the mask to only have indices within that single input.
5398 static bool getTargetShuffleMask(SDNode *N, MVT VT,
5399                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
5400   unsigned NumElems = VT.getVectorNumElements();
5401   SDValue ImmN;
5402
5403   IsUnary = false;
5404   bool IsFakeUnary = false;
5405   switch(N->getOpcode()) {
5406   case X86ISD::BLENDI:
5407     ImmN = N->getOperand(N->getNumOperands()-1);
5408     DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5409     break;
5410   case X86ISD::SHUFP:
5411     ImmN = N->getOperand(N->getNumOperands()-1);
5412     DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5413     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5414     break;
5415   case X86ISD::UNPCKH:
5416     DecodeUNPCKHMask(VT, Mask);
5417     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5418     break;
5419   case X86ISD::UNPCKL:
5420     DecodeUNPCKLMask(VT, Mask);
5421     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5422     break;
5423   case X86ISD::MOVHLPS:
5424     DecodeMOVHLPSMask(NumElems, Mask);
5425     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5426     break;
5427   case X86ISD::MOVLHPS:
5428     DecodeMOVLHPSMask(NumElems, Mask);
5429     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5430     break;
5431   case X86ISD::PALIGNR:
5432     ImmN = N->getOperand(N->getNumOperands()-1);
5433     DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5434     break;
5435   case X86ISD::PSHUFD:
5436   case X86ISD::VPERMILPI:
5437     ImmN = N->getOperand(N->getNumOperands()-1);
5438     DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5439     IsUnary = true;
5440     break;
5441   case X86ISD::PSHUFHW:
5442     ImmN = N->getOperand(N->getNumOperands()-1);
5443     DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5444     IsUnary = true;
5445     break;
5446   case X86ISD::PSHUFLW:
5447     ImmN = N->getOperand(N->getNumOperands()-1);
5448     DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5449     IsUnary = true;
5450     break;
5451   case X86ISD::PSHUFB: {
5452     IsUnary = true;
5453     SDValue MaskNode = N->getOperand(1);
5454     while (MaskNode->getOpcode() == ISD::BITCAST)
5455       MaskNode = MaskNode->getOperand(0);
5456
5457     if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
5458       // If we have a build-vector, then things are easy.
5459       EVT VT = MaskNode.getValueType();
5460       assert(VT.isVector() &&
5461              "Can't produce a non-vector with a build_vector!");
5462       if (!VT.isInteger())
5463         return false;
5464
5465       int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8;
5466
5467       SmallVector<uint64_t, 32> RawMask;
5468       for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) {
5469         SDValue Op = MaskNode->getOperand(i);
5470         if (Op->getOpcode() == ISD::UNDEF) {
5471           RawMask.push_back((uint64_t)SM_SentinelUndef);
5472           continue;
5473         }
5474         auto *CN = dyn_cast<ConstantSDNode>(Op.getNode());
5475         if (!CN)
5476           return false;
5477         APInt MaskElement = CN->getAPIntValue();
5478
5479         // We now have to decode the element which could be any integer size and
5480         // extract each byte of it.
5481         for (int j = 0; j < NumBytesPerElement; ++j) {
5482           // Note that this is x86 and so always little endian: the low byte is
5483           // the first byte of the mask.
5484           RawMask.push_back(MaskElement.getLoBits(8).getZExtValue());
5485           MaskElement = MaskElement.lshr(8);
5486         }
5487       }
5488       DecodePSHUFBMask(RawMask, Mask);
5489       break;
5490     }
5491
5492     auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
5493     if (!MaskLoad)
5494       return false;
5495
5496     SDValue Ptr = MaskLoad->getBasePtr();
5497     if (Ptr->getOpcode() == X86ISD::Wrapper)
5498       Ptr = Ptr->getOperand(0);
5499
5500     auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
5501     if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
5502       return false;
5503
5504     if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
5505       DecodePSHUFBMask(C, Mask);
5506       break;
5507     }
5508
5509     return false;
5510   }
5511   case X86ISD::VPERMI:
5512     ImmN = N->getOperand(N->getNumOperands()-1);
5513     DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5514     IsUnary = true;
5515     break;
5516   case X86ISD::MOVSS:
5517   case X86ISD::MOVSD: {
5518     // The index 0 always comes from the first element of the second source,
5519     // this is why MOVSS and MOVSD are used in the first place. The other
5520     // elements come from the other positions of the first source vector
5521     Mask.push_back(NumElems);
5522     for (unsigned i = 1; i != NumElems; ++i) {
5523       Mask.push_back(i);
5524     }
5525     break;
5526   }
5527   case X86ISD::VPERM2X128:
5528     ImmN = N->getOperand(N->getNumOperands()-1);
5529     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5530     if (Mask.empty()) return false;
5531     break;
5532   case X86ISD::MOVSLDUP:
5533     DecodeMOVSLDUPMask(VT, Mask);
5534     IsUnary = true;
5535     break;
5536   case X86ISD::MOVSHDUP:
5537     DecodeMOVSHDUPMask(VT, Mask);
5538     IsUnary = true;
5539     break;
5540   case X86ISD::MOVDDUP:
5541     DecodeMOVDDUPMask(VT, Mask);
5542     IsUnary = true;
5543     break;
5544   case X86ISD::MOVLHPD:
5545   case X86ISD::MOVLPD:
5546   case X86ISD::MOVLPS:
5547     // Not yet implemented
5548     return false;
5549   default: llvm_unreachable("unknown target shuffle node");
5550   }
5551
5552   // If we have a fake unary shuffle, the shuffle mask is spread across two
5553   // inputs that are actually the same node. Re-map the mask to always point
5554   // into the first input.
5555   if (IsFakeUnary)
5556     for (int &M : Mask)
5557       if (M >= (int)Mask.size())
5558         M -= Mask.size();
5559
5560   return true;
5561 }
5562
5563 /// getShuffleScalarElt - Returns the scalar element that will make up the ith
5564 /// element of the result of the vector shuffle.
5565 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
5566                                    unsigned Depth) {
5567   if (Depth == 6)
5568     return SDValue();  // Limit search depth.
5569
5570   SDValue V = SDValue(N, 0);
5571   EVT VT = V.getValueType();
5572   unsigned Opcode = V.getOpcode();
5573
5574   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
5575   if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
5576     int Elt = SV->getMaskElt(Index);
5577
5578     if (Elt < 0)
5579       return DAG.getUNDEF(VT.getVectorElementType());
5580
5581     unsigned NumElems = VT.getVectorNumElements();
5582     SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
5583                                          : SV->getOperand(1);
5584     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
5585   }
5586
5587   // Recurse into target specific vector shuffles to find scalars.
5588   if (isTargetShuffle(Opcode)) {
5589     MVT ShufVT = V.getSimpleValueType();
5590     unsigned NumElems = ShufVT.getVectorNumElements();
5591     SmallVector<int, 16> ShuffleMask;
5592     bool IsUnary;
5593
5594     if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
5595       return SDValue();
5596
5597     int Elt = ShuffleMask[Index];
5598     if (Elt < 0)
5599       return DAG.getUNDEF(ShufVT.getVectorElementType());
5600
5601     SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
5602                                          : N->getOperand(1);
5603     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
5604                                Depth+1);
5605   }
5606
5607   // Actual nodes that may contain scalar elements
5608   if (Opcode == ISD::BITCAST) {
5609     V = V.getOperand(0);
5610     EVT SrcVT = V.getValueType();
5611     unsigned NumElems = VT.getVectorNumElements();
5612
5613     if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
5614       return SDValue();
5615   }
5616
5617   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
5618     return (Index == 0) ? V.getOperand(0)
5619                         : DAG.getUNDEF(VT.getVectorElementType());
5620
5621   if (V.getOpcode() == ISD::BUILD_VECTOR)
5622     return V.getOperand(Index);
5623
5624   return SDValue();
5625 }
5626
5627 /// getNumOfConsecutiveZeros - Return the number of elements of a vector
5628 /// shuffle operation which come from a consecutively from a zero. The
5629 /// search can start in two different directions, from left or right.
5630 /// We count undefs as zeros until PreferredNum is reached.
5631 static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp,
5632                                          unsigned NumElems, bool ZerosFromLeft,
5633                                          SelectionDAG &DAG,
5634                                          unsigned PreferredNum = -1U) {
5635   unsigned NumZeros = 0;
5636   for (unsigned i = 0; i != NumElems; ++i) {
5637     unsigned Index = ZerosFromLeft ? i : NumElems - i - 1;
5638     SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
5639     if (!Elt.getNode())
5640       break;
5641
5642     if (X86::isZeroNode(Elt))
5643       ++NumZeros;
5644     else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum.
5645       NumZeros = std::min(NumZeros + 1, PreferredNum);
5646     else
5647       break;
5648   }
5649
5650   return NumZeros;
5651 }
5652
5653 /// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
5654 /// correspond consecutively to elements from one of the vector operands,
5655 /// starting from its index OpIdx. Also tell OpNum which source vector operand.
5656 static
5657 bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
5658                               unsigned MaskI, unsigned MaskE, unsigned OpIdx,
5659                               unsigned NumElems, unsigned &OpNum) {
5660   bool SeenV1 = false;
5661   bool SeenV2 = false;
5662
5663   for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
5664     int Idx = SVOp->getMaskElt(i);
5665     // Ignore undef indicies
5666     if (Idx < 0)
5667       continue;
5668
5669     if (Idx < (int)NumElems)
5670       SeenV1 = true;
5671     else
5672       SeenV2 = true;
5673
5674     // Only accept consecutive elements from the same vector
5675     if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
5676       return false;
5677   }
5678
5679   OpNum = SeenV1 ? 0 : 1;
5680   return true;
5681 }
5682
5683 /// isVectorShiftRight - Returns true if the shuffle can be implemented as a
5684 /// logical left shift of a vector.
5685 static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
5686                                bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5687   unsigned NumElems =
5688     SVOp->getSimpleValueType(0).getVectorNumElements();
5689   unsigned NumZeros = getNumOfConsecutiveZeros(
5690       SVOp, NumElems, false /* check zeros from right */, DAG,
5691       SVOp->getMaskElt(0));
5692   unsigned OpSrc;
5693
5694   if (!NumZeros)
5695     return false;
5696
5697   // Considering the elements in the mask that are not consecutive zeros,
5698   // check if they consecutively come from only one of the source vectors.
5699   //
5700   //               V1 = {X, A, B, C}     0
5701   //                         \  \  \    /
5702   //   vector_shuffle V1, V2 <1, 2, 3, X>
5703   //
5704   if (!isShuffleMaskConsecutive(SVOp,
5705             0,                   // Mask Start Index
5706             NumElems-NumZeros,   // Mask End Index(exclusive)
5707             NumZeros,            // Where to start looking in the src vector
5708             NumElems,            // Number of elements in vector
5709             OpSrc))              // Which source operand ?
5710     return false;
5711
5712   isLeft = false;
5713   ShAmt = NumZeros;
5714   ShVal = SVOp->getOperand(OpSrc);
5715   return true;
5716 }
5717
5718 /// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
5719 /// logical left shift of a vector.
5720 static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
5721                               bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5722   unsigned NumElems =
5723     SVOp->getSimpleValueType(0).getVectorNumElements();
5724   unsigned NumZeros = getNumOfConsecutiveZeros(
5725       SVOp, NumElems, true /* check zeros from left */, DAG,
5726       NumElems - SVOp->getMaskElt(NumElems - 1) - 1);
5727   unsigned OpSrc;
5728
5729   if (!NumZeros)
5730     return false;
5731
5732   // Considering the elements in the mask that are not consecutive zeros,
5733   // check if they consecutively come from only one of the source vectors.
5734   //
5735   //                           0    { A, B, X, X } = V2
5736   //                          / \    /  /
5737   //   vector_shuffle V1, V2 <X, X, 4, 5>
5738   //
5739   if (!isShuffleMaskConsecutive(SVOp,
5740             NumZeros,     // Mask Start Index
5741             NumElems,     // Mask End Index(exclusive)
5742             0,            // Where to start looking in the src vector
5743             NumElems,     // Number of elements in vector
5744             OpSrc))       // Which source operand ?
5745     return false;
5746
5747   isLeft = true;
5748   ShAmt = NumZeros;
5749   ShVal = SVOp->getOperand(OpSrc);
5750   return true;
5751 }
5752
5753 /// isVectorShift - Returns true if the shuffle can be implemented as a
5754 /// logical left or right shift of a vector.
5755 static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
5756                           bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5757   // Although the logic below support any bitwidth size, there are no
5758   // shift instructions which handle more than 128-bit vectors.
5759   if (!SVOp->getSimpleValueType(0).is128BitVector())
5760     return false;
5761
5762   if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
5763       isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
5764     return true;
5765
5766   return false;
5767 }
5768
5769 /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
5770 ///
5771 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
5772                                        unsigned NumNonZero, unsigned NumZero,
5773                                        SelectionDAG &DAG,
5774                                        const X86Subtarget* Subtarget,
5775                                        const TargetLowering &TLI) {
5776   if (NumNonZero > 8)
5777     return SDValue();
5778
5779   SDLoc dl(Op);
5780   SDValue V;
5781   bool First = true;
5782   for (unsigned i = 0; i < 16; ++i) {
5783     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
5784     if (ThisIsNonZero && First) {
5785       if (NumZero)
5786         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5787       else
5788         V = DAG.getUNDEF(MVT::v8i16);
5789       First = false;
5790     }
5791
5792     if ((i & 1) != 0) {
5793       SDValue ThisElt, LastElt;
5794       bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
5795       if (LastIsNonZero) {
5796         LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
5797                               MVT::i16, Op.getOperand(i-1));
5798       }
5799       if (ThisIsNonZero) {
5800         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
5801         ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
5802                               ThisElt, DAG.getConstant(8, MVT::i8));
5803         if (LastIsNonZero)
5804           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
5805       } else
5806         ThisElt = LastElt;
5807
5808       if (ThisElt.getNode())
5809         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
5810                         DAG.getIntPtrConstant(i/2));
5811     }
5812   }
5813
5814   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
5815 }
5816
5817 /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
5818 ///
5819 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
5820                                      unsigned NumNonZero, unsigned NumZero,
5821                                      SelectionDAG &DAG,
5822                                      const X86Subtarget* Subtarget,
5823                                      const TargetLowering &TLI) {
5824   if (NumNonZero > 4)
5825     return SDValue();
5826
5827   SDLoc dl(Op);
5828   SDValue V;
5829   bool First = true;
5830   for (unsigned i = 0; i < 8; ++i) {
5831     bool isNonZero = (NonZeros & (1 << i)) != 0;
5832     if (isNonZero) {
5833       if (First) {
5834         if (NumZero)
5835           V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5836         else
5837           V = DAG.getUNDEF(MVT::v8i16);
5838         First = false;
5839       }
5840       V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
5841                       MVT::v8i16, V, Op.getOperand(i),
5842                       DAG.getIntPtrConstant(i));
5843     }
5844   }
5845
5846   return V;
5847 }
5848
5849 /// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
5850 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
5851                                      const X86Subtarget *Subtarget,
5852                                      const TargetLowering &TLI) {
5853   // Find all zeroable elements.
5854   bool Zeroable[4];
5855   for (int i=0; i < 4; ++i) {
5856     SDValue Elt = Op->getOperand(i);
5857     Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt));
5858   }
5859   assert(std::count_if(&Zeroable[0], &Zeroable[4],
5860                        [](bool M) { return !M; }) > 1 &&
5861          "We expect at least two non-zero elements!");
5862
5863   // We only know how to deal with build_vector nodes where elements are either
5864   // zeroable or extract_vector_elt with constant index.
5865   SDValue FirstNonZero;
5866   unsigned FirstNonZeroIdx;
5867   for (unsigned i=0; i < 4; ++i) {
5868     if (Zeroable[i])
5869       continue;
5870     SDValue Elt = Op->getOperand(i);
5871     if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5872         !isa<ConstantSDNode>(Elt.getOperand(1)))
5873       return SDValue();
5874     // Make sure that this node is extracting from a 128-bit vector.
5875     MVT VT = Elt.getOperand(0).getSimpleValueType();
5876     if (!VT.is128BitVector())
5877       return SDValue();
5878     if (!FirstNonZero.getNode()) {
5879       FirstNonZero = Elt;
5880       FirstNonZeroIdx = i;
5881     }
5882   }
5883
5884   assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
5885   SDValue V1 = FirstNonZero.getOperand(0);
5886   MVT VT = V1.getSimpleValueType();
5887
5888   // See if this build_vector can be lowered as a blend with zero.
5889   SDValue Elt;
5890   unsigned EltMaskIdx, EltIdx;
5891   int Mask[4];
5892   for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
5893     if (Zeroable[EltIdx]) {
5894       // The zero vector will be on the right hand side.
5895       Mask[EltIdx] = EltIdx+4;
5896       continue;
5897     }
5898
5899     Elt = Op->getOperand(EltIdx);
5900     // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
5901     EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
5902     if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
5903       break;
5904     Mask[EltIdx] = EltIdx;
5905   }
5906
5907   if (EltIdx == 4) {
5908     // Let the shuffle legalizer deal with blend operations.
5909     SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
5910     if (V1.getSimpleValueType() != VT)
5911       V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1);
5912     return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]);
5913   }
5914
5915   // See if we can lower this build_vector to a INSERTPS.
5916   if (!Subtarget->hasSSE41())
5917     return SDValue();
5918
5919   SDValue V2 = Elt.getOperand(0);
5920   if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
5921     V1 = SDValue();
5922
5923   bool CanFold = true;
5924   for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
5925     if (Zeroable[i])
5926       continue;
5927
5928     SDValue Current = Op->getOperand(i);
5929     SDValue SrcVector = Current->getOperand(0);
5930     if (!V1.getNode())
5931       V1 = SrcVector;
5932     CanFold = SrcVector == V1 &&
5933       cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
5934   }
5935
5936   if (!CanFold)
5937     return SDValue();
5938
5939   assert(V1.getNode() && "Expected at least two non-zero elements!");
5940   if (V1.getSimpleValueType() != MVT::v4f32)
5941     V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1);
5942   if (V2.getSimpleValueType() != MVT::v4f32)
5943     V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2);
5944
5945   // Ok, we can emit an INSERTPS instruction.
5946   unsigned ZMask = 0;
5947   for (int i = 0; i < 4; ++i)
5948     if (Zeroable[i])
5949       ZMask |= 1 << i;
5950
5951   unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
5952   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
5953   SDValue Result = DAG.getNode(X86ISD::INSERTPS, SDLoc(Op), MVT::v4f32, V1, V2,
5954                                DAG.getIntPtrConstant(InsertPSMask));
5955   return DAG.getNode(ISD::BITCAST, SDLoc(Op), VT, Result);
5956 }
5957
5958 /// getVShift - Return a vector logical shift node.
5959 ///
5960 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
5961                          unsigned NumBits, SelectionDAG &DAG,
5962                          const TargetLowering &TLI, SDLoc dl) {
5963   assert(VT.is128BitVector() && "Unknown type for VShift");
5964   EVT ShVT = MVT::v2i64;
5965   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
5966   SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
5967   return DAG.getNode(ISD::BITCAST, dl, VT,
5968                      DAG.getNode(Opc, dl, ShVT, SrcOp,
5969                              DAG.getConstant(NumBits,
5970                                   TLI.getScalarShiftAmountTy(SrcOp.getValueType()))));
5971 }
5972
5973 static SDValue
5974 LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
5975
5976   // Check if the scalar load can be widened into a vector load. And if
5977   // the address is "base + cst" see if the cst can be "absorbed" into
5978   // the shuffle mask.
5979   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
5980     SDValue Ptr = LD->getBasePtr();
5981     if (!ISD::isNormalLoad(LD) || LD->isVolatile())
5982       return SDValue();
5983     EVT PVT = LD->getValueType(0);
5984     if (PVT != MVT::i32 && PVT != MVT::f32)
5985       return SDValue();
5986
5987     int FI = -1;
5988     int64_t Offset = 0;
5989     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
5990       FI = FINode->getIndex();
5991       Offset = 0;
5992     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
5993                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
5994       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
5995       Offset = Ptr.getConstantOperandVal(1);
5996       Ptr = Ptr.getOperand(0);
5997     } else {
5998       return SDValue();
5999     }
6000
6001     // FIXME: 256-bit vector instructions don't require a strict alignment,
6002     // improve this code to support it better.
6003     unsigned RequiredAlign = VT.getSizeInBits()/8;
6004     SDValue Chain = LD->getChain();
6005     // Make sure the stack object alignment is at least 16 or 32.
6006     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
6007     if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
6008       if (MFI->isFixedObjectIndex(FI)) {
6009         // Can't change the alignment. FIXME: It's possible to compute
6010         // the exact stack offset and reference FI + adjust offset instead.
6011         // If someone *really* cares about this. That's the way to implement it.
6012         return SDValue();
6013       } else {
6014         MFI->setObjectAlignment(FI, RequiredAlign);
6015       }
6016     }
6017
6018     // (Offset % 16 or 32) must be multiple of 4. Then address is then
6019     // Ptr + (Offset & ~15).
6020     if (Offset < 0)
6021       return SDValue();
6022     if ((Offset % RequiredAlign) & 3)
6023       return SDValue();
6024     int64_t StartOffset = Offset & ~(RequiredAlign-1);
6025     if (StartOffset)
6026       Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(),
6027                         Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
6028
6029     int EltNo = (Offset - StartOffset) >> 2;
6030     unsigned NumElems = VT.getVectorNumElements();
6031
6032     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6033     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6034                              LD->getPointerInfo().getWithOffset(StartOffset),
6035                              false, false, false, 0);
6036
6037     SmallVector<int, 8> Mask;
6038     for (unsigned i = 0; i != NumElems; ++i)
6039       Mask.push_back(EltNo);
6040
6041     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
6042   }
6043
6044   return SDValue();
6045 }
6046
6047 /// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
6048 /// vector of type 'VT', see if the elements can be replaced by a single large
6049 /// load which has the same value as a build_vector whose operands are 'elts'.
6050 ///
6051 /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
6052 ///
6053 /// FIXME: we'd also like to handle the case where the last elements are zero
6054 /// rather than undef via VZEXT_LOAD, but we do not detect that case today.
6055 /// There's even a handy isZeroNode for that purpose.
6056 static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
6057                                         SDLoc &DL, SelectionDAG &DAG,
6058                                         bool isAfterLegalize) {
6059   EVT EltVT = VT.getVectorElementType();
6060   unsigned NumElems = Elts.size();
6061
6062   LoadSDNode *LDBase = nullptr;
6063   unsigned LastLoadedElt = -1U;
6064
6065   // For each element in the initializer, see if we've found a load or an undef.
6066   // If we don't find an initial load element, or later load elements are
6067   // non-consecutive, bail out.
6068   for (unsigned i = 0; i < NumElems; ++i) {
6069     SDValue Elt = Elts[i];
6070
6071     if (!Elt.getNode() ||
6072         (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
6073       return SDValue();
6074     if (!LDBase) {
6075       if (Elt.getNode()->getOpcode() == ISD::UNDEF)
6076         return SDValue();
6077       LDBase = cast<LoadSDNode>(Elt.getNode());
6078       LastLoadedElt = i;
6079       continue;
6080     }
6081     if (Elt.getOpcode() == ISD::UNDEF)
6082       continue;
6083
6084     LoadSDNode *LD = cast<LoadSDNode>(Elt);
6085     if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
6086       return SDValue();
6087     LastLoadedElt = i;
6088   }
6089
6090   // If we have found an entire vector of loads and undefs, then return a large
6091   // load of the entire vector width starting at the base pointer.  If we found
6092   // consecutive loads for the low half, generate a vzext_load node.
6093   if (LastLoadedElt == NumElems - 1) {
6094
6095     if (isAfterLegalize &&
6096         !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
6097       return SDValue();
6098
6099     SDValue NewLd = SDValue();
6100
6101     NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6102                         LDBase->getPointerInfo(), LDBase->isVolatile(),
6103                         LDBase->isNonTemporal(), LDBase->isInvariant(),
6104                         LDBase->getAlignment());
6105
6106     if (LDBase->hasAnyUseOfValue(1)) {
6107       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
6108                                      SDValue(LDBase, 1),
6109                                      SDValue(NewLd.getNode(), 1));
6110       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6111       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6112                              SDValue(NewLd.getNode(), 1));
6113     }
6114
6115     return NewLd;
6116   }
6117
6118   //TODO: The code below fires only for for loading the low v2i32 / v2f32
6119   //of a v4i32 / v4f32. It's probably worth generalizing.
6120   if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) &&
6121       DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
6122     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
6123     SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6124     SDValue ResNode =
6125         DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
6126                                 LDBase->getPointerInfo(),
6127                                 LDBase->getAlignment(),
6128                                 false/*isVolatile*/, true/*ReadMem*/,
6129                                 false/*WriteMem*/);
6130
6131     // Make sure the newly-created LOAD is in the same position as LDBase in
6132     // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
6133     // update uses of LDBase's output chain to use the TokenFactor.
6134     if (LDBase->hasAnyUseOfValue(1)) {
6135       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
6136                              SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
6137       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6138       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6139                              SDValue(ResNode.getNode(), 1));
6140     }
6141
6142     return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
6143   }
6144   return SDValue();
6145 }
6146
6147 /// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
6148 /// to generate a splat value for the following cases:
6149 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
6150 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
6151 /// a scalar load, or a constant.
6152 /// The VBROADCAST node is returned when a pattern is found,
6153 /// or SDValue() otherwise.
6154 static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
6155                                     SelectionDAG &DAG) {
6156   // VBROADCAST requires AVX.
6157   // TODO: Splats could be generated for non-AVX CPUs using SSE
6158   // instructions, but there's less potential gain for only 128-bit vectors.
6159   if (!Subtarget->hasAVX())
6160     return SDValue();
6161
6162   MVT VT = Op.getSimpleValueType();
6163   SDLoc dl(Op);
6164
6165   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
6166          "Unsupported vector type for broadcast.");
6167
6168   SDValue Ld;
6169   bool ConstSplatVal;
6170
6171   switch (Op.getOpcode()) {
6172     default:
6173       // Unknown pattern found.
6174       return SDValue();
6175
6176     case ISD::BUILD_VECTOR: {
6177       auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
6178       BitVector UndefElements;
6179       SDValue Splat = BVOp->getSplatValue(&UndefElements);
6180
6181       // We need a splat of a single value to use broadcast, and it doesn't
6182       // make any sense if the value is only in one element of the vector.
6183       if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
6184         return SDValue();
6185
6186       Ld = Splat;
6187       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
6188                        Ld.getOpcode() == ISD::ConstantFP);
6189
6190       // Make sure that all of the users of a non-constant load are from the
6191       // BUILD_VECTOR node.
6192       if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
6193         return SDValue();
6194       break;
6195     }
6196
6197     case ISD::VECTOR_SHUFFLE: {
6198       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
6199
6200       // Shuffles must have a splat mask where the first element is
6201       // broadcasted.
6202       if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
6203         return SDValue();
6204
6205       SDValue Sc = Op.getOperand(0);
6206       if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
6207           Sc.getOpcode() != ISD::BUILD_VECTOR) {
6208
6209         if (!Subtarget->hasInt256())
6210           return SDValue();
6211
6212         // Use the register form of the broadcast instruction available on AVX2.
6213         if (VT.getSizeInBits() >= 256)
6214           Sc = Extract128BitVector(Sc, 0, DAG, dl);
6215         return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
6216       }
6217
6218       Ld = Sc.getOperand(0);
6219       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
6220                        Ld.getOpcode() == ISD::ConstantFP);
6221
6222       // The scalar_to_vector node and the suspected
6223       // load node must have exactly one user.
6224       // Constants may have multiple users.
6225
6226       // AVX-512 has register version of the broadcast
6227       bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
6228         Ld.getValueType().getSizeInBits() >= 32;
6229       if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
6230           !hasRegVer))
6231         return SDValue();
6232       break;
6233     }
6234   }
6235
6236   unsigned ScalarSize = Ld.getValueType().getSizeInBits();
6237   bool IsGE256 = (VT.getSizeInBits() >= 256);
6238
6239   // When optimizing for size, generate up to 5 extra bytes for a broadcast
6240   // instruction to save 8 or more bytes of constant pool data.
6241   // TODO: If multiple splats are generated to load the same constant,
6242   // it may be detrimental to overall size. There needs to be a way to detect
6243   // that condition to know if this is truly a size win.
6244   const Function *F = DAG.getMachineFunction().getFunction();
6245   bool OptForSize = F->getAttributes().
6246     hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
6247
6248   // Handle broadcasting a single constant scalar from the constant pool
6249   // into a vector.
6250   // On Sandybridge (no AVX2), it is still better to load a constant vector
6251   // from the constant pool and not to broadcast it from a scalar.
6252   // But override that restriction when optimizing for size.
6253   // TODO: Check if splatting is recommended for other AVX-capable CPUs.
6254   if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) {
6255     EVT CVT = Ld.getValueType();
6256     assert(!CVT.isVector() && "Must not broadcast a vector type");
6257
6258     // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
6259     // For size optimization, also splat v2f64 and v2i64, and for size opt
6260     // with AVX2, also splat i8 and i16.
6261     // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
6262     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6263         (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) {
6264       const Constant *C = nullptr;
6265       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
6266         C = CI->getConstantIntValue();
6267       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
6268         C = CF->getConstantFPValue();
6269
6270       assert(C && "Invalid constant type");
6271
6272       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6273       SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
6274       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6275       Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
6276                        MachinePointerInfo::getConstantPool(),
6277                        false, false, false, Alignment);
6278
6279       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6280     }
6281   }
6282
6283   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
6284
6285   // Handle AVX2 in-register broadcasts.
6286   if (!IsLoad && Subtarget->hasInt256() &&
6287       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
6288     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6289
6290   // The scalar source must be a normal load.
6291   if (!IsLoad)
6292     return SDValue();
6293
6294   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6295       (Subtarget->hasVLX() && ScalarSize == 64))
6296     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6297
6298   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
6299   // double since there is no vbroadcastsd xmm
6300   if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
6301     if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
6302       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6303   }
6304
6305   // Unsupported broadcast.
6306   return SDValue();
6307 }
6308
6309 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
6310 /// underlying vector and index.
6311 ///
6312 /// Modifies \p ExtractedFromVec to the real vector and returns the real
6313 /// index.
6314 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
6315                                          SDValue ExtIdx) {
6316   int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
6317   if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
6318     return Idx;
6319
6320   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
6321   // lowered this:
6322   //   (extract_vector_elt (v8f32 %vreg1), Constant<6>)
6323   // to:
6324   //   (extract_vector_elt (vector_shuffle<2,u,u,u>
6325   //                           (extract_subvector (v8f32 %vreg0), Constant<4>),
6326   //                           undef)
6327   //                       Constant<0>)
6328   // In this case the vector is the extract_subvector expression and the index
6329   // is 2, as specified by the shuffle.
6330   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
6331   SDValue ShuffleVec = SVOp->getOperand(0);
6332   MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
6333   assert(ShuffleVecVT.getVectorElementType() ==
6334          ExtractedFromVec.getSimpleValueType().getVectorElementType());
6335
6336   int ShuffleIdx = SVOp->getMaskElt(Idx);
6337   if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
6338     ExtractedFromVec = ShuffleVec;
6339     return ShuffleIdx;
6340   }
6341   return Idx;
6342 }
6343
6344 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
6345   MVT VT = Op.getSimpleValueType();
6346
6347   // Skip if insert_vec_elt is not supported.
6348   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6349   if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
6350     return SDValue();
6351
6352   SDLoc DL(Op);
6353   unsigned NumElems = Op.getNumOperands();
6354
6355   SDValue VecIn1;
6356   SDValue VecIn2;
6357   SmallVector<unsigned, 4> InsertIndices;
6358   SmallVector<int, 8> Mask(NumElems, -1);
6359
6360   for (unsigned i = 0; i != NumElems; ++i) {
6361     unsigned Opc = Op.getOperand(i).getOpcode();
6362
6363     if (Opc == ISD::UNDEF)
6364       continue;
6365
6366     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
6367       // Quit if more than 1 elements need inserting.
6368       if (InsertIndices.size() > 1)
6369         return SDValue();
6370
6371       InsertIndices.push_back(i);
6372       continue;
6373     }
6374
6375     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
6376     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
6377     // Quit if non-constant index.
6378     if (!isa<ConstantSDNode>(ExtIdx))
6379       return SDValue();
6380     int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
6381
6382     // Quit if extracted from vector of different type.
6383     if (ExtractedFromVec.getValueType() != VT)
6384       return SDValue();
6385
6386     if (!VecIn1.getNode())
6387       VecIn1 = ExtractedFromVec;
6388     else if (VecIn1 != ExtractedFromVec) {
6389       if (!VecIn2.getNode())
6390         VecIn2 = ExtractedFromVec;
6391       else if (VecIn2 != ExtractedFromVec)
6392         // Quit if more than 2 vectors to shuffle
6393         return SDValue();
6394     }
6395
6396     if (ExtractedFromVec == VecIn1)
6397       Mask[i] = Idx;
6398     else if (ExtractedFromVec == VecIn2)
6399       Mask[i] = Idx + NumElems;
6400   }
6401
6402   if (!VecIn1.getNode())
6403     return SDValue();
6404
6405   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
6406   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
6407   for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
6408     unsigned Idx = InsertIndices[i];
6409     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
6410                      DAG.getIntPtrConstant(Idx));
6411   }
6412
6413   return NV;
6414 }
6415
6416 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
6417 SDValue
6418 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
6419
6420   MVT VT = Op.getSimpleValueType();
6421   assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) &&
6422          "Unexpected type in LowerBUILD_VECTORvXi1!");
6423
6424   SDLoc dl(Op);
6425   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
6426     SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
6427     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
6428     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
6429   }
6430
6431   if (ISD::isBuildVectorAllOnes(Op.getNode())) {
6432     SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
6433     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
6434     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
6435   }
6436
6437   bool AllContants = true;
6438   uint64_t Immediate = 0;
6439   int NonConstIdx = -1;
6440   bool IsSplat = true;
6441   unsigned NumNonConsts = 0;
6442   unsigned NumConsts = 0;
6443   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6444     SDValue In = Op.getOperand(idx);
6445     if (In.getOpcode() == ISD::UNDEF)
6446       continue;
6447     if (!isa<ConstantSDNode>(In)) {
6448       AllContants = false;
6449       NonConstIdx = idx;
6450       NumNonConsts++;
6451     } else {
6452       NumConsts++;
6453       if (cast<ConstantSDNode>(In)->getZExtValue())
6454       Immediate |= (1ULL << idx);
6455     }
6456     if (In != Op.getOperand(0))
6457       IsSplat = false;
6458   }
6459
6460   if (AllContants) {
6461     SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1,
6462       DAG.getConstant(Immediate, MVT::i16));
6463     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask,
6464                        DAG.getIntPtrConstant(0));
6465   }
6466
6467   if (NumNonConsts == 1 && NonConstIdx != 0) {
6468     SDValue DstVec;
6469     if (NumConsts) {
6470       SDValue VecAsImm = DAG.getConstant(Immediate,
6471                                          MVT::getIntegerVT(VT.getSizeInBits()));
6472       DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
6473     }
6474     else
6475       DstVec = DAG.getUNDEF(VT);
6476     return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
6477                        Op.getOperand(NonConstIdx),
6478                        DAG.getIntPtrConstant(NonConstIdx));
6479   }
6480   if (!IsSplat && (NonConstIdx != 0))
6481     llvm_unreachable("Unsupported BUILD_VECTOR operation");
6482   MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8;
6483   SDValue Select;
6484   if (IsSplat)
6485     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
6486                           DAG.getConstant(-1, SelectVT),
6487                           DAG.getConstant(0, SelectVT));
6488   else
6489     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
6490                          DAG.getConstant((Immediate | 1), SelectVT),
6491                          DAG.getConstant(Immediate, SelectVT));
6492   return DAG.getNode(ISD::BITCAST, dl, VT, Select);
6493 }
6494
6495 /// \brief Return true if \p N implements a horizontal binop and return the
6496 /// operands for the horizontal binop into V0 and V1.
6497 ///
6498 /// This is a helper function of PerformBUILD_VECTORCombine.
6499 /// This function checks that the build_vector \p N in input implements a
6500 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
6501 /// operation to match.
6502 /// For example, if \p Opcode is equal to ISD::ADD, then this function
6503 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
6504 /// is equal to ISD::SUB, then this function checks if this is a horizontal
6505 /// arithmetic sub.
6506 ///
6507 /// This function only analyzes elements of \p N whose indices are
6508 /// in range [BaseIdx, LastIdx).
6509 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
6510                               SelectionDAG &DAG,
6511                               unsigned BaseIdx, unsigned LastIdx,
6512                               SDValue &V0, SDValue &V1) {
6513   EVT VT = N->getValueType(0);
6514
6515   assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
6516   assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
6517          "Invalid Vector in input!");
6518
6519   bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
6520   bool CanFold = true;
6521   unsigned ExpectedVExtractIdx = BaseIdx;
6522   unsigned NumElts = LastIdx - BaseIdx;
6523   V0 = DAG.getUNDEF(VT);
6524   V1 = DAG.getUNDEF(VT);
6525
6526   // Check if N implements a horizontal binop.
6527   for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
6528     SDValue Op = N->getOperand(i + BaseIdx);
6529
6530     // Skip UNDEFs.
6531     if (Op->getOpcode() == ISD::UNDEF) {
6532       // Update the expected vector extract index.
6533       if (i * 2 == NumElts)
6534         ExpectedVExtractIdx = BaseIdx;
6535       ExpectedVExtractIdx += 2;
6536       continue;
6537     }
6538
6539     CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
6540
6541     if (!CanFold)
6542       break;
6543
6544     SDValue Op0 = Op.getOperand(0);
6545     SDValue Op1 = Op.getOperand(1);
6546
6547     // Try to match the following pattern:
6548     // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
6549     CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6550         Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6551         Op0.getOperand(0) == Op1.getOperand(0) &&
6552         isa<ConstantSDNode>(Op0.getOperand(1)) &&
6553         isa<ConstantSDNode>(Op1.getOperand(1)));
6554     if (!CanFold)
6555       break;
6556
6557     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
6558     unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
6559
6560     if (i * 2 < NumElts) {
6561       if (V0.getOpcode() == ISD::UNDEF)
6562         V0 = Op0.getOperand(0);
6563     } else {
6564       if (V1.getOpcode() == ISD::UNDEF)
6565         V1 = Op0.getOperand(0);
6566       if (i * 2 == NumElts)
6567         ExpectedVExtractIdx = BaseIdx;
6568     }
6569
6570     SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
6571     if (I0 == ExpectedVExtractIdx)
6572       CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
6573     else if (IsCommutable && I1 == ExpectedVExtractIdx) {
6574       // Try to match the following dag sequence:
6575       // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
6576       CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
6577     } else
6578       CanFold = false;
6579
6580     ExpectedVExtractIdx += 2;
6581   }
6582
6583   return CanFold;
6584 }
6585
6586 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
6587 /// a concat_vector.
6588 ///
6589 /// This is a helper function of PerformBUILD_VECTORCombine.
6590 /// This function expects two 256-bit vectors called V0 and V1.
6591 /// At first, each vector is split into two separate 128-bit vectors.
6592 /// Then, the resulting 128-bit vectors are used to implement two
6593 /// horizontal binary operations.
6594 ///
6595 /// The kind of horizontal binary operation is defined by \p X86Opcode.
6596 ///
6597 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
6598 /// the two new horizontal binop.
6599 /// When Mode is set, the first horizontal binop dag node would take as input
6600 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
6601 /// horizontal binop dag node would take as input the lower 128-bit of V1
6602 /// and the upper 128-bit of V1.
6603 ///   Example:
6604 ///     HADD V0_LO, V0_HI
6605 ///     HADD V1_LO, V1_HI
6606 ///
6607 /// Otherwise, the first horizontal binop dag node takes as input the lower
6608 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
6609 /// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1.
6610 ///   Example:
6611 ///     HADD V0_LO, V1_LO
6612 ///     HADD V0_HI, V1_HI
6613 ///
6614 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
6615 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
6616 /// the upper 128-bits of the result.
6617 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
6618                                      SDLoc DL, SelectionDAG &DAG,
6619                                      unsigned X86Opcode, bool Mode,
6620                                      bool isUndefLO, bool isUndefHI) {
6621   EVT VT = V0.getValueType();
6622   assert(VT.is256BitVector() && VT == V1.getValueType() &&
6623          "Invalid nodes in input!");
6624
6625   unsigned NumElts = VT.getVectorNumElements();
6626   SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL);
6627   SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL);
6628   SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL);
6629   SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL);
6630   EVT NewVT = V0_LO.getValueType();
6631
6632   SDValue LO = DAG.getUNDEF(NewVT);
6633   SDValue HI = DAG.getUNDEF(NewVT);
6634
6635   if (Mode) {
6636     // Don't emit a horizontal binop if the result is expected to be UNDEF.
6637     if (!isUndefLO && V0->getOpcode() != ISD::UNDEF)
6638       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
6639     if (!isUndefHI && V1->getOpcode() != ISD::UNDEF)
6640       HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
6641   } else {
6642     // Don't emit a horizontal binop if the result is expected to be UNDEF.
6643     if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF ||
6644                        V1_LO->getOpcode() != ISD::UNDEF))
6645       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
6646
6647     if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF ||
6648                        V1_HI->getOpcode() != ISD::UNDEF))
6649       HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
6650   }
6651
6652   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
6653 }
6654
6655 /// \brief Try to fold a build_vector that performs an 'addsub' into the
6656 /// sequence of 'vadd + vsub + blendi'.
6657 static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG,
6658                            const X86Subtarget *Subtarget) {
6659   SDLoc DL(BV);
6660   EVT VT = BV->getValueType(0);
6661   unsigned NumElts = VT.getVectorNumElements();
6662   SDValue InVec0 = DAG.getUNDEF(VT);
6663   SDValue InVec1 = DAG.getUNDEF(VT);
6664
6665   assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
6666           VT == MVT::v2f64) && "build_vector with an invalid type found!");
6667
6668   // Odd-numbered elements in the input build vector are obtained from
6669   // adding two integer/float elements.
6670   // Even-numbered elements in the input build vector are obtained from
6671   // subtracting two integer/float elements.
6672   unsigned ExpectedOpcode = ISD::FSUB;
6673   unsigned NextExpectedOpcode = ISD::FADD;
6674   bool AddFound = false;
6675   bool SubFound = false;
6676
6677   for (unsigned i = 0, e = NumElts; i != e; i++) {
6678     SDValue Op = BV->getOperand(i);
6679
6680     // Skip 'undef' values.
6681     unsigned Opcode = Op.getOpcode();
6682     if (Opcode == ISD::UNDEF) {
6683       std::swap(ExpectedOpcode, NextExpectedOpcode);
6684       continue;
6685     }
6686
6687     // Early exit if we found an unexpected opcode.
6688     if (Opcode != ExpectedOpcode)
6689       return SDValue();
6690
6691     SDValue Op0 = Op.getOperand(0);
6692     SDValue Op1 = Op.getOperand(1);
6693
6694     // Try to match the following pattern:
6695     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
6696     // Early exit if we cannot match that sequence.
6697     if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6698         Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6699         !isa<ConstantSDNode>(Op0.getOperand(1)) ||
6700         !isa<ConstantSDNode>(Op1.getOperand(1)) ||
6701         Op0.getOperand(1) != Op1.getOperand(1))
6702       return SDValue();
6703
6704     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
6705     if (I0 != i)
6706       return SDValue();
6707
6708     // We found a valid add/sub node. Update the information accordingly.
6709     if (i & 1)
6710       AddFound = true;
6711     else
6712       SubFound = true;
6713
6714     // Update InVec0 and InVec1.
6715     if (InVec0.getOpcode() == ISD::UNDEF)
6716       InVec0 = Op0.getOperand(0);
6717     if (InVec1.getOpcode() == ISD::UNDEF)
6718       InVec1 = Op1.getOperand(0);
6719
6720     // Make sure that operands in input to each add/sub node always
6721     // come from a same pair of vectors.
6722     if (InVec0 != Op0.getOperand(0)) {
6723       if (ExpectedOpcode == ISD::FSUB)
6724         return SDValue();
6725
6726       // FADD is commutable. Try to commute the operands
6727       // and then test again.
6728       std::swap(Op0, Op1);
6729       if (InVec0 != Op0.getOperand(0))
6730         return SDValue();
6731     }
6732
6733     if (InVec1 != Op1.getOperand(0))
6734       return SDValue();
6735
6736     // Update the pair of expected opcodes.
6737     std::swap(ExpectedOpcode, NextExpectedOpcode);
6738   }
6739
6740   // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
6741   if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF &&
6742       InVec1.getOpcode() != ISD::UNDEF)
6743     return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
6744
6745   return SDValue();
6746 }
6747
6748 static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
6749                                           const X86Subtarget *Subtarget) {
6750   SDLoc DL(N);
6751   EVT VT = N->getValueType(0);
6752   unsigned NumElts = VT.getVectorNumElements();
6753   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N);
6754   SDValue InVec0, InVec1;
6755
6756   // Try to match an ADDSUB.
6757   if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
6758       (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
6759     SDValue Value = matchAddSub(BV, DAG, Subtarget);
6760     if (Value.getNode())
6761       return Value;
6762   }
6763
6764   // Try to match horizontal ADD/SUB.
6765   unsigned NumUndefsLO = 0;
6766   unsigned NumUndefsHI = 0;
6767   unsigned Half = NumElts/2;
6768
6769   // Count the number of UNDEF operands in the build_vector in input.
6770   for (unsigned i = 0, e = Half; i != e; ++i)
6771     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
6772       NumUndefsLO++;
6773
6774   for (unsigned i = Half, e = NumElts; i != e; ++i)
6775     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
6776       NumUndefsHI++;
6777
6778   // Early exit if this is either a build_vector of all UNDEFs or all the
6779   // operands but one are UNDEF.
6780   if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
6781     return SDValue();
6782
6783   if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) {
6784     // Try to match an SSE3 float HADD/HSUB.
6785     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
6786       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
6787
6788     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
6789       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
6790   } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
6791     // Try to match an SSSE3 integer HADD/HSUB.
6792     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
6793       return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
6794
6795     if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
6796       return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
6797   }
6798
6799   if (!Subtarget->hasAVX())
6800     return SDValue();
6801
6802   if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
6803     // Try to match an AVX horizontal add/sub of packed single/double
6804     // precision floating point values from 256-bit vectors.
6805     SDValue InVec2, InVec3;
6806     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
6807         isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
6808         ((InVec0.getOpcode() == ISD::UNDEF ||
6809           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6810         ((InVec1.getOpcode() == ISD::UNDEF ||
6811           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6812       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
6813
6814     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
6815         isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
6816         ((InVec0.getOpcode() == ISD::UNDEF ||
6817           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6818         ((InVec1.getOpcode() == ISD::UNDEF ||
6819           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6820       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
6821   } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
6822     // Try to match an AVX2 horizontal add/sub of signed integers.
6823     SDValue InVec2, InVec3;
6824     unsigned X86Opcode;
6825     bool CanFold = true;
6826
6827     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
6828         isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
6829         ((InVec0.getOpcode() == ISD::UNDEF ||
6830           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6831         ((InVec1.getOpcode() == ISD::UNDEF ||
6832           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6833       X86Opcode = X86ISD::HADD;
6834     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
6835         isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
6836         ((InVec0.getOpcode() == ISD::UNDEF ||
6837           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6838         ((InVec1.getOpcode() == ISD::UNDEF ||
6839           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6840       X86Opcode = X86ISD::HSUB;
6841     else
6842       CanFold = false;
6843
6844     if (CanFold) {
6845       // Fold this build_vector into a single horizontal add/sub.
6846       // Do this only if the target has AVX2.
6847       if (Subtarget->hasAVX2())
6848         return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
6849
6850       // Do not try to expand this build_vector into a pair of horizontal
6851       // add/sub if we can emit a pair of scalar add/sub.
6852       if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
6853         return SDValue();
6854
6855       // Convert this build_vector into a pair of horizontal binop followed by
6856       // a concat vector.
6857       bool isUndefLO = NumUndefsLO == Half;
6858       bool isUndefHI = NumUndefsHI == Half;
6859       return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
6860                                    isUndefLO, isUndefHI);
6861     }
6862   }
6863
6864   if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
6865        VT == MVT::v16i16) && Subtarget->hasAVX()) {
6866     unsigned X86Opcode;
6867     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
6868       X86Opcode = X86ISD::HADD;
6869     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
6870       X86Opcode = X86ISD::HSUB;
6871     else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
6872       X86Opcode = X86ISD::FHADD;
6873     else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
6874       X86Opcode = X86ISD::FHSUB;
6875     else
6876       return SDValue();
6877
6878     // Don't try to expand this build_vector into a pair of horizontal add/sub
6879     // if we can simply emit a pair of scalar add/sub.
6880     if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
6881       return SDValue();
6882
6883     // Convert this build_vector into two horizontal add/sub followed by
6884     // a concat vector.
6885     bool isUndefLO = NumUndefsLO == Half;
6886     bool isUndefHI = NumUndefsHI == Half;
6887     return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
6888                                  isUndefLO, isUndefHI);
6889   }
6890
6891   return SDValue();
6892 }
6893
6894 SDValue
6895 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
6896   SDLoc dl(Op);
6897
6898   MVT VT = Op.getSimpleValueType();
6899   MVT ExtVT = VT.getVectorElementType();
6900   unsigned NumElems = Op.getNumOperands();
6901
6902   // Generate vectors for predicate vectors.
6903   if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512())
6904     return LowerBUILD_VECTORvXi1(Op, DAG);
6905
6906   // Vectors containing all zeros can be matched by pxor and xorps later
6907   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
6908     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
6909     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
6910     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
6911       return Op;
6912
6913     return getZeroVector(VT, Subtarget, DAG, dl);
6914   }
6915
6916   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
6917   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
6918   // vpcmpeqd on 256-bit vectors.
6919   if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
6920     if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
6921       return Op;
6922
6923     if (!VT.is512BitVector())
6924       return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
6925   }
6926
6927   SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
6928   if (Broadcast.getNode())
6929     return Broadcast;
6930
6931   unsigned EVTBits = ExtVT.getSizeInBits();
6932
6933   unsigned NumZero  = 0;
6934   unsigned NumNonZero = 0;
6935   unsigned NonZeros = 0;
6936   bool IsAllConstants = true;
6937   SmallSet<SDValue, 8> Values;
6938   for (unsigned i = 0; i < NumElems; ++i) {
6939     SDValue Elt = Op.getOperand(i);
6940     if (Elt.getOpcode() == ISD::UNDEF)
6941       continue;
6942     Values.insert(Elt);
6943     if (Elt.getOpcode() != ISD::Constant &&
6944         Elt.getOpcode() != ISD::ConstantFP)
6945       IsAllConstants = false;
6946     if (X86::isZeroNode(Elt))
6947       NumZero++;
6948     else {
6949       NonZeros |= (1 << i);
6950       NumNonZero++;
6951     }
6952   }
6953
6954   // All undef vector. Return an UNDEF.  All zero vectors were handled above.
6955   if (NumNonZero == 0)
6956     return DAG.getUNDEF(VT);
6957
6958   // Special case for single non-zero, non-undef, element.
6959   if (NumNonZero == 1) {
6960     unsigned Idx = countTrailingZeros(NonZeros);
6961     SDValue Item = Op.getOperand(Idx);
6962
6963     // If this is an insertion of an i64 value on x86-32, and if the top bits of
6964     // the value are obviously zero, truncate the value to i32 and do the
6965     // insertion that way.  Only do this if the value is non-constant or if the
6966     // value is a constant being inserted into element 0.  It is cheaper to do
6967     // a constant pool load than it is to do a movd + shuffle.
6968     if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
6969         (!IsAllConstants || Idx == 0)) {
6970       if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
6971         // Handle SSE only.
6972         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
6973         EVT VecVT = MVT::v4i32;
6974         unsigned VecElts = 4;
6975
6976         // Truncate the value (which may itself be a constant) to i32, and
6977         // convert it to a vector with movd (S2V+shuffle to zero extend).
6978         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
6979         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
6980
6981         // If using the new shuffle lowering, just directly insert this.
6982         if (ExperimentalVectorShuffleLowering)
6983           return DAG.getNode(
6984               ISD::BITCAST, dl, VT,
6985               getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG));
6986
6987         Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
6988
6989         // Now we have our 32-bit value zero extended in the low element of
6990         // a vector.  If Idx != 0, swizzle it into place.
6991         if (Idx != 0) {
6992           SmallVector<int, 4> Mask;
6993           Mask.push_back(Idx);
6994           for (unsigned i = 1; i != VecElts; ++i)
6995             Mask.push_back(i);
6996           Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
6997                                       &Mask[0]);
6998         }
6999         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
7000       }
7001     }
7002
7003     // If we have a constant or non-constant insertion into the low element of
7004     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
7005     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
7006     // depending on what the source datatype is.
7007     if (Idx == 0) {
7008       if (NumZero == 0)
7009         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7010
7011       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
7012           (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
7013         if (VT.is256BitVector() || VT.is512BitVector()) {
7014           SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
7015           return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
7016                              Item, DAG.getIntPtrConstant(0));
7017         }
7018         assert(VT.is128BitVector() && "Expected an SSE value type!");
7019         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7020         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
7021         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7022       }
7023
7024       if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
7025         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
7026         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7027         if (VT.is256BitVector()) {
7028           SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
7029           Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
7030         } else {
7031           assert(VT.is128BitVector() && "Expected an SSE value type!");
7032           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7033         }
7034         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
7035       }
7036     }
7037
7038     // Is it a vector logical left shift?
7039     if (NumElems == 2 && Idx == 1 &&
7040         X86::isZeroNode(Op.getOperand(0)) &&
7041         !X86::isZeroNode(Op.getOperand(1))) {
7042       unsigned NumBits = VT.getSizeInBits();
7043       return getVShift(true, VT,
7044                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
7045                                    VT, Op.getOperand(1)),
7046                        NumBits/2, DAG, *this, dl);
7047     }
7048
7049     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
7050       return SDValue();
7051
7052     // Otherwise, if this is a vector with i32 or f32 elements, and the element
7053     // is a non-constant being inserted into an element other than the low one,
7054     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
7055     // movd/movss) to move this into the low element, then shuffle it into
7056     // place.
7057     if (EVTBits == 32) {
7058       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7059
7060       // If using the new shuffle lowering, just directly insert this.
7061       if (ExperimentalVectorShuffleLowering)
7062         return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
7063
7064       // Turn it into a shuffle of zero and zero-extended scalar to vector.
7065       Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
7066       SmallVector<int, 8> MaskVec;
7067       for (unsigned i = 0; i != NumElems; ++i)
7068         MaskVec.push_back(i == Idx ? 0 : 1);
7069       return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
7070     }
7071   }
7072
7073   // Splat is obviously ok. Let legalizer expand it to a shuffle.
7074   if (Values.size() == 1) {
7075     if (EVTBits == 32) {
7076       // Instead of a shuffle like this:
7077       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
7078       // Check if it's possible to issue this instead.
7079       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
7080       unsigned Idx = countTrailingZeros(NonZeros);
7081       SDValue Item = Op.getOperand(Idx);
7082       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
7083         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
7084     }
7085     return SDValue();
7086   }
7087
7088   // A vector full of immediates; various special cases are already
7089   // handled, so this is best done with a single constant-pool load.
7090   if (IsAllConstants)
7091     return SDValue();
7092
7093   // For AVX-length vectors, see if we can use a vector load to get all of the
7094   // elements, otherwise build the individual 128-bit pieces and use
7095   // shuffles to put them in place.
7096   if (VT.is256BitVector() || VT.is512BitVector()) {
7097     SmallVector<SDValue, 64> V;
7098     for (unsigned i = 0; i != NumElems; ++i)
7099       V.push_back(Op.getOperand(i));
7100
7101     // Check for a build vector of consecutive loads.
7102     if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
7103       return LD;
7104
7105     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
7106
7107     // Build both the lower and upper subvector.
7108     SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
7109                                 makeArrayRef(&V[0], NumElems/2));
7110     SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
7111                                 makeArrayRef(&V[NumElems / 2], NumElems/2));
7112
7113     // Recreate the wider vector with the lower and upper part.
7114     if (VT.is256BitVector())
7115       return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7116     return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7117   }
7118
7119   // Let legalizer expand 2-wide build_vectors.
7120   if (EVTBits == 64) {
7121     if (NumNonZero == 1) {
7122       // One half is zero or undef.
7123       unsigned Idx = countTrailingZeros(NonZeros);
7124       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
7125                                  Op.getOperand(Idx));
7126       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
7127     }
7128     return SDValue();
7129   }
7130
7131   // If element VT is < 32 bits, convert it to inserts into a zero vector.
7132   if (EVTBits == 8 && NumElems == 16) {
7133     SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
7134                                         Subtarget, *this);
7135     if (V.getNode()) return V;
7136   }
7137
7138   if (EVTBits == 16 && NumElems == 8) {
7139     SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
7140                                       Subtarget, *this);
7141     if (V.getNode()) return V;
7142   }
7143
7144   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
7145   if (EVTBits == 32 && NumElems == 4) {
7146     SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this);
7147     if (V.getNode())
7148       return V;
7149   }
7150
7151   // If element VT is == 32 bits, turn it into a number of shuffles.
7152   SmallVector<SDValue, 8> V(NumElems);
7153   if (NumElems == 4 && NumZero > 0) {
7154     for (unsigned i = 0; i < 4; ++i) {
7155       bool isZero = !(NonZeros & (1 << i));
7156       if (isZero)
7157         V[i] = getZeroVector(VT, Subtarget, DAG, dl);
7158       else
7159         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7160     }
7161
7162     for (unsigned i = 0; i < 2; ++i) {
7163       switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
7164         default: break;
7165         case 0:
7166           V[i] = V[i*2];  // Must be a zero vector.
7167           break;
7168         case 1:
7169           V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
7170           break;
7171         case 2:
7172           V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
7173           break;
7174         case 3:
7175           V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
7176           break;
7177       }
7178     }
7179
7180     bool Reverse1 = (NonZeros & 0x3) == 2;
7181     bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
7182     int MaskVec[] = {
7183       Reverse1 ? 1 : 0,
7184       Reverse1 ? 0 : 1,
7185       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
7186       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
7187     };
7188     return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
7189   }
7190
7191   if (Values.size() > 1 && VT.is128BitVector()) {
7192     // Check for a build vector of consecutive loads.
7193     for (unsigned i = 0; i < NumElems; ++i)
7194       V[i] = Op.getOperand(i);
7195
7196     // Check for elements which are consecutive loads.
7197     SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false);
7198     if (LD.getNode())
7199       return LD;
7200
7201     // Check for a build vector from mostly shuffle plus few inserting.
7202     SDValue Sh = buildFromShuffleMostly(Op, DAG);
7203     if (Sh.getNode())
7204       return Sh;
7205
7206     // For SSE 4.1, use insertps to put the high elements into the low element.
7207     if (getSubtarget()->hasSSE41()) {
7208       SDValue Result;
7209       if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
7210         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
7211       else
7212         Result = DAG.getUNDEF(VT);
7213
7214       for (unsigned i = 1; i < NumElems; ++i) {
7215         if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
7216         Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
7217                              Op.getOperand(i), DAG.getIntPtrConstant(i));
7218       }
7219       return Result;
7220     }
7221
7222     // Otherwise, expand into a number of unpckl*, start by extending each of
7223     // our (non-undef) elements to the full vector width with the element in the
7224     // bottom slot of the vector (which generates no code for SSE).
7225     for (unsigned i = 0; i < NumElems; ++i) {
7226       if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
7227         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7228       else
7229         V[i] = DAG.getUNDEF(VT);
7230     }
7231
7232     // Next, we iteratively mix elements, e.g. for v4f32:
7233     //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
7234     //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
7235     //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
7236     unsigned EltStride = NumElems >> 1;
7237     while (EltStride != 0) {
7238       for (unsigned i = 0; i < EltStride; ++i) {
7239         // If V[i+EltStride] is undef and this is the first round of mixing,
7240         // then it is safe to just drop this shuffle: V[i] is already in the
7241         // right place, the one element (since it's the first round) being
7242         // inserted as undef can be dropped.  This isn't safe for successive
7243         // rounds because they will permute elements within both vectors.
7244         if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
7245             EltStride == NumElems/2)
7246           continue;
7247
7248         V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
7249       }
7250       EltStride >>= 1;
7251     }
7252     return V[0];
7253   }
7254   return SDValue();
7255 }
7256
7257 // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
7258 // to create 256-bit vectors from two other 128-bit ones.
7259 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7260   SDLoc dl(Op);
7261   MVT ResVT = Op.getSimpleValueType();
7262
7263   assert((ResVT.is256BitVector() ||
7264           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
7265
7266   SDValue V1 = Op.getOperand(0);
7267   SDValue V2 = Op.getOperand(1);
7268   unsigned NumElems = ResVT.getVectorNumElements();
7269   if(ResVT.is256BitVector())
7270     return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7271
7272   if (Op.getNumOperands() == 4) {
7273     MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),
7274                                 ResVT.getVectorNumElements()/2);
7275     SDValue V3 = Op.getOperand(2);
7276     SDValue V4 = Op.getOperand(3);
7277     return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl),
7278       Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl);
7279   }
7280   return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7281 }
7282
7283 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7284   MVT LLVM_ATTRIBUTE_UNUSED VT = Op.getSimpleValueType();
7285   assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
7286          (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
7287           Op.getNumOperands() == 4)));
7288
7289   // AVX can use the vinsertf128 instruction to create 256-bit vectors
7290   // from two other 128-bit ones.
7291
7292   // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
7293   return LowerAVXCONCAT_VECTORS(Op, DAG);
7294 }
7295
7296
7297 //===----------------------------------------------------------------------===//
7298 // Vector shuffle lowering
7299 //
7300 // This is an experimental code path for lowering vector shuffles on x86. It is
7301 // designed to handle arbitrary vector shuffles and blends, gracefully
7302 // degrading performance as necessary. It works hard to recognize idiomatic
7303 // shuffles and lower them to optimal instruction patterns without leaving
7304 // a framework that allows reasonably efficient handling of all vector shuffle
7305 // patterns.
7306 //===----------------------------------------------------------------------===//
7307
7308 /// \brief Tiny helper function to identify a no-op mask.
7309 ///
7310 /// This is a somewhat boring predicate function. It checks whether the mask
7311 /// array input, which is assumed to be a single-input shuffle mask of the kind
7312 /// used by the X86 shuffle instructions (not a fully general
7313 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
7314 /// in-place shuffle are 'no-op's.
7315 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
7316   for (int i = 0, Size = Mask.size(); i < Size; ++i)
7317     if (Mask[i] != -1 && Mask[i] != i)
7318       return false;
7319   return true;
7320 }
7321
7322 /// \brief Helper function to classify a mask as a single-input mask.
7323 ///
7324 /// This isn't a generic single-input test because in the vector shuffle
7325 /// lowering we canonicalize single inputs to be the first input operand. This
7326 /// means we can more quickly test for a single input by only checking whether
7327 /// an input from the second operand exists. We also assume that the size of
7328 /// mask corresponds to the size of the input vectors which isn't true in the
7329 /// fully general case.
7330 static bool isSingleInputShuffleMask(ArrayRef<int> Mask) {
7331   for (int M : Mask)
7332     if (M >= (int)Mask.size())
7333       return false;
7334   return true;
7335 }
7336
7337 /// \brief Test whether there are elements crossing 128-bit lanes in this
7338 /// shuffle mask.
7339 ///
7340 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
7341 /// and we routinely test for these.
7342 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
7343   int LaneSize = 128 / VT.getScalarSizeInBits();
7344   int Size = Mask.size();
7345   for (int i = 0; i < Size; ++i)
7346     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
7347       return true;
7348   return false;
7349 }
7350
7351 /// \brief Test whether a shuffle mask is equivalent within each 128-bit lane.
7352 ///
7353 /// This checks a shuffle mask to see if it is performing the same
7354 /// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies
7355 /// that it is also not lane-crossing. It may however involve a blend from the
7356 /// same lane of a second vector.
7357 ///
7358 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
7359 /// non-trivial to compute in the face of undef lanes. The representation is
7360 /// *not* suitable for use with existing 128-bit shuffles as it will contain
7361 /// entries from both V1 and V2 inputs to the wider mask.
7362 static bool
7363 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
7364                                 SmallVectorImpl<int> &RepeatedMask) {
7365   int LaneSize = 128 / VT.getScalarSizeInBits();
7366   RepeatedMask.resize(LaneSize, -1);
7367   int Size = Mask.size();
7368   for (int i = 0; i < Size; ++i) {
7369     if (Mask[i] < 0)
7370       continue;
7371     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
7372       // This entry crosses lanes, so there is no way to model this shuffle.
7373       return false;
7374
7375     // Ok, handle the in-lane shuffles by detecting if and when they repeat.
7376     if (RepeatedMask[i % LaneSize] == -1)
7377       // This is the first non-undef entry in this slot of a 128-bit lane.
7378       RepeatedMask[i % LaneSize] =
7379           Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size;
7380     else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i])
7381       // Found a mismatch with the repeated mask.
7382       return false;
7383   }
7384   return true;
7385 }
7386
7387 // Hide this symbol with an anonymous namespace instead of 'static' so that MSVC
7388 // 2013 will allow us to use it as a non-type template parameter.
7389 namespace {
7390
7391 /// \brief Implementation of the \c isShuffleEquivalent variadic functor.
7392 ///
7393 /// See its documentation for details.
7394 bool isShuffleEquivalentImpl(ArrayRef<int> Mask, ArrayRef<const int *> Args) {
7395   if (Mask.size() != Args.size())
7396     return false;
7397   for (int i = 0, e = Mask.size(); i < e; ++i) {
7398     assert(*Args[i] >= 0 && "Arguments must be positive integers!");
7399     if (Mask[i] != -1 && Mask[i] != *Args[i])
7400       return false;
7401   }
7402   return true;
7403 }
7404
7405 } // namespace
7406
7407 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
7408 /// arguments.
7409 ///
7410 /// This is a fast way to test a shuffle mask against a fixed pattern:
7411 ///
7412 ///   if (isShuffleEquivalent(Mask, 3, 2, 1, 0)) { ... }
7413 ///
7414 /// It returns true if the mask is exactly as wide as the argument list, and
7415 /// each element of the mask is either -1 (signifying undef) or the value given
7416 /// in the argument.
7417 static const VariadicFunction1<
7418     bool, ArrayRef<int>, int, isShuffleEquivalentImpl> isShuffleEquivalent = {};
7419
7420 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
7421 ///
7422 /// This helper function produces an 8-bit shuffle immediate corresponding to
7423 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
7424 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
7425 /// example.
7426 ///
7427 /// NB: We rely heavily on "undef" masks preserving the input lane.
7428 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask,
7429                                           SelectionDAG &DAG) {
7430   assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
7431   assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
7432   assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
7433   assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
7434   assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
7435
7436   unsigned Imm = 0;
7437   Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0;
7438   Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2;
7439   Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4;
7440   Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6;
7441   return DAG.getConstant(Imm, MVT::i8);
7442 }
7443
7444 /// \brief Try to emit a blend instruction for a shuffle.
7445 ///
7446 /// This doesn't do any checks for the availability of instructions for blending
7447 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
7448 /// be matched in the backend with the type given. What it does check for is
7449 /// that the shuffle mask is in fact a blend.
7450 static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
7451                                          SDValue V2, ArrayRef<int> Mask,
7452                                          const X86Subtarget *Subtarget,
7453                                          SelectionDAG &DAG) {
7454
7455   unsigned BlendMask = 0;
7456   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7457     if (Mask[i] >= Size) {
7458       if (Mask[i] != i + Size)
7459         return SDValue(); // Shuffled V2 input!
7460       BlendMask |= 1u << i;
7461       continue;
7462     }
7463     if (Mask[i] >= 0 && Mask[i] != i)
7464       return SDValue(); // Shuffled V1 input!
7465   }
7466   switch (VT.SimpleTy) {
7467   case MVT::v2f64:
7468   case MVT::v4f32:
7469   case MVT::v4f64:
7470   case MVT::v8f32:
7471     return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
7472                        DAG.getConstant(BlendMask, MVT::i8));
7473
7474   case MVT::v4i64:
7475   case MVT::v8i32:
7476     assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
7477     // FALLTHROUGH
7478   case MVT::v2i64:
7479   case MVT::v4i32:
7480     // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
7481     // that instruction.
7482     if (Subtarget->hasAVX2()) {
7483       // Scale the blend by the number of 32-bit dwords per element.
7484       int Scale =  VT.getScalarSizeInBits() / 32;
7485       BlendMask = 0;
7486       for (int i = 0, Size = Mask.size(); i < Size; ++i)
7487         if (Mask[i] >= Size)
7488           for (int j = 0; j < Scale; ++j)
7489             BlendMask |= 1u << (i * Scale + j);
7490
7491       MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
7492       V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
7493       V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
7494       return DAG.getNode(ISD::BITCAST, DL, VT,
7495                          DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
7496                                      DAG.getConstant(BlendMask, MVT::i8)));
7497     }
7498     // FALLTHROUGH
7499   case MVT::v8i16: {
7500     // For integer shuffles we need to expand the mask and cast the inputs to
7501     // v8i16s prior to blending.
7502     int Scale = 8 / VT.getVectorNumElements();
7503     BlendMask = 0;
7504     for (int i = 0, Size = Mask.size(); i < Size; ++i)
7505       if (Mask[i] >= Size)
7506         for (int j = 0; j < Scale; ++j)
7507           BlendMask |= 1u << (i * Scale + j);
7508
7509     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
7510     V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
7511     return DAG.getNode(ISD::BITCAST, DL, VT,
7512                        DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
7513                                    DAG.getConstant(BlendMask, MVT::i8)));
7514   }
7515
7516   case MVT::v16i16: {
7517     assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
7518     SmallVector<int, 8> RepeatedMask;
7519     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
7520       // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
7521       assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
7522       BlendMask = 0;
7523       for (int i = 0; i < 8; ++i)
7524         if (RepeatedMask[i] >= 16)
7525           BlendMask |= 1u << i;
7526       return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
7527                          DAG.getConstant(BlendMask, MVT::i8));
7528     }
7529   }
7530     // FALLTHROUGH
7531   case MVT::v32i8: {
7532     assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
7533     // Scale the blend by the number of bytes per element.
7534     int Scale =  VT.getScalarSizeInBits() / 8;
7535     assert(Mask.size() * Scale == 32 && "Not a 256-bit vector!");
7536
7537     // Compute the VSELECT mask. Note that VSELECT is really confusing in the
7538     // mix of LLVM's code generator and the x86 backend. We tell the code
7539     // generator that boolean values in the elements of an x86 vector register
7540     // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
7541     // mapping a select to operand #1, and 'false' mapping to operand #2. The
7542     // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
7543     // of the element (the remaining are ignored) and 0 in that high bit would
7544     // mean operand #1 while 1 in the high bit would mean operand #2. So while
7545     // the LLVM model for boolean values in vector elements gets the relevant
7546     // bit set, it is set backwards and over constrained relative to x86's
7547     // actual model.
7548     SDValue VSELECTMask[32];
7549     for (int i = 0, Size = Mask.size(); i < Size; ++i)
7550       for (int j = 0; j < Scale; ++j)
7551         VSELECTMask[Scale * i + j] =
7552             Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
7553                         : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8);
7554
7555     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1);
7556     V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V2);
7557     return DAG.getNode(
7558         ISD::BITCAST, DL, VT,
7559         DAG.getNode(ISD::VSELECT, DL, MVT::v32i8,
7560                     DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, VSELECTMask),
7561                     V1, V2));
7562   }
7563
7564   default:
7565     llvm_unreachable("Not a supported integer vector type!");
7566   }
7567 }
7568
7569 /// \brief Generic routine to lower a shuffle and blend as a decomposed set of
7570 /// unblended shuffles followed by an unshuffled blend.
7571 ///
7572 /// This matches the extremely common pattern for handling combined
7573 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
7574 /// operations.
7575 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
7576                                                           SDValue V1,
7577                                                           SDValue V2,
7578                                                           ArrayRef<int> Mask,
7579                                                           SelectionDAG &DAG) {
7580   // Shuffle the input elements into the desired positions in V1 and V2 and
7581   // blend them together.
7582   SmallVector<int, 32> V1Mask(Mask.size(), -1);
7583   SmallVector<int, 32> V2Mask(Mask.size(), -1);
7584   SmallVector<int, 32> BlendMask(Mask.size(), -1);
7585   for (int i = 0, Size = Mask.size(); i < Size; ++i)
7586     if (Mask[i] >= 0 && Mask[i] < Size) {
7587       V1Mask[i] = Mask[i];
7588       BlendMask[i] = i;
7589     } else if (Mask[i] >= Size) {
7590       V2Mask[i] = Mask[i] - Size;
7591       BlendMask[i] = i + Size;
7592     }
7593
7594   V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
7595   V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
7596   return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
7597 }
7598
7599 /// \brief Try to lower a vector shuffle as a byte rotation.
7600 ///
7601 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
7602 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
7603 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
7604 /// try to generically lower a vector shuffle through such an pattern. It
7605 /// does not check for the profitability of lowering either as PALIGNR or
7606 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
7607 /// This matches shuffle vectors that look like:
7608 ///
7609 ///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
7610 ///
7611 /// Essentially it concatenates V1 and V2, shifts right by some number of
7612 /// elements, and takes the low elements as the result. Note that while this is
7613 /// specified as a *right shift* because x86 is little-endian, it is a *left
7614 /// rotate* of the vector lanes.
7615 ///
7616 /// Note that this only handles 128-bit vector widths currently.
7617 static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
7618                                               SDValue V2,
7619                                               ArrayRef<int> Mask,
7620                                               const X86Subtarget *Subtarget,
7621                                               SelectionDAG &DAG) {
7622   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
7623
7624   // We need to detect various ways of spelling a rotation:
7625   //   [11, 12, 13, 14, 15,  0,  1,  2]
7626   //   [-1, 12, 13, 14, -1, -1,  1, -1]
7627   //   [-1, -1, -1, -1, -1, -1,  1,  2]
7628   //   [ 3,  4,  5,  6,  7,  8,  9, 10]
7629   //   [-1,  4,  5,  6, -1, -1,  9, -1]
7630   //   [-1,  4,  5,  6, -1, -1, -1, -1]
7631   int Rotation = 0;
7632   SDValue Lo, Hi;
7633   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7634     if (Mask[i] == -1)
7635       continue;
7636     assert(Mask[i] >= 0 && "Only -1 is a valid negative mask element!");
7637
7638     // Based on the mod-Size value of this mask element determine where
7639     // a rotated vector would have started.
7640     int StartIdx = i - (Mask[i] % Size);
7641     if (StartIdx == 0)
7642       // The identity rotation isn't interesting, stop.
7643       return SDValue();
7644
7645     // If we found the tail of a vector the rotation must be the missing
7646     // front. If we found the head of a vector, it must be how much of the head.
7647     int CandidateRotation = StartIdx < 0 ? -StartIdx : Size - StartIdx;
7648
7649     if (Rotation == 0)
7650       Rotation = CandidateRotation;
7651     else if (Rotation != CandidateRotation)
7652       // The rotations don't match, so we can't match this mask.
7653       return SDValue();
7654
7655     // Compute which value this mask is pointing at.
7656     SDValue MaskV = Mask[i] < Size ? V1 : V2;
7657
7658     // Compute which of the two target values this index should be assigned to.
7659     // This reflects whether the high elements are remaining or the low elements
7660     // are remaining.
7661     SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
7662
7663     // Either set up this value if we've not encountered it before, or check
7664     // that it remains consistent.
7665     if (!TargetV)
7666       TargetV = MaskV;
7667     else if (TargetV != MaskV)
7668       // This may be a rotation, but it pulls from the inputs in some
7669       // unsupported interleaving.
7670       return SDValue();
7671   }
7672
7673   // Check that we successfully analyzed the mask, and normalize the results.
7674   assert(Rotation != 0 && "Failed to locate a viable rotation!");
7675   assert((Lo || Hi) && "Failed to find a rotated input vector!");
7676   if (!Lo)
7677     Lo = Hi;
7678   else if (!Hi)
7679     Hi = Lo;
7680
7681   assert(VT.getSizeInBits() == 128 &&
7682          "Rotate-based lowering only supports 128-bit lowering!");
7683   assert(Mask.size() <= 16 &&
7684          "Can shuffle at most 16 bytes in a 128-bit vector!");
7685
7686   // The actual rotate instruction rotates bytes, so we need to scale the
7687   // rotation based on how many bytes are in the vector.
7688   int Scale = 16 / Mask.size();
7689
7690   // SSSE3 targets can use the palignr instruction
7691   if (Subtarget->hasSSSE3()) {
7692     // Cast the inputs to v16i8 to match PALIGNR.
7693     Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Lo);
7694     Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Hi);
7695
7696     return DAG.getNode(ISD::BITCAST, DL, VT,
7697                        DAG.getNode(X86ISD::PALIGNR, DL, MVT::v16i8, Hi, Lo,
7698                                    DAG.getConstant(Rotation * Scale, MVT::i8)));
7699   }
7700
7701   // Default SSE2 implementation
7702   int LoByteShift = 16 - Rotation * Scale;
7703   int HiByteShift = Rotation * Scale;
7704
7705   // Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ.
7706   Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Lo);
7707   Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Hi);
7708
7709   SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo,
7710                                 DAG.getConstant(8 * LoByteShift, MVT::i8));
7711   SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi,
7712                                 DAG.getConstant(8 * HiByteShift, MVT::i8));
7713   return DAG.getNode(ISD::BITCAST, DL, VT,
7714                      DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift));
7715 }
7716
7717 /// \brief Compute whether each element of a shuffle is zeroable.
7718 ///
7719 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
7720 /// Either it is an undef element in the shuffle mask, the element of the input
7721 /// referenced is undef, or the element of the input referenced is known to be
7722 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
7723 /// as many lanes with this technique as possible to simplify the remaining
7724 /// shuffle.
7725 static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
7726                                                      SDValue V1, SDValue V2) {
7727   SmallBitVector Zeroable(Mask.size(), false);
7728
7729   bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
7730   bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
7731
7732   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7733     int M = Mask[i];
7734     // Handle the easy cases.
7735     if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
7736       Zeroable[i] = true;
7737       continue;
7738     }
7739
7740     // If this is an index into a build_vector node, dig out the input value and
7741     // use it.
7742     SDValue V = M < Size ? V1 : V2;
7743     if (V.getOpcode() != ISD::BUILD_VECTOR)
7744       continue;
7745
7746     SDValue Input = V.getOperand(M % Size);
7747     // The UNDEF opcode check really should be dead code here, but not quite
7748     // worth asserting on (it isn't invalid, just unexpected).
7749     if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input))
7750       Zeroable[i] = true;
7751   }
7752
7753   return Zeroable;
7754 }
7755
7756 /// \brief Try to lower a vector shuffle as a byte shift (shifts in zeros).
7757 ///
7758 /// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ SSE2
7759 /// byte-shift instructions. The mask must consist of a shifted sequential
7760 /// shuffle from one of the input vectors and zeroable elements for the
7761 /// remaining 'shifted in' elements.
7762 ///
7763 /// Note that this only handles 128-bit vector widths currently.
7764 static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1,
7765                                              SDValue V2, ArrayRef<int> Mask,
7766                                              SelectionDAG &DAG) {
7767   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
7768
7769   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7770
7771   int Size = Mask.size();
7772   int Scale = 16 / Size;
7773
7774   for (int Shift = 1; Shift < Size; Shift++) {
7775     int ByteShift = Shift * Scale;
7776
7777     // PSRLDQ : (little-endian) right byte shift
7778     // [ 5,  6,  7, zz, zz, zz, zz, zz]
7779     // [ -1, 5,  6,  7, zz, zz, zz, zz]
7780     // [  1, 2, -1, -1, -1, -1, zz, zz]
7781     bool ZeroableRight = true;
7782     for (int i = Size - Shift; i < Size; i++) {
7783       ZeroableRight &= Zeroable[i];
7784     }
7785
7786     if (ZeroableRight) {
7787       bool ValidShiftRight1 =
7788           isSequentialOrUndefInRange(Mask, 0, Size - Shift, Shift);
7789       bool ValidShiftRight2 =
7790           isSequentialOrUndefInRange(Mask, 0, Size - Shift, Size + Shift);
7791
7792       if (ValidShiftRight1 || ValidShiftRight2) {
7793         // Cast the inputs to v2i64 to match PSRLDQ.
7794         SDValue &TargetV = ValidShiftRight1 ? V1 : V2;
7795         SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
7796         SDValue Shifted = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, V,
7797                                       DAG.getConstant(ByteShift * 8, MVT::i8));
7798         return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
7799       }
7800     }
7801
7802     // PSLLDQ : (little-endian) left byte shift
7803     // [ zz,  0,  1,  2,  3,  4,  5,  6]
7804     // [ zz, zz, -1, -1,  2,  3,  4, -1]
7805     // [ zz, zz, zz, zz, zz, zz, -1,  1]
7806     bool ZeroableLeft = true;
7807     for (int i = 0; i < Shift; i++) {
7808       ZeroableLeft &= Zeroable[i];
7809     }
7810
7811     if (ZeroableLeft) {
7812       bool ValidShiftLeft1 =
7813           isSequentialOrUndefInRange(Mask, Shift, Size - Shift, 0);
7814       bool ValidShiftLeft2 =
7815           isSequentialOrUndefInRange(Mask, Shift, Size - Shift, Size);
7816
7817       if (ValidShiftLeft1 || ValidShiftLeft2) {
7818         // Cast the inputs to v2i64 to match PSLLDQ.
7819         SDValue &TargetV = ValidShiftLeft1 ? V1 : V2;
7820         SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
7821         SDValue Shifted = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, V,
7822                                       DAG.getConstant(ByteShift * 8, MVT::i8));
7823         return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
7824       }
7825     }
7826   }
7827
7828   return SDValue();
7829 }
7830
7831 /// \brief Lower a vector shuffle as a zero or any extension.
7832 ///
7833 /// Given a specific number of elements, element bit width, and extension
7834 /// stride, produce either a zero or any extension based on the available
7835 /// features of the subtarget.
7836 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
7837     SDLoc DL, MVT VT, int NumElements, int Scale, bool AnyExt, SDValue InputV,
7838     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
7839   assert(Scale > 1 && "Need a scale to extend.");
7840   int EltBits = VT.getSizeInBits() / NumElements;
7841   assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
7842          "Only 8, 16, and 32 bit elements can be extended.");
7843   assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
7844
7845   // Found a valid zext mask! Try various lowering strategies based on the
7846   // input type and available ISA extensions.
7847   if (Subtarget->hasSSE41()) {
7848     MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
7849     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
7850                                  NumElements / Scale);
7851     InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
7852     return DAG.getNode(ISD::BITCAST, DL, VT,
7853                        DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV));
7854   }
7855
7856   // For any extends we can cheat for larger element sizes and use shuffle
7857   // instructions that can fold with a load and/or copy.
7858   if (AnyExt && EltBits == 32) {
7859     int PSHUFDMask[4] = {0, -1, 1, -1};
7860     return DAG.getNode(
7861         ISD::BITCAST, DL, VT,
7862         DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
7863                     DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
7864                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
7865   }
7866   if (AnyExt && EltBits == 16 && Scale > 2) {
7867     int PSHUFDMask[4] = {0, -1, 0, -1};
7868     InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
7869                          DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
7870                          getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG));
7871     int PSHUFHWMask[4] = {1, -1, -1, -1};
7872     return DAG.getNode(
7873         ISD::BITCAST, DL, VT,
7874         DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16,
7875                     DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, InputV),
7876                     getV4X86ShuffleImm8ForMask(PSHUFHWMask, DAG)));
7877   }
7878
7879   // If this would require more than 2 unpack instructions to expand, use
7880   // pshufb when available. We can only use more than 2 unpack instructions
7881   // when zero extending i8 elements which also makes it easier to use pshufb.
7882   if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) {
7883     assert(NumElements == 16 && "Unexpected byte vector width!");
7884     SDValue PSHUFBMask[16];
7885     for (int i = 0; i < 16; ++i)
7886       PSHUFBMask[i] =
7887           DAG.getConstant((i % Scale == 0) ? i / Scale : 0x80, MVT::i8);
7888     InputV = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, InputV);
7889     return DAG.getNode(ISD::BITCAST, DL, VT,
7890                        DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
7891                                    DAG.getNode(ISD::BUILD_VECTOR, DL,
7892                                                MVT::v16i8, PSHUFBMask)));
7893   }
7894
7895   // Otherwise emit a sequence of unpacks.
7896   do {
7897     MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
7898     SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
7899                          : getZeroVector(InputVT, Subtarget, DAG, DL);
7900     InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
7901     InputV = DAG.getNode(X86ISD::UNPCKL, DL, InputVT, InputV, Ext);
7902     Scale /= 2;
7903     EltBits *= 2;
7904     NumElements /= 2;
7905   } while (Scale > 1);
7906   return DAG.getNode(ISD::BITCAST, DL, VT, InputV);
7907 }
7908
7909 /// \brief Try to lower a vector shuffle as a zero extension on any micrarch.
7910 ///
7911 /// This routine will try to do everything in its power to cleverly lower
7912 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
7913 /// check for the profitability of this lowering,  it tries to aggressively
7914 /// match this pattern. It will use all of the micro-architectural details it
7915 /// can to emit an efficient lowering. It handles both blends with all-zero
7916 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
7917 /// masking out later).
7918 ///
7919 /// The reason we have dedicated lowering for zext-style shuffles is that they
7920 /// are both incredibly common and often quite performance sensitive.
7921 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
7922     SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
7923     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
7924   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7925
7926   int Bits = VT.getSizeInBits();
7927   int NumElements = Mask.size();
7928
7929   // Define a helper function to check a particular ext-scale and lower to it if
7930   // valid.
7931   auto Lower = [&](int Scale) -> SDValue {
7932     SDValue InputV;
7933     bool AnyExt = true;
7934     for (int i = 0; i < NumElements; ++i) {
7935       if (Mask[i] == -1)
7936         continue; // Valid anywhere but doesn't tell us anything.
7937       if (i % Scale != 0) {
7938         // Each of the extend elements needs to be zeroable.
7939         if (!Zeroable[i])
7940           return SDValue();
7941
7942         // We no lorger are in the anyext case.
7943         AnyExt = false;
7944         continue;
7945       }
7946
7947       // Each of the base elements needs to be consecutive indices into the
7948       // same input vector.
7949       SDValue V = Mask[i] < NumElements ? V1 : V2;
7950       if (!InputV)
7951         InputV = V;
7952       else if (InputV != V)
7953         return SDValue(); // Flip-flopping inputs.
7954
7955       if (Mask[i] % NumElements != i / Scale)
7956         return SDValue(); // Non-consecutive strided elemenst.
7957     }
7958
7959     // If we fail to find an input, we have a zero-shuffle which should always
7960     // have already been handled.
7961     // FIXME: Maybe handle this here in case during blending we end up with one?
7962     if (!InputV)
7963       return SDValue();
7964
7965     return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
7966         DL, VT, NumElements, Scale, AnyExt, InputV, Subtarget, DAG);
7967   };
7968
7969   // The widest scale possible for extending is to a 64-bit integer.
7970   assert(Bits % 64 == 0 &&
7971          "The number of bits in a vector must be divisible by 64 on x86!");
7972   int NumExtElements = Bits / 64;
7973
7974   // Each iteration, try extending the elements half as much, but into twice as
7975   // many elements.
7976   for (; NumExtElements < NumElements; NumExtElements *= 2) {
7977     assert(NumElements % NumExtElements == 0 &&
7978            "The input vector size must be divisble by the extended size.");
7979     if (SDValue V = Lower(NumElements / NumExtElements))
7980       return V;
7981   }
7982
7983   // No viable ext lowering found.
7984   return SDValue();
7985 }
7986
7987 /// \brief Try to get a scalar value for a specific element of a vector.
7988 ///
7989 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
7990 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
7991                                               SelectionDAG &DAG) {
7992   MVT VT = V.getSimpleValueType();
7993   MVT EltVT = VT.getVectorElementType();
7994   while (V.getOpcode() == ISD::BITCAST)
7995     V = V.getOperand(0);
7996   // If the bitcasts shift the element size, we can't extract an equivalent
7997   // element from it.
7998   MVT NewVT = V.getSimpleValueType();
7999   if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
8000     return SDValue();
8001
8002   if (V.getOpcode() == ISD::BUILD_VECTOR ||
8003       (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR))
8004     return DAG.getNode(ISD::BITCAST, SDLoc(V), EltVT, V.getOperand(Idx));
8005
8006   return SDValue();
8007 }
8008
8009 /// \brief Helper to test for a load that can be folded with x86 shuffles.
8010 ///
8011 /// This is particularly important because the set of instructions varies
8012 /// significantly based on whether the operand is a load or not.
8013 static bool isShuffleFoldableLoad(SDValue V) {
8014   while (V.getOpcode() == ISD::BITCAST)
8015     V = V.getOperand(0);
8016
8017   return ISD::isNON_EXTLoad(V.getNode());
8018 }
8019
8020 /// \brief Try to lower insertion of a single element into a zero vector.
8021 ///
8022 /// This is a common pattern that we have especially efficient patterns to lower
8023 /// across all subtarget feature sets.
8024 static SDValue lowerVectorShuffleAsElementInsertion(
8025     MVT VT, SDLoc DL, SDValue V1, SDValue V2, ArrayRef<int> Mask,
8026     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
8027   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8028   MVT ExtVT = VT;
8029   MVT EltVT = VT.getVectorElementType();
8030
8031   int V2Index = std::find_if(Mask.begin(), Mask.end(),
8032                              [&Mask](int M) { return M >= (int)Mask.size(); }) -
8033                 Mask.begin();
8034   bool IsV1Zeroable = true;
8035   for (int i = 0, Size = Mask.size(); i < Size; ++i)
8036     if (i != V2Index && !Zeroable[i]) {
8037       IsV1Zeroable = false;
8038       break;
8039     }
8040
8041   // Check for a single input from a SCALAR_TO_VECTOR node.
8042   // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
8043   // all the smarts here sunk into that routine. However, the current
8044   // lowering of BUILD_VECTOR makes that nearly impossible until the old
8045   // vector shuffle lowering is dead.
8046   if (SDValue V2S = getScalarValueForVectorElement(
8047           V2, Mask[V2Index] - Mask.size(), DAG)) {
8048     // We need to zext the scalar if it is smaller than an i32.
8049     V2S = DAG.getNode(ISD::BITCAST, DL, EltVT, V2S);
8050     if (EltVT == MVT::i8 || EltVT == MVT::i16) {
8051       // Using zext to expand a narrow element won't work for non-zero
8052       // insertions.
8053       if (!IsV1Zeroable)
8054         return SDValue();
8055
8056       // Zero-extend directly to i32.
8057       ExtVT = MVT::v4i32;
8058       V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
8059     }
8060     V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
8061   } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
8062              EltVT == MVT::i16) {
8063     // Either not inserting from the low element of the input or the input
8064     // element size is too small to use VZEXT_MOVL to clear the high bits.
8065     return SDValue();
8066   }
8067
8068   if (!IsV1Zeroable) {
8069     // If V1 can't be treated as a zero vector we have fewer options to lower
8070     // this. We can't support integer vectors or non-zero targets cheaply, and
8071     // the V1 elements can't be permuted in any way.
8072     assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
8073     if (!VT.isFloatingPoint() || V2Index != 0)
8074       return SDValue();
8075     SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
8076     V1Mask[V2Index] = -1;
8077     if (!isNoopShuffleMask(V1Mask))
8078       return SDValue();
8079     // This is essentially a special case blend operation, but if we have
8080     // general purpose blend operations, they are always faster. Bail and let
8081     // the rest of the lowering handle these as blends.
8082     if (Subtarget->hasSSE41())
8083       return SDValue();
8084
8085     // Otherwise, use MOVSD or MOVSS.
8086     assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
8087            "Only two types of floating point element types to handle!");
8088     return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
8089                        ExtVT, V1, V2);
8090   }
8091
8092   V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
8093   if (ExtVT != VT)
8094     V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
8095
8096   if (V2Index != 0) {
8097     // If we have 4 or fewer lanes we can cheaply shuffle the element into
8098     // the desired position. Otherwise it is more efficient to do a vector
8099     // shift left. We know that we can do a vector shift left because all
8100     // the inputs are zero.
8101     if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
8102       SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
8103       V2Shuffle[V2Index] = 0;
8104       V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
8105     } else {
8106       V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V2);
8107       V2 = DAG.getNode(
8108           X86ISD::VSHLDQ, DL, MVT::v2i64, V2,
8109           DAG.getConstant(
8110               V2Index * EltVT.getSizeInBits(),
8111               DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64)));
8112       V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
8113     }
8114   }
8115   return V2;
8116 }
8117
8118 /// \brief Try to lower broadcast of a single element.
8119 ///
8120 /// For convenience, this code also bundles all of the subtarget feature set
8121 /// filtering. While a little annoying to re-dispatch on type here, there isn't
8122 /// a convenient way to factor it out.
8123 static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V,
8124                                              ArrayRef<int> Mask,
8125                                              const X86Subtarget *Subtarget,
8126                                              SelectionDAG &DAG) {
8127   if (!Subtarget->hasAVX())
8128     return SDValue();
8129   if (VT.isInteger() && !Subtarget->hasAVX2())
8130     return SDValue();
8131
8132   // Check that the mask is a broadcast.
8133   int BroadcastIdx = -1;
8134   for (int M : Mask)
8135     if (M >= 0 && BroadcastIdx == -1)
8136       BroadcastIdx = M;
8137     else if (M >= 0 && M != BroadcastIdx)
8138       return SDValue();
8139
8140   assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
8141                                             "a sorted mask where the broadcast "
8142                                             "comes from V1.");
8143
8144   // Go up the chain of (vector) values to try and find a scalar load that
8145   // we can combine with the broadcast.
8146   for (;;) {
8147     switch (V.getOpcode()) {
8148     case ISD::CONCAT_VECTORS: {
8149       int OperandSize = Mask.size() / V.getNumOperands();
8150       V = V.getOperand(BroadcastIdx / OperandSize);
8151       BroadcastIdx %= OperandSize;
8152       continue;
8153     }
8154
8155     case ISD::INSERT_SUBVECTOR: {
8156       SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
8157       auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
8158       if (!ConstantIdx)
8159         break;
8160
8161       int BeginIdx = (int)ConstantIdx->getZExtValue();
8162       int EndIdx =
8163           BeginIdx + (int)VInner.getValueType().getVectorNumElements();
8164       if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
8165         BroadcastIdx -= BeginIdx;
8166         V = VInner;
8167       } else {
8168         V = VOuter;
8169       }
8170       continue;
8171     }
8172     }
8173     break;
8174   }
8175
8176   // Check if this is a broadcast of a scalar. We special case lowering
8177   // for scalars so that we can more effectively fold with loads.
8178   if (V.getOpcode() == ISD::BUILD_VECTOR ||
8179       (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
8180     V = V.getOperand(BroadcastIdx);
8181
8182     // If the scalar isn't a load we can't broadcast from it in AVX1, only with
8183     // AVX2.
8184     if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V))
8185       return SDValue();
8186   } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) {
8187     // We can't broadcast from a vector register w/o AVX2, and we can only
8188     // broadcast from the zero-element of a vector register.
8189     return SDValue();
8190   }
8191
8192   return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V);
8193 }
8194
8195 // Check for whether we can use INSERTPS to perform the shuffle. We only use
8196 // INSERTPS when the V1 elements are already in the correct locations
8197 // because otherwise we can just always use two SHUFPS instructions which
8198 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
8199 // perform INSERTPS if a single V1 element is out of place and all V2
8200 // elements are zeroable.
8201 static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
8202                                             ArrayRef<int> Mask,
8203                                             SelectionDAG &DAG) {
8204   assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
8205   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8206   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8207   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8208
8209   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8210
8211   unsigned ZMask = 0;
8212   int V1DstIndex = -1;
8213   int V2DstIndex = -1;
8214   bool V1UsedInPlace = false;
8215
8216   for (int i = 0; i < 4; i++) {
8217     // Synthesize a zero mask from the zeroable elements (includes undefs).
8218     if (Zeroable[i]) {
8219       ZMask |= 1 << i;
8220       continue;
8221     }
8222
8223     // Flag if we use any V1 inputs in place.
8224     if (i == Mask[i]) {
8225       V1UsedInPlace = true;
8226       continue;
8227     }
8228
8229     // We can only insert a single non-zeroable element.
8230     if (V1DstIndex != -1 || V2DstIndex != -1)
8231       return SDValue();
8232
8233     if (Mask[i] < 4) {
8234       // V1 input out of place for insertion.
8235       V1DstIndex = i;
8236     } else {
8237       // V2 input for insertion.
8238       V2DstIndex = i;
8239     }
8240   }
8241
8242   // Don't bother if we have no (non-zeroable) element for insertion.
8243   if (V1DstIndex == -1 && V2DstIndex == -1)
8244     return SDValue();
8245
8246   // Determine element insertion src/dst indices. The src index is from the
8247   // start of the inserted vector, not the start of the concatenated vector.
8248   unsigned V2SrcIndex = 0;
8249   if (V1DstIndex != -1) {
8250     // If we have a V1 input out of place, we use V1 as the V2 element insertion
8251     // and don't use the original V2 at all.
8252     V2SrcIndex = Mask[V1DstIndex];
8253     V2DstIndex = V1DstIndex;
8254     V2 = V1;
8255   } else {
8256     V2SrcIndex = Mask[V2DstIndex] - 4;
8257   }
8258
8259   // If no V1 inputs are used in place, then the result is created only from
8260   // the zero mask and the V2 insertion - so remove V1 dependency.
8261   if (!V1UsedInPlace)
8262     V1 = DAG.getUNDEF(MVT::v4f32);
8263
8264   unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
8265   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
8266
8267   // Insert the V2 element into the desired position.
8268   SDLoc DL(Op);
8269   return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
8270                      DAG.getConstant(InsertPSMask, MVT::i8));
8271 }
8272
8273 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
8274 ///
8275 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
8276 /// support for floating point shuffles but not integer shuffles. These
8277 /// instructions will incur a domain crossing penalty on some chips though so
8278 /// it is better to avoid lowering through this for integer vectors where
8279 /// possible.
8280 static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8281                                        const X86Subtarget *Subtarget,
8282                                        SelectionDAG &DAG) {
8283   SDLoc DL(Op);
8284   assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!");
8285   assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
8286   assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
8287   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8288   ArrayRef<int> Mask = SVOp->getMask();
8289   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
8290
8291   if (isSingleInputShuffleMask(Mask)) {
8292     // Use low duplicate instructions for masks that match their pattern.
8293     if (Subtarget->hasSSE3())
8294       if (isShuffleEquivalent(Mask, 0, 0))
8295         return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, V1);
8296
8297     // Straight shuffle of a single input vector. Simulate this by using the
8298     // single input as both of the "inputs" to this instruction..
8299     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
8300
8301     if (Subtarget->hasAVX()) {
8302       // If we have AVX, we can use VPERMILPS which will allow folding a load
8303       // into the shuffle.
8304       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
8305                          DAG.getConstant(SHUFPDMask, MVT::i8));
8306     }
8307
8308     return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V1,
8309                        DAG.getConstant(SHUFPDMask, MVT::i8));
8310   }
8311   assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
8312   assert(Mask[1] >= 2 && "Non-canonicalized blend!");
8313
8314   // Use dedicated unpack instructions for masks that match their pattern.
8315   if (isShuffleEquivalent(Mask, 0, 2))
8316     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2);
8317   if (isShuffleEquivalent(Mask, 1, 3))
8318     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2);
8319
8320   // If we have a single input, insert that into V1 if we can do so cheaply.
8321   if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
8322     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8323             MVT::v2f64, DL, V1, V2, Mask, Subtarget, DAG))
8324       return Insertion;
8325     // Try inverting the insertion since for v2 masks it is easy to do and we
8326     // can't reliably sort the mask one way or the other.
8327     int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
8328                           Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
8329     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8330             MVT::v2f64, DL, V2, V1, InverseMask, Subtarget, DAG))
8331       return Insertion;
8332   }
8333
8334   // Try to use one of the special instruction patterns to handle two common
8335   // blend patterns if a zero-blend above didn't work.
8336   if (isShuffleEquivalent(Mask, 0, 3) || isShuffleEquivalent(Mask, 1, 3))
8337     if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
8338       // We can either use a special instruction to load over the low double or
8339       // to move just the low double.
8340       return DAG.getNode(
8341           isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
8342           DL, MVT::v2f64, V2,
8343           DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
8344
8345   if (Subtarget->hasSSE41())
8346     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
8347                                                   Subtarget, DAG))
8348       return Blend;
8349
8350   unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
8351   return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2,
8352                      DAG.getConstant(SHUFPDMask, MVT::i8));
8353 }
8354
8355 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
8356 ///
8357 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
8358 /// the integer unit to minimize domain crossing penalties. However, for blends
8359 /// it falls back to the floating point shuffle operation with appropriate bit
8360 /// casting.
8361 static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8362                                        const X86Subtarget *Subtarget,
8363                                        SelectionDAG &DAG) {
8364   SDLoc DL(Op);
8365   assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!");
8366   assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
8367   assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
8368   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8369   ArrayRef<int> Mask = SVOp->getMask();
8370   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
8371
8372   if (isSingleInputShuffleMask(Mask)) {
8373     // Check for being able to broadcast a single element.
8374     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v2i64, DL, V1,
8375                                                           Mask, Subtarget, DAG))
8376       return Broadcast;
8377
8378     // Straight shuffle of a single input vector. For everything from SSE2
8379     // onward this has a single fast instruction with no scary immediates.
8380     // We have to map the mask as it is actually a v4i32 shuffle instruction.
8381     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V1);
8382     int WidenedMask[4] = {
8383         std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
8384         std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
8385     return DAG.getNode(
8386         ISD::BITCAST, DL, MVT::v2i64,
8387         DAG.getNode(X86ISD::PSHUFD, SDLoc(Op), MVT::v4i32, V1,
8388                     getV4X86ShuffleImm8ForMask(WidenedMask, DAG)));
8389   }
8390
8391   // Try to use byte shift instructions.
8392   if (SDValue Shift = lowerVectorShuffleAsByteShift(
8393           DL, MVT::v2i64, V1, V2, Mask, DAG))
8394     return Shift;
8395
8396   // If we have a single input from V2 insert that into V1 if we can do so
8397   // cheaply.
8398   if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
8399     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8400             MVT::v2i64, DL, V1, V2, Mask, Subtarget, DAG))
8401       return Insertion;
8402     // Try inverting the insertion since for v2 masks it is easy to do and we
8403     // can't reliably sort the mask one way or the other.
8404     int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
8405                           Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
8406     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8407             MVT::v2i64, DL, V2, V1, InverseMask, Subtarget, DAG))
8408       return Insertion;
8409   }
8410
8411   // Use dedicated unpack instructions for masks that match their pattern.
8412   if (isShuffleEquivalent(Mask, 0, 2))
8413     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2);
8414   if (isShuffleEquivalent(Mask, 1, 3))
8415     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2);
8416
8417   if (Subtarget->hasSSE41())
8418     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
8419                                                   Subtarget, DAG))
8420       return Blend;
8421
8422   // Try to use byte rotation instructions.
8423   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
8424   if (Subtarget->hasSSSE3())
8425     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
8426             DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
8427       return Rotate;
8428
8429   // We implement this with SHUFPD which is pretty lame because it will likely
8430   // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
8431   // However, all the alternatives are still more cycles and newer chips don't
8432   // have this problem. It would be really nice if x86 had better shuffles here.
8433   V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V1);
8434   V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V2);
8435   return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
8436                      DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
8437 }
8438
8439 /// \brief Lower a vector shuffle using the SHUFPS instruction.
8440 ///
8441 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
8442 /// It makes no assumptions about whether this is the *best* lowering, it simply
8443 /// uses it.
8444 static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT,
8445                                             ArrayRef<int> Mask, SDValue V1,
8446                                             SDValue V2, SelectionDAG &DAG) {
8447   SDValue LowV = V1, HighV = V2;
8448   int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
8449
8450   int NumV2Elements =
8451       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
8452
8453   if (NumV2Elements == 1) {
8454     int V2Index =
8455         std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
8456         Mask.begin();
8457
8458     // Compute the index adjacent to V2Index and in the same half by toggling
8459     // the low bit.
8460     int V2AdjIndex = V2Index ^ 1;
8461
8462     if (Mask[V2AdjIndex] == -1) {
8463       // Handles all the cases where we have a single V2 element and an undef.
8464       // This will only ever happen in the high lanes because we commute the
8465       // vector otherwise.
8466       if (V2Index < 2)
8467         std::swap(LowV, HighV);
8468       NewMask[V2Index] -= 4;
8469     } else {
8470       // Handle the case where the V2 element ends up adjacent to a V1 element.
8471       // To make this work, blend them together as the first step.
8472       int V1Index = V2AdjIndex;
8473       int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
8474       V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
8475                        getV4X86ShuffleImm8ForMask(BlendMask, DAG));
8476
8477       // Now proceed to reconstruct the final blend as we have the necessary
8478       // high or low half formed.
8479       if (V2Index < 2) {
8480         LowV = V2;
8481         HighV = V1;
8482       } else {
8483         HighV = V2;
8484       }
8485       NewMask[V1Index] = 2; // We put the V1 element in V2[2].
8486       NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
8487     }
8488   } else if (NumV2Elements == 2) {
8489     if (Mask[0] < 4 && Mask[1] < 4) {
8490       // Handle the easy case where we have V1 in the low lanes and V2 in the
8491       // high lanes.
8492       NewMask[2] -= 4;
8493       NewMask[3] -= 4;
8494     } else if (Mask[2] < 4 && Mask[3] < 4) {
8495       // We also handle the reversed case because this utility may get called
8496       // when we detect a SHUFPS pattern but can't easily commute the shuffle to
8497       // arrange things in the right direction.
8498       NewMask[0] -= 4;
8499       NewMask[1] -= 4;
8500       HighV = V1;
8501       LowV = V2;
8502     } else {
8503       // We have a mixture of V1 and V2 in both low and high lanes. Rather than
8504       // trying to place elements directly, just blend them and set up the final
8505       // shuffle to place them.
8506
8507       // The first two blend mask elements are for V1, the second two are for
8508       // V2.
8509       int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
8510                           Mask[2] < 4 ? Mask[2] : Mask[3],
8511                           (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
8512                           (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
8513       V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
8514                        getV4X86ShuffleImm8ForMask(BlendMask, DAG));
8515
8516       // Now we do a normal shuffle of V1 by giving V1 as both operands to
8517       // a blend.
8518       LowV = HighV = V1;
8519       NewMask[0] = Mask[0] < 4 ? 0 : 2;
8520       NewMask[1] = Mask[0] < 4 ? 2 : 0;
8521       NewMask[2] = Mask[2] < 4 ? 1 : 3;
8522       NewMask[3] = Mask[2] < 4 ? 3 : 1;
8523     }
8524   }
8525   return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
8526                      getV4X86ShuffleImm8ForMask(NewMask, DAG));
8527 }
8528
8529 /// \brief Lower 4-lane 32-bit floating point shuffles.
8530 ///
8531 /// Uses instructions exclusively from the floating point unit to minimize
8532 /// domain crossing penalties, as these are sufficient to implement all v4f32
8533 /// shuffles.
8534 static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8535                                        const X86Subtarget *Subtarget,
8536                                        SelectionDAG &DAG) {
8537   SDLoc DL(Op);
8538   assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
8539   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8540   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8541   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8542   ArrayRef<int> Mask = SVOp->getMask();
8543   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8544
8545   int NumV2Elements =
8546       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
8547
8548   if (NumV2Elements == 0) {
8549     // Check for being able to broadcast a single element.
8550     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f32, DL, V1,
8551                                                           Mask, Subtarget, DAG))
8552       return Broadcast;
8553
8554     // Use even/odd duplicate instructions for masks that match their pattern.
8555     if (Subtarget->hasSSE3()) {
8556       if (isShuffleEquivalent(Mask, 0, 0, 2, 2))
8557         return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
8558       if (isShuffleEquivalent(Mask, 1, 1, 3, 3))
8559         return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
8560     }
8561
8562     if (Subtarget->hasAVX()) {
8563       // If we have AVX, we can use VPERMILPS which will allow folding a load
8564       // into the shuffle.
8565       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
8566                          getV4X86ShuffleImm8ForMask(Mask, DAG));
8567     }
8568
8569     // Otherwise, use a straight shuffle of a single input vector. We pass the
8570     // input vector to both operands to simulate this with a SHUFPS.
8571     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
8572                        getV4X86ShuffleImm8ForMask(Mask, DAG));
8573   }
8574
8575   // Use dedicated unpack instructions for masks that match their pattern.
8576   if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
8577     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2);
8578   if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
8579     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
8580
8581   // There are special ways we can lower some single-element blends. However, we
8582   // have custom ways we can lower more complex single-element blends below that
8583   // we defer to if both this and BLENDPS fail to match, so restrict this to
8584   // when the V2 input is targeting element 0 of the mask -- that is the fast
8585   // case here.
8586   if (NumV2Elements == 1 && Mask[0] >= 4)
8587     if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4f32, DL, V1, V2,
8588                                                          Mask, Subtarget, DAG))
8589       return V;
8590
8591   if (Subtarget->hasSSE41()) {
8592     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
8593                                                   Subtarget, DAG))
8594       return Blend;
8595
8596     // Use INSERTPS if we can complete the shuffle efficiently.
8597     if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG))
8598       return V;
8599   }
8600
8601   // Otherwise fall back to a SHUFPS lowering strategy.
8602   return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
8603 }
8604
8605 /// \brief Lower 4-lane i32 vector shuffles.
8606 ///
8607 /// We try to handle these with integer-domain shuffles where we can, but for
8608 /// blends we use the floating point domain blend instructions.
8609 static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8610                                        const X86Subtarget *Subtarget,
8611                                        SelectionDAG &DAG) {
8612   SDLoc DL(Op);
8613   assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!");
8614   assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
8615   assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
8616   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8617   ArrayRef<int> Mask = SVOp->getMask();
8618   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8619
8620   // Whenever we can lower this as a zext, that instruction is strictly faster
8621   // than any alternative. It also allows us to fold memory operands into the
8622   // shuffle in many cases.
8623   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2,
8624                                                          Mask, Subtarget, DAG))
8625     return ZExt;
8626
8627   int NumV2Elements =
8628       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
8629
8630   if (NumV2Elements == 0) {
8631     // Check for being able to broadcast a single element.
8632     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i32, DL, V1,
8633                                                           Mask, Subtarget, DAG))
8634       return Broadcast;
8635
8636     // Straight shuffle of a single input vector. For everything from SSE2
8637     // onward this has a single fast instruction with no scary immediates.
8638     // We coerce the shuffle pattern to be compatible with UNPCK instructions
8639     // but we aren't actually going to use the UNPCK instruction because doing
8640     // so prevents folding a load into this instruction or making a copy.
8641     const int UnpackLoMask[] = {0, 0, 1, 1};
8642     const int UnpackHiMask[] = {2, 2, 3, 3};
8643     if (isShuffleEquivalent(Mask, 0, 0, 1, 1))
8644       Mask = UnpackLoMask;
8645     else if (isShuffleEquivalent(Mask, 2, 2, 3, 3))
8646       Mask = UnpackHiMask;
8647
8648     return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
8649                        getV4X86ShuffleImm8ForMask(Mask, DAG));
8650   }
8651
8652   // Try to use byte shift instructions.
8653   if (SDValue Shift = lowerVectorShuffleAsByteShift(
8654           DL, MVT::v4i32, V1, V2, Mask, DAG))
8655     return Shift;
8656
8657   // There are special ways we can lower some single-element blends.
8658   if (NumV2Elements == 1)
8659     if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2,
8660                                                          Mask, Subtarget, DAG))
8661       return V;
8662
8663   // Use dedicated unpack instructions for masks that match their pattern.
8664   if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
8665     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2);
8666   if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
8667     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
8668
8669   if (Subtarget->hasSSE41())
8670     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
8671                                                   Subtarget, DAG))
8672       return Blend;
8673
8674   // Try to use byte rotation instructions.
8675   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
8676   if (Subtarget->hasSSSE3())
8677     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
8678             DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
8679       return Rotate;
8680
8681   // We implement this with SHUFPS because it can blend from two vectors.
8682   // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
8683   // up the inputs, bypassing domain shift penalties that we would encur if we
8684   // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
8685   // relevant.
8686   return DAG.getNode(ISD::BITCAST, DL, MVT::v4i32,
8687                      DAG.getVectorShuffle(
8688                          MVT::v4f32, DL,
8689                          DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V1),
8690                          DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V2), Mask));
8691 }
8692
8693 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
8694 /// shuffle lowering, and the most complex part.
8695 ///
8696 /// The lowering strategy is to try to form pairs of input lanes which are
8697 /// targeted at the same half of the final vector, and then use a dword shuffle
8698 /// to place them onto the right half, and finally unpack the paired lanes into
8699 /// their final position.
8700 ///
8701 /// The exact breakdown of how to form these dword pairs and align them on the
8702 /// correct sides is really tricky. See the comments within the function for
8703 /// more of the details.
8704 static SDValue lowerV8I16SingleInputVectorShuffle(
8705     SDLoc DL, SDValue V, MutableArrayRef<int> Mask,
8706     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
8707   assert(V.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
8708   MutableArrayRef<int> LoMask = Mask.slice(0, 4);
8709   MutableArrayRef<int> HiMask = Mask.slice(4, 4);
8710
8711   SmallVector<int, 4> LoInputs;
8712   std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
8713                [](int M) { return M >= 0; });
8714   std::sort(LoInputs.begin(), LoInputs.end());
8715   LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
8716   SmallVector<int, 4> HiInputs;
8717   std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
8718                [](int M) { return M >= 0; });
8719   std::sort(HiInputs.begin(), HiInputs.end());
8720   HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
8721   int NumLToL =
8722       std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
8723   int NumHToL = LoInputs.size() - NumLToL;
8724   int NumLToH =
8725       std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
8726   int NumHToH = HiInputs.size() - NumLToH;
8727   MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
8728   MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
8729   MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
8730   MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
8731
8732   // Check for being able to broadcast a single element.
8733   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i16, DL, V,
8734                                                         Mask, Subtarget, DAG))
8735     return Broadcast;
8736
8737   // Try to use byte shift instructions.
8738   if (SDValue Shift = lowerVectorShuffleAsByteShift(
8739           DL, MVT::v8i16, V, V, Mask, DAG))
8740     return Shift;
8741
8742   // Use dedicated unpack instructions for masks that match their pattern.
8743   if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3))
8744     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V);
8745   if (isShuffleEquivalent(Mask, 4, 4, 5, 5, 6, 6, 7, 7))
8746     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V);
8747
8748   // Try to use byte rotation instructions.
8749   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
8750           DL, MVT::v8i16, V, V, Mask, Subtarget, DAG))
8751     return Rotate;
8752
8753   // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
8754   // such inputs we can swap two of the dwords across the half mark and end up
8755   // with <=2 inputs to each half in each half. Once there, we can fall through
8756   // to the generic code below. For example:
8757   //
8758   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
8759   // Mask:  [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
8760   //
8761   // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
8762   // and an existing 2-into-2 on the other half. In this case we may have to
8763   // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
8764   // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
8765   // Fortunately, we don't have to handle anything but a 2-into-2 pattern
8766   // because any other situation (including a 3-into-1 or 1-into-3 in the other
8767   // half than the one we target for fixing) will be fixed when we re-enter this
8768   // path. We will also combine away any sequence of PSHUFD instructions that
8769   // result into a single instruction. Here is an example of the tricky case:
8770   //
8771   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
8772   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
8773   //
8774   // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
8775   //
8776   // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
8777   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
8778   //
8779   // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
8780   // Mask:  [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
8781   //
8782   // The result is fine to be handled by the generic logic.
8783   auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
8784                           ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
8785                           int AOffset, int BOffset) {
8786     assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
8787            "Must call this with A having 3 or 1 inputs from the A half.");
8788     assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
8789            "Must call this with B having 1 or 3 inputs from the B half.");
8790     assert(AToAInputs.size() + BToAInputs.size() == 4 &&
8791            "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
8792
8793     // Compute the index of dword with only one word among the three inputs in
8794     // a half by taking the sum of the half with three inputs and subtracting
8795     // the sum of the actual three inputs. The difference is the remaining
8796     // slot.
8797     int ADWord, BDWord;
8798     int &TripleDWord = AToAInputs.size() == 3 ? ADWord : BDWord;
8799     int &OneInputDWord = AToAInputs.size() == 3 ? BDWord : ADWord;
8800     int TripleInputOffset = AToAInputs.size() == 3 ? AOffset : BOffset;
8801     ArrayRef<int> TripleInputs = AToAInputs.size() == 3 ? AToAInputs : BToAInputs;
8802     int OneInput = AToAInputs.size() == 3 ? BToAInputs[0] : AToAInputs[0];
8803     int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
8804     int TripleNonInputIdx =
8805         TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
8806     TripleDWord = TripleNonInputIdx / 2;
8807
8808     // We use xor with one to compute the adjacent DWord to whichever one the
8809     // OneInput is in.
8810     OneInputDWord = (OneInput / 2) ^ 1;
8811
8812     // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
8813     // and BToA inputs. If there is also such a problem with the BToB and AToB
8814     // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
8815     // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
8816     // is essential that we don't *create* a 3<-1 as then we might oscillate.
8817     if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
8818       // Compute how many inputs will be flipped by swapping these DWords. We
8819       // need
8820       // to balance this to ensure we don't form a 3-1 shuffle in the other
8821       // half.
8822       int NumFlippedAToBInputs =
8823           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
8824           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
8825       int NumFlippedBToBInputs =
8826           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
8827           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
8828       if ((NumFlippedAToBInputs == 1 &&
8829            (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
8830           (NumFlippedBToBInputs == 1 &&
8831            (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
8832         // We choose whether to fix the A half or B half based on whether that
8833         // half has zero flipped inputs. At zero, we may not be able to fix it
8834         // with that half. We also bias towards fixing the B half because that
8835         // will more commonly be the high half, and we have to bias one way.
8836         auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
8837                                                        ArrayRef<int> Inputs) {
8838           int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
8839           bool IsFixIdxInput = std::find(Inputs.begin(), Inputs.end(),
8840                                          PinnedIdx ^ 1) != Inputs.end();
8841           // Determine whether the free index is in the flipped dword or the
8842           // unflipped dword based on where the pinned index is. We use this bit
8843           // in an xor to conditionally select the adjacent dword.
8844           int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
8845           bool IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
8846                                              FixFreeIdx) != Inputs.end();
8847           if (IsFixIdxInput == IsFixFreeIdxInput)
8848             FixFreeIdx += 1;
8849           IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
8850                                         FixFreeIdx) != Inputs.end();
8851           assert(IsFixIdxInput != IsFixFreeIdxInput &&
8852                  "We need to be changing the number of flipped inputs!");
8853           int PSHUFHalfMask[] = {0, 1, 2, 3};
8854           std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
8855           V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
8856                           MVT::v8i16, V,
8857                           getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DAG));
8858
8859           for (int &M : Mask)
8860             if (M != -1 && M == FixIdx)
8861               M = FixFreeIdx;
8862             else if (M != -1 && M == FixFreeIdx)
8863               M = FixIdx;
8864         };
8865         if (NumFlippedBToBInputs != 0) {
8866           int BPinnedIdx =
8867               BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
8868           FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
8869         } else {
8870           assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
8871           int APinnedIdx =
8872               AToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
8873           FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
8874         }
8875       }
8876     }
8877
8878     int PSHUFDMask[] = {0, 1, 2, 3};
8879     PSHUFDMask[ADWord] = BDWord;
8880     PSHUFDMask[BDWord] = ADWord;
8881     V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
8882                     DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
8883                                 DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
8884                                 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
8885
8886     // Adjust the mask to match the new locations of A and B.
8887     for (int &M : Mask)
8888       if (M != -1 && M/2 == ADWord)
8889         M = 2 * BDWord + M % 2;
8890       else if (M != -1 && M/2 == BDWord)
8891         M = 2 * ADWord + M % 2;
8892
8893     // Recurse back into this routine to re-compute state now that this isn't
8894     // a 3 and 1 problem.
8895     return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
8896                                 Mask);
8897   };
8898   if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
8899     return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
8900   else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
8901     return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
8902
8903   // At this point there are at most two inputs to the low and high halves from
8904   // each half. That means the inputs can always be grouped into dwords and
8905   // those dwords can then be moved to the correct half with a dword shuffle.
8906   // We use at most one low and one high word shuffle to collect these paired
8907   // inputs into dwords, and finally a dword shuffle to place them.
8908   int PSHUFLMask[4] = {-1, -1, -1, -1};
8909   int PSHUFHMask[4] = {-1, -1, -1, -1};
8910   int PSHUFDMask[4] = {-1, -1, -1, -1};
8911
8912   // First fix the masks for all the inputs that are staying in their
8913   // original halves. This will then dictate the targets of the cross-half
8914   // shuffles.
8915   auto fixInPlaceInputs =
8916       [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
8917                     MutableArrayRef<int> SourceHalfMask,
8918                     MutableArrayRef<int> HalfMask, int HalfOffset) {
8919     if (InPlaceInputs.empty())
8920       return;
8921     if (InPlaceInputs.size() == 1) {
8922       SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
8923           InPlaceInputs[0] - HalfOffset;
8924       PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
8925       return;
8926     }
8927     if (IncomingInputs.empty()) {
8928       // Just fix all of the in place inputs.
8929       for (int Input : InPlaceInputs) {
8930         SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
8931         PSHUFDMask[Input / 2] = Input / 2;
8932       }
8933       return;
8934     }
8935
8936     assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
8937     SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
8938         InPlaceInputs[0] - HalfOffset;
8939     // Put the second input next to the first so that they are packed into
8940     // a dword. We find the adjacent index by toggling the low bit.
8941     int AdjIndex = InPlaceInputs[0] ^ 1;
8942     SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
8943     std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
8944     PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
8945   };
8946   fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
8947   fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
8948
8949   // Now gather the cross-half inputs and place them into a free dword of
8950   // their target half.
8951   // FIXME: This operation could almost certainly be simplified dramatically to
8952   // look more like the 3-1 fixing operation.
8953   auto moveInputsToRightHalf = [&PSHUFDMask](
8954       MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
8955       MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
8956       MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
8957       int DestOffset) {
8958     auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
8959       return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word;
8960     };
8961     auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
8962                                                int Word) {
8963       int LowWord = Word & ~1;
8964       int HighWord = Word | 1;
8965       return isWordClobbered(SourceHalfMask, LowWord) ||
8966              isWordClobbered(SourceHalfMask, HighWord);
8967     };
8968
8969     if (IncomingInputs.empty())
8970       return;
8971
8972     if (ExistingInputs.empty()) {
8973       // Map any dwords with inputs from them into the right half.
8974       for (int Input : IncomingInputs) {
8975         // If the source half mask maps over the inputs, turn those into
8976         // swaps and use the swapped lane.
8977         if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
8978           if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == -1) {
8979             SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
8980                 Input - SourceOffset;
8981             // We have to swap the uses in our half mask in one sweep.
8982             for (int &M : HalfMask)
8983               if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
8984                 M = Input;
8985               else if (M == Input)
8986                 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
8987           } else {
8988             assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
8989                        Input - SourceOffset &&
8990                    "Previous placement doesn't match!");
8991           }
8992           // Note that this correctly re-maps both when we do a swap and when
8993           // we observe the other side of the swap above. We rely on that to
8994           // avoid swapping the members of the input list directly.
8995           Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
8996         }
8997
8998         // Map the input's dword into the correct half.
8999         if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == -1)
9000           PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
9001         else
9002           assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
9003                      Input / 2 &&
9004                  "Previous placement doesn't match!");
9005       }
9006
9007       // And just directly shift any other-half mask elements to be same-half
9008       // as we will have mirrored the dword containing the element into the
9009       // same position within that half.
9010       for (int &M : HalfMask)
9011         if (M >= SourceOffset && M < SourceOffset + 4) {
9012           M = M - SourceOffset + DestOffset;
9013           assert(M >= 0 && "This should never wrap below zero!");
9014         }
9015       return;
9016     }
9017
9018     // Ensure we have the input in a viable dword of its current half. This
9019     // is particularly tricky because the original position may be clobbered
9020     // by inputs being moved and *staying* in that half.
9021     if (IncomingInputs.size() == 1) {
9022       if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
9023         int InputFixed = std::find(std::begin(SourceHalfMask),
9024                                    std::end(SourceHalfMask), -1) -
9025                          std::begin(SourceHalfMask) + SourceOffset;
9026         SourceHalfMask[InputFixed - SourceOffset] =
9027             IncomingInputs[0] - SourceOffset;
9028         std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
9029                      InputFixed);
9030         IncomingInputs[0] = InputFixed;
9031       }
9032     } else if (IncomingInputs.size() == 2) {
9033       if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
9034           isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
9035         // We have two non-adjacent or clobbered inputs we need to extract from
9036         // the source half. To do this, we need to map them into some adjacent
9037         // dword slot in the source mask.
9038         int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
9039                               IncomingInputs[1] - SourceOffset};
9040
9041         // If there is a free slot in the source half mask adjacent to one of
9042         // the inputs, place the other input in it. We use (Index XOR 1) to
9043         // compute an adjacent index.
9044         if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
9045             SourceHalfMask[InputsFixed[0] ^ 1] == -1) {
9046           SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
9047           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
9048           InputsFixed[1] = InputsFixed[0] ^ 1;
9049         } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
9050                    SourceHalfMask[InputsFixed[1] ^ 1] == -1) {
9051           SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
9052           SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
9053           InputsFixed[0] = InputsFixed[1] ^ 1;
9054         } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] == -1 &&
9055                    SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] == -1) {
9056           // The two inputs are in the same DWord but it is clobbered and the
9057           // adjacent DWord isn't used at all. Move both inputs to the free
9058           // slot.
9059           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
9060           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
9061           InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
9062           InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
9063         } else {
9064           // The only way we hit this point is if there is no clobbering
9065           // (because there are no off-half inputs to this half) and there is no
9066           // free slot adjacent to one of the inputs. In this case, we have to
9067           // swap an input with a non-input.
9068           for (int i = 0; i < 4; ++i)
9069             assert((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) &&
9070                    "We can't handle any clobbers here!");
9071           assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
9072                  "Cannot have adjacent inputs here!");
9073
9074           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
9075           SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
9076
9077           // We also have to update the final source mask in this case because
9078           // it may need to undo the above swap.
9079           for (int &M : FinalSourceHalfMask)
9080             if (M == (InputsFixed[0] ^ 1) + SourceOffset)
9081               M = InputsFixed[1] + SourceOffset;
9082             else if (M == InputsFixed[1] + SourceOffset)
9083               M = (InputsFixed[0] ^ 1) + SourceOffset;
9084
9085           InputsFixed[1] = InputsFixed[0] ^ 1;
9086         }
9087
9088         // Point everything at the fixed inputs.
9089         for (int &M : HalfMask)
9090           if (M == IncomingInputs[0])
9091             M = InputsFixed[0] + SourceOffset;
9092           else if (M == IncomingInputs[1])
9093             M = InputsFixed[1] + SourceOffset;
9094
9095         IncomingInputs[0] = InputsFixed[0] + SourceOffset;
9096         IncomingInputs[1] = InputsFixed[1] + SourceOffset;
9097       }
9098     } else {
9099       llvm_unreachable("Unhandled input size!");
9100     }
9101
9102     // Now hoist the DWord down to the right half.
9103     int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2;
9104     assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free");
9105     PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
9106     for (int &M : HalfMask)
9107       for (int Input : IncomingInputs)
9108         if (M == Input)
9109           M = FreeDWord * 2 + Input % 2;
9110   };
9111   moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
9112                         /*SourceOffset*/ 4, /*DestOffset*/ 0);
9113   moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
9114                         /*SourceOffset*/ 0, /*DestOffset*/ 4);
9115
9116   // Now enact all the shuffles we've computed to move the inputs into their
9117   // target half.
9118   if (!isNoopShuffleMask(PSHUFLMask))
9119     V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
9120                     getV4X86ShuffleImm8ForMask(PSHUFLMask, DAG));
9121   if (!isNoopShuffleMask(PSHUFHMask))
9122     V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
9123                     getV4X86ShuffleImm8ForMask(PSHUFHMask, DAG));
9124   if (!isNoopShuffleMask(PSHUFDMask))
9125     V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9126                     DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9127                                 DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
9128                                 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
9129
9130   // At this point, each half should contain all its inputs, and we can then
9131   // just shuffle them into their final position.
9132   assert(std::count_if(LoMask.begin(), LoMask.end(),
9133                        [](int M) { return M >= 4; }) == 0 &&
9134          "Failed to lift all the high half inputs to the low mask!");
9135   assert(std::count_if(HiMask.begin(), HiMask.end(),
9136                        [](int M) { return M >= 0 && M < 4; }) == 0 &&
9137          "Failed to lift all the low half inputs to the high mask!");
9138
9139   // Do a half shuffle for the low mask.
9140   if (!isNoopShuffleMask(LoMask))
9141     V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
9142                     getV4X86ShuffleImm8ForMask(LoMask, DAG));
9143
9144   // Do a half shuffle with the high mask after shifting its values down.
9145   for (int &M : HiMask)
9146     if (M >= 0)
9147       M -= 4;
9148   if (!isNoopShuffleMask(HiMask))
9149     V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
9150                     getV4X86ShuffleImm8ForMask(HiMask, DAG));
9151
9152   return V;
9153 }
9154
9155 /// \brief Detect whether the mask pattern should be lowered through
9156 /// interleaving.
9157 ///
9158 /// This essentially tests whether viewing the mask as an interleaving of two
9159 /// sub-sequences reduces the cross-input traffic of a blend operation. If so,
9160 /// lowering it through interleaving is a significantly better strategy.
9161 static bool shouldLowerAsInterleaving(ArrayRef<int> Mask) {
9162   int NumEvenInputs[2] = {0, 0};
9163   int NumOddInputs[2] = {0, 0};
9164   int NumLoInputs[2] = {0, 0};
9165   int NumHiInputs[2] = {0, 0};
9166   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9167     if (Mask[i] < 0)
9168       continue;
9169
9170     int InputIdx = Mask[i] >= Size;
9171
9172     if (i < Size / 2)
9173       ++NumLoInputs[InputIdx];
9174     else
9175       ++NumHiInputs[InputIdx];
9176
9177     if ((i % 2) == 0)
9178       ++NumEvenInputs[InputIdx];
9179     else
9180       ++NumOddInputs[InputIdx];
9181   }
9182
9183   // The minimum number of cross-input results for both the interleaved and
9184   // split cases. If interleaving results in fewer cross-input results, return
9185   // true.
9186   int InterleavedCrosses = std::min(NumEvenInputs[1] + NumOddInputs[0],
9187                                     NumEvenInputs[0] + NumOddInputs[1]);
9188   int SplitCrosses = std::min(NumLoInputs[1] + NumHiInputs[0],
9189                               NumLoInputs[0] + NumHiInputs[1]);
9190   return InterleavedCrosses < SplitCrosses;
9191 }
9192
9193 /// \brief Blend two v8i16 vectors using a naive unpack strategy.
9194 ///
9195 /// This strategy only works when the inputs from each vector fit into a single
9196 /// half of that vector, and generally there are not so many inputs as to leave
9197 /// the in-place shuffles required highly constrained (and thus expensive). It
9198 /// shifts all the inputs into a single side of both input vectors and then
9199 /// uses an unpack to interleave these inputs in a single vector. At that
9200 /// point, we will fall back on the generic single input shuffle lowering.
9201 static SDValue lowerV8I16BasicBlendVectorShuffle(SDLoc DL, SDValue V1,
9202                                                  SDValue V2,
9203                                                  MutableArrayRef<int> Mask,
9204                                                  const X86Subtarget *Subtarget,
9205                                                  SelectionDAG &DAG) {
9206   assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
9207   assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
9208   SmallVector<int, 3> LoV1Inputs, HiV1Inputs, LoV2Inputs, HiV2Inputs;
9209   for (int i = 0; i < 8; ++i)
9210     if (Mask[i] >= 0 && Mask[i] < 4)
9211       LoV1Inputs.push_back(i);
9212     else if (Mask[i] >= 4 && Mask[i] < 8)
9213       HiV1Inputs.push_back(i);
9214     else if (Mask[i] >= 8 && Mask[i] < 12)
9215       LoV2Inputs.push_back(i);
9216     else if (Mask[i] >= 12)
9217       HiV2Inputs.push_back(i);
9218
9219   int NumV1Inputs = LoV1Inputs.size() + HiV1Inputs.size();
9220   int NumV2Inputs = LoV2Inputs.size() + HiV2Inputs.size();
9221   (void)NumV1Inputs;
9222   (void)NumV2Inputs;
9223   assert(NumV1Inputs > 0 && NumV1Inputs <= 3 && "At most 3 inputs supported");
9224   assert(NumV2Inputs > 0 && NumV2Inputs <= 3 && "At most 3 inputs supported");
9225   assert(NumV1Inputs + NumV2Inputs <= 4 && "At most 4 combined inputs");
9226
9227   bool MergeFromLo = LoV1Inputs.size() + LoV2Inputs.size() >=
9228                      HiV1Inputs.size() + HiV2Inputs.size();
9229
9230   auto moveInputsToHalf = [&](SDValue V, ArrayRef<int> LoInputs,
9231                               ArrayRef<int> HiInputs, bool MoveToLo,
9232                               int MaskOffset) {
9233     ArrayRef<int> GoodInputs = MoveToLo ? LoInputs : HiInputs;
9234     ArrayRef<int> BadInputs = MoveToLo ? HiInputs : LoInputs;
9235     if (BadInputs.empty())
9236       return V;
9237
9238     int MoveMask[] = {-1, -1, -1, -1, -1, -1, -1, -1};
9239     int MoveOffset = MoveToLo ? 0 : 4;
9240
9241     if (GoodInputs.empty()) {
9242       for (int BadInput : BadInputs) {
9243         MoveMask[Mask[BadInput] % 4 + MoveOffset] = Mask[BadInput] - MaskOffset;
9244         Mask[BadInput] = Mask[BadInput] % 4 + MoveOffset + MaskOffset;
9245       }
9246     } else {
9247       if (GoodInputs.size() == 2) {
9248         // If the low inputs are spread across two dwords, pack them into
9249         // a single dword.
9250         MoveMask[MoveOffset] = Mask[GoodInputs[0]] - MaskOffset;
9251         MoveMask[MoveOffset + 1] = Mask[GoodInputs[1]] - MaskOffset;
9252         Mask[GoodInputs[0]] = MoveOffset + MaskOffset;
9253         Mask[GoodInputs[1]] = MoveOffset + 1 + MaskOffset;
9254       } else {
9255         // Otherwise pin the good inputs.
9256         for (int GoodInput : GoodInputs)
9257           MoveMask[Mask[GoodInput] - MaskOffset] = Mask[GoodInput] - MaskOffset;
9258       }
9259
9260       if (BadInputs.size() == 2) {
9261         // If we have two bad inputs then there may be either one or two good
9262         // inputs fixed in place. Find a fixed input, and then find the *other*
9263         // two adjacent indices by using modular arithmetic.
9264         int GoodMaskIdx =
9265             std::find_if(std::begin(MoveMask) + MoveOffset, std::end(MoveMask),
9266                          [](int M) { return M >= 0; }) -
9267             std::begin(MoveMask);
9268         int MoveMaskIdx =
9269             ((((GoodMaskIdx - MoveOffset) & ~1) + 2) % 4) + MoveOffset;
9270         assert(MoveMask[MoveMaskIdx] == -1 && "Expected empty slot");
9271         assert(MoveMask[MoveMaskIdx + 1] == -1 && "Expected empty slot");
9272         MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
9273         MoveMask[MoveMaskIdx + 1] = Mask[BadInputs[1]] - MaskOffset;
9274         Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
9275         Mask[BadInputs[1]] = MoveMaskIdx + 1 + MaskOffset;
9276       } else {
9277         assert(BadInputs.size() == 1 && "All sizes handled");
9278         int MoveMaskIdx = std::find(std::begin(MoveMask) + MoveOffset,
9279                                     std::end(MoveMask), -1) -
9280                           std::begin(MoveMask);
9281         MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
9282         Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
9283       }
9284     }
9285
9286     return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
9287                                 MoveMask);
9288   };
9289   V1 = moveInputsToHalf(V1, LoV1Inputs, HiV1Inputs, MergeFromLo,
9290                         /*MaskOffset*/ 0);
9291   V2 = moveInputsToHalf(V2, LoV2Inputs, HiV2Inputs, MergeFromLo,
9292                         /*MaskOffset*/ 8);
9293
9294   // FIXME: Select an interleaving of the merge of V1 and V2 that minimizes
9295   // cross-half traffic in the final shuffle.
9296
9297   // Munge the mask to be a single-input mask after the unpack merges the
9298   // results.
9299   for (int &M : Mask)
9300     if (M != -1)
9301       M = 2 * (M % 4) + (M / 8);
9302
9303   return DAG.getVectorShuffle(
9304       MVT::v8i16, DL, DAG.getNode(MergeFromLo ? X86ISD::UNPCKL : X86ISD::UNPCKH,
9305                                   DL, MVT::v8i16, V1, V2),
9306       DAG.getUNDEF(MVT::v8i16), Mask);
9307 }
9308
9309 /// \brief Generic lowering of 8-lane i16 shuffles.
9310 ///
9311 /// This handles both single-input shuffles and combined shuffle/blends with
9312 /// two inputs. The single input shuffles are immediately delegated to
9313 /// a dedicated lowering routine.
9314 ///
9315 /// The blends are lowered in one of three fundamental ways. If there are few
9316 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
9317 /// of the input is significantly cheaper when lowered as an interleaving of
9318 /// the two inputs, try to interleave them. Otherwise, blend the low and high
9319 /// halves of the inputs separately (making them have relatively few inputs)
9320 /// and then concatenate them.
9321 static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9322                                        const X86Subtarget *Subtarget,
9323                                        SelectionDAG &DAG) {
9324   SDLoc DL(Op);
9325   assert(Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!");
9326   assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
9327   assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
9328   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
9329   ArrayRef<int> OrigMask = SVOp->getMask();
9330   int MaskStorage[8] = {OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3],
9331                         OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7]};
9332   MutableArrayRef<int> Mask(MaskStorage);
9333
9334   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
9335
9336   // Whenever we can lower this as a zext, that instruction is strictly faster
9337   // than any alternative.
9338   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
9339           DL, MVT::v8i16, V1, V2, OrigMask, Subtarget, DAG))
9340     return ZExt;
9341
9342   auto isV1 = [](int M) { return M >= 0 && M < 8; };
9343   auto isV2 = [](int M) { return M >= 8; };
9344
9345   int NumV1Inputs = std::count_if(Mask.begin(), Mask.end(), isV1);
9346   int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2);
9347
9348   if (NumV2Inputs == 0)
9349     return lowerV8I16SingleInputVectorShuffle(DL, V1, Mask, Subtarget, DAG);
9350
9351   assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized "
9352                             "to be V1-input shuffles.");
9353
9354   // Try to use byte shift instructions.
9355   if (SDValue Shift = lowerVectorShuffleAsByteShift(
9356           DL, MVT::v8i16, V1, V2, Mask, DAG))
9357     return Shift;
9358
9359   // There are special ways we can lower some single-element blends.
9360   if (NumV2Inputs == 1)
9361     if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v8i16, DL, V1, V2,
9362                                                          Mask, Subtarget, DAG))
9363       return V;
9364
9365   // Use dedicated unpack instructions for masks that match their pattern.
9366   if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 2, 10, 3, 11))
9367     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2);
9368   if (isShuffleEquivalent(Mask, 4, 12, 5, 13, 6, 14, 7, 15))
9369     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2);
9370
9371   if (Subtarget->hasSSE41())
9372     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
9373                                                   Subtarget, DAG))
9374       return Blend;
9375
9376   // Try to use byte rotation instructions.
9377   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9378           DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
9379     return Rotate;
9380
9381   if (NumV1Inputs + NumV2Inputs <= 4)
9382     return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG);
9383
9384   // Check whether an interleaving lowering is likely to be more efficient.
9385   // This isn't perfect but it is a strong heuristic that tends to work well on
9386   // the kinds of shuffles that show up in practice.
9387   //
9388   // FIXME: Handle 1x, 2x, and 4x interleaving.
9389   if (shouldLowerAsInterleaving(Mask)) {
9390     // FIXME: Figure out whether we should pack these into the low or high
9391     // halves.
9392
9393     int EMask[8], OMask[8];
9394     for (int i = 0; i < 4; ++i) {
9395       EMask[i] = Mask[2*i];
9396       OMask[i] = Mask[2*i + 1];
9397       EMask[i + 4] = -1;
9398       OMask[i + 4] = -1;
9399     }
9400
9401     SDValue Evens = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, EMask);
9402     SDValue Odds = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, OMask);
9403
9404     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, Evens, Odds);
9405   }
9406
9407   int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9408   int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9409
9410   for (int i = 0; i < 4; ++i) {
9411     LoBlendMask[i] = Mask[i];
9412     HiBlendMask[i] = Mask[i + 4];
9413   }
9414
9415   SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
9416   SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
9417   LoV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, LoV);
9418   HiV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, HiV);
9419
9420   return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9421                      DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, LoV, HiV));
9422 }
9423
9424 /// \brief Check whether a compaction lowering can be done by dropping even
9425 /// elements and compute how many times even elements must be dropped.
9426 ///
9427 /// This handles shuffles which take every Nth element where N is a power of
9428 /// two. Example shuffle masks:
9429 ///
9430 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14,  0,  2,  4,  6,  8, 10, 12, 14
9431 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
9432 ///  N = 2:  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12
9433 ///  N = 2:  0,  4,  8, 12, 16, 20, 24, 28,  0,  4,  8, 12, 16, 20, 24, 28
9434 ///  N = 3:  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8
9435 ///  N = 3:  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24
9436 ///
9437 /// Any of these lanes can of course be undef.
9438 ///
9439 /// This routine only supports N <= 3.
9440 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
9441 /// for larger N.
9442 ///
9443 /// \returns N above, or the number of times even elements must be dropped if
9444 /// there is such a number. Otherwise returns zero.
9445 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask) {
9446   // Figure out whether we're looping over two inputs or just one.
9447   bool IsSingleInput = isSingleInputShuffleMask(Mask);
9448
9449   // The modulus for the shuffle vector entries is based on whether this is
9450   // a single input or not.
9451   int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
9452   assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
9453          "We should only be called with masks with a power-of-2 size!");
9454
9455   uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
9456
9457   // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
9458   // and 2^3 simultaneously. This is because we may have ambiguity with
9459   // partially undef inputs.
9460   bool ViableForN[3] = {true, true, true};
9461
9462   for (int i = 0, e = Mask.size(); i < e; ++i) {
9463     // Ignore undef lanes, we'll optimistically collapse them to the pattern we
9464     // want.
9465     if (Mask[i] == -1)
9466       continue;
9467
9468     bool IsAnyViable = false;
9469     for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
9470       if (ViableForN[j]) {
9471         uint64_t N = j + 1;
9472
9473         // The shuffle mask must be equal to (i * 2^N) % M.
9474         if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
9475           IsAnyViable = true;
9476         else
9477           ViableForN[j] = false;
9478       }
9479     // Early exit if we exhaust the possible powers of two.
9480     if (!IsAnyViable)
9481       break;
9482   }
9483
9484   for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
9485     if (ViableForN[j])
9486       return j + 1;
9487
9488   // Return 0 as there is no viable power of two.
9489   return 0;
9490 }
9491
9492 /// \brief Generic lowering of v16i8 shuffles.
9493 ///
9494 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
9495 /// detect any complexity reducing interleaving. If that doesn't help, it uses
9496 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
9497 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
9498 /// back together.
9499 static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9500                                        const X86Subtarget *Subtarget,
9501                                        SelectionDAG &DAG) {
9502   SDLoc DL(Op);
9503   assert(Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!");
9504   assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
9505   assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
9506   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
9507   ArrayRef<int> OrigMask = SVOp->getMask();
9508   assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!");
9509
9510   // Try to use byte shift instructions.
9511   if (SDValue Shift = lowerVectorShuffleAsByteShift(
9512           DL, MVT::v16i8, V1, V2, OrigMask, DAG))
9513     return Shift;
9514
9515   // Try to use byte rotation instructions.
9516   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9517           DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
9518     return Rotate;
9519
9520   // Try to use a zext lowering.
9521   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
9522           DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
9523     return ZExt;
9524
9525   int MaskStorage[16] = {
9526       OrigMask[0],  OrigMask[1],  OrigMask[2],  OrigMask[3],
9527       OrigMask[4],  OrigMask[5],  OrigMask[6],  OrigMask[7],
9528       OrigMask[8],  OrigMask[9],  OrigMask[10], OrigMask[11],
9529       OrigMask[12], OrigMask[13], OrigMask[14], OrigMask[15]};
9530   MutableArrayRef<int> Mask(MaskStorage);
9531   MutableArrayRef<int> LoMask = Mask.slice(0, 8);
9532   MutableArrayRef<int> HiMask = Mask.slice(8, 8);
9533
9534   int NumV2Elements =
9535       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; });
9536
9537   // For single-input shuffles, there are some nicer lowering tricks we can use.
9538   if (NumV2Elements == 0) {
9539     // Check for being able to broadcast a single element.
9540     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i8, DL, V1,
9541                                                           Mask, Subtarget, DAG))
9542       return Broadcast;
9543
9544     // Check whether we can widen this to an i16 shuffle by duplicating bytes.
9545     // Notably, this handles splat and partial-splat shuffles more efficiently.
9546     // However, it only makes sense if the pre-duplication shuffle simplifies
9547     // things significantly. Currently, this means we need to be able to
9548     // express the pre-duplication shuffle as an i16 shuffle.
9549     //
9550     // FIXME: We should check for other patterns which can be widened into an
9551     // i16 shuffle as well.
9552     auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
9553       for (int i = 0; i < 16; i += 2)
9554         if (Mask[i] != -1 && Mask[i + 1] != -1 && Mask[i] != Mask[i + 1])
9555           return false;
9556
9557       return true;
9558     };
9559     auto tryToWidenViaDuplication = [&]() -> SDValue {
9560       if (!canWidenViaDuplication(Mask))
9561         return SDValue();
9562       SmallVector<int, 4> LoInputs;
9563       std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
9564                    [](int M) { return M >= 0 && M < 8; });
9565       std::sort(LoInputs.begin(), LoInputs.end());
9566       LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
9567                      LoInputs.end());
9568       SmallVector<int, 4> HiInputs;
9569       std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
9570                    [](int M) { return M >= 8; });
9571       std::sort(HiInputs.begin(), HiInputs.end());
9572       HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
9573                      HiInputs.end());
9574
9575       bool TargetLo = LoInputs.size() >= HiInputs.size();
9576       ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
9577       ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
9578
9579       int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
9580       SmallDenseMap<int, int, 8> LaneMap;
9581       for (int I : InPlaceInputs) {
9582         PreDupI16Shuffle[I/2] = I/2;
9583         LaneMap[I] = I;
9584       }
9585       int j = TargetLo ? 0 : 4, je = j + 4;
9586       for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
9587         // Check if j is already a shuffle of this input. This happens when
9588         // there are two adjacent bytes after we move the low one.
9589         if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
9590           // If we haven't yet mapped the input, search for a slot into which
9591           // we can map it.
9592           while (j < je && PreDupI16Shuffle[j] != -1)
9593             ++j;
9594
9595           if (j == je)
9596             // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
9597             return SDValue();
9598
9599           // Map this input with the i16 shuffle.
9600           PreDupI16Shuffle[j] = MovingInputs[i] / 2;
9601         }
9602
9603         // Update the lane map based on the mapping we ended up with.
9604         LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
9605       }
9606       V1 = DAG.getNode(
9607           ISD::BITCAST, DL, MVT::v16i8,
9608           DAG.getVectorShuffle(MVT::v8i16, DL,
9609                                DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
9610                                DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
9611
9612       // Unpack the bytes to form the i16s that will be shuffled into place.
9613       V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
9614                        MVT::v16i8, V1, V1);
9615
9616       int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9617       for (int i = 0; i < 16; ++i)
9618         if (Mask[i] != -1) {
9619           int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
9620           assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
9621           if (PostDupI16Shuffle[i / 2] == -1)
9622             PostDupI16Shuffle[i / 2] = MappedMask;
9623           else
9624             assert(PostDupI16Shuffle[i / 2] == MappedMask &&
9625                    "Conflicting entrties in the original shuffle!");
9626         }
9627       return DAG.getNode(
9628           ISD::BITCAST, DL, MVT::v16i8,
9629           DAG.getVectorShuffle(MVT::v8i16, DL,
9630                                DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
9631                                DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
9632     };
9633     if (SDValue V = tryToWidenViaDuplication())
9634       return V;
9635   }
9636
9637   // Check whether an interleaving lowering is likely to be more efficient.
9638   // This isn't perfect but it is a strong heuristic that tends to work well on
9639   // the kinds of shuffles that show up in practice.
9640   //
9641   // FIXME: We need to handle other interleaving widths (i16, i32, ...).
9642   if (shouldLowerAsInterleaving(Mask)) {
9643     int NumLoHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {
9644       return (M >= 0 && M < 8) || (M >= 16 && M < 24);
9645     });
9646     int NumHiHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {
9647       return (M >= 8 && M < 16) || M >= 24;
9648     });
9649     int EMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,
9650                      -1, -1, -1, -1, -1, -1, -1, -1};
9651     int OMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,
9652                      -1, -1, -1, -1, -1, -1, -1, -1};
9653     bool UnpackLo = NumLoHalf >= NumHiHalf;
9654     MutableArrayRef<int> TargetEMask(UnpackLo ? EMask : EMask + 8, 8);
9655     MutableArrayRef<int> TargetOMask(UnpackLo ? OMask : OMask + 8, 8);
9656     for (int i = 0; i < 8; ++i) {
9657       TargetEMask[i] = Mask[2 * i];
9658       TargetOMask[i] = Mask[2 * i + 1];
9659     }
9660
9661     SDValue Evens = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, EMask);
9662     SDValue Odds = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, OMask);
9663
9664     return DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
9665                        MVT::v16i8, Evens, Odds);
9666   }
9667
9668   // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
9669   // with PSHUFB. It is important to do this before we attempt to generate any
9670   // blends but after all of the single-input lowerings. If the single input
9671   // lowerings can find an instruction sequence that is faster than a PSHUFB, we
9672   // want to preserve that and we can DAG combine any longer sequences into
9673   // a PSHUFB in the end. But once we start blending from multiple inputs,
9674   // the complexity of DAG combining bad patterns back into PSHUFB is too high,
9675   // and there are *very* few patterns that would actually be faster than the
9676   // PSHUFB approach because of its ability to zero lanes.
9677   //
9678   // FIXME: The only exceptions to the above are blends which are exact
9679   // interleavings with direct instructions supporting them. We currently don't
9680   // handle those well here.
9681   if (Subtarget->hasSSSE3()) {
9682     SDValue V1Mask[16];
9683     SDValue V2Mask[16];
9684     bool V1InUse = false;
9685     bool V2InUse = false;
9686     SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
9687
9688     for (int i = 0; i < 16; ++i) {
9689       if (Mask[i] == -1) {
9690         V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
9691       } else {
9692         const int ZeroMask = 0x80;
9693         int V1Idx = (Mask[i] < 16 ? Mask[i] : ZeroMask);
9694         int V2Idx = (Mask[i] < 16 ? ZeroMask : Mask[i] - 16);
9695         if (Zeroable[i])
9696           V1Idx = V2Idx = ZeroMask;
9697         V1Mask[i] = DAG.getConstant(V1Idx, MVT::i8);
9698         V2Mask[i] = DAG.getConstant(V2Idx, MVT::i8);
9699         V1InUse |= (ZeroMask != V1Idx);
9700         V2InUse |= (ZeroMask != V2Idx);
9701       }
9702     }
9703
9704     if (V1InUse)
9705       V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V1,
9706                        DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
9707     if (V2InUse)
9708       V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V2,
9709                        DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));
9710
9711     // If we need shuffled inputs from both, blend the two.
9712     if (V1InUse && V2InUse)
9713       return DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
9714     if (V1InUse)
9715       return V1; // Single inputs are easy.
9716     if (V2InUse)
9717       return V2; // Single inputs are easy.
9718     // Shuffling to a zeroable vector.
9719     return getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
9720   }
9721
9722   // There are special ways we can lower some single-element blends.
9723   if (NumV2Elements == 1)
9724     if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v16i8, DL, V1, V2,
9725                                                          Mask, Subtarget, DAG))
9726       return V;
9727
9728   // Check whether a compaction lowering can be done. This handles shuffles
9729   // which take every Nth element for some even N. See the helper function for
9730   // details.
9731   //
9732   // We special case these as they can be particularly efficiently handled with
9733   // the PACKUSB instruction on x86 and they show up in common patterns of
9734   // rearranging bytes to truncate wide elements.
9735   if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask)) {
9736     // NumEvenDrops is the power of two stride of the elements. Another way of
9737     // thinking about it is that we need to drop the even elements this many
9738     // times to get the original input.
9739     bool IsSingleInput = isSingleInputShuffleMask(Mask);
9740
9741     // First we need to zero all the dropped bytes.
9742     assert(NumEvenDrops <= 3 &&
9743            "No support for dropping even elements more than 3 times.");
9744     // We use the mask type to pick which bytes are preserved based on how many
9745     // elements are dropped.
9746     MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
9747     SDValue ByteClearMask =
9748         DAG.getNode(ISD::BITCAST, DL, MVT::v16i8,
9749                     DAG.getConstant(0xFF, MaskVTs[NumEvenDrops - 1]));
9750     V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
9751     if (!IsSingleInput)
9752       V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
9753
9754     // Now pack things back together.
9755     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
9756     V2 = IsSingleInput ? V1 : DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
9757     SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
9758     for (int i = 1; i < NumEvenDrops; ++i) {
9759       Result = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Result);
9760       Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
9761     }
9762
9763     return Result;
9764   }
9765
9766   int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9767   int V1HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9768   int V2LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9769   int V2HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9770
9771   auto buildBlendMasks = [](MutableArrayRef<int> HalfMask,
9772                             MutableArrayRef<int> V1HalfBlendMask,
9773                             MutableArrayRef<int> V2HalfBlendMask) {
9774     for (int i = 0; i < 8; ++i)
9775       if (HalfMask[i] >= 0 && HalfMask[i] < 16) {
9776         V1HalfBlendMask[i] = HalfMask[i];
9777         HalfMask[i] = i;
9778       } else if (HalfMask[i] >= 16) {
9779         V2HalfBlendMask[i] = HalfMask[i] - 16;
9780         HalfMask[i] = i + 8;
9781       }
9782   };
9783   buildBlendMasks(LoMask, V1LoBlendMask, V2LoBlendMask);
9784   buildBlendMasks(HiMask, V1HiBlendMask, V2HiBlendMask);
9785
9786   SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
9787
9788   auto buildLoAndHiV8s = [&](SDValue V, MutableArrayRef<int> LoBlendMask,
9789                              MutableArrayRef<int> HiBlendMask) {
9790     SDValue V1, V2;
9791     // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
9792     // them out and avoid using UNPCK{L,H} to extract the elements of V as
9793     // i16s.
9794     if (std::none_of(LoBlendMask.begin(), LoBlendMask.end(),
9795                      [](int M) { return M >= 0 && M % 2 == 1; }) &&
9796         std::none_of(HiBlendMask.begin(), HiBlendMask.end(),
9797                      [](int M) { return M >= 0 && M % 2 == 1; })) {
9798       // Use a mask to drop the high bytes.
9799       V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
9800       V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, V1,
9801                        DAG.getConstant(0x00FF, MVT::v8i16));
9802
9803       // This will be a single vector shuffle instead of a blend so nuke V2.
9804       V2 = DAG.getUNDEF(MVT::v8i16);
9805
9806       // Squash the masks to point directly into V1.
9807       for (int &M : LoBlendMask)
9808         if (M >= 0)
9809           M /= 2;
9810       for (int &M : HiBlendMask)
9811         if (M >= 0)
9812           M /= 2;
9813     } else {
9814       // Otherwise just unpack the low half of V into V1 and the high half into
9815       // V2 so that we can blend them as i16s.
9816       V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9817                        DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
9818       V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9819                        DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
9820     }
9821
9822     SDValue BlendedLo = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
9823     SDValue BlendedHi = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
9824     return std::make_pair(BlendedLo, BlendedHi);
9825   };
9826   SDValue V1Lo, V1Hi, V2Lo, V2Hi;
9827   std::tie(V1Lo, V1Hi) = buildLoAndHiV8s(V1, V1LoBlendMask, V1HiBlendMask);
9828   std::tie(V2Lo, V2Hi) = buildLoAndHiV8s(V2, V2LoBlendMask, V2HiBlendMask);
9829
9830   SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Lo, V2Lo, LoMask);
9831   SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Hi, V2Hi, HiMask);
9832
9833   return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
9834 }
9835
9836 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
9837 ///
9838 /// This routine breaks down the specific type of 128-bit shuffle and
9839 /// dispatches to the lowering routines accordingly.
9840 static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9841                                         MVT VT, const X86Subtarget *Subtarget,
9842                                         SelectionDAG &DAG) {
9843   switch (VT.SimpleTy) {
9844   case MVT::v2i64:
9845     return lowerV2I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
9846   case MVT::v2f64:
9847     return lowerV2F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
9848   case MVT::v4i32:
9849     return lowerV4I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
9850   case MVT::v4f32:
9851     return lowerV4F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
9852   case MVT::v8i16:
9853     return lowerV8I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
9854   case MVT::v16i8:
9855     return lowerV16I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
9856
9857   default:
9858     llvm_unreachable("Unimplemented!");
9859   }
9860 }
9861
9862 /// \brief Helper function to test whether a shuffle mask could be
9863 /// simplified by widening the elements being shuffled.
9864 ///
9865 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
9866 /// leaves it in an unspecified state.
9867 ///
9868 /// NOTE: This must handle normal vector shuffle masks and *target* vector
9869 /// shuffle masks. The latter have the special property of a '-2' representing
9870 /// a zero-ed lane of a vector.
9871 static bool canWidenShuffleElements(ArrayRef<int> Mask,
9872                                     SmallVectorImpl<int> &WidenedMask) {
9873   for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
9874     // If both elements are undef, its trivial.
9875     if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
9876       WidenedMask.push_back(SM_SentinelUndef);
9877       continue;
9878     }
9879
9880     // Check for an undef mask and a mask value properly aligned to fit with
9881     // a pair of values. If we find such a case, use the non-undef mask's value.
9882     if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) {
9883       WidenedMask.push_back(Mask[i + 1] / 2);
9884       continue;
9885     }
9886     if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
9887       WidenedMask.push_back(Mask[i] / 2);
9888       continue;
9889     }
9890
9891     // When zeroing, we need to spread the zeroing across both lanes to widen.
9892     if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
9893       if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
9894           (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
9895         WidenedMask.push_back(SM_SentinelZero);
9896         continue;
9897       }
9898       return false;
9899     }
9900
9901     // Finally check if the two mask values are adjacent and aligned with
9902     // a pair.
9903     if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) {
9904       WidenedMask.push_back(Mask[i] / 2);
9905       continue;
9906     }
9907
9908     // Otherwise we can't safely widen the elements used in this shuffle.
9909     return false;
9910   }
9911   assert(WidenedMask.size() == Mask.size() / 2 &&
9912          "Incorrect size of mask after widening the elements!");
9913
9914   return true;
9915 }
9916
9917 /// \brief Generic routine to split ector shuffle into half-sized shuffles.
9918 ///
9919 /// This routine just extracts two subvectors, shuffles them independently, and
9920 /// then concatenates them back together. This should work effectively with all
9921 /// AVX vector shuffle types.
9922 static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
9923                                           SDValue V2, ArrayRef<int> Mask,
9924                                           SelectionDAG &DAG) {
9925   assert(VT.getSizeInBits() >= 256 &&
9926          "Only for 256-bit or wider vector shuffles!");
9927   assert(V1.getSimpleValueType() == VT && "Bad operand type!");
9928   assert(V2.getSimpleValueType() == VT && "Bad operand type!");
9929
9930   ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
9931   ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
9932
9933   int NumElements = VT.getVectorNumElements();
9934   int SplitNumElements = NumElements / 2;
9935   MVT ScalarVT = VT.getScalarType();
9936   MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
9937
9938   SDValue LoV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1,
9939                              DAG.getIntPtrConstant(0));
9940   SDValue HiV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1,
9941                              DAG.getIntPtrConstant(SplitNumElements));
9942   SDValue LoV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2,
9943                              DAG.getIntPtrConstant(0));
9944   SDValue HiV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2,
9945                              DAG.getIntPtrConstant(SplitNumElements));
9946
9947   // Now create two 4-way blends of these half-width vectors.
9948   auto HalfBlend = [&](ArrayRef<int> HalfMask) {
9949     bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
9950     SmallVector<int, 32> V1BlendMask, V2BlendMask, BlendMask;
9951     for (int i = 0; i < SplitNumElements; ++i) {
9952       int M = HalfMask[i];
9953       if (M >= NumElements) {
9954         if (M >= NumElements + SplitNumElements)
9955           UseHiV2 = true;
9956         else
9957           UseLoV2 = true;
9958         V2BlendMask.push_back(M - NumElements);
9959         V1BlendMask.push_back(-1);
9960         BlendMask.push_back(SplitNumElements + i);
9961       } else if (M >= 0) {
9962         if (M >= SplitNumElements)
9963           UseHiV1 = true;
9964         else
9965           UseLoV1 = true;
9966         V2BlendMask.push_back(-1);
9967         V1BlendMask.push_back(M);
9968         BlendMask.push_back(i);
9969       } else {
9970         V2BlendMask.push_back(-1);
9971         V1BlendMask.push_back(-1);
9972         BlendMask.push_back(-1);
9973       }
9974     }
9975
9976     // Because the lowering happens after all combining takes place, we need to
9977     // manually combine these blend masks as much as possible so that we create
9978     // a minimal number of high-level vector shuffle nodes.
9979
9980     // First try just blending the halves of V1 or V2.
9981     if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
9982       return DAG.getUNDEF(SplitVT);
9983     if (!UseLoV2 && !UseHiV2)
9984       return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
9985     if (!UseLoV1 && !UseHiV1)
9986       return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
9987
9988     SDValue V1Blend, V2Blend;
9989     if (UseLoV1 && UseHiV1) {
9990       V1Blend =
9991         DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
9992     } else {
9993       // We only use half of V1 so map the usage down into the final blend mask.
9994       V1Blend = UseLoV1 ? LoV1 : HiV1;
9995       for (int i = 0; i < SplitNumElements; ++i)
9996         if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
9997           BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
9998     }
9999     if (UseLoV2 && UseHiV2) {
10000       V2Blend =
10001         DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
10002     } else {
10003       // We only use half of V2 so map the usage down into the final blend mask.
10004       V2Blend = UseLoV2 ? LoV2 : HiV2;
10005       for (int i = 0; i < SplitNumElements; ++i)
10006         if (BlendMask[i] >= SplitNumElements)
10007           BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
10008     }
10009     return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
10010   };
10011   SDValue Lo = HalfBlend(LoMask);
10012   SDValue Hi = HalfBlend(HiMask);
10013   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
10014 }
10015
10016 /// \brief Either split a vector in halves or decompose the shuffles and the
10017 /// blend.
10018 ///
10019 /// This is provided as a good fallback for many lowerings of non-single-input
10020 /// shuffles with more than one 128-bit lane. In those cases, we want to select
10021 /// between splitting the shuffle into 128-bit components and stitching those
10022 /// back together vs. extracting the single-input shuffles and blending those
10023 /// results.
10024 static SDValue lowerVectorShuffleAsSplitOrBlend(SDLoc DL, MVT VT, SDValue V1,
10025                                                 SDValue V2, ArrayRef<int> Mask,
10026                                                 SelectionDAG &DAG) {
10027   assert(!isSingleInputShuffleMask(Mask) && "This routine must not be used to "
10028                                             "lower single-input shuffles as it "
10029                                             "could then recurse on itself.");
10030   int Size = Mask.size();
10031
10032   // If this can be modeled as a broadcast of two elements followed by a blend,
10033   // prefer that lowering. This is especially important because broadcasts can
10034   // often fold with memory operands.
10035   auto DoBothBroadcast = [&] {
10036     int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
10037     for (int M : Mask)
10038       if (M >= Size) {
10039         if (V2BroadcastIdx == -1)
10040           V2BroadcastIdx = M - Size;
10041         else if (M - Size != V2BroadcastIdx)
10042           return false;
10043       } else if (M >= 0) {
10044         if (V1BroadcastIdx == -1)
10045           V1BroadcastIdx = M;
10046         else if (M != V1BroadcastIdx)
10047           return false;
10048       }
10049     return true;
10050   };
10051   if (DoBothBroadcast())
10052     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
10053                                                       DAG);
10054
10055   // If the inputs all stem from a single 128-bit lane of each input, then we
10056   // split them rather than blending because the split will decompose to
10057   // unusually few instructions.
10058   int LaneCount = VT.getSizeInBits() / 128;
10059   int LaneSize = Size / LaneCount;
10060   SmallBitVector LaneInputs[2];
10061   LaneInputs[0].resize(LaneCount, false);
10062   LaneInputs[1].resize(LaneCount, false);
10063   for (int i = 0; i < Size; ++i)
10064     if (Mask[i] >= 0)
10065       LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
10066   if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
10067     return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10068
10069   // Otherwise, just fall back to decomposed shuffles and a blend. This requires
10070   // that the decomposed single-input shuffles don't end up here.
10071   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
10072 }
10073
10074 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
10075 /// a permutation and blend of those lanes.
10076 ///
10077 /// This essentially blends the out-of-lane inputs to each lane into the lane
10078 /// from a permuted copy of the vector. This lowering strategy results in four
10079 /// instructions in the worst case for a single-input cross lane shuffle which
10080 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
10081 /// of. Special cases for each particular shuffle pattern should be handled
10082 /// prior to trying this lowering.
10083 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT,
10084                                                        SDValue V1, SDValue V2,
10085                                                        ArrayRef<int> Mask,
10086                                                        SelectionDAG &DAG) {
10087   // FIXME: This should probably be generalized for 512-bit vectors as well.
10088   assert(VT.getSizeInBits() == 256 && "Only for 256-bit vector shuffles!");
10089   int LaneSize = Mask.size() / 2;
10090
10091   // If there are only inputs from one 128-bit lane, splitting will in fact be
10092   // less expensive. The flags track wether the given lane contains an element
10093   // that crosses to another lane.
10094   bool LaneCrossing[2] = {false, false};
10095   for (int i = 0, Size = Mask.size(); i < Size; ++i)
10096     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
10097       LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
10098   if (!LaneCrossing[0] || !LaneCrossing[1])
10099     return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10100
10101   if (isSingleInputShuffleMask(Mask)) {
10102     SmallVector<int, 32> FlippedBlendMask;
10103     for (int i = 0, Size = Mask.size(); i < Size; ++i)
10104       FlippedBlendMask.push_back(
10105           Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
10106                                   ? Mask[i]
10107                                   : Mask[i] % LaneSize +
10108                                         (i / LaneSize) * LaneSize + Size));
10109
10110     // Flip the vector, and blend the results which should now be in-lane. The
10111     // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
10112     // 5 for the high source. The value 3 selects the high half of source 2 and
10113     // the value 2 selects the low half of source 2. We only use source 2 to
10114     // allow folding it into a memory operand.
10115     unsigned PERMMask = 3 | 2 << 4;
10116     SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
10117                                   V1, DAG.getConstant(PERMMask, MVT::i8));
10118     return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
10119   }
10120
10121   // This now reduces to two single-input shuffles of V1 and V2 which at worst
10122   // will be handled by the above logic and a blend of the results, much like
10123   // other patterns in AVX.
10124   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
10125 }
10126
10127 /// \brief Handle lowering 2-lane 128-bit shuffles.
10128 static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
10129                                         SDValue V2, ArrayRef<int> Mask,
10130                                         const X86Subtarget *Subtarget,
10131                                         SelectionDAG &DAG) {
10132   // Blends are faster and handle all the non-lane-crossing cases.
10133   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
10134                                                 Subtarget, DAG))
10135     return Blend;
10136
10137   MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
10138                                VT.getVectorNumElements() / 2);
10139   // Check for patterns which can be matched with a single insert of a 128-bit
10140   // subvector.
10141   if (isShuffleEquivalent(Mask, 0, 1, 0, 1) ||
10142       isShuffleEquivalent(Mask, 0, 1, 4, 5)) {
10143     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
10144                               DAG.getIntPtrConstant(0));
10145     SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
10146                               Mask[2] < 4 ? V1 : V2, DAG.getIntPtrConstant(0));
10147     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
10148   }
10149   if (isShuffleEquivalent(Mask, 0, 1, 6, 7)) {
10150     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
10151                               DAG.getIntPtrConstant(0));
10152     SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
10153                               DAG.getIntPtrConstant(2));
10154     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
10155   }
10156
10157   // Otherwise form a 128-bit permutation.
10158   // FIXME: Detect zero-vector inputs and use the VPERM2X128 to zero that half.
10159   unsigned PermMask = Mask[0] / 2 | (Mask[2] / 2) << 4;
10160   return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
10161                      DAG.getConstant(PermMask, MVT::i8));
10162 }
10163
10164 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
10165 /// shuffling each lane.
10166 ///
10167 /// This will only succeed when the result of fixing the 128-bit lanes results
10168 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
10169 /// each 128-bit lanes. This handles many cases where we can quickly blend away
10170 /// the lane crosses early and then use simpler shuffles within each lane.
10171 ///
10172 /// FIXME: It might be worthwhile at some point to support this without
10173 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
10174 /// in x86 only floating point has interesting non-repeating shuffles, and even
10175 /// those are still *marginally* more expensive.
10176 static SDValue lowerVectorShuffleByMerging128BitLanes(
10177     SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10178     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
10179   assert(!isSingleInputShuffleMask(Mask) &&
10180          "This is only useful with multiple inputs.");
10181
10182   int Size = Mask.size();
10183   int LaneSize = 128 / VT.getScalarSizeInBits();
10184   int NumLanes = Size / LaneSize;
10185   assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
10186
10187   // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
10188   // check whether the in-128-bit lane shuffles share a repeating pattern.
10189   SmallVector<int, 4> Lanes;
10190   Lanes.resize(NumLanes, -1);
10191   SmallVector<int, 4> InLaneMask;
10192   InLaneMask.resize(LaneSize, -1);
10193   for (int i = 0; i < Size; ++i) {
10194     if (Mask[i] < 0)
10195       continue;
10196
10197     int j = i / LaneSize;
10198
10199     if (Lanes[j] < 0) {
10200       // First entry we've seen for this lane.
10201       Lanes[j] = Mask[i] / LaneSize;
10202     } else if (Lanes[j] != Mask[i] / LaneSize) {
10203       // This doesn't match the lane selected previously!
10204       return SDValue();
10205     }
10206
10207     // Check that within each lane we have a consistent shuffle mask.
10208     int k = i % LaneSize;
10209     if (InLaneMask[k] < 0) {
10210       InLaneMask[k] = Mask[i] % LaneSize;
10211     } else if (InLaneMask[k] != Mask[i] % LaneSize) {
10212       // This doesn't fit a repeating in-lane mask.
10213       return SDValue();
10214     }
10215   }
10216
10217   // First shuffle the lanes into place.
10218   MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
10219                                 VT.getSizeInBits() / 64);
10220   SmallVector<int, 8> LaneMask;
10221   LaneMask.resize(NumLanes * 2, -1);
10222   for (int i = 0; i < NumLanes; ++i)
10223     if (Lanes[i] >= 0) {
10224       LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
10225       LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
10226     }
10227
10228   V1 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V1);
10229   V2 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V2);
10230   SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
10231
10232   // Cast it back to the type we actually want.
10233   LaneShuffle = DAG.getNode(ISD::BITCAST, DL, VT, LaneShuffle);
10234
10235   // Now do a simple shuffle that isn't lane crossing.
10236   SmallVector<int, 8> NewMask;
10237   NewMask.resize(Size, -1);
10238   for (int i = 0; i < Size; ++i)
10239     if (Mask[i] >= 0)
10240       NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
10241   assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
10242          "Must not introduce lane crosses at this point!");
10243
10244   return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
10245 }
10246
10247 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
10248 /// given mask.
10249 ///
10250 /// This returns true if the elements from a particular input are already in the
10251 /// slot required by the given mask and require no permutation.
10252 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
10253   assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
10254   int Size = Mask.size();
10255   for (int i = 0; i < Size; ++i)
10256     if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
10257       return false;
10258
10259   return true;
10260 }
10261
10262 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
10263 ///
10264 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
10265 /// isn't available.
10266 static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10267                                        const X86Subtarget *Subtarget,
10268                                        SelectionDAG &DAG) {
10269   SDLoc DL(Op);
10270   assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
10271   assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
10272   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10273   ArrayRef<int> Mask = SVOp->getMask();
10274   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10275
10276   SmallVector<int, 4> WidenedMask;
10277   if (canWidenShuffleElements(Mask, WidenedMask))
10278     return lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, Subtarget,
10279                                     DAG);
10280
10281   if (isSingleInputShuffleMask(Mask)) {
10282     // Check for being able to broadcast a single element.
10283     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f64, DL, V1,
10284                                                           Mask, Subtarget, DAG))
10285       return Broadcast;
10286
10287     // Use low duplicate instructions for masks that match their pattern.
10288     if (isShuffleEquivalent(Mask, 0, 0, 2, 2))
10289       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
10290
10291     if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
10292       // Non-half-crossing single input shuffles can be lowerid with an
10293       // interleaved permutation.
10294       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
10295                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
10296       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
10297                          DAG.getConstant(VPERMILPMask, MVT::i8));
10298     }
10299
10300     // With AVX2 we have direct support for this permutation.
10301     if (Subtarget->hasAVX2())
10302       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
10303                          getV4X86ShuffleImm8ForMask(Mask, DAG));
10304
10305     // Otherwise, fall back.
10306     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
10307                                                    DAG);
10308   }
10309
10310   // X86 has dedicated unpack instructions that can handle specific blend
10311   // operations: UNPCKH and UNPCKL.
10312   if (isShuffleEquivalent(Mask, 0, 4, 2, 6))
10313     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2);
10314   if (isShuffleEquivalent(Mask, 1, 5, 3, 7))
10315     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2);
10316
10317   // If we have a single input to the zero element, insert that into V1 if we
10318   // can do so cheaply.
10319   int NumV2Elements =
10320       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
10321   if (NumV2Elements == 1 && Mask[0] >= 4)
10322     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10323             MVT::v4f64, DL, V1, V2, Mask, Subtarget, DAG))
10324       return Insertion;
10325
10326   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
10327                                                 Subtarget, DAG))
10328     return Blend;
10329
10330   // Check if the blend happens to exactly fit that of SHUFPD.
10331   if ((Mask[0] == -1 || Mask[0] < 2) &&
10332       (Mask[1] == -1 || (Mask[1] >= 4 && Mask[1] < 6)) &&
10333       (Mask[2] == -1 || (Mask[2] >= 2 && Mask[2] < 4)) &&
10334       (Mask[3] == -1 || Mask[3] >= 6)) {
10335     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 5) << 1) |
10336                           ((Mask[2] == 3) << 2) | ((Mask[3] == 7) << 3);
10337     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V1, V2,
10338                        DAG.getConstant(SHUFPDMask, MVT::i8));
10339   }
10340   if ((Mask[0] == -1 || (Mask[0] >= 4 && Mask[0] < 6)) &&
10341       (Mask[1] == -1 || Mask[1] < 2) &&
10342       (Mask[2] == -1 || Mask[2] >= 6) &&
10343       (Mask[3] == -1 || (Mask[3] >= 2 && Mask[3] < 4))) {
10344     unsigned SHUFPDMask = (Mask[0] == 5) | ((Mask[1] == 1) << 1) |
10345                           ((Mask[2] == 7) << 2) | ((Mask[3] == 3) << 3);
10346     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V2, V1,
10347                        DAG.getConstant(SHUFPDMask, MVT::i8));
10348   }
10349
10350   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10351   // shuffle. However, if we have AVX2 and either inputs are already in place,
10352   // we will be able to shuffle even across lanes the other input in a single
10353   // instruction so skip this pattern.
10354   if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
10355                                  isShuffleMaskInputInPlace(1, Mask))))
10356     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10357             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
10358       return Result;
10359
10360   // If we have AVX2 then we always want to lower with a blend because an v4 we
10361   // can fully permute the elements.
10362   if (Subtarget->hasAVX2())
10363     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
10364                                                       Mask, DAG);
10365
10366   // Otherwise fall back on generic lowering.
10367   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
10368 }
10369
10370 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
10371 ///
10372 /// This routine is only called when we have AVX2 and thus a reasonable
10373 /// instruction set for v4i64 shuffling..
10374 static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10375                                        const X86Subtarget *Subtarget,
10376                                        SelectionDAG &DAG) {
10377   SDLoc DL(Op);
10378   assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
10379   assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
10380   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10381   ArrayRef<int> Mask = SVOp->getMask();
10382   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10383   assert(Subtarget->hasAVX2() && "We can only lower v4i64 with AVX2!");
10384
10385   SmallVector<int, 4> WidenedMask;
10386   if (canWidenShuffleElements(Mask, WidenedMask))
10387     return lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, Subtarget,
10388                                     DAG);
10389
10390   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
10391                                                 Subtarget, DAG))
10392     return Blend;
10393
10394   // Check for being able to broadcast a single element.
10395   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i64, DL, V1,
10396                                                         Mask, Subtarget, DAG))
10397     return Broadcast;
10398
10399   // When the shuffle is mirrored between the 128-bit lanes of the unit, we can
10400   // use lower latency instructions that will operate on both 128-bit lanes.
10401   SmallVector<int, 2> RepeatedMask;
10402   if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
10403     if (isSingleInputShuffleMask(Mask)) {
10404       int PSHUFDMask[] = {-1, -1, -1, -1};
10405       for (int i = 0; i < 2; ++i)
10406         if (RepeatedMask[i] >= 0) {
10407           PSHUFDMask[2 * i] = 2 * RepeatedMask[i];
10408           PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1;
10409         }
10410       return DAG.getNode(
10411           ISD::BITCAST, DL, MVT::v4i64,
10412           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
10413                       DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, V1),
10414                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
10415     }
10416
10417     // Use dedicated unpack instructions for masks that match their pattern.
10418     if (isShuffleEquivalent(Mask, 0, 4, 2, 6))
10419       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
10420     if (isShuffleEquivalent(Mask, 1, 5, 3, 7))
10421       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);
10422   }
10423
10424   // AVX2 provides a direct instruction for permuting a single input across
10425   // lanes.
10426   if (isSingleInputShuffleMask(Mask))
10427     return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
10428                        getV4X86ShuffleImm8ForMask(Mask, DAG));
10429
10430   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10431   // shuffle. However, if we have AVX2 and either inputs are already in place,
10432   // we will be able to shuffle even across lanes the other input in a single
10433   // instruction so skip this pattern.
10434   if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
10435                                  isShuffleMaskInputInPlace(1, Mask))))
10436     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10437             DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
10438       return Result;
10439
10440   // Otherwise fall back on generic blend lowering.
10441   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
10442                                                     Mask, DAG);
10443 }
10444
10445 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
10446 ///
10447 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
10448 /// isn't available.
10449 static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10450                                        const X86Subtarget *Subtarget,
10451                                        SelectionDAG &DAG) {
10452   SDLoc DL(Op);
10453   assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
10454   assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
10455   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10456   ArrayRef<int> Mask = SVOp->getMask();
10457   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10458
10459   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
10460                                                 Subtarget, DAG))
10461     return Blend;
10462
10463   // Check for being able to broadcast a single element.
10464   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8f32, DL, V1,
10465                                                         Mask, Subtarget, DAG))
10466     return Broadcast;
10467
10468   // If the shuffle mask is repeated in each 128-bit lane, we have many more
10469   // options to efficiently lower the shuffle.
10470   SmallVector<int, 4> RepeatedMask;
10471   if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
10472     assert(RepeatedMask.size() == 4 &&
10473            "Repeated masks must be half the mask width!");
10474
10475     // Use even/odd duplicate instructions for masks that match their pattern.
10476     if (isShuffleEquivalent(Mask, 0, 0, 2, 2, 4, 4, 6, 6))
10477       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
10478     if (isShuffleEquivalent(Mask, 1, 1, 3, 3, 5, 5, 7, 7))
10479       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
10480
10481     if (isSingleInputShuffleMask(Mask))
10482       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
10483                          getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
10484
10485     // Use dedicated unpack instructions for masks that match their pattern.
10486     if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13))
10487       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2);
10488     if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15))
10489       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2);
10490
10491     // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
10492     // have already handled any direct blends. We also need to squash the
10493     // repeated mask into a simulated v4f32 mask.
10494     for (int i = 0; i < 4; ++i)
10495       if (RepeatedMask[i] >= 8)
10496         RepeatedMask[i] -= 4;
10497     return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
10498   }
10499
10500   // If we have a single input shuffle with different shuffle patterns in the
10501   // two 128-bit lanes use the variable mask to VPERMILPS.
10502   if (isSingleInputShuffleMask(Mask)) {
10503     SDValue VPermMask[8];
10504     for (int i = 0; i < 8; ++i)
10505       VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
10506                                  : DAG.getConstant(Mask[i], MVT::i32);
10507     if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
10508       return DAG.getNode(
10509           X86ISD::VPERMILPV, DL, MVT::v8f32, V1,
10510           DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask));
10511
10512     if (Subtarget->hasAVX2())
10513       return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32,
10514                          DAG.getNode(ISD::BITCAST, DL, MVT::v8f32,
10515                                      DAG.getNode(ISD::BUILD_VECTOR, DL,
10516                                                  MVT::v8i32, VPermMask)),
10517                          V1);
10518
10519     // Otherwise, fall back.
10520     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
10521                                                    DAG);
10522   }
10523
10524   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10525   // shuffle.
10526   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10527           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
10528     return Result;
10529
10530   // If we have AVX2 then we always want to lower with a blend because at v8 we
10531   // can fully permute the elements.
10532   if (Subtarget->hasAVX2())
10533     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
10534                                                       Mask, DAG);
10535
10536   // Otherwise fall back on generic lowering.
10537   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
10538 }
10539
10540 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
10541 ///
10542 /// This routine is only called when we have AVX2 and thus a reasonable
10543 /// instruction set for v8i32 shuffling..
10544 static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10545                                        const X86Subtarget *Subtarget,
10546                                        SelectionDAG &DAG) {
10547   SDLoc DL(Op);
10548   assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
10549   assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
10550   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10551   ArrayRef<int> Mask = SVOp->getMask();
10552   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10553   assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!");
10554
10555   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
10556                                                 Subtarget, DAG))
10557     return Blend;
10558
10559   // Check for being able to broadcast a single element.
10560   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i32, DL, V1,
10561                                                         Mask, Subtarget, DAG))
10562     return Broadcast;
10563
10564   // If the shuffle mask is repeated in each 128-bit lane we can use more
10565   // efficient instructions that mirror the shuffles across the two 128-bit
10566   // lanes.
10567   SmallVector<int, 4> RepeatedMask;
10568   if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) {
10569     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
10570     if (isSingleInputShuffleMask(Mask))
10571       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
10572                          getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
10573
10574     // Use dedicated unpack instructions for masks that match their pattern.
10575     if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13))
10576       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2);
10577     if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15))
10578       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2);
10579   }
10580
10581   // If the shuffle patterns aren't repeated but it is a single input, directly
10582   // generate a cross-lane VPERMD instruction.
10583   if (isSingleInputShuffleMask(Mask)) {
10584     SDValue VPermMask[8];
10585     for (int i = 0; i < 8; ++i)
10586       VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
10587                                  : DAG.getConstant(Mask[i], MVT::i32);
10588     return DAG.getNode(
10589         X86ISD::VPERMV, DL, MVT::v8i32,
10590         DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
10591   }
10592
10593   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10594   // shuffle.
10595   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10596           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
10597     return Result;
10598
10599   // Otherwise fall back on generic blend lowering.
10600   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
10601                                                     Mask, DAG);
10602 }
10603
10604 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
10605 ///
10606 /// This routine is only called when we have AVX2 and thus a reasonable
10607 /// instruction set for v16i16 shuffling..
10608 static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10609                                         const X86Subtarget *Subtarget,
10610                                         SelectionDAG &DAG) {
10611   SDLoc DL(Op);
10612   assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
10613   assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
10614   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10615   ArrayRef<int> Mask = SVOp->getMask();
10616   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
10617   assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!");
10618
10619   // Check for being able to broadcast a single element.
10620   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i16, DL, V1,
10621                                                         Mask, Subtarget, DAG))
10622     return Broadcast;
10623
10624   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
10625                                                 Subtarget, DAG))
10626     return Blend;
10627
10628   // Use dedicated unpack instructions for masks that match their pattern.
10629   if (isShuffleEquivalent(Mask,
10630                           // First 128-bit lane:
10631                           0, 16, 1, 17, 2, 18, 3, 19,
10632                           // Second 128-bit lane:
10633                           8, 24, 9, 25, 10, 26, 11, 27))
10634     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2);
10635   if (isShuffleEquivalent(Mask,
10636                           // First 128-bit lane:
10637                           4, 20, 5, 21, 6, 22, 7, 23,
10638                           // Second 128-bit lane:
10639                           12, 28, 13, 29, 14, 30, 15, 31))
10640     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2);
10641
10642   if (isSingleInputShuffleMask(Mask)) {
10643     // There are no generalized cross-lane shuffle operations available on i16
10644     // element types.
10645     if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
10646       return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
10647                                                      Mask, DAG);
10648
10649     SDValue PSHUFBMask[32];
10650     for (int i = 0; i < 16; ++i) {
10651       if (Mask[i] == -1) {
10652         PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8);
10653         continue;
10654       }
10655
10656       int M = i < 8 ? Mask[i] : Mask[i] - 8;
10657       assert(M >= 0 && M < 8 && "Invalid single-input mask!");
10658       PSHUFBMask[2 * i] = DAG.getConstant(2 * M, MVT::i8);
10659       PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, MVT::i8);
10660     }
10661     return DAG.getNode(
10662         ISD::BITCAST, DL, MVT::v16i16,
10663         DAG.getNode(
10664             X86ISD::PSHUFB, DL, MVT::v32i8,
10665             DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1),
10666             DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)));
10667   }
10668
10669   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10670   // shuffle.
10671   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10672           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
10673     return Result;
10674
10675   // Otherwise fall back on generic lowering.
10676   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
10677 }
10678
10679 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
10680 ///
10681 /// This routine is only called when we have AVX2 and thus a reasonable
10682 /// instruction set for v32i8 shuffling..
10683 static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10684                                        const X86Subtarget *Subtarget,
10685                                        SelectionDAG &DAG) {
10686   SDLoc DL(Op);
10687   assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
10688   assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
10689   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10690   ArrayRef<int> Mask = SVOp->getMask();
10691   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
10692   assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!");
10693
10694   // Check for being able to broadcast a single element.
10695   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v32i8, DL, V1,
10696                                                         Mask, Subtarget, DAG))
10697     return Broadcast;
10698
10699   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
10700                                                 Subtarget, DAG))
10701     return Blend;
10702
10703   // Use dedicated unpack instructions for masks that match their pattern.
10704   // Note that these are repeated 128-bit lane unpacks, not unpacks across all
10705   // 256-bit lanes.
10706   if (isShuffleEquivalent(
10707           Mask,
10708           // First 128-bit lane:
10709           0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
10710           // Second 128-bit lane:
10711           16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55))
10712     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2);
10713   if (isShuffleEquivalent(
10714           Mask,
10715           // First 128-bit lane:
10716           8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
10717           // Second 128-bit lane:
10718           24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63))
10719     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2);
10720
10721   if (isSingleInputShuffleMask(Mask)) {
10722     // There are no generalized cross-lane shuffle operations available on i8
10723     // element types.
10724     if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
10725       return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2,
10726                                                      Mask, DAG);
10727
10728     SDValue PSHUFBMask[32];
10729     for (int i = 0; i < 32; ++i)
10730       PSHUFBMask[i] =
10731           Mask[i] < 0
10732               ? DAG.getUNDEF(MVT::i8)
10733               : DAG.getConstant(Mask[i] < 16 ? Mask[i] : Mask[i] - 16, MVT::i8);
10734
10735     return DAG.getNode(
10736         X86ISD::PSHUFB, DL, MVT::v32i8, V1,
10737         DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask));
10738   }
10739
10740   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10741   // shuffle.
10742   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10743           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
10744     return Result;
10745
10746   // Otherwise fall back on generic lowering.
10747   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
10748 }
10749
10750 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
10751 ///
10752 /// This routine either breaks down the specific type of a 256-bit x86 vector
10753 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
10754 /// together based on the available instructions.
10755 static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10756                                         MVT VT, const X86Subtarget *Subtarget,
10757                                         SelectionDAG &DAG) {
10758   SDLoc DL(Op);
10759   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10760   ArrayRef<int> Mask = SVOp->getMask();
10761
10762   // There is a really nice hard cut-over between AVX1 and AVX2 that means we can
10763   // check for those subtargets here and avoid much of the subtarget querying in
10764   // the per-vector-type lowering routines. With AVX1 we have essentially *zero*
10765   // ability to manipulate a 256-bit vector with integer types. Since we'll use
10766   // floating point types there eventually, just immediately cast everything to
10767   // a float and operate entirely in that domain.
10768   if (VT.isInteger() && !Subtarget->hasAVX2()) {
10769     int ElementBits = VT.getScalarSizeInBits();
10770     if (ElementBits < 32)
10771       // No floating point type available, decompose into 128-bit vectors.
10772       return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10773
10774     MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
10775                                 VT.getVectorNumElements());
10776     V1 = DAG.getNode(ISD::BITCAST, DL, FpVT, V1);
10777     V2 = DAG.getNode(ISD::BITCAST, DL, FpVT, V2);
10778     return DAG.getNode(ISD::BITCAST, DL, VT,
10779                        DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
10780   }
10781
10782   switch (VT.SimpleTy) {
10783   case MVT::v4f64:
10784     return lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
10785   case MVT::v4i64:
10786     return lowerV4I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
10787   case MVT::v8f32:
10788     return lowerV8F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10789   case MVT::v8i32:
10790     return lowerV8I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10791   case MVT::v16i16:
10792     return lowerV16I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
10793   case MVT::v32i8:
10794     return lowerV32I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
10795
10796   default:
10797     llvm_unreachable("Not a valid 256-bit x86 vector type!");
10798   }
10799 }
10800
10801 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
10802 static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10803                                        const X86Subtarget *Subtarget,
10804                                        SelectionDAG &DAG) {
10805   SDLoc DL(Op);
10806   assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
10807   assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
10808   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10809   ArrayRef<int> Mask = SVOp->getMask();
10810   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10811
10812   // X86 has dedicated unpack instructions that can handle specific blend
10813   // operations: UNPCKH and UNPCKL.
10814   if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14))
10815     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2);
10816   if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15))
10817     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2);
10818
10819   // FIXME: Implement direct support for this type!
10820   return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG);
10821 }
10822
10823 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
10824 static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10825                                        const X86Subtarget *Subtarget,
10826                                        SelectionDAG &DAG) {
10827   SDLoc DL(Op);
10828   assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
10829   assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
10830   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10831   ArrayRef<int> Mask = SVOp->getMask();
10832   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
10833
10834   // Use dedicated unpack instructions for masks that match their pattern.
10835   if (isShuffleEquivalent(Mask,
10836                           0, 16, 1, 17, 4, 20, 5, 21,
10837                           8, 24, 9, 25, 12, 28, 13, 29))
10838     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2);
10839   if (isShuffleEquivalent(Mask,
10840                           2, 18, 3, 19, 6, 22, 7, 23,
10841                           10, 26, 11, 27, 14, 30, 15, 31))
10842     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2);
10843
10844   // FIXME: Implement direct support for this type!
10845   return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG);
10846 }
10847
10848 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
10849 static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10850                                        const X86Subtarget *Subtarget,
10851                                        SelectionDAG &DAG) {
10852   SDLoc DL(Op);
10853   assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
10854   assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
10855   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10856   ArrayRef<int> Mask = SVOp->getMask();
10857   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10858
10859   // X86 has dedicated unpack instructions that can handle specific blend
10860   // operations: UNPCKH and UNPCKL.
10861   if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14))
10862     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2);
10863   if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15))
10864     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2);
10865
10866   // FIXME: Implement direct support for this type!
10867   return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG);
10868 }
10869
10870 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
10871 static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10872                                        const X86Subtarget *Subtarget,
10873                                        SelectionDAG &DAG) {
10874   SDLoc DL(Op);
10875   assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
10876   assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
10877   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10878   ArrayRef<int> Mask = SVOp->getMask();
10879   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
10880
10881   // Use dedicated unpack instructions for masks that match their pattern.
10882   if (isShuffleEquivalent(Mask,
10883                           0, 16, 1, 17, 4, 20, 5, 21,
10884                           8, 24, 9, 25, 12, 28, 13, 29))
10885     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2);
10886   if (isShuffleEquivalent(Mask,
10887                           2, 18, 3, 19, 6, 22, 7, 23,
10888                           10, 26, 11, 27, 14, 30, 15, 31))
10889     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2);
10890
10891   // FIXME: Implement direct support for this type!
10892   return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG);
10893 }
10894
10895 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
10896 static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10897                                         const X86Subtarget *Subtarget,
10898                                         SelectionDAG &DAG) {
10899   SDLoc DL(Op);
10900   assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
10901   assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
10902   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10903   ArrayRef<int> Mask = SVOp->getMask();
10904   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
10905   assert(Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
10906
10907   // FIXME: Implement direct support for this type!
10908   return splitAndLowerVectorShuffle(DL, MVT::v32i16, V1, V2, Mask, DAG);
10909 }
10910
10911 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
10912 static SDValue lowerV64I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10913                                        const X86Subtarget *Subtarget,
10914                                        SelectionDAG &DAG) {
10915   SDLoc DL(Op);
10916   assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
10917   assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
10918   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10919   ArrayRef<int> Mask = SVOp->getMask();
10920   assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
10921   assert(Subtarget->hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
10922
10923   // FIXME: Implement direct support for this type!
10924   return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
10925 }
10926
10927 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
10928 ///
10929 /// This routine either breaks down the specific type of a 512-bit x86 vector
10930 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
10931 /// together based on the available instructions.
10932 static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10933                                         MVT VT, const X86Subtarget *Subtarget,
10934                                         SelectionDAG &DAG) {
10935   SDLoc DL(Op);
10936   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10937   ArrayRef<int> Mask = SVOp->getMask();
10938   assert(Subtarget->hasAVX512() &&
10939          "Cannot lower 512-bit vectors w/ basic ISA!");
10940
10941   // Check for being able to broadcast a single element.
10942   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(VT.SimpleTy, DL, V1,
10943                                                         Mask, Subtarget, DAG))
10944     return Broadcast;
10945
10946   // Dispatch to each element type for lowering. If we don't have supprot for
10947   // specific element type shuffles at 512 bits, immediately split them and
10948   // lower them. Each lowering routine of a given type is allowed to assume that
10949   // the requisite ISA extensions for that element type are available.
10950   switch (VT.SimpleTy) {
10951   case MVT::v8f64:
10952     return lowerV8F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
10953   case MVT::v16f32:
10954     return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10955   case MVT::v8i64:
10956     return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
10957   case MVT::v16i32:
10958     return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10959   case MVT::v32i16:
10960     if (Subtarget->hasBWI())
10961       return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
10962     break;
10963   case MVT::v64i8:
10964     if (Subtarget->hasBWI())
10965       return lowerV64I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
10966     break;
10967
10968   default:
10969     llvm_unreachable("Not a valid 512-bit x86 vector type!");
10970   }
10971
10972   // Otherwise fall back on splitting.
10973   return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10974 }
10975
10976 /// \brief Top-level lowering for x86 vector shuffles.
10977 ///
10978 /// This handles decomposition, canonicalization, and lowering of all x86
10979 /// vector shuffles. Most of the specific lowering strategies are encapsulated
10980 /// above in helper routines. The canonicalization attempts to widen shuffles
10981 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
10982 /// s.t. only one of the two inputs needs to be tested, etc.
10983 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
10984                                   SelectionDAG &DAG) {
10985   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10986   ArrayRef<int> Mask = SVOp->getMask();
10987   SDValue V1 = Op.getOperand(0);
10988   SDValue V2 = Op.getOperand(1);
10989   MVT VT = Op.getSimpleValueType();
10990   int NumElements = VT.getVectorNumElements();
10991   SDLoc dl(Op);
10992
10993   assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
10994
10995   bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
10996   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
10997   if (V1IsUndef && V2IsUndef)
10998     return DAG.getUNDEF(VT);
10999
11000   // When we create a shuffle node we put the UNDEF node to second operand,
11001   // but in some cases the first operand may be transformed to UNDEF.
11002   // In this case we should just commute the node.
11003   if (V1IsUndef)
11004     return DAG.getCommutedVectorShuffle(*SVOp);
11005
11006   // Check for non-undef masks pointing at an undef vector and make the masks
11007   // undef as well. This makes it easier to match the shuffle based solely on
11008   // the mask.
11009   if (V2IsUndef)
11010     for (int M : Mask)
11011       if (M >= NumElements) {
11012         SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
11013         for (int &M : NewMask)
11014           if (M >= NumElements)
11015             M = -1;
11016         return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask);
11017       }
11018
11019   // Try to collapse shuffles into using a vector type with fewer elements but
11020   // wider element types. We cap this to not form integers or floating point
11021   // elements wider than 64 bits, but it might be interesting to form i128
11022   // integers to handle flipping the low and high halves of AVX 256-bit vectors.
11023   SmallVector<int, 16> WidenedMask;
11024   if (VT.getScalarSizeInBits() < 64 &&
11025       canWidenShuffleElements(Mask, WidenedMask)) {
11026     MVT NewEltVT = VT.isFloatingPoint()
11027                        ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
11028                        : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
11029     MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
11030     // Make sure that the new vector type is legal. For example, v2f64 isn't
11031     // legal on SSE1.
11032     if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
11033       V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
11034       V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
11035       return DAG.getNode(ISD::BITCAST, dl, VT,
11036                          DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask));
11037     }
11038   }
11039
11040   int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0;
11041   for (int M : SVOp->getMask())
11042     if (M < 0)
11043       ++NumUndefElements;
11044     else if (M < NumElements)
11045       ++NumV1Elements;
11046     else
11047       ++NumV2Elements;
11048
11049   // Commute the shuffle as needed such that more elements come from V1 than
11050   // V2. This allows us to match the shuffle pattern strictly on how many
11051   // elements come from V1 without handling the symmetric cases.
11052   if (NumV2Elements > NumV1Elements)
11053     return DAG.getCommutedVectorShuffle(*SVOp);
11054
11055   // When the number of V1 and V2 elements are the same, try to minimize the
11056   // number of uses of V2 in the low half of the vector. When that is tied,
11057   // ensure that the sum of indices for V1 is equal to or lower than the sum
11058   // indices for V2. When those are equal, try to ensure that the number of odd
11059   // indices for V1 is lower than the number of odd indices for V2.
11060   if (NumV1Elements == NumV2Elements) {
11061     int LowV1Elements = 0, LowV2Elements = 0;
11062     for (int M : SVOp->getMask().slice(0, NumElements / 2))
11063       if (M >= NumElements)
11064         ++LowV2Elements;
11065       else if (M >= 0)
11066         ++LowV1Elements;
11067     if (LowV2Elements > LowV1Elements) {
11068       return DAG.getCommutedVectorShuffle(*SVOp);
11069     } else if (LowV2Elements == LowV1Elements) {
11070       int SumV1Indices = 0, SumV2Indices = 0;
11071       for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
11072         if (SVOp->getMask()[i] >= NumElements)
11073           SumV2Indices += i;
11074         else if (SVOp->getMask()[i] >= 0)
11075           SumV1Indices += i;
11076       if (SumV2Indices < SumV1Indices) {
11077         return DAG.getCommutedVectorShuffle(*SVOp);
11078       } else if (SumV2Indices == SumV1Indices) {
11079         int NumV1OddIndices = 0, NumV2OddIndices = 0;
11080         for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
11081           if (SVOp->getMask()[i] >= NumElements)
11082             NumV2OddIndices += i % 2;
11083           else if (SVOp->getMask()[i] >= 0)
11084             NumV1OddIndices += i % 2;
11085         if (NumV2OddIndices < NumV1OddIndices)
11086           return DAG.getCommutedVectorShuffle(*SVOp);
11087       }
11088     }
11089   }
11090
11091   // For each vector width, delegate to a specialized lowering routine.
11092   if (VT.getSizeInBits() == 128)
11093     return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
11094
11095   if (VT.getSizeInBits() == 256)
11096     return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
11097
11098   // Force AVX-512 vectors to be scalarized for now.
11099   // FIXME: Implement AVX-512 support!
11100   if (VT.getSizeInBits() == 512)
11101     return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
11102
11103   llvm_unreachable("Unimplemented!");
11104 }
11105
11106
11107 //===----------------------------------------------------------------------===//
11108 // Legacy vector shuffle lowering
11109 //
11110 // This code is the legacy code handling vector shuffles until the above
11111 // replaces its functionality and performance.
11112 //===----------------------------------------------------------------------===//
11113
11114 static bool isBlendMask(ArrayRef<int> MaskVals, MVT VT, bool hasSSE41,
11115                         bool hasInt256, unsigned *MaskOut = nullptr) {
11116   MVT EltVT = VT.getVectorElementType();
11117
11118   // There is no blend with immediate in AVX-512.
11119   if (VT.is512BitVector())
11120     return false;
11121
11122   if (!hasSSE41 || EltVT == MVT::i8)
11123     return false;
11124   if (!hasInt256 && VT == MVT::v16i16)
11125     return false;
11126
11127   unsigned MaskValue = 0;
11128   unsigned NumElems = VT.getVectorNumElements();
11129   // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
11130   unsigned NumLanes = (NumElems - 1) / 8 + 1;
11131   unsigned NumElemsInLane = NumElems / NumLanes;
11132
11133   // Blend for v16i16 should be symetric for the both lanes.
11134   for (unsigned i = 0; i < NumElemsInLane; ++i) {
11135
11136     int SndLaneEltIdx = (NumLanes == 2) ? MaskVals[i + NumElemsInLane] : -1;
11137     int EltIdx = MaskVals[i];
11138
11139     if ((EltIdx < 0 || EltIdx == (int)i) &&
11140         (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane)))
11141       continue;
11142
11143     if (((unsigned)EltIdx == (i + NumElems)) &&
11144         (SndLaneEltIdx < 0 ||
11145          (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane))
11146       MaskValue |= (1 << i);
11147     else
11148       return false;
11149   }
11150
11151   if (MaskOut)
11152     *MaskOut = MaskValue;
11153   return true;
11154 }
11155
11156 // Try to lower a shuffle node into a simple blend instruction.
11157 // This function assumes isBlendMask returns true for this
11158 // SuffleVectorSDNode
11159 static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
11160                                           unsigned MaskValue,
11161                                           const X86Subtarget *Subtarget,
11162                                           SelectionDAG &DAG) {
11163   MVT VT = SVOp->getSimpleValueType(0);
11164   MVT EltVT = VT.getVectorElementType();
11165   assert(isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(),
11166                      Subtarget->hasInt256() && "Trying to lower a "
11167                                                "VECTOR_SHUFFLE to a Blend but "
11168                                                "with the wrong mask"));
11169   SDValue V1 = SVOp->getOperand(0);
11170   SDValue V2 = SVOp->getOperand(1);
11171   SDLoc dl(SVOp);
11172   unsigned NumElems = VT.getVectorNumElements();
11173
11174   // Convert i32 vectors to floating point if it is not AVX2.
11175   // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
11176   MVT BlendVT = VT;
11177   if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
11178     BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
11179                                NumElems);
11180     V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1);
11181     V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2);
11182   }
11183
11184   SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2,
11185                             DAG.getConstant(MaskValue, MVT::i32));
11186   return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
11187 }
11188
11189 /// In vector type \p VT, return true if the element at index \p InputIdx
11190 /// falls on a different 128-bit lane than \p OutputIdx.
11191 static bool ShuffleCrosses128bitLane(MVT VT, unsigned InputIdx,
11192                                      unsigned OutputIdx) {
11193   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
11194   return InputIdx * EltSize / 128 != OutputIdx * EltSize / 128;
11195 }
11196
11197 /// Generate a PSHUFB if possible.  Selects elements from \p V1 according to
11198 /// \p MaskVals.  MaskVals[OutputIdx] = InputIdx specifies that we want to
11199 /// shuffle the element at InputIdx in V1 to OutputIdx in the result.  If \p
11200 /// MaskVals refers to elements outside of \p V1 or is undef (-1), insert a
11201 /// zero.
11202 static SDValue getPSHUFB(ArrayRef<int> MaskVals, SDValue V1, SDLoc &dl,
11203                          SelectionDAG &DAG) {
11204   MVT VT = V1.getSimpleValueType();
11205   assert(VT.is128BitVector() || VT.is256BitVector());
11206
11207   MVT EltVT = VT.getVectorElementType();
11208   unsigned EltSizeInBytes = EltVT.getSizeInBits() / 8;
11209   unsigned NumElts = VT.getVectorNumElements();
11210
11211   SmallVector<SDValue, 32> PshufbMask;
11212   for (unsigned OutputIdx = 0; OutputIdx < NumElts; ++OutputIdx) {
11213     int InputIdx = MaskVals[OutputIdx];
11214     unsigned InputByteIdx;
11215
11216     if (InputIdx < 0 || NumElts <= (unsigned)InputIdx)
11217       InputByteIdx = 0x80;
11218     else {
11219       // Cross lane is not allowed.
11220       if (ShuffleCrosses128bitLane(VT, InputIdx, OutputIdx))
11221         return SDValue();
11222       InputByteIdx = InputIdx * EltSizeInBytes;
11223       // Index is an byte offset within the 128-bit lane.
11224       InputByteIdx &= 0xf;
11225     }
11226
11227     for (unsigned j = 0; j < EltSizeInBytes; ++j) {
11228       PshufbMask.push_back(DAG.getConstant(InputByteIdx, MVT::i8));
11229       if (InputByteIdx != 0x80)
11230         ++InputByteIdx;
11231     }
11232   }
11233
11234   MVT ShufVT = MVT::getVectorVT(MVT::i8, PshufbMask.size());
11235   if (ShufVT != VT)
11236     V1 = DAG.getNode(ISD::BITCAST, dl, ShufVT, V1);
11237   return DAG.getNode(X86ISD::PSHUFB, dl, ShufVT, V1,
11238                      DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT, PshufbMask));
11239 }
11240
11241 // v8i16 shuffles - Prefer shuffles in the following order:
11242 // 1. [all]   pshuflw, pshufhw, optional move
11243 // 2. [ssse3] 1 x pshufb
11244 // 3. [ssse3] 2 x pshufb + 1 x por
11245 // 4. [all]   mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
11246 static SDValue
11247 LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
11248                          SelectionDAG &DAG) {
11249   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11250   SDValue V1 = SVOp->getOperand(0);
11251   SDValue V2 = SVOp->getOperand(1);
11252   SDLoc dl(SVOp);
11253   SmallVector<int, 8> MaskVals;
11254
11255   // Determine if more than 1 of the words in each of the low and high quadwords
11256   // of the result come from the same quadword of one of the two inputs.  Undef
11257   // mask values count as coming from any quadword, for better codegen.
11258   //
11259   // Lo/HiQuad[i] = j indicates how many words from the ith quad of the input
11260   // feeds this quad.  For i, 0 and 1 refer to V1, 2 and 3 refer to V2.
11261   unsigned LoQuad[] = { 0, 0, 0, 0 };
11262   unsigned HiQuad[] = { 0, 0, 0, 0 };
11263   // Indices of quads used.
11264   std::bitset<4> InputQuads;
11265   for (unsigned i = 0; i < 8; ++i) {
11266     unsigned *Quad = i < 4 ? LoQuad : HiQuad;
11267     int EltIdx = SVOp->getMaskElt(i);
11268     MaskVals.push_back(EltIdx);
11269     if (EltIdx < 0) {
11270       ++Quad[0];
11271       ++Quad[1];
11272       ++Quad[2];
11273       ++Quad[3];
11274       continue;
11275     }
11276     ++Quad[EltIdx / 4];
11277     InputQuads.set(EltIdx / 4);
11278   }
11279
11280   int BestLoQuad = -1;
11281   unsigned MaxQuad = 1;
11282   for (unsigned i = 0; i < 4; ++i) {
11283     if (LoQuad[i] > MaxQuad) {
11284       BestLoQuad = i;
11285       MaxQuad = LoQuad[i];
11286     }
11287   }
11288
11289   int BestHiQuad = -1;
11290   MaxQuad = 1;
11291   for (unsigned i = 0; i < 4; ++i) {
11292     if (HiQuad[i] > MaxQuad) {
11293       BestHiQuad = i;
11294       MaxQuad = HiQuad[i];
11295     }
11296   }
11297
11298   // For SSSE3, If all 8 words of the result come from only 1 quadword of each
11299   // of the two input vectors, shuffle them into one input vector so only a
11300   // single pshufb instruction is necessary. If there are more than 2 input
11301   // quads, disable the next transformation since it does not help SSSE3.
11302   bool V1Used = InputQuads[0] || InputQuads[1];
11303   bool V2Used = InputQuads[2] || InputQuads[3];
11304   if (Subtarget->hasSSSE3()) {
11305     if (InputQuads.count() == 2 && V1Used && V2Used) {
11306       BestLoQuad = InputQuads[0] ? 0 : 1;
11307       BestHiQuad = InputQuads[2] ? 2 : 3;
11308     }
11309     if (InputQuads.count() > 2) {
11310       BestLoQuad = -1;
11311       BestHiQuad = -1;
11312     }
11313   }
11314
11315   // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
11316   // the shuffle mask.  If a quad is scored as -1, that means that it contains
11317   // words from all 4 input quadwords.
11318   SDValue NewV;
11319   if (BestLoQuad >= 0 || BestHiQuad >= 0) {
11320     int MaskV[] = {
11321       BestLoQuad < 0 ? 0 : BestLoQuad,
11322       BestHiQuad < 0 ? 1 : BestHiQuad
11323     };
11324     NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
11325                   DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
11326                   DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
11327     NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV);
11328
11329     // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
11330     // source words for the shuffle, to aid later transformations.
11331     bool AllWordsInNewV = true;
11332     bool InOrder[2] = { true, true };
11333     for (unsigned i = 0; i != 8; ++i) {
11334       int idx = MaskVals[i];
11335       if (idx != (int)i)
11336         InOrder[i/4] = false;
11337       if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
11338         continue;
11339       AllWordsInNewV = false;
11340       break;
11341     }
11342
11343     bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
11344     if (AllWordsInNewV) {
11345       for (int i = 0; i != 8; ++i) {
11346         int idx = MaskVals[i];
11347         if (idx < 0)
11348           continue;
11349         idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
11350         if ((idx != i) && idx < 4)
11351           pshufhw = false;
11352         if ((idx != i) && idx > 3)
11353           pshuflw = false;
11354       }
11355       V1 = NewV;
11356       V2Used = false;
11357       BestLoQuad = 0;
11358       BestHiQuad = 1;
11359     }
11360
11361     // If we've eliminated the use of V2, and the new mask is a pshuflw or
11362     // pshufhw, that's as cheap as it gets.  Return the new shuffle.
11363     if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
11364       unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;
11365       unsigned TargetMask = 0;
11366       NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
11367                                   DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
11368       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
11369       TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp):
11370                              getShufflePSHUFLWImmediate(SVOp);
11371       V1 = NewV.getOperand(0);
11372       return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
11373     }
11374   }
11375
11376   // Promote splats to a larger type which usually leads to more efficient code.
11377   // FIXME: Is this true if pshufb is available?
11378   if (SVOp->isSplat())
11379     return PromoteSplat(SVOp, DAG);
11380
11381   // If we have SSSE3, and all words of the result are from 1 input vector,
11382   // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
11383   // is present, fall back to case 4.
11384   if (Subtarget->hasSSSE3()) {
11385     SmallVector<SDValue,16> pshufbMask;
11386
11387     // If we have elements from both input vectors, set the high bit of the
11388     // shuffle mask element to zero out elements that come from V2 in the V1
11389     // mask, and elements that come from V1 in the V2 mask, so that the two
11390     // results can be OR'd together.
11391     bool TwoInputs = V1Used && V2Used;
11392     V1 = getPSHUFB(MaskVals, V1, dl, DAG);
11393     if (!TwoInputs)
11394       return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
11395
11396     // Calculate the shuffle mask for the second input, shuffle it, and
11397     // OR it with the first shuffled input.
11398     CommuteVectorShuffleMask(MaskVals, 8);
11399     V2 = getPSHUFB(MaskVals, V2, dl, DAG);
11400     V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
11401     return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
11402   }
11403
11404   // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
11405   // and update MaskVals with new element order.
11406   std::bitset<8> InOrder;
11407   if (BestLoQuad >= 0) {
11408     int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 };
11409     for (int i = 0; i != 4; ++i) {
11410       int idx = MaskVals[i];
11411       if (idx < 0) {
11412         InOrder.set(i);
11413       } else if ((idx / 4) == BestLoQuad) {
11414         MaskV[i] = idx & 3;
11415         InOrder.set(i);
11416       }
11417     }
11418     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
11419                                 &MaskV[0]);
11420
11421     if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
11422       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
11423       NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
11424                                   NewV.getOperand(0),
11425                                   getShufflePSHUFLWImmediate(SVOp), DAG);
11426     }
11427   }
11428
11429   // If BestHi >= 0, generate a pshufhw to put the high elements in order,
11430   // and update MaskVals with the new element order.
11431   if (BestHiQuad >= 0) {
11432     int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 };
11433     for (unsigned i = 4; i != 8; ++i) {
11434       int idx = MaskVals[i];
11435       if (idx < 0) {
11436         InOrder.set(i);
11437       } else if ((idx / 4) == BestHiQuad) {
11438         MaskV[i] = (idx & 3) + 4;
11439         InOrder.set(i);
11440       }
11441     }
11442     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
11443                                 &MaskV[0]);
11444
11445     if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
11446       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
11447       NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
11448                                   NewV.getOperand(0),
11449                                   getShufflePSHUFHWImmediate(SVOp), DAG);
11450     }
11451   }
11452
11453   // In case BestHi & BestLo were both -1, which means each quadword has a word
11454   // from each of the four input quadwords, calculate the InOrder bitvector now
11455   // before falling through to the insert/extract cleanup.
11456   if (BestLoQuad == -1 && BestHiQuad == -1) {
11457     NewV = V1;
11458     for (int i = 0; i != 8; ++i)
11459       if (MaskVals[i] < 0 || MaskVals[i] == i)
11460         InOrder.set(i);
11461   }
11462
11463   // The other elements are put in the right place using pextrw and pinsrw.
11464   for (unsigned i = 0; i != 8; ++i) {
11465     if (InOrder[i])
11466       continue;
11467     int EltIdx = MaskVals[i];
11468     if (EltIdx < 0)
11469       continue;
11470     SDValue ExtOp = (EltIdx < 8) ?
11471       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
11472                   DAG.getIntPtrConstant(EltIdx)) :
11473       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
11474                   DAG.getIntPtrConstant(EltIdx - 8));
11475     NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
11476                        DAG.getIntPtrConstant(i));
11477   }
11478   return NewV;
11479 }
11480
11481 /// \brief v16i16 shuffles
11482 ///
11483 /// FIXME: We only support generation of a single pshufb currently.  We can
11484 /// generalize the other applicable cases from LowerVECTOR_SHUFFLEv8i16 as
11485 /// well (e.g 2 x pshufb + 1 x por).
11486 static SDValue
11487 LowerVECTOR_SHUFFLEv16i16(SDValue Op, SelectionDAG &DAG) {
11488   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11489   SDValue V1 = SVOp->getOperand(0);
11490   SDValue V2 = SVOp->getOperand(1);
11491   SDLoc dl(SVOp);
11492
11493   if (V2.getOpcode() != ISD::UNDEF)
11494     return SDValue();
11495
11496   SmallVector<int, 16> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
11497   return getPSHUFB(MaskVals, V1, dl, DAG);
11498 }
11499
11500 // v16i8 shuffles - Prefer shuffles in the following order:
11501 // 1. [ssse3] 1 x pshufb
11502 // 2. [ssse3] 2 x pshufb + 1 x por
11503 // 3. [all]   v8i16 shuffle + N x pextrw + rotate + pinsrw
11504 static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
11505                                         const X86Subtarget* Subtarget,
11506                                         SelectionDAG &DAG) {
11507   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
11508   SDValue V1 = SVOp->getOperand(0);
11509   SDValue V2 = SVOp->getOperand(1);
11510   SDLoc dl(SVOp);
11511   ArrayRef<int> MaskVals = SVOp->getMask();
11512
11513   // Promote splats to a larger type which usually leads to more efficient code.
11514   // FIXME: Is this true if pshufb is available?
11515   if (SVOp->isSplat())
11516     return PromoteSplat(SVOp, DAG);
11517
11518   // If we have SSSE3, case 1 is generated when all result bytes come from
11519   // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
11520   // present, fall back to case 3.
11521
11522   // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
11523   if (Subtarget->hasSSSE3()) {
11524     SmallVector<SDValue,16> pshufbMask;
11525
11526     // If all result elements are from one input vector, then only translate
11527     // undef mask values to 0x80 (zero out result) in the pshufb mask.
11528     //
11529     // Otherwise, we have elements from both input vectors, and must zero out
11530     // elements that come from V2 in the first mask, and V1 in the second mask
11531     // so that we can OR them together.
11532     for (unsigned i = 0; i != 16; ++i) {
11533       int EltIdx = MaskVals[i];
11534       if (EltIdx < 0 || EltIdx >= 16)
11535         EltIdx = 0x80;
11536       pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
11537     }
11538     V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
11539                      DAG.getNode(ISD::BUILD_VECTOR, dl,
11540                                  MVT::v16i8, pshufbMask));
11541
11542     // As PSHUFB will zero elements with negative indices, it's safe to ignore
11543     // the 2nd operand if it's undefined or zero.
11544     if (V2.getOpcode() == ISD::UNDEF ||
11545         ISD::isBuildVectorAllZeros(V2.getNode()))
11546       return V1;
11547
11548     // Calculate the shuffle mask for the second input, shuffle it, and
11549     // OR it with the first shuffled input.
11550     pshufbMask.clear();
11551     for (unsigned i = 0; i != 16; ++i) {
11552       int EltIdx = MaskVals[i];
11553       EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16;
11554       pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
11555     }
11556     V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
11557                      DAG.getNode(ISD::BUILD_VECTOR, dl,
11558                                  MVT::v16i8, pshufbMask));
11559     return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
11560   }
11561
11562   // No SSSE3 - Calculate in place words and then fix all out of place words
11563   // With 0-16 extracts & inserts.  Worst case is 16 bytes out of order from
11564   // the 16 different words that comprise the two doublequadword input vectors.
11565   V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
11566   V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
11567   SDValue NewV = V1;
11568   for (int i = 0; i != 8; ++i) {
11569     int Elt0 = MaskVals[i*2];
11570     int Elt1 = MaskVals[i*2+1];
11571
11572     // This word of the result is all undef, skip it.
11573     if (Elt0 < 0 && Elt1 < 0)
11574       continue;
11575
11576     // This word of the result is already in the correct place, skip it.
11577     if ((Elt0 == i*2) && (Elt1 == i*2+1))
11578       continue;
11579
11580     SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
11581     SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
11582     SDValue InsElt;
11583
11584     // If Elt0 and Elt1 are defined, are consecutive, and can be load
11585     // using a single extract together, load it and store it.
11586     if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
11587       InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
11588                            DAG.getIntPtrConstant(Elt1 / 2));
11589       NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
11590                         DAG.getIntPtrConstant(i));
11591       continue;
11592     }
11593
11594     // If Elt1 is defined, extract it from the appropriate source.  If the
11595     // source byte is not also odd, shift the extracted word left 8 bits
11596     // otherwise clear the bottom 8 bits if we need to do an or.
11597     if (Elt1 >= 0) {
11598       InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
11599                            DAG.getIntPtrConstant(Elt1 / 2));
11600       if ((Elt1 & 1) == 0)
11601         InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
11602                              DAG.getConstant(8,
11603                                   TLI.getShiftAmountTy(InsElt.getValueType())));
11604       else if (Elt0 >= 0)
11605         InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
11606                              DAG.getConstant(0xFF00, MVT::i16));
11607     }
11608     // If Elt0 is defined, extract it from the appropriate source.  If the
11609     // source byte is not also even, shift the extracted word right 8 bits. If
11610     // Elt1 was also defined, OR the extracted values together before
11611     // inserting them in the result.
11612     if (Elt0 >= 0) {
11613       SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
11614                                     Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
11615       if ((Elt0 & 1) != 0)
11616         InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
11617                               DAG.getConstant(8,
11618                                  TLI.getShiftAmountTy(InsElt0.getValueType())));
11619       else if (Elt1 >= 0)
11620         InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
11621                              DAG.getConstant(0x00FF, MVT::i16));
11622       InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
11623                          : InsElt0;
11624     }
11625     NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
11626                        DAG.getIntPtrConstant(i));
11627   }
11628   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
11629 }
11630
11631 // v32i8 shuffles - Translate to VPSHUFB if possible.
11632 static
11633 SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
11634                                  const X86Subtarget *Subtarget,
11635                                  SelectionDAG &DAG) {
11636   MVT VT = SVOp->getSimpleValueType(0);
11637   SDValue V1 = SVOp->getOperand(0);
11638   SDValue V2 = SVOp->getOperand(1);
11639   SDLoc dl(SVOp);
11640   SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
11641
11642   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
11643   bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode());
11644   bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode());
11645
11646   // VPSHUFB may be generated if
11647   // (1) one of input vector is undefined or zeroinitializer.
11648   // The mask value 0x80 puts 0 in the corresponding slot of the vector.
11649   // And (2) the mask indexes don't cross the 128-bit lane.
11650   if (VT != MVT::v32i8 || !Subtarget->hasInt256() ||
11651       (!V2IsUndef && !V2IsAllZero && !V1IsAllZero))
11652     return SDValue();
11653
11654   if (V1IsAllZero && !V2IsAllZero) {
11655     CommuteVectorShuffleMask(MaskVals, 32);
11656     V1 = V2;
11657   }
11658   return getPSHUFB(MaskVals, V1, dl, DAG);
11659 }
11660
11661 /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
11662 /// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
11663 /// done when every pair / quad of shuffle mask elements point to elements in
11664 /// the right sequence. e.g.
11665 /// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15>
11666 static
11667 SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
11668                                  SelectionDAG &DAG) {
11669   MVT VT = SVOp->getSimpleValueType(0);
11670   SDLoc dl(SVOp);
11671   unsigned NumElems = VT.getVectorNumElements();
11672   MVT NewVT;
11673   unsigned Scale;
11674   switch (VT.SimpleTy) {
11675   default: llvm_unreachable("Unexpected!");
11676   case MVT::v2i64:
11677   case MVT::v2f64:
11678            return SDValue(SVOp, 0);
11679   case MVT::v4f32:  NewVT = MVT::v2f64; Scale = 2; break;
11680   case MVT::v4i32:  NewVT = MVT::v2i64; Scale = 2; break;
11681   case MVT::v8i16:  NewVT = MVT::v4i32; Scale = 2; break;
11682   case MVT::v16i8:  NewVT = MVT::v4i32; Scale = 4; break;
11683   case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break;
11684   case MVT::v32i8:  NewVT = MVT::v8i32; Scale = 4; break;
11685   }
11686
11687   SmallVector<int, 8> MaskVec;
11688   for (unsigned i = 0; i != NumElems; i += Scale) {
11689     int StartIdx = -1;
11690     for (unsigned j = 0; j != Scale; ++j) {
11691       int EltIdx = SVOp->getMaskElt(i+j);
11692       if (EltIdx < 0)
11693         continue;
11694       if (StartIdx < 0)
11695         StartIdx = (EltIdx / Scale);
11696       if (EltIdx != (int)(StartIdx*Scale + j))
11697         return SDValue();
11698     }
11699     MaskVec.push_back(StartIdx);
11700   }
11701
11702   SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0));
11703   SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1));
11704   return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
11705 }
11706
11707 /// getVZextMovL - Return a zero-extending vector move low node.
11708 ///
11709 static SDValue getVZextMovL(MVT VT, MVT OpVT,
11710                             SDValue SrcOp, SelectionDAG &DAG,
11711                             const X86Subtarget *Subtarget, SDLoc dl) {
11712   if (VT == MVT::v2f64 || VT == MVT::v4f32) {
11713     LoadSDNode *LD = nullptr;
11714     if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
11715       LD = dyn_cast<LoadSDNode>(SrcOp);
11716     if (!LD) {
11717       // movssrr and movsdrr do not clear top bits. Try to use movd, movq
11718       // instead.
11719       MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
11720       if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) &&
11721           SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
11722           SrcOp.getOperand(0).getOpcode() == ISD::BITCAST &&
11723           SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
11724         // PR2108
11725         OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
11726         return DAG.getNode(ISD::BITCAST, dl, VT,
11727                            DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
11728                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
11729                                                    OpVT,
11730                                                    SrcOp.getOperand(0)
11731                                                           .getOperand(0))));
11732       }
11733     }
11734   }
11735
11736   return DAG.getNode(ISD::BITCAST, dl, VT,
11737                      DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
11738                                  DAG.getNode(ISD::BITCAST, dl,
11739                                              OpVT, SrcOp)));
11740 }
11741
11742 /// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
11743 /// which could not be matched by any known target speficic shuffle
11744 static SDValue
11745 LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
11746
11747   SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG);
11748   if (NewOp.getNode())
11749     return NewOp;
11750
11751   MVT VT = SVOp->getSimpleValueType(0);
11752
11753   unsigned NumElems = VT.getVectorNumElements();
11754   unsigned NumLaneElems = NumElems / 2;
11755
11756   SDLoc dl(SVOp);
11757   MVT EltVT = VT.getVectorElementType();
11758   MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
11759   SDValue Output[2];
11760
11761   SmallVector<int, 16> Mask;
11762   for (unsigned l = 0; l < 2; ++l) {
11763     // Build a shuffle mask for the output, discovering on the fly which
11764     // input vectors to use as shuffle operands (recorded in InputUsed).
11765     // If building a suitable shuffle vector proves too hard, then bail
11766     // out with UseBuildVector set.
11767     bool UseBuildVector = false;
11768     int InputUsed[2] = { -1, -1 }; // Not yet discovered.
11769     unsigned LaneStart = l * NumLaneElems;
11770     for (unsigned i = 0; i != NumLaneElems; ++i) {
11771       // The mask element.  This indexes into the input.
11772       int Idx = SVOp->getMaskElt(i+LaneStart);
11773       if (Idx < 0) {
11774         // the mask element does not index into any input vector.
11775         Mask.push_back(-1);
11776         continue;
11777       }
11778
11779       // The input vector this mask element indexes into.
11780       int Input = Idx / NumLaneElems;
11781
11782       // Turn the index into an offset from the start of the input vector.
11783       Idx -= Input * NumLaneElems;
11784
11785       // Find or create a shuffle vector operand to hold this input.
11786       unsigned OpNo;
11787       for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
11788         if (InputUsed[OpNo] == Input)
11789           // This input vector is already an operand.
11790           break;
11791         if (InputUsed[OpNo] < 0) {
11792           // Create a new operand for this input vector.
11793           InputUsed[OpNo] = Input;
11794           break;
11795         }
11796       }
11797
11798       if (OpNo >= array_lengthof(InputUsed)) {
11799         // More than two input vectors used!  Give up on trying to create a
11800         // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
11801         UseBuildVector = true;
11802         break;
11803       }
11804
11805       // Add the mask index for the new shuffle vector.
11806       Mask.push_back(Idx + OpNo * NumLaneElems);
11807     }
11808
11809     if (UseBuildVector) {
11810       SmallVector<SDValue, 16> SVOps;
11811       for (unsigned i = 0; i != NumLaneElems; ++i) {
11812         // The mask element.  This indexes into the input.
11813         int Idx = SVOp->getMaskElt(i+LaneStart);
11814         if (Idx < 0) {
11815           SVOps.push_back(DAG.getUNDEF(EltVT));
11816           continue;
11817         }
11818
11819         // The input vector this mask element indexes into.
11820         int Input = Idx / NumElems;
11821
11822         // Turn the index into an offset from the start of the input vector.
11823         Idx -= Input * NumElems;
11824
11825         // Extract the vector element by hand.
11826         SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
11827                                     SVOp->getOperand(Input),
11828                                     DAG.getIntPtrConstant(Idx)));
11829       }
11830
11831       // Construct the output using a BUILD_VECTOR.
11832       Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, SVOps);
11833     } else if (InputUsed[0] < 0) {
11834       // No input vectors were used! The result is undefined.
11835       Output[l] = DAG.getUNDEF(NVT);
11836     } else {
11837       SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),
11838                                         (InputUsed[0] % 2) * NumLaneElems,
11839                                         DAG, dl);
11840       // If only one input was used, use an undefined vector for the other.
11841       SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :
11842         Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),
11843                             (InputUsed[1] % 2) * NumLaneElems, DAG, dl);
11844       // At least one input vector was used. Create a new shuffle vector.
11845       Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
11846     }
11847
11848     Mask.clear();
11849   }
11850
11851   // Concatenate the result back
11852   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]);
11853 }
11854
11855 /// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
11856 /// 4 elements, and match them with several different shuffle types.
11857 static SDValue
11858 LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
11859   SDValue V1 = SVOp->getOperand(0);
11860   SDValue V2 = SVOp->getOperand(1);
11861   SDLoc dl(SVOp);
11862   MVT VT = SVOp->getSimpleValueType(0);
11863
11864   assert(VT.is128BitVector() && "Unsupported vector size");
11865
11866   std::pair<int, int> Locs[4];
11867   int Mask1[] = { -1, -1, -1, -1 };
11868   SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end());
11869
11870   unsigned NumHi = 0;
11871   unsigned NumLo = 0;
11872   for (unsigned i = 0; i != 4; ++i) {
11873     int Idx = PermMask[i];
11874     if (Idx < 0) {
11875       Locs[i] = std::make_pair(-1, -1);
11876     } else {
11877       assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
11878       if (Idx < 4) {
11879         Locs[i] = std::make_pair(0, NumLo);
11880         Mask1[NumLo] = Idx;
11881         NumLo++;
11882       } else {
11883         Locs[i] = std::make_pair(1, NumHi);
11884         if (2+NumHi < 4)
11885           Mask1[2+NumHi] = Idx;
11886         NumHi++;
11887       }
11888     }
11889   }
11890
11891   if (NumLo <= 2 && NumHi <= 2) {
11892     // If no more than two elements come from either vector. This can be
11893     // implemented with two shuffles. First shuffle gather the elements.
11894     // The second shuffle, which takes the first shuffle as both of its
11895     // vector operands, put the elements into the right order.
11896     V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
11897
11898     int Mask2[] = { -1, -1, -1, -1 };
11899
11900     for (unsigned i = 0; i != 4; ++i)
11901       if (Locs[i].first != -1) {
11902         unsigned Idx = (i < 2) ? 0 : 4;
11903         Idx += Locs[i].first * 2 + Locs[i].second;
11904         Mask2[i] = Idx;
11905       }
11906
11907     return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
11908   }
11909
11910   if (NumLo == 3 || NumHi == 3) {
11911     // Otherwise, we must have three elements from one vector, call it X, and
11912     // one element from the other, call it Y.  First, use a shufps to build an
11913     // intermediate vector with the one element from Y and the element from X
11914     // that will be in the same half in the final destination (the indexes don't
11915     // matter). Then, use a shufps to build the final vector, taking the half
11916     // containing the element from Y from the intermediate, and the other half
11917     // from X.
11918     if (NumHi == 3) {
11919       // Normalize it so the 3 elements come from V1.
11920       CommuteVectorShuffleMask(PermMask, 4);
11921       std::swap(V1, V2);
11922     }
11923
11924     // Find the element from V2.
11925     unsigned HiIndex;
11926     for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
11927       int Val = PermMask[HiIndex];
11928       if (Val < 0)
11929         continue;
11930       if (Val >= 4)
11931         break;
11932     }
11933
11934     Mask1[0] = PermMask[HiIndex];
11935     Mask1[1] = -1;
11936     Mask1[2] = PermMask[HiIndex^1];
11937     Mask1[3] = -1;
11938     V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
11939
11940     if (HiIndex >= 2) {
11941       Mask1[0] = PermMask[0];
11942       Mask1[1] = PermMask[1];
11943       Mask1[2] = HiIndex & 1 ? 6 : 4;
11944       Mask1[3] = HiIndex & 1 ? 4 : 6;
11945       return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
11946     }
11947
11948     Mask1[0] = HiIndex & 1 ? 2 : 0;
11949     Mask1[1] = HiIndex & 1 ? 0 : 2;
11950     Mask1[2] = PermMask[2];
11951     Mask1[3] = PermMask[3];
11952     if (Mask1[2] >= 0)
11953       Mask1[2] += 4;
11954     if (Mask1[3] >= 0)
11955       Mask1[3] += 4;
11956     return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
11957   }
11958
11959   // Break it into (shuffle shuffle_hi, shuffle_lo).
11960   int LoMask[] = { -1, -1, -1, -1 };
11961   int HiMask[] = { -1, -1, -1, -1 };
11962
11963   int *MaskPtr = LoMask;
11964   unsigned MaskIdx = 0;
11965   unsigned LoIdx = 0;
11966   unsigned HiIdx = 2;
11967   for (unsigned i = 0; i != 4; ++i) {
11968     if (i == 2) {
11969       MaskPtr = HiMask;
11970       MaskIdx = 1;
11971       LoIdx = 0;
11972       HiIdx = 2;
11973     }
11974     int Idx = PermMask[i];
11975     if (Idx < 0) {
11976       Locs[i] = std::make_pair(-1, -1);
11977     } else if (Idx < 4) {
11978       Locs[i] = std::make_pair(MaskIdx, LoIdx);
11979       MaskPtr[LoIdx] = Idx;
11980       LoIdx++;
11981     } else {
11982       Locs[i] = std::make_pair(MaskIdx, HiIdx);
11983       MaskPtr[HiIdx] = Idx;
11984       HiIdx++;
11985     }
11986   }
11987
11988   SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
11989   SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
11990   int MaskOps[] = { -1, -1, -1, -1 };
11991   for (unsigned i = 0; i != 4; ++i)
11992     if (Locs[i].first != -1)
11993       MaskOps[i] = Locs[i].first * 4 + Locs[i].second;
11994   return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
11995 }
11996
11997 static bool MayFoldVectorLoad(SDValue V) {
11998   while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
11999     V = V.getOperand(0);
12000
12001   if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
12002     V = V.getOperand(0);
12003   if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&
12004       V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)
12005     // BUILD_VECTOR (load), undef
12006     V = V.getOperand(0);
12007
12008   return MayFoldLoad(V);
12009 }
12010
12011 static
12012 SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) {
12013   MVT VT = Op.getSimpleValueType();
12014
12015   // Canonizalize to v2f64.
12016   V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
12017   return DAG.getNode(ISD::BITCAST, dl, VT,
12018                      getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64,
12019                                           V1, DAG));
12020 }
12021
12022 static
12023 SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG,
12024                         bool HasSSE2) {
12025   SDValue V1 = Op.getOperand(0);
12026   SDValue V2 = Op.getOperand(1);
12027   MVT VT = Op.getSimpleValueType();
12028
12029   assert(VT != MVT::v2i64 && "unsupported shuffle type");
12030
12031   if (HasSSE2 && VT == MVT::v2f64)
12032     return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
12033
12034   // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1)
12035   return DAG.getNode(ISD::BITCAST, dl, VT,
12036                      getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32,
12037                            DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1),
12038                            DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG));
12039 }
12040
12041 static
12042 SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG) {
12043   SDValue V1 = Op.getOperand(0);
12044   SDValue V2 = Op.getOperand(1);
12045   MVT VT = Op.getSimpleValueType();
12046
12047   assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
12048          "unsupported shuffle type");
12049
12050   if (V2.getOpcode() == ISD::UNDEF)
12051     V2 = V1;
12052
12053   // v4i32 or v4f32
12054   return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
12055 }
12056
12057 static
12058 SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
12059   SDValue V1 = Op.getOperand(0);
12060   SDValue V2 = Op.getOperand(1);
12061   MVT VT = Op.getSimpleValueType();
12062   unsigned NumElems = VT.getVectorNumElements();
12063
12064   // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
12065   // operand of these instructions is only memory, so check if there's a
12066   // potencial load folding here, otherwise use SHUFPS or MOVSD to match the
12067   // same masks.
12068   bool CanFoldLoad = false;
12069
12070   // Trivial case, when V2 comes from a load.
12071   if (MayFoldVectorLoad(V2))
12072     CanFoldLoad = true;
12073
12074   // When V1 is a load, it can be folded later into a store in isel, example:
12075   //  (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1)
12076   //    turns into:
12077   //  (MOVLPSmr addr:$src1, VR128:$src2)
12078   // So, recognize this potential and also use MOVLPS or MOVLPD
12079   else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
12080     CanFoldLoad = true;
12081
12082   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12083   if (CanFoldLoad) {
12084     if (HasSSE2 && NumElems == 2)
12085       return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
12086
12087     if (NumElems == 4)
12088       // If we don't care about the second element, proceed to use movss.
12089       if (SVOp->getMaskElt(1) != -1)
12090         return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
12091   }
12092
12093   // movl and movlp will both match v2i64, but v2i64 is never matched by
12094   // movl earlier because we make it strict to avoid messing with the movlp load
12095   // folding logic (see the code above getMOVLP call). Match it here then,
12096   // this is horrible, but will stay like this until we move all shuffle
12097   // matching to x86 specific nodes. Note that for the 1st condition all
12098   // types are matched with movsd.
12099   if (HasSSE2) {
12100     // FIXME: isMOVLMask should be checked and matched before getMOVLP,
12101     // as to remove this logic from here, as much as possible
12102     if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT))
12103       return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
12104     return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
12105   }
12106
12107   assert(VT != MVT::v4i32 && "unsupported shuffle type");
12108
12109   // Invert the operand order and use SHUFPS to match it.
12110   return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1,
12111                               getShuffleSHUFImmediate(SVOp), DAG);
12112 }
12113
12114 static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
12115                                          SelectionDAG &DAG) {
12116   SDLoc dl(Load);
12117   MVT VT = Load->getSimpleValueType(0);
12118   MVT EVT = VT.getVectorElementType();
12119   SDValue Addr = Load->getOperand(1);
12120   SDValue NewAddr = DAG.getNode(
12121       ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
12122       DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType()));
12123
12124   SDValue NewLoad =
12125       DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
12126                   DAG.getMachineFunction().getMachineMemOperand(
12127                       Load->getMemOperand(), 0, EVT.getStoreSize()));
12128   return NewLoad;
12129 }
12130
12131 // It is only safe to call this function if isINSERTPSMask is true for
12132 // this shufflevector mask.
12133 static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
12134                            SelectionDAG &DAG) {
12135   // Generate an insertps instruction when inserting an f32 from memory onto a
12136   // v4f32 or when copying a member from one v4f32 to another.
12137   // We also use it for transferring i32 from one register to another,
12138   // since it simply copies the same bits.
12139   // If we're transferring an i32 from memory to a specific element in a
12140   // register, we output a generic DAG that will match the PINSRD
12141   // instruction.
12142   MVT VT = SVOp->getSimpleValueType(0);
12143   MVT EVT = VT.getVectorElementType();
12144   SDValue V1 = SVOp->getOperand(0);
12145   SDValue V2 = SVOp->getOperand(1);
12146   auto Mask = SVOp->getMask();
12147   assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
12148          "unsupported vector type for insertps/pinsrd");
12149
12150   auto FromV1Predicate = [](const int &i) { return i < 4 && i > -1; };
12151   auto FromV2Predicate = [](const int &i) { return i >= 4; };
12152   int FromV1 = std::count_if(Mask.begin(), Mask.end(), FromV1Predicate);
12153
12154   SDValue From;
12155   SDValue To;
12156   unsigned DestIndex;
12157   if (FromV1 == 1) {
12158     From = V1;
12159     To = V2;
12160     DestIndex = std::find_if(Mask.begin(), Mask.end(), FromV1Predicate) -
12161                 Mask.begin();
12162
12163     // If we have 1 element from each vector, we have to check if we're
12164     // changing V1's element's place. If so, we're done. Otherwise, we
12165     // should assume we're changing V2's element's place and behave
12166     // accordingly.
12167     int FromV2 = std::count_if(Mask.begin(), Mask.end(), FromV2Predicate);
12168     assert(DestIndex <= INT32_MAX && "truncated destination index");
12169     if (FromV1 == FromV2 &&
12170         static_cast<int>(DestIndex) == Mask[DestIndex] % 4) {
12171       From = V2;
12172       To = V1;
12173       DestIndex =
12174           std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
12175     }
12176   } else {
12177     assert(std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 &&
12178            "More than one element from V1 and from V2, or no elements from one "
12179            "of the vectors. This case should not have returned true from "
12180            "isINSERTPSMask");
12181     From = V2;
12182     To = V1;
12183     DestIndex =
12184         std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
12185   }
12186
12187   // Get an index into the source vector in the range [0,4) (the mask is
12188   // in the range [0,8) because it can address V1 and V2)
12189   unsigned SrcIndex = Mask[DestIndex] % 4;
12190   if (MayFoldLoad(From)) {
12191     // Trivial case, when From comes from a load and is only used by the
12192     // shuffle. Make it use insertps from the vector that we need from that
12193     // load.
12194     SDValue NewLoad =
12195         NarrowVectorLoadToElement(cast<LoadSDNode>(From), SrcIndex, DAG);
12196     if (!NewLoad.getNode())
12197       return SDValue();
12198
12199     if (EVT == MVT::f32) {
12200       // Create this as a scalar to vector to match the instruction pattern.
12201       SDValue LoadScalarToVector =
12202           DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad);
12203       SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4);
12204       return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector,
12205                          InsertpsMask);
12206     } else { // EVT == MVT::i32
12207       // If we're getting an i32 from memory, use an INSERT_VECTOR_ELT
12208       // instruction, to match the PINSRD instruction, which loads an i32 to a
12209       // certain vector element.
12210       return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, To, NewLoad,
12211                          DAG.getConstant(DestIndex, MVT::i32));
12212     }
12213   }
12214
12215   // Vector-element-to-vector
12216   SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6);
12217   return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask);
12218 }
12219
12220 // Reduce a vector shuffle to zext.
12221 static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
12222                                     SelectionDAG &DAG) {
12223   // PMOVZX is only available from SSE41.
12224   if (!Subtarget->hasSSE41())
12225     return SDValue();
12226
12227   MVT VT = Op.getSimpleValueType();
12228
12229   // Only AVX2 support 256-bit vector integer extending.
12230   if (!Subtarget->hasInt256() && VT.is256BitVector())
12231     return SDValue();
12232
12233   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12234   SDLoc DL(Op);
12235   SDValue V1 = Op.getOperand(0);
12236   SDValue V2 = Op.getOperand(1);
12237   unsigned NumElems = VT.getVectorNumElements();
12238
12239   // Extending is an unary operation and the element type of the source vector
12240   // won't be equal to or larger than i64.
12241   if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() ||
12242       VT.getVectorElementType() == MVT::i64)
12243     return SDValue();
12244
12245   // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4.
12246   unsigned Shift = 1; // Start from 2, i.e. 1 << 1.
12247   while ((1U << Shift) < NumElems) {
12248     if (SVOp->getMaskElt(1U << Shift) == 1)
12249       break;
12250     Shift += 1;
12251     // The maximal ratio is 8, i.e. from i8 to i64.
12252     if (Shift > 3)
12253       return SDValue();
12254   }
12255
12256   // Check the shuffle mask.
12257   unsigned Mask = (1U << Shift) - 1;
12258   for (unsigned i = 0; i != NumElems; ++i) {
12259     int EltIdx = SVOp->getMaskElt(i);
12260     if ((i & Mask) != 0 && EltIdx != -1)
12261       return SDValue();
12262     if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift))
12263       return SDValue();
12264   }
12265
12266   unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift;
12267   MVT NeVT = MVT::getIntegerVT(NBits);
12268   MVT NVT = MVT::getVectorVT(NeVT, NumElems >> Shift);
12269
12270   if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT))
12271     return SDValue();
12272
12273   return DAG.getNode(ISD::BITCAST, DL, VT,
12274                      DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
12275 }
12276
12277 static SDValue NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
12278                                       SelectionDAG &DAG) {
12279   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12280   MVT VT = Op.getSimpleValueType();
12281   SDLoc dl(Op);
12282   SDValue V1 = Op.getOperand(0);
12283   SDValue V2 = Op.getOperand(1);
12284
12285   if (isZeroShuffle(SVOp))
12286     return getZeroVector(VT, Subtarget, DAG, dl);
12287
12288   // Handle splat operations
12289   if (SVOp->isSplat()) {
12290     // Use vbroadcast whenever the splat comes from a foldable load
12291     SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
12292     if (Broadcast.getNode())
12293       return Broadcast;
12294   }
12295
12296   // Check integer expanding shuffles.
12297   SDValue NewOp = LowerVectorIntExtend(Op, Subtarget, DAG);
12298   if (NewOp.getNode())
12299     return NewOp;
12300
12301   // If the shuffle can be profitably rewritten as a narrower shuffle, then
12302   // do it!
12303   if (VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v16i16 ||
12304       VT == MVT::v32i8) {
12305     SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
12306     if (NewOp.getNode())
12307       return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
12308   } else if (VT.is128BitVector() && Subtarget->hasSSE2()) {
12309     // FIXME: Figure out a cleaner way to do this.
12310     if (ISD::isBuildVectorAllZeros(V2.getNode())) {
12311       SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
12312       if (NewOp.getNode()) {
12313         MVT NewVT = NewOp.getSimpleValueType();
12314         if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
12315                                NewVT, true, false))
12316           return getVZextMovL(VT, NewVT, NewOp.getOperand(0), DAG, Subtarget,
12317                               dl);
12318       }
12319     } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
12320       SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
12321       if (NewOp.getNode()) {
12322         MVT NewVT = NewOp.getSimpleValueType();
12323         if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
12324           return getVZextMovL(VT, NewVT, NewOp.getOperand(1), DAG, Subtarget,
12325                               dl);
12326       }
12327     }
12328   }
12329   return SDValue();
12330 }
12331
12332 SDValue
12333 X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
12334   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12335   SDValue V1 = Op.getOperand(0);
12336   SDValue V2 = Op.getOperand(1);
12337   MVT VT = Op.getSimpleValueType();
12338   SDLoc dl(Op);
12339   unsigned NumElems = VT.getVectorNumElements();
12340   bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
12341   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
12342   bool V1IsSplat = false;
12343   bool V2IsSplat = false;
12344   bool HasSSE2 = Subtarget->hasSSE2();
12345   bool HasFp256    = Subtarget->hasFp256();
12346   bool HasInt256   = Subtarget->hasInt256();
12347   MachineFunction &MF = DAG.getMachineFunction();
12348   bool OptForSize = MF.getFunction()->getAttributes().
12349     hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
12350
12351   // Check if we should use the experimental vector shuffle lowering. If so,
12352   // delegate completely to that code path.
12353   if (ExperimentalVectorShuffleLowering)
12354     return lowerVectorShuffle(Op, Subtarget, DAG);
12355
12356   assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
12357
12358   if (V1IsUndef && V2IsUndef)
12359     return DAG.getUNDEF(VT);
12360
12361   // When we create a shuffle node we put the UNDEF node to second operand,
12362   // but in some cases the first operand may be transformed to UNDEF.
12363   // In this case we should just commute the node.
12364   if (V1IsUndef)
12365     return DAG.getCommutedVectorShuffle(*SVOp);
12366
12367   // Vector shuffle lowering takes 3 steps:
12368   //
12369   // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable
12370   //    narrowing and commutation of operands should be handled.
12371   // 2) Matching of shuffles with known shuffle masks to x86 target specific
12372   //    shuffle nodes.
12373   // 3) Rewriting of unmatched masks into new generic shuffle operations,
12374   //    so the shuffle can be broken into other shuffles and the legalizer can
12375   //    try the lowering again.
12376   //
12377   // The general idea is that no vector_shuffle operation should be left to
12378   // be matched during isel, all of them must be converted to a target specific
12379   // node here.
12380
12381   // Normalize the input vectors. Here splats, zeroed vectors, profitable
12382   // narrowing and commutation of operands should be handled. The actual code
12383   // doesn't include all of those, work in progress...
12384   SDValue NewOp = NormalizeVectorShuffle(Op, Subtarget, DAG);
12385   if (NewOp.getNode())
12386     return NewOp;
12387
12388   SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end());
12389
12390   // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
12391   // unpckh_undef). Only use pshufd if speed is more important than size.
12392   if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasInt256))
12393     return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
12394   if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasInt256))
12395     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
12396
12397   if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() &&
12398       V2IsUndef && MayFoldVectorLoad(V1))
12399     return getMOVDDup(Op, dl, V1, DAG);
12400
12401   if (isMOVHLPS_v_undef_Mask(M, VT))
12402     return getMOVHighToLow(Op, dl, DAG);
12403
12404   // Use to match splats
12405   if (HasSSE2 && isUNPCKHMask(M, VT, HasInt256) && V2IsUndef &&
12406       (VT == MVT::v2f64 || VT == MVT::v2i64))
12407     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
12408
12409   if (isPSHUFDMask(M, VT)) {
12410     // The actual implementation will match the mask in the if above and then
12411     // during isel it can match several different instructions, not only pshufd
12412     // as its name says, sad but true, emulate the behavior for now...
12413     if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64)))
12414       return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG);
12415
12416     unsigned TargetMask = getShuffleSHUFImmediate(SVOp);
12417
12418     if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
12419       return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
12420
12421     if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64))
12422       return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1, TargetMask,
12423                                   DAG);
12424
12425     return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1,
12426                                 TargetMask, DAG);
12427   }
12428
12429   if (isPALIGNRMask(M, VT, Subtarget))
12430     return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2,
12431                                 getShufflePALIGNRImmediate(SVOp),
12432                                 DAG);
12433
12434   if (isVALIGNMask(M, VT, Subtarget))
12435     return getTargetShuffleNode(X86ISD::VALIGN, dl, VT, V1, V2,
12436                                 getShuffleVALIGNImmediate(SVOp),
12437                                 DAG);
12438
12439   // Check if this can be converted into a logical shift.
12440   bool isLeft = false;
12441   unsigned ShAmt = 0;
12442   SDValue ShVal;
12443   bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
12444   if (isShift && ShVal.hasOneUse()) {
12445     // If the shifted value has multiple uses, it may be cheaper to use
12446     // v_set0 + movlhps or movhlps, etc.
12447     MVT EltVT = VT.getVectorElementType();
12448     ShAmt *= EltVT.getSizeInBits();
12449     return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
12450   }
12451
12452   if (isMOVLMask(M, VT)) {
12453     if (ISD::isBuildVectorAllZeros(V1.getNode()))
12454       return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
12455     if (!isMOVLPMask(M, VT)) {
12456       if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
12457         return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
12458
12459       if (VT == MVT::v4i32 || VT == MVT::v4f32)
12460         return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
12461     }
12462   }
12463
12464   // FIXME: fold these into legal mask.
12465   if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasInt256))
12466     return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
12467
12468   if (isMOVHLPSMask(M, VT))
12469     return getMOVHighToLow(Op, dl, DAG);
12470
12471   if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget))
12472     return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
12473
12474   if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget))
12475     return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
12476
12477   if (isMOVLPMask(M, VT))
12478     return getMOVLP(Op, dl, DAG, HasSSE2);
12479
12480   if (ShouldXformToMOVHLPS(M, VT) ||
12481       ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT))
12482     return DAG.getCommutedVectorShuffle(*SVOp);
12483
12484   if (isShift) {
12485     // No better options. Use a vshldq / vsrldq.
12486     MVT EltVT = VT.getVectorElementType();
12487     ShAmt *= EltVT.getSizeInBits();
12488     return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
12489   }
12490
12491   bool Commuted = false;
12492   // FIXME: This should also accept a bitcast of a splat?  Be careful, not
12493   // 1,1,1,1 -> v8i16 though.
12494   BitVector UndefElements;
12495   if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V1.getNode()))
12496     if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
12497       V1IsSplat = true;
12498   if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V2.getNode()))
12499     if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
12500       V2IsSplat = true;
12501
12502   // Canonicalize the splat or undef, if present, to be on the RHS.
12503   if (!V2IsUndef && V1IsSplat && !V2IsSplat) {
12504     CommuteVectorShuffleMask(M, NumElems);
12505     std::swap(V1, V2);
12506     std::swap(V1IsSplat, V2IsSplat);
12507     Commuted = true;
12508   }
12509
12510   if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) {
12511     // Shuffling low element of v1 into undef, just return v1.
12512     if (V2IsUndef)
12513       return V1;
12514     // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
12515     // the instruction selector will not match, so get a canonical MOVL with
12516     // swapped operands to undo the commute.
12517     return getMOVL(DAG, dl, VT, V2, V1);
12518   }
12519
12520   if (isUNPCKLMask(M, VT, HasInt256))
12521     return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
12522
12523   if (isUNPCKHMask(M, VT, HasInt256))
12524     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
12525
12526   if (V2IsSplat) {
12527     // Normalize mask so all entries that point to V2 points to its first
12528     // element then try to match unpck{h|l} again. If match, return a
12529     // new vector_shuffle with the corrected mask.p
12530     SmallVector<int, 8> NewMask(M.begin(), M.end());
12531     NormalizeMask(NewMask, NumElems);
12532     if (isUNPCKLMask(NewMask, VT, HasInt256, true))
12533       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
12534     if (isUNPCKHMask(NewMask, VT, HasInt256, true))
12535       return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
12536   }
12537
12538   if (Commuted) {
12539     // Commute is back and try unpck* again.
12540     // FIXME: this seems wrong.
12541     CommuteVectorShuffleMask(M, NumElems);
12542     std::swap(V1, V2);
12543     std::swap(V1IsSplat, V2IsSplat);
12544
12545     if (isUNPCKLMask(M, VT, HasInt256))
12546       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
12547
12548     if (isUNPCKHMask(M, VT, HasInt256))
12549       return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
12550   }
12551
12552   // Normalize the node to match x86 shuffle ops if needed
12553   if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true)))
12554     return DAG.getCommutedVectorShuffle(*SVOp);
12555
12556   // The checks below are all present in isShuffleMaskLegal, but they are
12557   // inlined here right now to enable us to directly emit target specific
12558   // nodes, and remove one by one until they don't return Op anymore.
12559
12560   if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
12561       SVOp->getSplatIndex() == 0 && V2IsUndef) {
12562     if (VT == MVT::v2f64 || VT == MVT::v2i64)
12563       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
12564   }
12565
12566   if (isPSHUFHWMask(M, VT, HasInt256))
12567     return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
12568                                 getShufflePSHUFHWImmediate(SVOp),
12569                                 DAG);
12570
12571   if (isPSHUFLWMask(M, VT, HasInt256))
12572     return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
12573                                 getShufflePSHUFLWImmediate(SVOp),
12574                                 DAG);
12575
12576   unsigned MaskValue;
12577   if (isBlendMask(M, VT, Subtarget->hasSSE41(), Subtarget->hasInt256(),
12578                   &MaskValue))
12579     return LowerVECTOR_SHUFFLEtoBlend(SVOp, MaskValue, Subtarget, DAG);
12580
12581   if (isSHUFPMask(M, VT))
12582     return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
12583                                 getShuffleSHUFImmediate(SVOp), DAG);
12584
12585   if (isUNPCKL_v_undef_Mask(M, VT, HasInt256))
12586     return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
12587   if (isUNPCKH_v_undef_Mask(M, VT, HasInt256))
12588     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
12589
12590   //===--------------------------------------------------------------------===//
12591   // Generate target specific nodes for 128 or 256-bit shuffles only
12592   // supported in the AVX instruction set.
12593   //
12594
12595   // Handle VMOVDDUPY permutations
12596   if (V2IsUndef && isMOVDDUPYMask(M, VT, HasFp256))
12597     return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
12598
12599   // Handle VPERMILPS/D* permutations
12600   if (isVPERMILPMask(M, VT)) {
12601     if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32)
12602       return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
12603                                   getShuffleSHUFImmediate(SVOp), DAG);
12604     return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1,
12605                                 getShuffleSHUFImmediate(SVOp), DAG);
12606   }
12607
12608   unsigned Idx;
12609   if (VT.is512BitVector() && isINSERT64x4Mask(M, VT, &Idx))
12610     return Insert256BitVector(V1, Extract256BitVector(V2, 0, DAG, dl),
12611                               Idx*(NumElems/2), DAG, dl);
12612
12613   // Handle VPERM2F128/VPERM2I128 permutations
12614   if (isVPERM2X128Mask(M, VT, HasFp256))
12615     return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
12616                                 V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
12617
12618   if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT))
12619     return getINSERTPS(SVOp, dl, DAG);
12620
12621   unsigned Imm8;
12622   if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8))
12623     return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG);
12624
12625   if ((V2IsUndef && HasInt256 && VT.is256BitVector() && NumElems == 8) ||
12626       VT.is512BitVector()) {
12627     MVT MaskEltVT = MVT::getIntegerVT(VT.getVectorElementType().getSizeInBits());
12628     MVT MaskVectorVT = MVT::getVectorVT(MaskEltVT, NumElems);
12629     SmallVector<SDValue, 16> permclMask;
12630     for (unsigned i = 0; i != NumElems; ++i) {
12631       permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT));
12632     }
12633
12634     SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, permclMask);
12635     if (V2IsUndef)
12636       // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
12637       return DAG.getNode(X86ISD::VPERMV, dl, VT,
12638                           DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
12639     return DAG.getNode(X86ISD::VPERMV3, dl, VT, V1,
12640                        DAG.getNode(ISD::BITCAST, dl, VT, Mask), V2);
12641   }
12642
12643   //===--------------------------------------------------------------------===//
12644   // Since no target specific shuffle was selected for this generic one,
12645   // lower it into other known shuffles. FIXME: this isn't true yet, but
12646   // this is the plan.
12647   //
12648
12649   // Handle v8i16 specifically since SSE can do byte extraction and insertion.
12650   if (VT == MVT::v8i16) {
12651     SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG);
12652     if (NewOp.getNode())
12653       return NewOp;
12654   }
12655
12656   if (VT == MVT::v16i16 && Subtarget->hasInt256()) {
12657     SDValue NewOp = LowerVECTOR_SHUFFLEv16i16(Op, DAG);
12658     if (NewOp.getNode())
12659       return NewOp;
12660   }
12661
12662   if (VT == MVT::v16i8) {
12663     SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, Subtarget, DAG);
12664     if (NewOp.getNode())
12665       return NewOp;
12666   }
12667
12668   if (VT == MVT::v32i8) {
12669     SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG);
12670     if (NewOp.getNode())
12671       return NewOp;
12672   }
12673
12674   // Handle all 128-bit wide vectors with 4 elements, and match them with
12675   // several different shuffle types.
12676   if (NumElems == 4 && VT.is128BitVector())
12677     return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
12678
12679   // Handle general 256-bit shuffles
12680   if (VT.is256BitVector())
12681     return LowerVECTOR_SHUFFLE_256(SVOp, DAG);
12682
12683   return SDValue();
12684 }
12685
12686 // This function assumes its argument is a BUILD_VECTOR of constants or
12687 // undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is
12688 // true.
12689 static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
12690                                     unsigned &MaskValue) {
12691   MaskValue = 0;
12692   unsigned NumElems = BuildVector->getNumOperands();
12693   // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
12694   unsigned NumLanes = (NumElems - 1) / 8 + 1;
12695   unsigned NumElemsInLane = NumElems / NumLanes;
12696
12697   // Blend for v16i16 should be symetric for the both lanes.
12698   for (unsigned i = 0; i < NumElemsInLane; ++i) {
12699     SDValue EltCond = BuildVector->getOperand(i);
12700     SDValue SndLaneEltCond =
12701         (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond;
12702
12703     int Lane1Cond = -1, Lane2Cond = -1;
12704     if (isa<ConstantSDNode>(EltCond))
12705       Lane1Cond = !isZero(EltCond);
12706     if (isa<ConstantSDNode>(SndLaneEltCond))
12707       Lane2Cond = !isZero(SndLaneEltCond);
12708
12709     if (Lane1Cond == Lane2Cond || Lane2Cond < 0)
12710       // Lane1Cond != 0, means we want the first argument.
12711       // Lane1Cond == 0, means we want the second argument.
12712       // The encoding of this argument is 0 for the first argument, 1
12713       // for the second. Therefore, invert the condition.
12714       MaskValue |= !Lane1Cond << i;
12715     else if (Lane1Cond < 0)
12716       MaskValue |= !Lane2Cond << i;
12717     else
12718       return false;
12719   }
12720   return true;
12721 }
12722
12723 /// \brief Try to lower a VSELECT instruction to an immediate-controlled blend
12724 /// instruction.
12725 static SDValue lowerVSELECTtoBLENDI(SDValue Op, const X86Subtarget *Subtarget,
12726                                     SelectionDAG &DAG) {
12727   SDValue Cond = Op.getOperand(0);
12728   SDValue LHS = Op.getOperand(1);
12729   SDValue RHS = Op.getOperand(2);
12730   SDLoc dl(Op);
12731   MVT VT = Op.getSimpleValueType();
12732   MVT EltVT = VT.getVectorElementType();
12733   unsigned NumElems = VT.getVectorNumElements();
12734
12735   // There is no blend with immediate in AVX-512.
12736   if (VT.is512BitVector())
12737     return SDValue();
12738
12739   if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
12740     return SDValue();
12741   if (!Subtarget->hasInt256() && VT == MVT::v16i16)
12742     return SDValue();
12743
12744   if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
12745     return SDValue();
12746
12747   // Check the mask for BLEND and build the value.
12748   unsigned MaskValue = 0;
12749   if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
12750     return SDValue();
12751
12752   // Convert i32 vectors to floating point if it is not AVX2.
12753   // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
12754   MVT BlendVT = VT;
12755   if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
12756     BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
12757                                NumElems);
12758     LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS);
12759     RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS);
12760   }
12761
12762   SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS,
12763                             DAG.getConstant(MaskValue, MVT::i32));
12764   return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
12765 }
12766
12767 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
12768   // A vselect where all conditions and data are constants can be optimized into
12769   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
12770   if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
12771       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
12772       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
12773     return SDValue();
12774
12775   SDValue BlendOp = lowerVSELECTtoBLENDI(Op, Subtarget, DAG);
12776   if (BlendOp.getNode())
12777     return BlendOp;
12778
12779   // Some types for vselect were previously set to Expand, not Legal or
12780   // Custom. Return an empty SDValue so we fall-through to Expand, after
12781   // the Custom lowering phase.
12782   MVT VT = Op.getSimpleValueType();
12783   switch (VT.SimpleTy) {
12784   default:
12785     break;
12786   case MVT::v8i16:
12787   case MVT::v16i16:
12788     if (Subtarget->hasBWI() && Subtarget->hasVLX())
12789       break;
12790     return SDValue();
12791   }
12792
12793   // We couldn't create a "Blend with immediate" node.
12794   // This node should still be legal, but we'll have to emit a blendv*
12795   // instruction.
12796   return Op;
12797 }
12798
12799 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
12800   MVT VT = Op.getSimpleValueType();
12801   SDLoc dl(Op);
12802
12803   if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
12804     return SDValue();
12805
12806   if (VT.getSizeInBits() == 8) {
12807     SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
12808                                   Op.getOperand(0), Op.getOperand(1));
12809     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
12810                                   DAG.getValueType(VT));
12811     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
12812   }
12813
12814   if (VT.getSizeInBits() == 16) {
12815     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
12816     // If Idx is 0, it's cheaper to do a move instead of a pextrw.
12817     if (Idx == 0)
12818       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
12819                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
12820                                      DAG.getNode(ISD::BITCAST, dl,
12821                                                  MVT::v4i32,
12822                                                  Op.getOperand(0)),
12823                                      Op.getOperand(1)));
12824     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
12825                                   Op.getOperand(0), Op.getOperand(1));
12826     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
12827                                   DAG.getValueType(VT));
12828     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
12829   }
12830
12831   if (VT == MVT::f32) {
12832     // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
12833     // the result back to FR32 register. It's only worth matching if the
12834     // result has a single use which is a store or a bitcast to i32.  And in
12835     // the case of a store, it's not worth it if the index is a constant 0,
12836     // because a MOVSSmr can be used instead, which is smaller and faster.
12837     if (!Op.hasOneUse())
12838       return SDValue();
12839     SDNode *User = *Op.getNode()->use_begin();
12840     if ((User->getOpcode() != ISD::STORE ||
12841          (isa<ConstantSDNode>(Op.getOperand(1)) &&
12842           cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
12843         (User->getOpcode() != ISD::BITCAST ||
12844          User->getValueType(0) != MVT::i32))
12845       return SDValue();
12846     SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
12847                                   DAG.getNode(ISD::BITCAST, dl, MVT::v4i32,
12848                                               Op.getOperand(0)),
12849                                               Op.getOperand(1));
12850     return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
12851   }
12852
12853   if (VT == MVT::i32 || VT == MVT::i64) {
12854     // ExtractPS/pextrq works with constant index.
12855     if (isa<ConstantSDNode>(Op.getOperand(1)))
12856       return Op;
12857   }
12858   return SDValue();
12859 }
12860
12861 /// Extract one bit from mask vector, like v16i1 or v8i1.
12862 /// AVX-512 feature.
12863 SDValue
12864 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
12865   SDValue Vec = Op.getOperand(0);
12866   SDLoc dl(Vec);
12867   MVT VecVT = Vec.getSimpleValueType();
12868   SDValue Idx = Op.getOperand(1);
12869   MVT EltVT = Op.getSimpleValueType();
12870
12871   assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
12872
12873   // variable index can't be handled in mask registers,
12874   // extend vector to VR512
12875   if (!isa<ConstantSDNode>(Idx)) {
12876     MVT ExtVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
12877     SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
12878     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
12879                               ExtVT.getVectorElementType(), Ext, Idx);
12880     return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
12881   }
12882
12883   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
12884   const TargetRegisterClass* rc = getRegClassFor(VecVT);
12885   unsigned MaxSift = rc->getSize()*8 - 1;
12886   Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
12887                     DAG.getConstant(MaxSift - IdxVal, MVT::i8));
12888   Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
12889                     DAG.getConstant(MaxSift, MVT::i8));
12890   return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
12891                        DAG.getIntPtrConstant(0));
12892 }
12893
12894 SDValue
12895 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
12896                                            SelectionDAG &DAG) const {
12897   SDLoc dl(Op);
12898   SDValue Vec = Op.getOperand(0);
12899   MVT VecVT = Vec.getSimpleValueType();
12900   SDValue Idx = Op.getOperand(1);
12901
12902   if (Op.getSimpleValueType() == MVT::i1)
12903     return ExtractBitFromMaskVector(Op, DAG);
12904
12905   if (!isa<ConstantSDNode>(Idx)) {
12906     if (VecVT.is512BitVector() ||
12907         (VecVT.is256BitVector() && Subtarget->hasInt256() &&
12908          VecVT.getVectorElementType().getSizeInBits() == 32)) {
12909
12910       MVT MaskEltVT =
12911         MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits());
12912       MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
12913                                     MaskEltVT.getSizeInBits());
12914
12915       Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
12916       SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
12917                                 getZeroVector(MaskVT, Subtarget, DAG, dl),
12918                                 Idx, DAG.getConstant(0, getPointerTy()));
12919       SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
12920       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(),
12921                         Perm, DAG.getConstant(0, getPointerTy()));
12922     }
12923     return SDValue();
12924   }
12925
12926   // If this is a 256-bit vector result, first extract the 128-bit vector and
12927   // then extract the element from the 128-bit vector.
12928   if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
12929
12930     unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
12931     // Get the 128-bit vector.
12932     Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
12933     MVT EltVT = VecVT.getVectorElementType();
12934
12935     unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
12936
12937     //if (IdxVal >= NumElems/2)
12938     //  IdxVal -= NumElems/2;
12939     IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk;
12940     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
12941                        DAG.getConstant(IdxVal, MVT::i32));
12942   }
12943
12944   assert(VecVT.is128BitVector() && "Unexpected vector length");
12945
12946   if (Subtarget->hasSSE41()) {
12947     SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
12948     if (Res.getNode())
12949       return Res;
12950   }
12951
12952   MVT VT = Op.getSimpleValueType();
12953   // TODO: handle v16i8.
12954   if (VT.getSizeInBits() == 16) {
12955     SDValue Vec = Op.getOperand(0);
12956     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
12957     if (Idx == 0)
12958       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
12959                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
12960                                      DAG.getNode(ISD::BITCAST, dl,
12961                                                  MVT::v4i32, Vec),
12962                                      Op.getOperand(1)));
12963     // Transform it so it match pextrw which produces a 32-bit result.
12964     MVT EltVT = MVT::i32;
12965     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
12966                                   Op.getOperand(0), Op.getOperand(1));
12967     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
12968                                   DAG.getValueType(VT));
12969     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
12970   }
12971
12972   if (VT.getSizeInBits() == 32) {
12973     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
12974     if (Idx == 0)
12975       return Op;
12976
12977     // SHUFPS the element to the lowest double word, then movss.
12978     int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
12979     MVT VVT = Op.getOperand(0).getSimpleValueType();
12980     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
12981                                        DAG.getUNDEF(VVT), Mask);
12982     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
12983                        DAG.getIntPtrConstant(0));
12984   }
12985
12986   if (VT.getSizeInBits() == 64) {
12987     // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
12988     // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
12989     //        to match extract_elt for f64.
12990     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
12991     if (Idx == 0)
12992       return Op;
12993
12994     // UNPCKHPD the element to the lowest double word, then movsd.
12995     // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
12996     // to a f64mem, the whole operation is folded into a single MOVHPDmr.
12997     int Mask[2] = { 1, -1 };
12998     MVT VVT = Op.getOperand(0).getSimpleValueType();
12999     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
13000                                        DAG.getUNDEF(VVT), Mask);
13001     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
13002                        DAG.getIntPtrConstant(0));
13003   }
13004
13005   return SDValue();
13006 }
13007
13008 /// Insert one bit to mask vector, like v16i1 or v8i1.
13009 /// AVX-512 feature.
13010 SDValue
13011 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
13012   SDLoc dl(Op);
13013   SDValue Vec = Op.getOperand(0);
13014   SDValue Elt = Op.getOperand(1);
13015   SDValue Idx = Op.getOperand(2);
13016   MVT VecVT = Vec.getSimpleValueType();
13017
13018   if (!isa<ConstantSDNode>(Idx)) {
13019     // Non constant index. Extend source and destination,
13020     // insert element and then truncate the result.
13021     MVT ExtVecVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
13022     MVT ExtEltVT = (VecVT == MVT::v8i1 ?  MVT::i64 : MVT::i32);
13023     SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
13024       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
13025       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
13026     return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
13027   }
13028
13029   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13030   SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
13031   if (Vec.getOpcode() == ISD::UNDEF)
13032     return DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
13033                        DAG.getConstant(IdxVal, MVT::i8));
13034   const TargetRegisterClass* rc = getRegClassFor(VecVT);
13035   unsigned MaxSift = rc->getSize()*8 - 1;
13036   EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
13037                     DAG.getConstant(MaxSift, MVT::i8));
13038   EltInVec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, EltInVec,
13039                     DAG.getConstant(MaxSift - IdxVal, MVT::i8));
13040   return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
13041 }
13042
13043 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
13044                                                   SelectionDAG &DAG) const {
13045   MVT VT = Op.getSimpleValueType();
13046   MVT EltVT = VT.getVectorElementType();
13047
13048   if (EltVT == MVT::i1)
13049     return InsertBitToMaskVector(Op, DAG);
13050
13051   SDLoc dl(Op);
13052   SDValue N0 = Op.getOperand(0);
13053   SDValue N1 = Op.getOperand(1);
13054   SDValue N2 = Op.getOperand(2);
13055   if (!isa<ConstantSDNode>(N2))
13056     return SDValue();
13057   auto *N2C = cast<ConstantSDNode>(N2);
13058   unsigned IdxVal = N2C->getZExtValue();
13059
13060   // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
13061   // into that, and then insert the subvector back into the result.
13062   if (VT.is256BitVector() || VT.is512BitVector()) {
13063     // Get the desired 128-bit vector half.
13064     SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
13065
13066     // Insert the element into the desired half.
13067     unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
13068     unsigned IdxIn128 = IdxVal - (IdxVal / NumEltsIn128) * NumEltsIn128;
13069
13070     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
13071                     DAG.getConstant(IdxIn128, MVT::i32));
13072
13073     // Insert the changed part back to the 256-bit vector
13074     return Insert128BitVector(N0, V, IdxVal, DAG, dl);
13075   }
13076   assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
13077
13078   if (Subtarget->hasSSE41()) {
13079     if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) {
13080       unsigned Opc;
13081       if (VT == MVT::v8i16) {
13082         Opc = X86ISD::PINSRW;
13083       } else {
13084         assert(VT == MVT::v16i8);
13085         Opc = X86ISD::PINSRB;
13086       }
13087
13088       // Transform it so it match pinsr{b,w} which expects a GR32 as its second
13089       // argument.
13090       if (N1.getValueType() != MVT::i32)
13091         N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
13092       if (N2.getValueType() != MVT::i32)
13093         N2 = DAG.getIntPtrConstant(IdxVal);
13094       return DAG.getNode(Opc, dl, VT, N0, N1, N2);
13095     }
13096
13097     if (EltVT == MVT::f32) {
13098       // Bits [7:6] of the constant are the source select.  This will always be
13099       //  zero here.  The DAG Combiner may combine an extract_elt index into
13100       //  these
13101       //  bits.  For example (insert (extract, 3), 2) could be matched by
13102       //  putting
13103       //  the '3' into bits [7:6] of X86ISD::INSERTPS.
13104       // Bits [5:4] of the constant are the destination select.  This is the
13105       //  value of the incoming immediate.
13106       // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may
13107       //   combine either bitwise AND or insert of float 0.0 to set these bits.
13108       N2 = DAG.getIntPtrConstant(IdxVal << 4);
13109       // Create this as a scalar to vector..
13110       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
13111       return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
13112     }
13113
13114     if (EltVT == MVT::i32 || EltVT == MVT::i64) {
13115       // PINSR* works with constant index.
13116       return Op;
13117     }
13118   }
13119
13120   if (EltVT == MVT::i8)
13121     return SDValue();
13122
13123   if (EltVT.getSizeInBits() == 16) {
13124     // Transform it so it match pinsrw which expects a 16-bit value in a GR32
13125     // as its second argument.
13126     if (N1.getValueType() != MVT::i32)
13127       N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
13128     if (N2.getValueType() != MVT::i32)
13129       N2 = DAG.getIntPtrConstant(IdxVal);
13130     return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
13131   }
13132   return SDValue();
13133 }
13134
13135 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
13136   SDLoc dl(Op);
13137   MVT OpVT = Op.getSimpleValueType();
13138
13139   // If this is a 256-bit vector result, first insert into a 128-bit
13140   // vector and then insert into the 256-bit vector.
13141   if (!OpVT.is128BitVector()) {
13142     // Insert into a 128-bit vector.
13143     unsigned SizeFactor = OpVT.getSizeInBits()/128;
13144     MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
13145                                  OpVT.getVectorNumElements() / SizeFactor);
13146
13147     Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
13148
13149     // Insert the 128-bit vector.
13150     return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
13151   }
13152
13153   if (OpVT == MVT::v1i64 &&
13154       Op.getOperand(0).getValueType() == MVT::i64)
13155     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
13156
13157   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
13158   assert(OpVT.is128BitVector() && "Expected an SSE type!");
13159   return DAG.getNode(ISD::BITCAST, dl, OpVT,
13160                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
13161 }
13162
13163 // Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
13164 // a simple subregister reference or explicit instructions to grab
13165 // upper bits of a vector.
13166 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
13167                                       SelectionDAG &DAG) {
13168   SDLoc dl(Op);
13169   SDValue In =  Op.getOperand(0);
13170   SDValue Idx = Op.getOperand(1);
13171   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13172   MVT ResVT   = Op.getSimpleValueType();
13173   MVT InVT    = In.getSimpleValueType();
13174
13175   if (Subtarget->hasFp256()) {
13176     if (ResVT.is128BitVector() &&
13177         (InVT.is256BitVector() || InVT.is512BitVector()) &&
13178         isa<ConstantSDNode>(Idx)) {
13179       return Extract128BitVector(In, IdxVal, DAG, dl);
13180     }
13181     if (ResVT.is256BitVector() && InVT.is512BitVector() &&
13182         isa<ConstantSDNode>(Idx)) {
13183       return Extract256BitVector(In, IdxVal, DAG, dl);
13184     }
13185   }
13186   return SDValue();
13187 }
13188
13189 // Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
13190 // simple superregister reference or explicit instructions to insert
13191 // the upper bits of a vector.
13192 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
13193                                      SelectionDAG &DAG) {
13194   if (Subtarget->hasFp256()) {
13195     SDLoc dl(Op.getNode());
13196     SDValue Vec = Op.getNode()->getOperand(0);
13197     SDValue SubVec = Op.getNode()->getOperand(1);
13198     SDValue Idx = Op.getNode()->getOperand(2);
13199
13200     if ((Op.getNode()->getSimpleValueType(0).is256BitVector() ||
13201          Op.getNode()->getSimpleValueType(0).is512BitVector()) &&
13202         SubVec.getNode()->getSimpleValueType(0).is128BitVector() &&
13203         isa<ConstantSDNode>(Idx)) {
13204       unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13205       return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
13206     }
13207
13208     if (Op.getNode()->getSimpleValueType(0).is512BitVector() &&
13209         SubVec.getNode()->getSimpleValueType(0).is256BitVector() &&
13210         isa<ConstantSDNode>(Idx)) {
13211       unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13212       return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
13213     }
13214   }
13215   return SDValue();
13216 }
13217
13218 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
13219 // their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
13220 // one of the above mentioned nodes. It has to be wrapped because otherwise
13221 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
13222 // be used to form addressing mode. These wrapped nodes will be selected
13223 // into MOV32ri.
13224 SDValue
13225 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
13226   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
13227
13228   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13229   // global base reg.
13230   unsigned char OpFlag = 0;
13231   unsigned WrapperKind = X86ISD::Wrapper;
13232   CodeModel::Model M = DAG.getTarget().getCodeModel();
13233
13234   if (Subtarget->isPICStyleRIPRel() &&
13235       (M == CodeModel::Small || M == CodeModel::Kernel))
13236     WrapperKind = X86ISD::WrapperRIP;
13237   else if (Subtarget->isPICStyleGOT())
13238     OpFlag = X86II::MO_GOTOFF;
13239   else if (Subtarget->isPICStyleStubPIC())
13240     OpFlag = X86II::MO_PIC_BASE_OFFSET;
13241
13242   SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
13243                                              CP->getAlignment(),
13244                                              CP->getOffset(), OpFlag);
13245   SDLoc DL(CP);
13246   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13247   // With PIC, the address is actually $g + Offset.
13248   if (OpFlag) {
13249     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13250                          DAG.getNode(X86ISD::GlobalBaseReg,
13251                                      SDLoc(), getPointerTy()),
13252                          Result);
13253   }
13254
13255   return Result;
13256 }
13257
13258 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
13259   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
13260
13261   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13262   // global base reg.
13263   unsigned char OpFlag = 0;
13264   unsigned WrapperKind = X86ISD::Wrapper;
13265   CodeModel::Model M = DAG.getTarget().getCodeModel();
13266
13267   if (Subtarget->isPICStyleRIPRel() &&
13268       (M == CodeModel::Small || M == CodeModel::Kernel))
13269     WrapperKind = X86ISD::WrapperRIP;
13270   else if (Subtarget->isPICStyleGOT())
13271     OpFlag = X86II::MO_GOTOFF;
13272   else if (Subtarget->isPICStyleStubPIC())
13273     OpFlag = X86II::MO_PIC_BASE_OFFSET;
13274
13275   SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
13276                                           OpFlag);
13277   SDLoc DL(JT);
13278   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13279
13280   // With PIC, the address is actually $g + Offset.
13281   if (OpFlag)
13282     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13283                          DAG.getNode(X86ISD::GlobalBaseReg,
13284                                      SDLoc(), getPointerTy()),
13285                          Result);
13286
13287   return Result;
13288 }
13289
13290 SDValue
13291 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
13292   const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
13293
13294   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13295   // global base reg.
13296   unsigned char OpFlag = 0;
13297   unsigned WrapperKind = X86ISD::Wrapper;
13298   CodeModel::Model M = DAG.getTarget().getCodeModel();
13299
13300   if (Subtarget->isPICStyleRIPRel() &&
13301       (M == CodeModel::Small || M == CodeModel::Kernel)) {
13302     if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())
13303       OpFlag = X86II::MO_GOTPCREL;
13304     WrapperKind = X86ISD::WrapperRIP;
13305   } else if (Subtarget->isPICStyleGOT()) {
13306     OpFlag = X86II::MO_GOT;
13307   } else if (Subtarget->isPICStyleStubPIC()) {
13308     OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE;
13309   } else if (Subtarget->isPICStyleStubNoDynamic()) {
13310     OpFlag = X86II::MO_DARWIN_NONLAZY;
13311   }
13312
13313   SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
13314
13315   SDLoc DL(Op);
13316   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13317
13318   // With PIC, the address is actually $g + Offset.
13319   if (DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
13320       !Subtarget->is64Bit()) {
13321     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13322                          DAG.getNode(X86ISD::GlobalBaseReg,
13323                                      SDLoc(), getPointerTy()),
13324                          Result);
13325   }
13326
13327   // For symbols that require a load from a stub to get the address, emit the
13328   // load.
13329   if (isGlobalStubReference(OpFlag))
13330     Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
13331                          MachinePointerInfo::getGOT(), false, false, false, 0);
13332
13333   return Result;
13334 }
13335
13336 SDValue
13337 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
13338   // Create the TargetBlockAddressAddress node.
13339   unsigned char OpFlags =
13340     Subtarget->ClassifyBlockAddressReference();
13341   CodeModel::Model M = DAG.getTarget().getCodeModel();
13342   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
13343   int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
13344   SDLoc dl(Op);
13345   SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset,
13346                                              OpFlags);
13347
13348   if (Subtarget->isPICStyleRIPRel() &&
13349       (M == CodeModel::Small || M == CodeModel::Kernel))
13350     Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
13351   else
13352     Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
13353
13354   // With PIC, the address is actually $g + Offset.
13355   if (isGlobalRelativeToPICBase(OpFlags)) {
13356     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
13357                          DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
13358                          Result);
13359   }
13360
13361   return Result;
13362 }
13363
13364 SDValue
13365 X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
13366                                       int64_t Offset, SelectionDAG &DAG) const {
13367   // Create the TargetGlobalAddress node, folding in the constant
13368   // offset if it is legal.
13369   unsigned char OpFlags =
13370       Subtarget->ClassifyGlobalReference(GV, DAG.getTarget());
13371   CodeModel::Model M = DAG.getTarget().getCodeModel();
13372   SDValue Result;
13373   if (OpFlags == X86II::MO_NO_FLAG &&
13374       X86::isOffsetSuitableForCodeModel(Offset, M)) {
13375     // A direct static reference to a global.
13376     Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
13377     Offset = 0;
13378   } else {
13379     Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
13380   }
13381
13382   if (Subtarget->isPICStyleRIPRel() &&
13383       (M == CodeModel::Small || M == CodeModel::Kernel))
13384     Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
13385   else
13386     Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
13387
13388   // With PIC, the address is actually $g + Offset.
13389   if (isGlobalRelativeToPICBase(OpFlags)) {
13390     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
13391                          DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
13392                          Result);
13393   }
13394
13395   // For globals that require a load from a stub to get the address, emit the
13396   // load.
13397   if (isGlobalStubReference(OpFlags))
13398     Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
13399                          MachinePointerInfo::getGOT(), false, false, false, 0);
13400
13401   // If there was a non-zero offset that we didn't fold, create an explicit
13402   // addition for it.
13403   if (Offset != 0)
13404     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
13405                          DAG.getConstant(Offset, getPointerTy()));
13406
13407   return Result;
13408 }
13409
13410 SDValue
13411 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
13412   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
13413   int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
13414   return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
13415 }
13416
13417 static SDValue
13418 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
13419            SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
13420            unsigned char OperandFlags, bool LocalDynamic = false) {
13421   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
13422   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
13423   SDLoc dl(GA);
13424   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13425                                            GA->getValueType(0),
13426                                            GA->getOffset(),
13427                                            OperandFlags);
13428
13429   X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
13430                                            : X86ISD::TLSADDR;
13431
13432   if (InFlag) {
13433     SDValue Ops[] = { Chain,  TGA, *InFlag };
13434     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
13435   } else {
13436     SDValue Ops[]  = { Chain, TGA };
13437     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
13438   }
13439
13440   // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
13441   MFI->setAdjustsStack(true);
13442   MFI->setHasCalls(true);
13443
13444   SDValue Flag = Chain.getValue(1);
13445   return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
13446 }
13447
13448 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
13449 static SDValue
13450 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13451                                 const EVT PtrVT) {
13452   SDValue InFlag;
13453   SDLoc dl(GA);  // ? function entry point might be better
13454   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
13455                                    DAG.getNode(X86ISD::GlobalBaseReg,
13456                                                SDLoc(), PtrVT), InFlag);
13457   InFlag = Chain.getValue(1);
13458
13459   return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
13460 }
13461
13462 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
13463 static SDValue
13464 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13465                                 const EVT PtrVT) {
13466   return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
13467                     X86::RAX, X86II::MO_TLSGD);
13468 }
13469
13470 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
13471                                            SelectionDAG &DAG,
13472                                            const EVT PtrVT,
13473                                            bool is64Bit) {
13474   SDLoc dl(GA);
13475
13476   // Get the start address of the TLS block for this module.
13477   X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
13478       .getInfo<X86MachineFunctionInfo>();
13479   MFI->incNumLocalDynamicTLSAccesses();
13480
13481   SDValue Base;
13482   if (is64Bit) {
13483     Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
13484                       X86II::MO_TLSLD, /*LocalDynamic=*/true);
13485   } else {
13486     SDValue InFlag;
13487     SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
13488         DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
13489     InFlag = Chain.getValue(1);
13490     Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
13491                       X86II::MO_TLSLDM, /*LocalDynamic=*/true);
13492   }
13493
13494   // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
13495   // of Base.
13496
13497   // Build x@dtpoff.
13498   unsigned char OperandFlags = X86II::MO_DTPOFF;
13499   unsigned WrapperKind = X86ISD::Wrapper;
13500   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13501                                            GA->getValueType(0),
13502                                            GA->getOffset(), OperandFlags);
13503   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
13504
13505   // Add x@dtpoff with the base.
13506   return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
13507 }
13508
13509 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
13510 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13511                                    const EVT PtrVT, TLSModel::Model model,
13512                                    bool is64Bit, bool isPIC) {
13513   SDLoc dl(GA);
13514
13515   // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
13516   Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
13517                                                          is64Bit ? 257 : 256));
13518
13519   SDValue ThreadPointer =
13520       DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0),
13521                   MachinePointerInfo(Ptr), false, false, false, 0);
13522
13523   unsigned char OperandFlags = 0;
13524   // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
13525   // initialexec.
13526   unsigned WrapperKind = X86ISD::Wrapper;
13527   if (model == TLSModel::LocalExec) {
13528     OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
13529   } else if (model == TLSModel::InitialExec) {
13530     if (is64Bit) {
13531       OperandFlags = X86II::MO_GOTTPOFF;
13532       WrapperKind = X86ISD::WrapperRIP;
13533     } else {
13534       OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
13535     }
13536   } else {
13537     llvm_unreachable("Unexpected model");
13538   }
13539
13540   // emit "addl x@ntpoff,%eax" (local exec)
13541   // or "addl x@indntpoff,%eax" (initial exec)
13542   // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
13543   SDValue TGA =
13544       DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
13545                                  GA->getOffset(), OperandFlags);
13546   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
13547
13548   if (model == TLSModel::InitialExec) {
13549     if (isPIC && !is64Bit) {
13550       Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
13551                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
13552                            Offset);
13553     }
13554
13555     Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
13556                          MachinePointerInfo::getGOT(), false, false, false, 0);
13557   }
13558
13559   // The address of the thread local variable is the add of the thread
13560   // pointer with the offset of the variable.
13561   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
13562 }
13563
13564 SDValue
13565 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
13566
13567   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
13568   const GlobalValue *GV = GA->getGlobal();
13569
13570   if (Subtarget->isTargetELF()) {
13571     TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
13572
13573     switch (model) {
13574       case TLSModel::GeneralDynamic:
13575         if (Subtarget->is64Bit())
13576           return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
13577         return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
13578       case TLSModel::LocalDynamic:
13579         return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(),
13580                                            Subtarget->is64Bit());
13581       case TLSModel::InitialExec:
13582       case TLSModel::LocalExec:
13583         return LowerToTLSExecModel(
13584             GA, DAG, getPointerTy(), model, Subtarget->is64Bit(),
13585             DAG.getTarget().getRelocationModel() == Reloc::PIC_);
13586     }
13587     llvm_unreachable("Unknown TLS model.");
13588   }
13589
13590   if (Subtarget->isTargetDarwin()) {
13591     // Darwin only has one model of TLS.  Lower to that.
13592     unsigned char OpFlag = 0;
13593     unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
13594                            X86ISD::WrapperRIP : X86ISD::Wrapper;
13595
13596     // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13597     // global base reg.
13598     bool PIC32 = (DAG.getTarget().getRelocationModel() == Reloc::PIC_) &&
13599                  !Subtarget->is64Bit();
13600     if (PIC32)
13601       OpFlag = X86II::MO_TLVP_PIC_BASE;
13602     else
13603       OpFlag = X86II::MO_TLVP;
13604     SDLoc DL(Op);
13605     SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
13606                                                 GA->getValueType(0),
13607                                                 GA->getOffset(), OpFlag);
13608     SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13609
13610     // With PIC32, the address is actually $g + Offset.
13611     if (PIC32)
13612       Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13613                            DAG.getNode(X86ISD::GlobalBaseReg,
13614                                        SDLoc(), getPointerTy()),
13615                            Offset);
13616
13617     // Lowering the machine isd will make sure everything is in the right
13618     // location.
13619     SDValue Chain = DAG.getEntryNode();
13620     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
13621     SDValue Args[] = { Chain, Offset };
13622     Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
13623
13624     // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
13625     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
13626     MFI->setAdjustsStack(true);
13627
13628     // And our return value (tls address) is in the standard call return value
13629     // location.
13630     unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
13631     return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
13632                               Chain.getValue(1));
13633   }
13634
13635   if (Subtarget->isTargetKnownWindowsMSVC() ||
13636       Subtarget->isTargetWindowsGNU()) {
13637     // Just use the implicit TLS architecture
13638     // Need to generate someting similar to:
13639     //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
13640     //                                  ; from TEB
13641     //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
13642     //   mov     rcx, qword [rdx+rcx*8]
13643     //   mov     eax, .tls$:tlsvar
13644     //   [rax+rcx] contains the address
13645     // Windows 64bit: gs:0x58
13646     // Windows 32bit: fs:__tls_array
13647
13648     SDLoc dl(GA);
13649     SDValue Chain = DAG.getEntryNode();
13650
13651     // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
13652     // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
13653     // use its literal value of 0x2C.
13654     Value *Ptr = Constant::getNullValue(Subtarget->is64Bit()
13655                                         ? Type::getInt8PtrTy(*DAG.getContext(),
13656                                                              256)
13657                                         : Type::getInt32PtrTy(*DAG.getContext(),
13658                                                               257));
13659
13660     SDValue TlsArray =
13661         Subtarget->is64Bit()
13662             ? DAG.getIntPtrConstant(0x58)
13663             : (Subtarget->isTargetWindowsGNU()
13664                    ? DAG.getIntPtrConstant(0x2C)
13665                    : DAG.getExternalSymbol("_tls_array", getPointerTy()));
13666
13667     SDValue ThreadPointer =
13668         DAG.getLoad(getPointerTy(), dl, Chain, TlsArray,
13669                     MachinePointerInfo(Ptr), false, false, false, 0);
13670
13671     // Load the _tls_index variable
13672     SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy());
13673     if (Subtarget->is64Bit())
13674       IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain,
13675                            IDX, MachinePointerInfo(), MVT::i32,
13676                            false, false, false, 0);
13677     else
13678       IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(),
13679                         false, false, false, 0);
13680
13681     SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()),
13682                                     getPointerTy());
13683     IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);
13684
13685     SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);
13686     res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(),
13687                       false, false, false, 0);
13688
13689     // Get the offset of start of .tls section
13690     SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13691                                              GA->getValueType(0),
13692                                              GA->getOffset(), X86II::MO_SECREL);
13693     SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA);
13694
13695     // The address of the thread local variable is the add of the thread
13696     // pointer with the offset of the variable.
13697     return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset);
13698   }
13699
13700   llvm_unreachable("TLS not implemented for this target.");
13701 }
13702
13703 /// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
13704 /// and take a 2 x i32 value to shift plus a shift amount.
13705 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
13706   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
13707   MVT VT = Op.getSimpleValueType();
13708   unsigned VTBits = VT.getSizeInBits();
13709   SDLoc dl(Op);
13710   bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
13711   SDValue ShOpLo = Op.getOperand(0);
13712   SDValue ShOpHi = Op.getOperand(1);
13713   SDValue ShAmt  = Op.getOperand(2);
13714   // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
13715   // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
13716   // during isel.
13717   SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
13718                                   DAG.getConstant(VTBits - 1, MVT::i8));
13719   SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
13720                                      DAG.getConstant(VTBits - 1, MVT::i8))
13721                        : DAG.getConstant(0, VT);
13722
13723   SDValue Tmp2, Tmp3;
13724   if (Op.getOpcode() == ISD::SHL_PARTS) {
13725     Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
13726     Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
13727   } else {
13728     Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
13729     Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
13730   }
13731
13732   // If the shift amount is larger or equal than the width of a part we can't
13733   // rely on the results of shld/shrd. Insert a test and select the appropriate
13734   // values for large shift amounts.
13735   SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
13736                                 DAG.getConstant(VTBits, MVT::i8));
13737   SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
13738                              AndNode, DAG.getConstant(0, MVT::i8));
13739
13740   SDValue Hi, Lo;
13741   SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
13742   SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
13743   SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
13744
13745   if (Op.getOpcode() == ISD::SHL_PARTS) {
13746     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
13747     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
13748   } else {
13749     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
13750     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
13751   }
13752
13753   SDValue Ops[2] = { Lo, Hi };
13754   return DAG.getMergeValues(Ops, dl);
13755 }
13756
13757 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
13758                                            SelectionDAG &DAG) const {
13759   MVT SrcVT = Op.getOperand(0).getSimpleValueType();
13760   SDLoc dl(Op);
13761
13762   if (SrcVT.isVector()) {
13763     if (SrcVT.getVectorElementType() == MVT::i1) {
13764       MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
13765       return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
13766                          DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT,
13767                                      Op.getOperand(0)));
13768     }
13769     return SDValue();
13770   }
13771
13772   assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
13773          "Unknown SINT_TO_FP to lower!");
13774
13775   // These are really Legal; return the operand so the caller accepts it as
13776   // Legal.
13777   if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
13778     return Op;
13779   if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
13780       Subtarget->is64Bit()) {
13781     return Op;
13782   }
13783
13784   unsigned Size = SrcVT.getSizeInBits()/8;
13785   MachineFunction &MF = DAG.getMachineFunction();
13786   int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
13787   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
13788   SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
13789                                StackSlot,
13790                                MachinePointerInfo::getFixedStack(SSFI),
13791                                false, false, 0);
13792   return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
13793 }
13794
13795 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
13796                                      SDValue StackSlot,
13797                                      SelectionDAG &DAG) const {
13798   // Build the FILD
13799   SDLoc DL(Op);
13800   SDVTList Tys;
13801   bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
13802   if (useSSE)
13803     Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
13804   else
13805     Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
13806
13807   unsigned ByteSize = SrcVT.getSizeInBits()/8;
13808
13809   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
13810   MachineMemOperand *MMO;
13811   if (FI) {
13812     int SSFI = FI->getIndex();
13813     MMO =
13814       DAG.getMachineFunction()
13815       .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
13816                             MachineMemOperand::MOLoad, ByteSize, ByteSize);
13817   } else {
13818     MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
13819     StackSlot = StackSlot.getOperand(1);
13820   }
13821   SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
13822   SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
13823                                            X86ISD::FILD, DL,
13824                                            Tys, Ops, SrcVT, MMO);
13825
13826   if (useSSE) {
13827     Chain = Result.getValue(1);
13828     SDValue InFlag = Result.getValue(2);
13829
13830     // FIXME: Currently the FST is flagged to the FILD_FLAG. This
13831     // shouldn't be necessary except that RFP cannot be live across
13832     // multiple blocks. When stackifier is fixed, they can be uncoupled.
13833     MachineFunction &MF = DAG.getMachineFunction();
13834     unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
13835     int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
13836     SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
13837     Tys = DAG.getVTList(MVT::Other);
13838     SDValue Ops[] = {
13839       Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
13840     };
13841     MachineMemOperand *MMO =
13842       DAG.getMachineFunction()
13843       .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
13844                             MachineMemOperand::MOStore, SSFISize, SSFISize);
13845
13846     Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
13847                                     Ops, Op.getValueType(), MMO);
13848     Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
13849                          MachinePointerInfo::getFixedStack(SSFI),
13850                          false, false, false, 0);
13851   }
13852
13853   return Result;
13854 }
13855
13856 // LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
13857 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
13858                                                SelectionDAG &DAG) const {
13859   // This algorithm is not obvious. Here it is what we're trying to output:
13860   /*
13861      movq       %rax,  %xmm0
13862      punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
13863      subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
13864      #ifdef __SSE3__
13865        haddpd   %xmm0, %xmm0
13866      #else
13867        pshufd   $0x4e, %xmm0, %xmm1
13868        addpd    %xmm1, %xmm0
13869      #endif
13870   */
13871
13872   SDLoc dl(Op);
13873   LLVMContext *Context = DAG.getContext();
13874
13875   // Build some magic constants.
13876   static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
13877   Constant *C0 = ConstantDataVector::get(*Context, CV0);
13878   SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
13879
13880   SmallVector<Constant*,2> CV1;
13881   CV1.push_back(
13882     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
13883                                       APInt(64, 0x4330000000000000ULL))));
13884   CV1.push_back(
13885     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
13886                                       APInt(64, 0x4530000000000000ULL))));
13887   Constant *C1 = ConstantVector::get(CV1);
13888   SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
13889
13890   // Load the 64-bit value into an XMM register.
13891   SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
13892                             Op.getOperand(0));
13893   SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
13894                               MachinePointerInfo::getConstantPool(),
13895                               false, false, false, 16);
13896   SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32,
13897                               DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1),
13898                               CLod0);
13899
13900   SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
13901                               MachinePointerInfo::getConstantPool(),
13902                               false, false, false, 16);
13903   SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1);
13904   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
13905   SDValue Result;
13906
13907   if (Subtarget->hasSSE3()) {
13908     // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
13909     Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
13910   } else {
13911     SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub);
13912     SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
13913                                            S2F, 0x4E, DAG);
13914     Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
13915                          DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle),
13916                          Sub);
13917   }
13918
13919   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
13920                      DAG.getIntPtrConstant(0));
13921 }
13922
13923 // LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
13924 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
13925                                                SelectionDAG &DAG) const {
13926   SDLoc dl(Op);
13927   // FP constant to bias correct the final result.
13928   SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
13929                                    MVT::f64);
13930
13931   // Load the 32-bit value into an XMM register.
13932   SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
13933                              Op.getOperand(0));
13934
13935   // Zero out the upper parts of the register.
13936   Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
13937
13938   Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
13939                      DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
13940                      DAG.getIntPtrConstant(0));
13941
13942   // Or the load with the bias.
13943   SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
13944                            DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
13945                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
13946                                                    MVT::v2f64, Load)),
13947                            DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
13948                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
13949                                                    MVT::v2f64, Bias)));
13950   Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
13951                    DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or),
13952                    DAG.getIntPtrConstant(0));
13953
13954   // Subtract the bias.
13955   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
13956
13957   // Handle final rounding.
13958   EVT DestVT = Op.getValueType();
13959
13960   if (DestVT.bitsLT(MVT::f64))
13961     return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
13962                        DAG.getIntPtrConstant(0));
13963   if (DestVT.bitsGT(MVT::f64))
13964     return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
13965
13966   // Handle final rounding.
13967   return Sub;
13968 }
13969
13970 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
13971                                      const X86Subtarget &Subtarget) {
13972   // The algorithm is the following:
13973   // #ifdef __SSE4_1__
13974   //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
13975   //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
13976   //                                 (uint4) 0x53000000, 0xaa);
13977   // #else
13978   //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
13979   //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
13980   // #endif
13981   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
13982   //     return (float4) lo + fhi;
13983
13984   SDLoc DL(Op);
13985   SDValue V = Op->getOperand(0);
13986   EVT VecIntVT = V.getValueType();
13987   bool Is128 = VecIntVT == MVT::v4i32;
13988   EVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
13989   // If we convert to something else than the supported type, e.g., to v4f64,
13990   // abort early.
13991   if (VecFloatVT != Op->getValueType(0))
13992     return SDValue();
13993
13994   unsigned NumElts = VecIntVT.getVectorNumElements();
13995   assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
13996          "Unsupported custom type");
13997   assert(NumElts <= 8 && "The size of the constant array must be fixed");
13998
13999   // In the #idef/#else code, we have in common:
14000   // - The vector of constants:
14001   // -- 0x4b000000
14002   // -- 0x53000000
14003   // - A shift:
14004   // -- v >> 16
14005
14006   // Create the splat vector for 0x4b000000.
14007   SDValue CstLow = DAG.getConstant(0x4b000000, MVT::i32);
14008   SDValue CstLowArray[] = {CstLow, CstLow, CstLow, CstLow,
14009                            CstLow, CstLow, CstLow, CstLow};
14010   SDValue VecCstLow = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
14011                                   makeArrayRef(&CstLowArray[0], NumElts));
14012   // Create the splat vector for 0x53000000.
14013   SDValue CstHigh = DAG.getConstant(0x53000000, MVT::i32);
14014   SDValue CstHighArray[] = {CstHigh, CstHigh, CstHigh, CstHigh,
14015                             CstHigh, CstHigh, CstHigh, CstHigh};
14016   SDValue VecCstHigh = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
14017                                    makeArrayRef(&CstHighArray[0], NumElts));
14018
14019   // Create the right shift.
14020   SDValue CstShift = DAG.getConstant(16, MVT::i32);
14021   SDValue CstShiftArray[] = {CstShift, CstShift, CstShift, CstShift,
14022                              CstShift, CstShift, CstShift, CstShift};
14023   SDValue VecCstShift = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
14024                                     makeArrayRef(&CstShiftArray[0], NumElts));
14025   SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
14026
14027   SDValue Low, High;
14028   if (Subtarget.hasSSE41()) {
14029     EVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
14030     //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
14031     SDValue VecCstLowBitcast =
14032         DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstLow);
14033     SDValue VecBitcast = DAG.getNode(ISD::BITCAST, DL, VecI16VT, V);
14034     // Low will be bitcasted right away, so do not bother bitcasting back to its
14035     // original type.
14036     Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
14037                       VecCstLowBitcast, DAG.getConstant(0xaa, MVT::i32));
14038     //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
14039     //                                 (uint4) 0x53000000, 0xaa);
14040     SDValue VecCstHighBitcast =
14041         DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstHigh);
14042     SDValue VecShiftBitcast =
14043         DAG.getNode(ISD::BITCAST, DL, VecI16VT, HighShift);
14044     // High will be bitcasted right away, so do not bother bitcasting back to
14045     // its original type.
14046     High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
14047                        VecCstHighBitcast, DAG.getConstant(0xaa, MVT::i32));
14048   } else {
14049     SDValue CstMask = DAG.getConstant(0xffff, MVT::i32);
14050     SDValue VecCstMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, CstMask,
14051                                      CstMask, CstMask, CstMask);
14052     //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
14053     SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
14054     Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
14055
14056     //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
14057     High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
14058   }
14059
14060   // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
14061   SDValue CstFAdd = DAG.getConstantFP(
14062       APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), MVT::f32);
14063   SDValue CstFAddArray[] = {CstFAdd, CstFAdd, CstFAdd, CstFAdd,
14064                             CstFAdd, CstFAdd, CstFAdd, CstFAdd};
14065   SDValue VecCstFAdd = DAG.getNode(ISD::BUILD_VECTOR, DL, VecFloatVT,
14066                                    makeArrayRef(&CstFAddArray[0], NumElts));
14067
14068   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
14069   SDValue HighBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, High);
14070   SDValue FHigh =
14071       DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
14072   //     return (float4) lo + fhi;
14073   SDValue LowBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, Low);
14074   return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
14075 }
14076
14077 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
14078                                                SelectionDAG &DAG) const {
14079   SDValue N0 = Op.getOperand(0);
14080   MVT SVT = N0.getSimpleValueType();
14081   SDLoc dl(Op);
14082
14083   switch (SVT.SimpleTy) {
14084   default:
14085     llvm_unreachable("Custom UINT_TO_FP is not supported!");
14086   case MVT::v4i8:
14087   case MVT::v4i16:
14088   case MVT::v8i8:
14089   case MVT::v8i16: {
14090     MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements());
14091     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14092                        DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
14093   }
14094   case MVT::v4i32:
14095   case MVT::v8i32:
14096     return lowerUINT_TO_FP_vXi32(Op, DAG, *Subtarget);
14097   }
14098   llvm_unreachable(nullptr);
14099 }
14100
14101 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
14102                                            SelectionDAG &DAG) const {
14103   SDValue N0 = Op.getOperand(0);
14104   SDLoc dl(Op);
14105
14106   if (Op.getValueType().isVector())
14107     return lowerUINT_TO_FP_vec(Op, DAG);
14108
14109   // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
14110   // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
14111   // the optimization here.
14112   if (DAG.SignBitIsZero(N0))
14113     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
14114
14115   MVT SrcVT = N0.getSimpleValueType();
14116   MVT DstVT = Op.getSimpleValueType();
14117   if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
14118     return LowerUINT_TO_FP_i64(Op, DAG);
14119   if (SrcVT == MVT::i32 && X86ScalarSSEf64)
14120     return LowerUINT_TO_FP_i32(Op, DAG);
14121   if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
14122     return SDValue();
14123
14124   // Make a 64-bit buffer, and use it to build an FILD.
14125   SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
14126   if (SrcVT == MVT::i32) {
14127     SDValue WordOff = DAG.getConstant(4, getPointerTy());
14128     SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
14129                                      getPointerTy(), StackSlot, WordOff);
14130     SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
14131                                   StackSlot, MachinePointerInfo(),
14132                                   false, false, 0);
14133     SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
14134                                   OffsetSlot, MachinePointerInfo(),
14135                                   false, false, 0);
14136     SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
14137     return Fild;
14138   }
14139
14140   assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
14141   SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
14142                                StackSlot, MachinePointerInfo(),
14143                                false, false, 0);
14144   // For i64 source, we need to add the appropriate power of 2 if the input
14145   // was negative.  This is the same as the optimization in
14146   // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
14147   // we must be careful to do the computation in x87 extended precision, not
14148   // in SSE. (The generic code can't know it's OK to do this, or how to.)
14149   int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
14150   MachineMemOperand *MMO =
14151     DAG.getMachineFunction()
14152     .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14153                           MachineMemOperand::MOLoad, 8, 8);
14154
14155   SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
14156   SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
14157   SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
14158                                          MVT::i64, MMO);
14159
14160   APInt FF(32, 0x5F800000ULL);
14161
14162   // Check whether the sign bit is set.
14163   SDValue SignSet = DAG.getSetCC(dl,
14164                                  getSetCCResultType(*DAG.getContext(), MVT::i64),
14165                                  Op.getOperand(0), DAG.getConstant(0, MVT::i64),
14166                                  ISD::SETLT);
14167
14168   // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
14169   SDValue FudgePtr = DAG.getConstantPool(
14170                              ConstantInt::get(*DAG.getContext(), FF.zext(64)),
14171                                          getPointerTy());
14172
14173   // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
14174   SDValue Zero = DAG.getIntPtrConstant(0);
14175   SDValue Four = DAG.getIntPtrConstant(4);
14176   SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
14177                                Zero, Four);
14178   FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset);
14179
14180   // Load the value out, extending it from f32 to f80.
14181   // FIXME: Avoid the extend by constructing the right constant pool?
14182   SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(),
14183                                  FudgePtr, MachinePointerInfo::getConstantPool(),
14184                                  MVT::f32, false, false, false, 4);
14185   // Extend everything to 80 bits to force it to be done on x87.
14186   SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
14187   return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0));
14188 }
14189
14190 std::pair<SDValue,SDValue>
14191 X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
14192                                     bool IsSigned, bool IsReplace) const {
14193   SDLoc DL(Op);
14194
14195   EVT DstTy = Op.getValueType();
14196
14197   if (!IsSigned && !isIntegerTypeFTOL(DstTy)) {
14198     assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
14199     DstTy = MVT::i64;
14200   }
14201
14202   assert(DstTy.getSimpleVT() <= MVT::i64 &&
14203          DstTy.getSimpleVT() >= MVT::i16 &&
14204          "Unknown FP_TO_INT to lower!");
14205
14206   // These are really Legal.
14207   if (DstTy == MVT::i32 &&
14208       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
14209     return std::make_pair(SDValue(), SDValue());
14210   if (Subtarget->is64Bit() &&
14211       DstTy == MVT::i64 &&
14212       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
14213     return std::make_pair(SDValue(), SDValue());
14214
14215   // We lower FP->int64 either into FISTP64 followed by a load from a temporary
14216   // stack slot, or into the FTOL runtime function.
14217   MachineFunction &MF = DAG.getMachineFunction();
14218   unsigned MemSize = DstTy.getSizeInBits()/8;
14219   int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
14220   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
14221
14222   unsigned Opc;
14223   if (!IsSigned && isIntegerTypeFTOL(DstTy))
14224     Opc = X86ISD::WIN_FTOL;
14225   else
14226     switch (DstTy.getSimpleVT().SimpleTy) {
14227     default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
14228     case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
14229     case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
14230     case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
14231     }
14232
14233   SDValue Chain = DAG.getEntryNode();
14234   SDValue Value = Op.getOperand(0);
14235   EVT TheVT = Op.getOperand(0).getValueType();
14236   // FIXME This causes a redundant load/store if the SSE-class value is already
14237   // in memory, such as if it is on the callstack.
14238   if (isScalarFPTypeInSSEReg(TheVT)) {
14239     assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
14240     Chain = DAG.getStore(Chain, DL, Value, StackSlot,
14241                          MachinePointerInfo::getFixedStack(SSFI),
14242                          false, false, 0);
14243     SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
14244     SDValue Ops[] = {
14245       Chain, StackSlot, DAG.getValueType(TheVT)
14246     };
14247
14248     MachineMemOperand *MMO =
14249       MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14250                               MachineMemOperand::MOLoad, MemSize, MemSize);
14251     Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
14252     Chain = Value.getValue(1);
14253     SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
14254     StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
14255   }
14256
14257   MachineMemOperand *MMO =
14258     MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14259                             MachineMemOperand::MOStore, MemSize, MemSize);
14260
14261   if (Opc != X86ISD::WIN_FTOL) {
14262     // Build the FP_TO_INT*_IN_MEM
14263     SDValue Ops[] = { Chain, Value, StackSlot };
14264     SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
14265                                            Ops, DstTy, MMO);
14266     return std::make_pair(FIST, StackSlot);
14267   } else {
14268     SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
14269       DAG.getVTList(MVT::Other, MVT::Glue),
14270       Chain, Value);
14271     SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX,
14272       MVT::i32, ftol.getValue(1));
14273     SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX,
14274       MVT::i32, eax.getValue(2));
14275     SDValue Ops[] = { eax, edx };
14276     SDValue pair = IsReplace
14277       ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops)
14278       : DAG.getMergeValues(Ops, DL);
14279     return std::make_pair(pair, SDValue());
14280   }
14281 }
14282
14283 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
14284                               const X86Subtarget *Subtarget) {
14285   MVT VT = Op->getSimpleValueType(0);
14286   SDValue In = Op->getOperand(0);
14287   MVT InVT = In.getSimpleValueType();
14288   SDLoc dl(Op);
14289
14290   // Optimize vectors in AVX mode:
14291   //
14292   //   v8i16 -> v8i32
14293   //   Use vpunpcklwd for 4 lower elements  v8i16 -> v4i32.
14294   //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
14295   //   Concat upper and lower parts.
14296   //
14297   //   v4i32 -> v4i64
14298   //   Use vpunpckldq for 4 lower elements  v4i32 -> v2i64.
14299   //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
14300   //   Concat upper and lower parts.
14301   //
14302
14303   if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
14304       ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
14305       ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
14306     return SDValue();
14307
14308   if (Subtarget->hasInt256())
14309     return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
14310
14311   SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
14312   SDValue Undef = DAG.getUNDEF(InVT);
14313   bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
14314   SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
14315   SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
14316
14317   MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
14318                              VT.getVectorNumElements()/2);
14319
14320   OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
14321   OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
14322
14323   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
14324 }
14325
14326 static  SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
14327                                         SelectionDAG &DAG) {
14328   MVT VT = Op->getSimpleValueType(0);
14329   SDValue In = Op->getOperand(0);
14330   MVT InVT = In.getSimpleValueType();
14331   SDLoc DL(Op);
14332   unsigned int NumElts = VT.getVectorNumElements();
14333   if (NumElts != 8 && NumElts != 16)
14334     return SDValue();
14335
14336   if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
14337     return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
14338
14339   EVT ExtVT = (NumElts == 8)? MVT::v8i64 : MVT::v16i32;
14340   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14341   // Now we have only mask extension
14342   assert(InVT.getVectorElementType() == MVT::i1);
14343   SDValue Cst = DAG.getTargetConstant(1, ExtVT.getScalarType());
14344   const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
14345   SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
14346   unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
14347   SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
14348                            MachinePointerInfo::getConstantPool(),
14349                            false, false, false, Alignment);
14350
14351   SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, DL, ExtVT, In, Ld);
14352   if (VT.is512BitVector())
14353     return Brcst;
14354   return DAG.getNode(X86ISD::VTRUNC, DL, VT, Brcst);
14355 }
14356
14357 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
14358                                SelectionDAG &DAG) {
14359   if (Subtarget->hasFp256()) {
14360     SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
14361     if (Res.getNode())
14362       return Res;
14363   }
14364
14365   return SDValue();
14366 }
14367
14368 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
14369                                 SelectionDAG &DAG) {
14370   SDLoc DL(Op);
14371   MVT VT = Op.getSimpleValueType();
14372   SDValue In = Op.getOperand(0);
14373   MVT SVT = In.getSimpleValueType();
14374
14375   if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
14376     return LowerZERO_EXTEND_AVX512(Op, DAG);
14377
14378   if (Subtarget->hasFp256()) {
14379     SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
14380     if (Res.getNode())
14381       return Res;
14382   }
14383
14384   assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
14385          VT.getVectorNumElements() != SVT.getVectorNumElements());
14386   return SDValue();
14387 }
14388
14389 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
14390   SDLoc DL(Op);
14391   MVT VT = Op.getSimpleValueType();
14392   SDValue In = Op.getOperand(0);
14393   MVT InVT = In.getSimpleValueType();
14394
14395   if (VT == MVT::i1) {
14396     assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
14397            "Invalid scalar TRUNCATE operation");
14398     if (InVT.getSizeInBits() >= 32)
14399       return SDValue();
14400     In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
14401     return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
14402   }
14403   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
14404          "Invalid TRUNCATE operation");
14405
14406   if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) {
14407     if (VT.getVectorElementType().getSizeInBits() >=8)
14408       return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
14409
14410     assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
14411     unsigned NumElts = InVT.getVectorNumElements();
14412     assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type");
14413     if (InVT.getSizeInBits() < 512) {
14414       MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64;
14415       In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
14416       InVT = ExtVT;
14417     }
14418
14419     SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType());
14420     const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
14421     SDValue CP = DAG.getConstantPool(C, getPointerTy());
14422     unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
14423     SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
14424                            MachinePointerInfo::getConstantPool(),
14425                            false, false, false, Alignment);
14426     SDValue OneV = DAG.getNode(X86ISD::VBROADCAST, DL, InVT, Ld);
14427     SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In);
14428     return DAG.getNode(X86ISD::TESTM, DL, VT, And, And);
14429   }
14430
14431   if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
14432     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
14433     if (Subtarget->hasInt256()) {
14434       static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
14435       In = DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, In);
14436       In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
14437                                 ShufMask);
14438       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
14439                          DAG.getIntPtrConstant(0));
14440     }
14441
14442     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14443                                DAG.getIntPtrConstant(0));
14444     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14445                                DAG.getIntPtrConstant(2));
14446     OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
14447     OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
14448     static const int ShufMask[] = {0, 2, 4, 6};
14449     return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
14450   }
14451
14452   if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
14453     // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
14454     if (Subtarget->hasInt256()) {
14455       In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In);
14456
14457       SmallVector<SDValue,32> pshufbMask;
14458       for (unsigned i = 0; i < 2; ++i) {
14459         pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8));
14460         pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8));
14461         pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8));
14462         pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8));
14463         pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8));
14464         pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8));
14465         pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8));
14466         pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8));
14467         for (unsigned j = 0; j < 8; ++j)
14468           pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
14469       }
14470       SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask);
14471       In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
14472       In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In);
14473
14474       static const int ShufMask[] = {0,  2,  -1,  -1};
14475       In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, DAG.getUNDEF(MVT::v4i64),
14476                                 &ShufMask[0]);
14477       In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14478                        DAG.getIntPtrConstant(0));
14479       return DAG.getNode(ISD::BITCAST, DL, VT, In);
14480     }
14481
14482     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
14483                                DAG.getIntPtrConstant(0));
14484
14485     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
14486                                DAG.getIntPtrConstant(4));
14487
14488     OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpLo);
14489     OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpHi);
14490
14491     // The PSHUFB mask:
14492     static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
14493                                    -1, -1, -1, -1, -1, -1, -1, -1};
14494
14495     SDValue Undef = DAG.getUNDEF(MVT::v16i8);
14496     OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
14497     OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
14498
14499     OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
14500     OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
14501
14502     // The MOVLHPS Mask:
14503     static const int ShufMask2[] = {0, 1, 4, 5};
14504     SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
14505     return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, res);
14506   }
14507
14508   // Handle truncation of V256 to V128 using shuffles.
14509   if (!VT.is128BitVector() || !InVT.is256BitVector())
14510     return SDValue();
14511
14512   assert(Subtarget->hasFp256() && "256-bit vector without AVX!");
14513
14514   unsigned NumElems = VT.getVectorNumElements();
14515   MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
14516
14517   SmallVector<int, 16> MaskVec(NumElems * 2, -1);
14518   // Prepare truncation shuffle mask
14519   for (unsigned i = 0; i != NumElems; ++i)
14520     MaskVec[i] = i * 2;
14521   SDValue V = DAG.getVectorShuffle(NVT, DL,
14522                                    DAG.getNode(ISD::BITCAST, DL, NVT, In),
14523                                    DAG.getUNDEF(NVT), &MaskVec[0]);
14524   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
14525                      DAG.getIntPtrConstant(0));
14526 }
14527
14528 SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
14529                                            SelectionDAG &DAG) const {
14530   assert(!Op.getSimpleValueType().isVector());
14531
14532   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
14533     /*IsSigned=*/ true, /*IsReplace=*/ false);
14534   SDValue FIST = Vals.first, StackSlot = Vals.second;
14535   // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
14536   if (!FIST.getNode()) return Op;
14537
14538   if (StackSlot.getNode())
14539     // Load the result.
14540     return DAG.getLoad(Op.getValueType(), SDLoc(Op),
14541                        FIST, StackSlot, MachinePointerInfo(),
14542                        false, false, false, 0);
14543
14544   // The node is the result.
14545   return FIST;
14546 }
14547
14548 SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
14549                                            SelectionDAG &DAG) const {
14550   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
14551     /*IsSigned=*/ false, /*IsReplace=*/ false);
14552   SDValue FIST = Vals.first, StackSlot = Vals.second;
14553   assert(FIST.getNode() && "Unexpected failure");
14554
14555   if (StackSlot.getNode())
14556     // Load the result.
14557     return DAG.getLoad(Op.getValueType(), SDLoc(Op),
14558                        FIST, StackSlot, MachinePointerInfo(),
14559                        false, false, false, 0);
14560
14561   // The node is the result.
14562   return FIST;
14563 }
14564
14565 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
14566   SDLoc DL(Op);
14567   MVT VT = Op.getSimpleValueType();
14568   SDValue In = Op.getOperand(0);
14569   MVT SVT = In.getSimpleValueType();
14570
14571   assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
14572
14573   return DAG.getNode(X86ISD::VFPEXT, DL, VT,
14574                      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
14575                                  In, DAG.getUNDEF(SVT)));
14576 }
14577
14578 /// The only differences between FABS and FNEG are the mask and the logic op.
14579 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
14580 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
14581   assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
14582          "Wrong opcode for lowering FABS or FNEG.");
14583
14584   bool IsFABS = (Op.getOpcode() == ISD::FABS);
14585
14586   // If this is a FABS and it has an FNEG user, bail out to fold the combination
14587   // into an FNABS. We'll lower the FABS after that if it is still in use.
14588   if (IsFABS)
14589     for (SDNode *User : Op->uses())
14590       if (User->getOpcode() == ISD::FNEG)
14591         return Op;
14592
14593   SDValue Op0 = Op.getOperand(0);
14594   bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
14595
14596   SDLoc dl(Op);
14597   MVT VT = Op.getSimpleValueType();
14598   // Assume scalar op for initialization; update for vector if needed.
14599   // Note that there are no scalar bitwise logical SSE/AVX instructions, so we
14600   // generate a 16-byte vector constant and logic op even for the scalar case.
14601   // Using a 16-byte mask allows folding the load of the mask with
14602   // the logic op, so it can save (~4 bytes) on code size.
14603   MVT EltVT = VT;
14604   unsigned NumElts = VT == MVT::f64 ? 2 : 4;
14605   // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
14606   // decide if we should generate a 16-byte constant mask when we only need 4 or
14607   // 8 bytes for the scalar case.
14608   if (VT.isVector()) {
14609     EltVT = VT.getVectorElementType();
14610     NumElts = VT.getVectorNumElements();
14611   }
14612
14613   unsigned EltBits = EltVT.getSizeInBits();
14614   LLVMContext *Context = DAG.getContext();
14615   // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
14616   APInt MaskElt =
14617     IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits);
14618   Constant *C = ConstantInt::get(*Context, MaskElt);
14619   C = ConstantVector::getSplat(NumElts, C);
14620   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14621   SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy());
14622   unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
14623   SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
14624                              MachinePointerInfo::getConstantPool(),
14625                              false, false, false, Alignment);
14626
14627   if (VT.isVector()) {
14628     // For a vector, cast operands to a vector type, perform the logic op,
14629     // and cast the result back to the original value type.
14630     MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
14631     SDValue MaskCasted = DAG.getNode(ISD::BITCAST, dl, VecVT, Mask);
14632     SDValue Operand = IsFNABS ?
14633       DAG.getNode(ISD::BITCAST, dl, VecVT, Op0.getOperand(0)) :
14634       DAG.getNode(ISD::BITCAST, dl, VecVT, Op0);
14635     unsigned BitOp = IsFABS ? ISD::AND : IsFNABS ? ISD::OR : ISD::XOR;
14636     return DAG.getNode(ISD::BITCAST, dl, VT,
14637                        DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted));
14638   }
14639
14640   // If not vector, then scalar.
14641   unsigned BitOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
14642   SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
14643   return DAG.getNode(BitOp, dl, VT, Operand, Mask);
14644 }
14645
14646 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
14647   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14648   LLVMContext *Context = DAG.getContext();
14649   SDValue Op0 = Op.getOperand(0);
14650   SDValue Op1 = Op.getOperand(1);
14651   SDLoc dl(Op);
14652   MVT VT = Op.getSimpleValueType();
14653   MVT SrcVT = Op1.getSimpleValueType();
14654
14655   // If second operand is smaller, extend it first.
14656   if (SrcVT.bitsLT(VT)) {
14657     Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
14658     SrcVT = VT;
14659   }
14660   // And if it is bigger, shrink it first.
14661   if (SrcVT.bitsGT(VT)) {
14662     Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
14663     SrcVT = VT;
14664   }
14665
14666   // At this point the operands and the result should have the same
14667   // type, and that won't be f80 since that is not custom lowered.
14668
14669   const fltSemantics &Sem =
14670       VT == MVT::f64 ? APFloat::IEEEdouble : APFloat::IEEEsingle;
14671   const unsigned SizeInBits = VT.getSizeInBits();
14672
14673   SmallVector<Constant *, 4> CV(
14674       VT == MVT::f64 ? 2 : 4,
14675       ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0))));
14676
14677   // First, clear all bits but the sign bit from the second operand (sign).
14678   CV[0] = ConstantFP::get(*Context,
14679                           APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1)));
14680   Constant *C = ConstantVector::get(CV);
14681   SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
14682   SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
14683                               MachinePointerInfo::getConstantPool(),
14684                               false, false, false, 16);
14685   SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
14686
14687   // Next, clear the sign bit from the first operand (magnitude).
14688   // If it's a constant, we can clear it here.
14689   if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Op0)) {
14690     APFloat APF = Op0CN->getValueAPF();
14691     // If the magnitude is a positive zero, the sign bit alone is enough.
14692     if (APF.isPosZero())
14693       return SignBit;
14694     APF.clearSign();
14695     CV[0] = ConstantFP::get(*Context, APF);
14696   } else {
14697     CV[0] = ConstantFP::get(
14698         *Context,
14699         APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1)));
14700   }
14701   C = ConstantVector::get(CV);
14702   CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
14703   SDValue Val = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
14704                             MachinePointerInfo::getConstantPool(),
14705                             false, false, false, 16);
14706   // If the magnitude operand wasn't a constant, we need to AND out the sign.
14707   if (!isa<ConstantFPSDNode>(Op0))
14708     Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Val);
14709
14710   // OR the magnitude value with the sign bit.
14711   return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
14712 }
14713
14714 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
14715   SDValue N0 = Op.getOperand(0);
14716   SDLoc dl(Op);
14717   MVT VT = Op.getSimpleValueType();
14718
14719   // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
14720   SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
14721                                   DAG.getConstant(1, VT));
14722   return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
14723 }
14724
14725 // Check whether an OR'd tree is PTEST-able.
14726 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
14727                                       SelectionDAG &DAG) {
14728   assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
14729
14730   if (!Subtarget->hasSSE41())
14731     return SDValue();
14732
14733   if (!Op->hasOneUse())
14734     return SDValue();
14735
14736   SDNode *N = Op.getNode();
14737   SDLoc DL(N);
14738
14739   SmallVector<SDValue, 8> Opnds;
14740   DenseMap<SDValue, unsigned> VecInMap;
14741   SmallVector<SDValue, 8> VecIns;
14742   EVT VT = MVT::Other;
14743
14744   // Recognize a special case where a vector is casted into wide integer to
14745   // test all 0s.
14746   Opnds.push_back(N->getOperand(0));
14747   Opnds.push_back(N->getOperand(1));
14748
14749   for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
14750     SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
14751     // BFS traverse all OR'd operands.
14752     if (I->getOpcode() == ISD::OR) {
14753       Opnds.push_back(I->getOperand(0));
14754       Opnds.push_back(I->getOperand(1));
14755       // Re-evaluate the number of nodes to be traversed.
14756       e += 2; // 2 more nodes (LHS and RHS) are pushed.
14757       continue;
14758     }
14759
14760     // Quit if a non-EXTRACT_VECTOR_ELT
14761     if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
14762       return SDValue();
14763
14764     // Quit if without a constant index.
14765     SDValue Idx = I->getOperand(1);
14766     if (!isa<ConstantSDNode>(Idx))
14767       return SDValue();
14768
14769     SDValue ExtractedFromVec = I->getOperand(0);
14770     DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
14771     if (M == VecInMap.end()) {
14772       VT = ExtractedFromVec.getValueType();
14773       // Quit if not 128/256-bit vector.
14774       if (!VT.is128BitVector() && !VT.is256BitVector())
14775         return SDValue();
14776       // Quit if not the same type.
14777       if (VecInMap.begin() != VecInMap.end() &&
14778           VT != VecInMap.begin()->first.getValueType())
14779         return SDValue();
14780       M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
14781       VecIns.push_back(ExtractedFromVec);
14782     }
14783     M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
14784   }
14785
14786   assert((VT.is128BitVector() || VT.is256BitVector()) &&
14787          "Not extracted from 128-/256-bit vector.");
14788
14789   unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
14790
14791   for (DenseMap<SDValue, unsigned>::const_iterator
14792         I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
14793     // Quit if not all elements are used.
14794     if (I->second != FullMask)
14795       return SDValue();
14796   }
14797
14798   EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
14799
14800   // Cast all vectors into TestVT for PTEST.
14801   for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
14802     VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]);
14803
14804   // If more than one full vectors are evaluated, OR them first before PTEST.
14805   for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
14806     // Each iteration will OR 2 nodes and append the result until there is only
14807     // 1 node left, i.e. the final OR'd value of all vectors.
14808     SDValue LHS = VecIns[Slot];
14809     SDValue RHS = VecIns[Slot + 1];
14810     VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
14811   }
14812
14813   return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
14814                      VecIns.back(), VecIns.back());
14815 }
14816
14817 /// \brief return true if \c Op has a use that doesn't just read flags.
14818 static bool hasNonFlagsUse(SDValue Op) {
14819   for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
14820        ++UI) {
14821     SDNode *User = *UI;
14822     unsigned UOpNo = UI.getOperandNo();
14823     if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
14824       // Look pass truncate.
14825       UOpNo = User->use_begin().getOperandNo();
14826       User = *User->use_begin();
14827     }
14828
14829     if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
14830         !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
14831       return true;
14832   }
14833   return false;
14834 }
14835
14836 /// Emit nodes that will be selected as "test Op0,Op0", or something
14837 /// equivalent.
14838 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
14839                                     SelectionDAG &DAG) const {
14840   if (Op.getValueType() == MVT::i1)
14841     // KORTEST instruction should be selected
14842     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
14843                        DAG.getConstant(0, Op.getValueType()));
14844
14845   // CF and OF aren't always set the way we want. Determine which
14846   // of these we need.
14847   bool NeedCF = false;
14848   bool NeedOF = false;
14849   switch (X86CC) {
14850   default: break;
14851   case X86::COND_A: case X86::COND_AE:
14852   case X86::COND_B: case X86::COND_BE:
14853     NeedCF = true;
14854     break;
14855   case X86::COND_G: case X86::COND_GE:
14856   case X86::COND_L: case X86::COND_LE:
14857   case X86::COND_O: case X86::COND_NO: {
14858     // Check if we really need to set the
14859     // Overflow flag. If NoSignedWrap is present
14860     // that is not actually needed.
14861     switch (Op->getOpcode()) {
14862     case ISD::ADD:
14863     case ISD::SUB:
14864     case ISD::MUL:
14865     case ISD::SHL: {
14866       const BinaryWithFlagsSDNode *BinNode =
14867           cast<BinaryWithFlagsSDNode>(Op.getNode());
14868       if (BinNode->hasNoSignedWrap())
14869         break;
14870     }
14871     default:
14872       NeedOF = true;
14873       break;
14874     }
14875     break;
14876   }
14877   }
14878   // See if we can use the EFLAGS value from the operand instead of
14879   // doing a separate TEST. TEST always sets OF and CF to 0, so unless
14880   // we prove that the arithmetic won't overflow, we can't use OF or CF.
14881   if (Op.getResNo() != 0 || NeedOF || NeedCF) {
14882     // Emit a CMP with 0, which is the TEST pattern.
14883     //if (Op.getValueType() == MVT::i1)
14884     //  return DAG.getNode(X86ISD::CMP, dl, MVT::i1, Op,
14885     //                     DAG.getConstant(0, MVT::i1));
14886     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
14887                        DAG.getConstant(0, Op.getValueType()));
14888   }
14889   unsigned Opcode = 0;
14890   unsigned NumOperands = 0;
14891
14892   // Truncate operations may prevent the merge of the SETCC instruction
14893   // and the arithmetic instruction before it. Attempt to truncate the operands
14894   // of the arithmetic instruction and use a reduced bit-width instruction.
14895   bool NeedTruncation = false;
14896   SDValue ArithOp = Op;
14897   if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
14898     SDValue Arith = Op->getOperand(0);
14899     // Both the trunc and the arithmetic op need to have one user each.
14900     if (Arith->hasOneUse())
14901       switch (Arith.getOpcode()) {
14902         default: break;
14903         case ISD::ADD:
14904         case ISD::SUB:
14905         case ISD::AND:
14906         case ISD::OR:
14907         case ISD::XOR: {
14908           NeedTruncation = true;
14909           ArithOp = Arith;
14910         }
14911       }
14912   }
14913
14914   // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
14915   // which may be the result of a CAST.  We use the variable 'Op', which is the
14916   // non-casted variable when we check for possible users.
14917   switch (ArithOp.getOpcode()) {
14918   case ISD::ADD:
14919     // Due to an isel shortcoming, be conservative if this add is likely to be
14920     // selected as part of a load-modify-store instruction. When the root node
14921     // in a match is a store, isel doesn't know how to remap non-chain non-flag
14922     // uses of other nodes in the match, such as the ADD in this case. This
14923     // leads to the ADD being left around and reselected, with the result being
14924     // two adds in the output.  Alas, even if none our users are stores, that
14925     // doesn't prove we're O.K.  Ergo, if we have any parents that aren't
14926     // CopyToReg or SETCC, eschew INC/DEC.  A better fix seems to require
14927     // climbing the DAG back to the root, and it doesn't seem to be worth the
14928     // effort.
14929     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
14930          UE = Op.getNode()->use_end(); UI != UE; ++UI)
14931       if (UI->getOpcode() != ISD::CopyToReg &&
14932           UI->getOpcode() != ISD::SETCC &&
14933           UI->getOpcode() != ISD::STORE)
14934         goto default_case;
14935
14936     if (ConstantSDNode *C =
14937         dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
14938       // An add of one will be selected as an INC.
14939       if (C->getAPIntValue() == 1 && !Subtarget->slowIncDec()) {
14940         Opcode = X86ISD::INC;
14941         NumOperands = 1;
14942         break;
14943       }
14944
14945       // An add of negative one (subtract of one) will be selected as a DEC.
14946       if (C->getAPIntValue().isAllOnesValue() && !Subtarget->slowIncDec()) {
14947         Opcode = X86ISD::DEC;
14948         NumOperands = 1;
14949         break;
14950       }
14951     }
14952
14953     // Otherwise use a regular EFLAGS-setting add.
14954     Opcode = X86ISD::ADD;
14955     NumOperands = 2;
14956     break;
14957   case ISD::SHL:
14958   case ISD::SRL:
14959     // If we have a constant logical shift that's only used in a comparison
14960     // against zero turn it into an equivalent AND. This allows turning it into
14961     // a TEST instruction later.
14962     if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
14963         isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
14964       EVT VT = Op.getValueType();
14965       unsigned BitWidth = VT.getSizeInBits();
14966       unsigned ShAmt = Op->getConstantOperandVal(1);
14967       if (ShAmt >= BitWidth) // Avoid undefined shifts.
14968         break;
14969       APInt Mask = ArithOp.getOpcode() == ISD::SRL
14970                        ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
14971                        : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
14972       if (!Mask.isSignedIntN(32)) // Avoid large immediates.
14973         break;
14974       SDValue New = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
14975                                 DAG.getConstant(Mask, VT));
14976       DAG.ReplaceAllUsesWith(Op, New);
14977       Op = New;
14978     }
14979     break;
14980
14981   case ISD::AND:
14982     // If the primary and result isn't used, don't bother using X86ISD::AND,
14983     // because a TEST instruction will be better.
14984     if (!hasNonFlagsUse(Op))
14985       break;
14986     // FALL THROUGH
14987   case ISD::SUB:
14988   case ISD::OR:
14989   case ISD::XOR:
14990     // Due to the ISEL shortcoming noted above, be conservative if this op is
14991     // likely to be selected as part of a load-modify-store instruction.
14992     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
14993            UE = Op.getNode()->use_end(); UI != UE; ++UI)
14994       if (UI->getOpcode() == ISD::STORE)
14995         goto default_case;
14996
14997     // Otherwise use a regular EFLAGS-setting instruction.
14998     switch (ArithOp.getOpcode()) {
14999     default: llvm_unreachable("unexpected operator!");
15000     case ISD::SUB: Opcode = X86ISD::SUB; break;
15001     case ISD::XOR: Opcode = X86ISD::XOR; break;
15002     case ISD::AND: Opcode = X86ISD::AND; break;
15003     case ISD::OR: {
15004       if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
15005         SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG);
15006         if (EFLAGS.getNode())
15007           return EFLAGS;
15008       }
15009       Opcode = X86ISD::OR;
15010       break;
15011     }
15012     }
15013
15014     NumOperands = 2;
15015     break;
15016   case X86ISD::ADD:
15017   case X86ISD::SUB:
15018   case X86ISD::INC:
15019   case X86ISD::DEC:
15020   case X86ISD::OR:
15021   case X86ISD::XOR:
15022   case X86ISD::AND:
15023     return SDValue(Op.getNode(), 1);
15024   default:
15025   default_case:
15026     break;
15027   }
15028
15029   // If we found that truncation is beneficial, perform the truncation and
15030   // update 'Op'.
15031   if (NeedTruncation) {
15032     EVT VT = Op.getValueType();
15033     SDValue WideVal = Op->getOperand(0);
15034     EVT WideVT = WideVal.getValueType();
15035     unsigned ConvertedOp = 0;
15036     // Use a target machine opcode to prevent further DAGCombine
15037     // optimizations that may separate the arithmetic operations
15038     // from the setcc node.
15039     switch (WideVal.getOpcode()) {
15040       default: break;
15041       case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
15042       case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
15043       case ISD::AND: ConvertedOp = X86ISD::AND; break;
15044       case ISD::OR:  ConvertedOp = X86ISD::OR;  break;
15045       case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
15046     }
15047
15048     if (ConvertedOp) {
15049       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15050       if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
15051         SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
15052         SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
15053         Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
15054       }
15055     }
15056   }
15057
15058   if (Opcode == 0)
15059     // Emit a CMP with 0, which is the TEST pattern.
15060     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
15061                        DAG.getConstant(0, Op.getValueType()));
15062
15063   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
15064   SmallVector<SDValue, 4> Ops;
15065   for (unsigned i = 0; i != NumOperands; ++i)
15066     Ops.push_back(Op.getOperand(i));
15067
15068   SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
15069   DAG.ReplaceAllUsesWith(Op, New);
15070   return SDValue(New.getNode(), 1);
15071 }
15072
15073 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
15074 /// equivalent.
15075 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
15076                                    SDLoc dl, SelectionDAG &DAG) const {
15077   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) {
15078     if (C->getAPIntValue() == 0)
15079       return EmitTest(Op0, X86CC, dl, DAG);
15080
15081      if (Op0.getValueType() == MVT::i1)
15082        llvm_unreachable("Unexpected comparison operation for MVT::i1 operands");
15083   }
15084
15085   if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
15086        Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
15087     // Do the comparison at i32 if it's smaller, besides the Atom case.
15088     // This avoids subregister aliasing issues. Keep the smaller reference
15089     // if we're optimizing for size, however, as that'll allow better folding
15090     // of memory operations.
15091     if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 &&
15092         !DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
15093              AttributeSet::FunctionIndex, Attribute::MinSize) &&
15094         !Subtarget->isAtom()) {
15095       unsigned ExtendOp =
15096           isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
15097       Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
15098       Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
15099     }
15100     // Use SUB instead of CMP to enable CSE between SUB and CMP.
15101     SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
15102     SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
15103                               Op0, Op1);
15104     return SDValue(Sub.getNode(), 1);
15105   }
15106   return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
15107 }
15108
15109 /// Convert a comparison if required by the subtarget.
15110 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
15111                                                  SelectionDAG &DAG) const {
15112   // If the subtarget does not support the FUCOMI instruction, floating-point
15113   // comparisons have to be converted.
15114   if (Subtarget->hasCMov() ||
15115       Cmp.getOpcode() != X86ISD::CMP ||
15116       !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
15117       !Cmp.getOperand(1).getValueType().isFloatingPoint())
15118     return Cmp;
15119
15120   // The instruction selector will select an FUCOM instruction instead of
15121   // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
15122   // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
15123   // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
15124   SDLoc dl(Cmp);
15125   SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
15126   SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
15127   SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
15128                             DAG.getConstant(8, MVT::i8));
15129   SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
15130   return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
15131 }
15132
15133 /// The minimum architected relative accuracy is 2^-12. We need one
15134 /// Newton-Raphson step to have a good float result (24 bits of precision).
15135 SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
15136                                             DAGCombinerInfo &DCI,
15137                                             unsigned &RefinementSteps,
15138                                             bool &UseOneConstNR) const {
15139   // FIXME: We should use instruction latency models to calculate the cost of
15140   // each potential sequence, but this is very hard to do reliably because
15141   // at least Intel's Core* chips have variable timing based on the number of
15142   // significant digits in the divisor and/or sqrt operand.
15143   if (!Subtarget->useSqrtEst())
15144     return SDValue();
15145
15146   EVT VT = Op.getValueType();
15147
15148   // SSE1 has rsqrtss and rsqrtps.
15149   // TODO: Add support for AVX512 (v16f32).
15150   // It is likely not profitable to do this for f64 because a double-precision
15151   // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
15152   // instructions: convert to single, rsqrtss, convert back to double, refine
15153   // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
15154   // along with FMA, this could be a throughput win.
15155   if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
15156       (Subtarget->hasAVX() && VT == MVT::v8f32)) {
15157     RefinementSteps = 1;
15158     UseOneConstNR = false;
15159     return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
15160   }
15161   return SDValue();
15162 }
15163
15164 /// The minimum architected relative accuracy is 2^-12. We need one
15165 /// Newton-Raphson step to have a good float result (24 bits of precision).
15166 SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
15167                                             DAGCombinerInfo &DCI,
15168                                             unsigned &RefinementSteps) const {
15169   // FIXME: We should use instruction latency models to calculate the cost of
15170   // each potential sequence, but this is very hard to do reliably because
15171   // at least Intel's Core* chips have variable timing based on the number of
15172   // significant digits in the divisor.
15173   if (!Subtarget->useReciprocalEst())
15174     return SDValue();
15175
15176   EVT VT = Op.getValueType();
15177
15178   // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
15179   // TODO: Add support for AVX512 (v16f32).
15180   // It is likely not profitable to do this for f64 because a double-precision
15181   // reciprocal estimate with refinement on x86 prior to FMA requires
15182   // 15 instructions: convert to single, rcpss, convert back to double, refine
15183   // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
15184   // along with FMA, this could be a throughput win.
15185   if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
15186       (Subtarget->hasAVX() && VT == MVT::v8f32)) {
15187     RefinementSteps = ReciprocalEstimateRefinementSteps;
15188     return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
15189   }
15190   return SDValue();
15191 }
15192
15193 static bool isAllOnes(SDValue V) {
15194   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
15195   return C && C->isAllOnesValue();
15196 }
15197
15198 /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
15199 /// if it's possible.
15200 SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
15201                                      SDLoc dl, SelectionDAG &DAG) const {
15202   SDValue Op0 = And.getOperand(0);
15203   SDValue Op1 = And.getOperand(1);
15204   if (Op0.getOpcode() == ISD::TRUNCATE)
15205     Op0 = Op0.getOperand(0);
15206   if (Op1.getOpcode() == ISD::TRUNCATE)
15207     Op1 = Op1.getOperand(0);
15208
15209   SDValue LHS, RHS;
15210   if (Op1.getOpcode() == ISD::SHL)
15211     std::swap(Op0, Op1);
15212   if (Op0.getOpcode() == ISD::SHL) {
15213     if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0)))
15214       if (And00C->getZExtValue() == 1) {
15215         // If we looked past a truncate, check that it's only truncating away
15216         // known zeros.
15217         unsigned BitWidth = Op0.getValueSizeInBits();
15218         unsigned AndBitWidth = And.getValueSizeInBits();
15219         if (BitWidth > AndBitWidth) {
15220           APInt Zeros, Ones;
15221           DAG.computeKnownBits(Op0, Zeros, Ones);
15222           if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
15223             return SDValue();
15224         }
15225         LHS = Op1;
15226         RHS = Op0.getOperand(1);
15227       }
15228   } else if (Op1.getOpcode() == ISD::Constant) {
15229     ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
15230     uint64_t AndRHSVal = AndRHS->getZExtValue();
15231     SDValue AndLHS = Op0;
15232
15233     if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
15234       LHS = AndLHS.getOperand(0);
15235       RHS = AndLHS.getOperand(1);
15236     }
15237
15238     // Use BT if the immediate can't be encoded in a TEST instruction.
15239     if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
15240       LHS = AndLHS;
15241       RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType());
15242     }
15243   }
15244
15245   if (LHS.getNode()) {
15246     // If LHS is i8, promote it to i32 with any_extend.  There is no i8 BT
15247     // instruction.  Since the shift amount is in-range-or-undefined, we know
15248     // that doing a bittest on the i32 value is ok.  We extend to i32 because
15249     // the encoding for the i16 version is larger than the i32 version.
15250     // Also promote i16 to i32 for performance / code size reason.
15251     if (LHS.getValueType() == MVT::i8 ||
15252         LHS.getValueType() == MVT::i16)
15253       LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
15254
15255     // If the operand types disagree, extend the shift amount to match.  Since
15256     // BT ignores high bits (like shifts) we can use anyextend.
15257     if (LHS.getValueType() != RHS.getValueType())
15258       RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
15259
15260     SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
15261     X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
15262     return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15263                        DAG.getConstant(Cond, MVT::i8), BT);
15264   }
15265
15266   return SDValue();
15267 }
15268
15269 /// \brief - Turns an ISD::CondCode into a value suitable for SSE floating point
15270 /// mask CMPs.
15271 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
15272                               SDValue &Op1) {
15273   unsigned SSECC;
15274   bool Swap = false;
15275
15276   // SSE Condition code mapping:
15277   //  0 - EQ
15278   //  1 - LT
15279   //  2 - LE
15280   //  3 - UNORD
15281   //  4 - NEQ
15282   //  5 - NLT
15283   //  6 - NLE
15284   //  7 - ORD
15285   switch (SetCCOpcode) {
15286   default: llvm_unreachable("Unexpected SETCC condition");
15287   case ISD::SETOEQ:
15288   case ISD::SETEQ:  SSECC = 0; break;
15289   case ISD::SETOGT:
15290   case ISD::SETGT:  Swap = true; // Fallthrough
15291   case ISD::SETLT:
15292   case ISD::SETOLT: SSECC = 1; break;
15293   case ISD::SETOGE:
15294   case ISD::SETGE:  Swap = true; // Fallthrough
15295   case ISD::SETLE:
15296   case ISD::SETOLE: SSECC = 2; break;
15297   case ISD::SETUO:  SSECC = 3; break;
15298   case ISD::SETUNE:
15299   case ISD::SETNE:  SSECC = 4; break;
15300   case ISD::SETULE: Swap = true; // Fallthrough
15301   case ISD::SETUGE: SSECC = 5; break;
15302   case ISD::SETULT: Swap = true; // Fallthrough
15303   case ISD::SETUGT: SSECC = 6; break;
15304   case ISD::SETO:   SSECC = 7; break;
15305   case ISD::SETUEQ:
15306   case ISD::SETONE: SSECC = 8; break;
15307   }
15308   if (Swap)
15309     std::swap(Op0, Op1);
15310
15311   return SSECC;
15312 }
15313
15314 // Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
15315 // ones, and then concatenate the result back.
15316 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
15317   MVT VT = Op.getSimpleValueType();
15318
15319   assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
15320          "Unsupported value type for operation");
15321
15322   unsigned NumElems = VT.getVectorNumElements();
15323   SDLoc dl(Op);
15324   SDValue CC = Op.getOperand(2);
15325
15326   // Extract the LHS vectors
15327   SDValue LHS = Op.getOperand(0);
15328   SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
15329   SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
15330
15331   // Extract the RHS vectors
15332   SDValue RHS = Op.getOperand(1);
15333   SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
15334   SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
15335
15336   // Issue the operation on the smaller types and concatenate the result back
15337   MVT EltVT = VT.getVectorElementType();
15338   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
15339   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
15340                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
15341                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
15342 }
15343
15344 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG,
15345                                      const X86Subtarget *Subtarget) {
15346   SDValue Op0 = Op.getOperand(0);
15347   SDValue Op1 = Op.getOperand(1);
15348   SDValue CC = Op.getOperand(2);
15349   MVT VT = Op.getSimpleValueType();
15350   SDLoc dl(Op);
15351
15352   assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 8 &&
15353          Op.getValueType().getScalarType() == MVT::i1 &&
15354          "Cannot set masked compare for this operation");
15355
15356   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
15357   unsigned  Opc = 0;
15358   bool Unsigned = false;
15359   bool Swap = false;
15360   unsigned SSECC;
15361   switch (SetCCOpcode) {
15362   default: llvm_unreachable("Unexpected SETCC condition");
15363   case ISD::SETNE:  SSECC = 4; break;
15364   case ISD::SETEQ:  Opc = X86ISD::PCMPEQM; break;
15365   case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
15366   case ISD::SETLT:  Swap = true; //fall-through
15367   case ISD::SETGT:  Opc = X86ISD::PCMPGTM; break;
15368   case ISD::SETULT: SSECC = 1; Unsigned = true; break;
15369   case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
15370   case ISD::SETGE:  Swap = true; SSECC = 2; break; // LE + swap
15371   case ISD::SETULE: Unsigned = true; //fall-through
15372   case ISD::SETLE:  SSECC = 2; break;
15373   }
15374
15375   if (Swap)
15376     std::swap(Op0, Op1);
15377   if (Opc)
15378     return DAG.getNode(Opc, dl, VT, Op0, Op1);
15379   Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
15380   return DAG.getNode(Opc, dl, VT, Op0, Op1,
15381                      DAG.getConstant(SSECC, MVT::i8));
15382 }
15383
15384 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
15385 /// operand \p Op1.  If non-trivial (for example because it's not constant)
15386 /// return an empty value.
15387 static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG)
15388 {
15389   BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
15390   if (!BV)
15391     return SDValue();
15392
15393   MVT VT = Op1.getSimpleValueType();
15394   MVT EVT = VT.getVectorElementType();
15395   unsigned n = VT.getVectorNumElements();
15396   SmallVector<SDValue, 8> ULTOp1;
15397
15398   for (unsigned i = 0; i < n; ++i) {
15399     ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
15400     if (!Elt || Elt->isOpaque() || Elt->getValueType(0) != EVT)
15401       return SDValue();
15402
15403     // Avoid underflow.
15404     APInt Val = Elt->getAPIntValue();
15405     if (Val == 0)
15406       return SDValue();
15407
15408     ULTOp1.push_back(DAG.getConstant(Val - 1, EVT));
15409   }
15410
15411   return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ULTOp1);
15412 }
15413
15414 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
15415                            SelectionDAG &DAG) {
15416   SDValue Op0 = Op.getOperand(0);
15417   SDValue Op1 = Op.getOperand(1);
15418   SDValue CC = Op.getOperand(2);
15419   MVT VT = Op.getSimpleValueType();
15420   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
15421   bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
15422   SDLoc dl(Op);
15423
15424   if (isFP) {
15425 #ifndef NDEBUG
15426     MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
15427     assert(EltVT == MVT::f32 || EltVT == MVT::f64);
15428 #endif
15429
15430     unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
15431     unsigned Opc = X86ISD::CMPP;
15432     if (Subtarget->hasAVX512() && VT.getVectorElementType() == MVT::i1) {
15433       assert(VT.getVectorNumElements() <= 16);
15434       Opc = X86ISD::CMPM;
15435     }
15436     // In the two special cases we can't handle, emit two comparisons.
15437     if (SSECC == 8) {
15438       unsigned CC0, CC1;
15439       unsigned CombineOpc;
15440       if (SetCCOpcode == ISD::SETUEQ) {
15441         CC0 = 3; CC1 = 0; CombineOpc = ISD::OR;
15442       } else {
15443         assert(SetCCOpcode == ISD::SETONE);
15444         CC0 = 7; CC1 = 4; CombineOpc = ISD::AND;
15445       }
15446
15447       SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
15448                                  DAG.getConstant(CC0, MVT::i8));
15449       SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
15450                                  DAG.getConstant(CC1, MVT::i8));
15451       return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
15452     }
15453     // Handle all other FP comparisons here.
15454     return DAG.getNode(Opc, dl, VT, Op0, Op1,
15455                        DAG.getConstant(SSECC, MVT::i8));
15456   }
15457
15458   // Break 256-bit integer vector compare into smaller ones.
15459   if (VT.is256BitVector() && !Subtarget->hasInt256())
15460     return Lower256IntVSETCC(Op, DAG);
15461
15462   bool MaskResult = (VT.getVectorElementType() == MVT::i1);
15463   EVT OpVT = Op1.getValueType();
15464   if (Subtarget->hasAVX512()) {
15465     if (Op1.getValueType().is512BitVector() ||
15466         (Subtarget->hasBWI() && Subtarget->hasVLX()) ||
15467         (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32))
15468       return LowerIntVSETCC_AVX512(Op, DAG, Subtarget);
15469
15470     // In AVX-512 architecture setcc returns mask with i1 elements,
15471     // But there is no compare instruction for i8 and i16 elements in KNL.
15472     // We are not talking about 512-bit operands in this case, these
15473     // types are illegal.
15474     if (MaskResult &&
15475         (OpVT.getVectorElementType().getSizeInBits() < 32 &&
15476          OpVT.getVectorElementType().getSizeInBits() >= 8))
15477       return DAG.getNode(ISD::TRUNCATE, dl, VT,
15478                          DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
15479   }
15480
15481   // We are handling one of the integer comparisons here.  Since SSE only has
15482   // GT and EQ comparisons for integer, swapping operands and multiple
15483   // operations may be required for some comparisons.
15484   unsigned Opc;
15485   bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
15486   bool Subus = false;
15487
15488   switch (SetCCOpcode) {
15489   default: llvm_unreachable("Unexpected SETCC condition");
15490   case ISD::SETNE:  Invert = true;
15491   case ISD::SETEQ:  Opc = X86ISD::PCMPEQ; break;
15492   case ISD::SETLT:  Swap = true;
15493   case ISD::SETGT:  Opc = X86ISD::PCMPGT; break;
15494   case ISD::SETGE:  Swap = true;
15495   case ISD::SETLE:  Opc = X86ISD::PCMPGT;
15496                     Invert = true; break;
15497   case ISD::SETULT: Swap = true;
15498   case ISD::SETUGT: Opc = X86ISD::PCMPGT;
15499                     FlipSigns = true; break;
15500   case ISD::SETUGE: Swap = true;
15501   case ISD::SETULE: Opc = X86ISD::PCMPGT;
15502                     FlipSigns = true; Invert = true; break;
15503   }
15504
15505   // Special case: Use min/max operations for SETULE/SETUGE
15506   MVT VET = VT.getVectorElementType();
15507   bool hasMinMax =
15508        (Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
15509     || (Subtarget->hasSSE2()  && (VET == MVT::i8));
15510
15511   if (hasMinMax) {
15512     switch (SetCCOpcode) {
15513     default: break;
15514     case ISD::SETULE: Opc = X86ISD::UMIN; MinMax = true; break;
15515     case ISD::SETUGE: Opc = X86ISD::UMAX; MinMax = true; break;
15516     }
15517
15518     if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
15519   }
15520
15521   bool hasSubus = Subtarget->hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
15522   if (!MinMax && hasSubus) {
15523     // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
15524     // Op0 u<= Op1:
15525     //   t = psubus Op0, Op1
15526     //   pcmpeq t, <0..0>
15527     switch (SetCCOpcode) {
15528     default: break;
15529     case ISD::SETULT: {
15530       // If the comparison is against a constant we can turn this into a
15531       // setule.  With psubus, setule does not require a swap.  This is
15532       // beneficial because the constant in the register is no longer
15533       // destructed as the destination so it can be hoisted out of a loop.
15534       // Only do this pre-AVX since vpcmp* is no longer destructive.
15535       if (Subtarget->hasAVX())
15536         break;
15537       SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG);
15538       if (ULEOp1.getNode()) {
15539         Op1 = ULEOp1;
15540         Subus = true; Invert = false; Swap = false;
15541       }
15542       break;
15543     }
15544     // Psubus is better than flip-sign because it requires no inversion.
15545     case ISD::SETUGE: Subus = true; Invert = false; Swap = true;  break;
15546     case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
15547     }
15548
15549     if (Subus) {
15550       Opc = X86ISD::SUBUS;
15551       FlipSigns = false;
15552     }
15553   }
15554
15555   if (Swap)
15556     std::swap(Op0, Op1);
15557
15558   // Check that the operation in question is available (most are plain SSE2,
15559   // but PCMPGTQ and PCMPEQQ have different requirements).
15560   if (VT == MVT::v2i64) {
15561     if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) {
15562       assert(Subtarget->hasSSE2() && "Don't know how to lower!");
15563
15564       // First cast everything to the right type.
15565       Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
15566       Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
15567
15568       // Since SSE has no unsigned integer comparisons, we need to flip the sign
15569       // bits of the inputs before performing those operations. The lower
15570       // compare is always unsigned.
15571       SDValue SB;
15572       if (FlipSigns) {
15573         SB = DAG.getConstant(0x80000000U, MVT::v4i32);
15574       } else {
15575         SDValue Sign = DAG.getConstant(0x80000000U, MVT::i32);
15576         SDValue Zero = DAG.getConstant(0x00000000U, MVT::i32);
15577         SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
15578                          Sign, Zero, Sign, Zero);
15579       }
15580       Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
15581       Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
15582
15583       // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
15584       SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
15585       SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
15586
15587       // Create masks for only the low parts/high parts of the 64 bit integers.
15588       static const int MaskHi[] = { 1, 1, 3, 3 };
15589       static const int MaskLo[] = { 0, 0, 2, 2 };
15590       SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
15591       SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
15592       SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
15593
15594       SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
15595       Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
15596
15597       if (Invert)
15598         Result = DAG.getNOT(dl, Result, MVT::v4i32);
15599
15600       return DAG.getNode(ISD::BITCAST, dl, VT, Result);
15601     }
15602
15603     if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) {
15604       // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
15605       // pcmpeqd + pshufd + pand.
15606       assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!");
15607
15608       // First cast everything to the right type.
15609       Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
15610       Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
15611
15612       // Do the compare.
15613       SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
15614
15615       // Make sure the lower and upper halves are both all-ones.
15616       static const int Mask[] = { 1, 0, 3, 2 };
15617       SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
15618       Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
15619
15620       if (Invert)
15621         Result = DAG.getNOT(dl, Result, MVT::v4i32);
15622
15623       return DAG.getNode(ISD::BITCAST, dl, VT, Result);
15624     }
15625   }
15626
15627   // Since SSE has no unsigned integer comparisons, we need to flip the sign
15628   // bits of the inputs before performing those operations.
15629   if (FlipSigns) {
15630     EVT EltVT = VT.getVectorElementType();
15631     SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), VT);
15632     Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
15633     Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
15634   }
15635
15636   SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
15637
15638   // If the logical-not of the result is required, perform that now.
15639   if (Invert)
15640     Result = DAG.getNOT(dl, Result, VT);
15641
15642   if (MinMax)
15643     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
15644
15645   if (Subus)
15646     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
15647                          getZeroVector(VT, Subtarget, DAG, dl));
15648
15649   return Result;
15650 }
15651
15652 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
15653
15654   MVT VT = Op.getSimpleValueType();
15655
15656   if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
15657
15658   assert(((!Subtarget->hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
15659          && "SetCC type must be 8-bit or 1-bit integer");
15660   SDValue Op0 = Op.getOperand(0);
15661   SDValue Op1 = Op.getOperand(1);
15662   SDLoc dl(Op);
15663   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
15664
15665   // Optimize to BT if possible.
15666   // Lower (X & (1 << N)) == 0 to BT(X, N).
15667   // Lower ((X >>u N) & 1) != 0 to BT(X, N).
15668   // Lower ((X >>s N) & 1) != 0 to BT(X, N).
15669   if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
15670       Op1.getOpcode() == ISD::Constant &&
15671       cast<ConstantSDNode>(Op1)->isNullValue() &&
15672       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15673     SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
15674     if (NewSetCC.getNode()) {
15675       if (VT == MVT::i1)
15676         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
15677       return NewSetCC;
15678     }
15679   }
15680
15681   // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
15682   // these.
15683   if (Op1.getOpcode() == ISD::Constant &&
15684       (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
15685        cast<ConstantSDNode>(Op1)->isNullValue()) &&
15686       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15687
15688     // If the input is a setcc, then reuse the input setcc or use a new one with
15689     // the inverted condition.
15690     if (Op0.getOpcode() == X86ISD::SETCC) {
15691       X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
15692       bool Invert = (CC == ISD::SETNE) ^
15693         cast<ConstantSDNode>(Op1)->isNullValue();
15694       if (!Invert)
15695         return Op0;
15696
15697       CCode = X86::GetOppositeBranchCondition(CCode);
15698       SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15699                                   DAG.getConstant(CCode, MVT::i8),
15700                                   Op0.getOperand(1));
15701       if (VT == MVT::i1)
15702         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
15703       return SetCC;
15704     }
15705   }
15706   if ((Op0.getValueType() == MVT::i1) && (Op1.getOpcode() == ISD::Constant) &&
15707       (cast<ConstantSDNode>(Op1)->getZExtValue() == 1) &&
15708       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15709
15710     ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
15711     return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, MVT::i1), NewCC);
15712   }
15713
15714   bool isFP = Op1.getSimpleValueType().isFloatingPoint();
15715   unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
15716   if (X86CC == X86::COND_INVALID)
15717     return SDValue();
15718
15719   SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
15720   EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
15721   SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15722                               DAG.getConstant(X86CC, MVT::i8), EFLAGS);
15723   if (VT == MVT::i1)
15724     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
15725   return SetCC;
15726 }
15727
15728 // isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
15729 static bool isX86LogicalCmp(SDValue Op) {
15730   unsigned Opc = Op.getNode()->getOpcode();
15731   if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
15732       Opc == X86ISD::SAHF)
15733     return true;
15734   if (Op.getResNo() == 1 &&
15735       (Opc == X86ISD::ADD ||
15736        Opc == X86ISD::SUB ||
15737        Opc == X86ISD::ADC ||
15738        Opc == X86ISD::SBB ||
15739        Opc == X86ISD::SMUL ||
15740        Opc == X86ISD::UMUL ||
15741        Opc == X86ISD::INC ||
15742        Opc == X86ISD::DEC ||
15743        Opc == X86ISD::OR ||
15744        Opc == X86ISD::XOR ||
15745        Opc == X86ISD::AND))
15746     return true;
15747
15748   if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
15749     return true;
15750
15751   return false;
15752 }
15753
15754 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
15755   if (V.getOpcode() != ISD::TRUNCATE)
15756     return false;
15757
15758   SDValue VOp0 = V.getOperand(0);
15759   unsigned InBits = VOp0.getValueSizeInBits();
15760   unsigned Bits = V.getValueSizeInBits();
15761   return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
15762 }
15763
15764 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
15765   bool addTest = true;
15766   SDValue Cond  = Op.getOperand(0);
15767   SDValue Op1 = Op.getOperand(1);
15768   SDValue Op2 = Op.getOperand(2);
15769   SDLoc DL(Op);
15770   EVT VT = Op1.getValueType();
15771   SDValue CC;
15772
15773   // Lower fp selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
15774   // are available. Otherwise fp cmovs get lowered into a less efficient branch
15775   // sequence later on.
15776   if (Cond.getOpcode() == ISD::SETCC &&
15777       ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
15778        (Subtarget->hasSSE1() && VT == MVT::f32)) &&
15779       VT == Cond.getOperand(0).getValueType() && Cond->hasOneUse()) {
15780     SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
15781     int SSECC = translateX86FSETCC(
15782         cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
15783
15784     if (SSECC != 8) {
15785       if (Subtarget->hasAVX512()) {
15786         SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1,
15787                                   DAG.getConstant(SSECC, MVT::i8));
15788         return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2);
15789       }
15790       SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
15791                                 DAG.getConstant(SSECC, MVT::i8));
15792       SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
15793       SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
15794       return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
15795     }
15796   }
15797
15798   if (Cond.getOpcode() == ISD::SETCC) {
15799     SDValue NewCond = LowerSETCC(Cond, DAG);
15800     if (NewCond.getNode())
15801       Cond = NewCond;
15802   }
15803
15804   // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
15805   // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
15806   // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
15807   // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
15808   if (Cond.getOpcode() == X86ISD::SETCC &&
15809       Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
15810       isZero(Cond.getOperand(1).getOperand(1))) {
15811     SDValue Cmp = Cond.getOperand(1);
15812
15813     unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
15814
15815     if ((isAllOnes(Op1) || isAllOnes(Op2)) &&
15816         (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
15817       SDValue Y = isAllOnes(Op2) ? Op1 : Op2;
15818
15819       SDValue CmpOp0 = Cmp.getOperand(0);
15820       // Apply further optimizations for special cases
15821       // (select (x != 0), -1, 0) -> neg & sbb
15822       // (select (x == 0), 0, -1) -> neg & sbb
15823       if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y))
15824         if (YC->isNullValue() &&
15825             (isAllOnes(Op1) == (CondCode == X86::COND_NE))) {
15826           SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
15827           SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
15828                                     DAG.getConstant(0, CmpOp0.getValueType()),
15829                                     CmpOp0);
15830           SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
15831                                     DAG.getConstant(X86::COND_B, MVT::i8),
15832                                     SDValue(Neg.getNode(), 1));
15833           return Res;
15834         }
15835
15836       Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
15837                         CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));
15838       Cmp = ConvertCmpIfNecessary(Cmp, DAG);
15839
15840       SDValue Res =   // Res = 0 or -1.
15841         DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
15842                     DAG.getConstant(X86::COND_B, MVT::i8), Cmp);
15843
15844       if (isAllOnes(Op1) != (CondCode == X86::COND_E))
15845         Res = DAG.getNOT(DL, Res, Res.getValueType());
15846
15847       ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
15848       if (!N2C || !N2C->isNullValue())
15849         Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
15850       return Res;
15851     }
15852   }
15853
15854   // Look past (and (setcc_carry (cmp ...)), 1).
15855   if (Cond.getOpcode() == ISD::AND &&
15856       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
15857     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
15858     if (C && C->getAPIntValue() == 1)
15859       Cond = Cond.getOperand(0);
15860   }
15861
15862   // If condition flag is set by a X86ISD::CMP, then use it as the condition
15863   // setting operand in place of the X86ISD::SETCC.
15864   unsigned CondOpcode = Cond.getOpcode();
15865   if (CondOpcode == X86ISD::SETCC ||
15866       CondOpcode == X86ISD::SETCC_CARRY) {
15867     CC = Cond.getOperand(0);
15868
15869     SDValue Cmp = Cond.getOperand(1);
15870     unsigned Opc = Cmp.getOpcode();
15871     MVT VT = Op.getSimpleValueType();
15872
15873     bool IllegalFPCMov = false;
15874     if (VT.isFloatingPoint() && !VT.isVector() &&
15875         !isScalarFPTypeInSSEReg(VT))  // FPStack?
15876       IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
15877
15878     if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
15879         Opc == X86ISD::BT) { // FIXME
15880       Cond = Cmp;
15881       addTest = false;
15882     }
15883   } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
15884              CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
15885              ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
15886               Cond.getOperand(0).getValueType() != MVT::i8)) {
15887     SDValue LHS = Cond.getOperand(0);
15888     SDValue RHS = Cond.getOperand(1);
15889     unsigned X86Opcode;
15890     unsigned X86Cond;
15891     SDVTList VTs;
15892     switch (CondOpcode) {
15893     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
15894     case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
15895     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
15896     case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
15897     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
15898     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
15899     default: llvm_unreachable("unexpected overflowing operator");
15900     }
15901     if (CondOpcode == ISD::UMULO)
15902       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
15903                           MVT::i32);
15904     else
15905       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
15906
15907     SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
15908
15909     if (CondOpcode == ISD::UMULO)
15910       Cond = X86Op.getValue(2);
15911     else
15912       Cond = X86Op.getValue(1);
15913
15914     CC = DAG.getConstant(X86Cond, MVT::i8);
15915     addTest = false;
15916   }
15917
15918   if (addTest) {
15919     // Look pass the truncate if the high bits are known zero.
15920     if (isTruncWithZeroHighBitsInput(Cond, DAG))
15921         Cond = Cond.getOperand(0);
15922
15923     // We know the result of AND is compared against zero. Try to match
15924     // it to BT.
15925     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
15926       SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG);
15927       if (NewSetCC.getNode()) {
15928         CC = NewSetCC.getOperand(0);
15929         Cond = NewSetCC.getOperand(1);
15930         addTest = false;
15931       }
15932     }
15933   }
15934
15935   if (addTest) {
15936     CC = DAG.getConstant(X86::COND_NE, MVT::i8);
15937     Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
15938   }
15939
15940   // a <  b ? -1 :  0 -> RES = ~setcc_carry
15941   // a <  b ?  0 : -1 -> RES = setcc_carry
15942   // a >= b ? -1 :  0 -> RES = setcc_carry
15943   // a >= b ?  0 : -1 -> RES = ~setcc_carry
15944   if (Cond.getOpcode() == X86ISD::SUB) {
15945     Cond = ConvertCmpIfNecessary(Cond, DAG);
15946     unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
15947
15948     if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
15949         (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) {
15950       SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
15951                                 DAG.getConstant(X86::COND_B, MVT::i8), Cond);
15952       if (isAllOnes(Op1) != (CondCode == X86::COND_B))
15953         return DAG.getNOT(DL, Res, Res.getValueType());
15954       return Res;
15955     }
15956   }
15957
15958   // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
15959   // widen the cmov and push the truncate through. This avoids introducing a new
15960   // branch during isel and doesn't add any extensions.
15961   if (Op.getValueType() == MVT::i8 &&
15962       Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
15963     SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
15964     if (T1.getValueType() == T2.getValueType() &&
15965         // Blacklist CopyFromReg to avoid partial register stalls.
15966         T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
15967       SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
15968       SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
15969       return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
15970     }
15971   }
15972
15973   // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
15974   // condition is true.
15975   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
15976   SDValue Ops[] = { Op2, Op1, CC, Cond };
15977   return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
15978 }
15979
15980 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget,
15981                                        SelectionDAG &DAG) {
15982   MVT VT = Op->getSimpleValueType(0);
15983   SDValue In = Op->getOperand(0);
15984   MVT InVT = In.getSimpleValueType();
15985   MVT VTElt = VT.getVectorElementType();
15986   MVT InVTElt = InVT.getVectorElementType();
15987   SDLoc dl(Op);
15988
15989   // SKX processor
15990   if ((InVTElt == MVT::i1) &&
15991       (((Subtarget->hasBWI() && Subtarget->hasVLX() &&
15992         VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) ||
15993
15994        ((Subtarget->hasBWI() && VT.is512BitVector() &&
15995         VTElt.getSizeInBits() <= 16)) ||
15996
15997        ((Subtarget->hasDQI() && Subtarget->hasVLX() &&
15998         VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
15999
16000        ((Subtarget->hasDQI() && VT.is512BitVector() &&
16001         VTElt.getSizeInBits() >= 32))))
16002     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16003
16004   unsigned int NumElts = VT.getVectorNumElements();
16005
16006   if (NumElts != 8 && NumElts != 16)
16007     return SDValue();
16008
16009   if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) {
16010     if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
16011       return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
16012     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16013   }
16014
16015   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16016   assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
16017
16018   MVT ExtVT = (NumElts == 8) ? MVT::v8i64 : MVT::v16i32;
16019   Constant *C = ConstantInt::get(*DAG.getContext(),
16020     APInt::getAllOnesValue(ExtVT.getScalarType().getSizeInBits()));
16021
16022   SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
16023   unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
16024   SDValue Ld = DAG.getLoad(ExtVT.getScalarType(), dl, DAG.getEntryNode(), CP,
16025                           MachinePointerInfo::getConstantPool(),
16026                           false, false, false, Alignment);
16027   SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, dl, ExtVT, In, Ld);
16028   if (VT.is512BitVector())
16029     return Brcst;
16030   return DAG.getNode(X86ISD::VTRUNC, dl, VT, Brcst);
16031 }
16032
16033 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
16034                                 SelectionDAG &DAG) {
16035   MVT VT = Op->getSimpleValueType(0);
16036   SDValue In = Op->getOperand(0);
16037   MVT InVT = In.getSimpleValueType();
16038   SDLoc dl(Op);
16039
16040   if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
16041     return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
16042
16043   if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
16044       (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
16045       (VT != MVT::v16i16 || InVT != MVT::v16i8))
16046     return SDValue();
16047
16048   if (Subtarget->hasInt256())
16049     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16050
16051   // Optimize vectors in AVX mode
16052   // Sign extend  v8i16 to v8i32 and
16053   //              v4i32 to v4i64
16054   //
16055   // Divide input vector into two parts
16056   // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
16057   // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
16058   // concat the vectors to original VT
16059
16060   unsigned NumElems = InVT.getVectorNumElements();
16061   SDValue Undef = DAG.getUNDEF(InVT);
16062
16063   SmallVector<int,8> ShufMask1(NumElems, -1);
16064   for (unsigned i = 0; i != NumElems/2; ++i)
16065     ShufMask1[i] = i;
16066
16067   SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask1[0]);
16068
16069   SmallVector<int,8> ShufMask2(NumElems, -1);
16070   for (unsigned i = 0; i != NumElems/2; ++i)
16071     ShufMask2[i] = i + NumElems/2;
16072
16073   SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]);
16074
16075   MVT HalfVT = MVT::getVectorVT(VT.getScalarType(),
16076                                 VT.getVectorNumElements()/2);
16077
16078   OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
16079   OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi);
16080
16081   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
16082 }
16083
16084 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
16085 // may emit an illegal shuffle but the expansion is still better than scalar
16086 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
16087 // we'll emit a shuffle and a arithmetic shift.
16088 // TODO: It is possible to support ZExt by zeroing the undef values during
16089 // the shuffle phase or after the shuffle.
16090 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
16091                                  SelectionDAG &DAG) {
16092   MVT RegVT = Op.getSimpleValueType();
16093   assert(RegVT.isVector() && "We only custom lower vector sext loads.");
16094   assert(RegVT.isInteger() &&
16095          "We only custom lower integer vector sext loads.");
16096
16097   // Nothing useful we can do without SSE2 shuffles.
16098   assert(Subtarget->hasSSE2() && "We only custom lower sext loads with SSE2.");
16099
16100   LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
16101   SDLoc dl(Ld);
16102   EVT MemVT = Ld->getMemoryVT();
16103   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16104   unsigned RegSz = RegVT.getSizeInBits();
16105
16106   ISD::LoadExtType Ext = Ld->getExtensionType();
16107
16108   assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
16109          && "Only anyext and sext are currently implemented.");
16110   assert(MemVT != RegVT && "Cannot extend to the same type");
16111   assert(MemVT.isVector() && "Must load a vector from memory");
16112
16113   unsigned NumElems = RegVT.getVectorNumElements();
16114   unsigned MemSz = MemVT.getSizeInBits();
16115   assert(RegSz > MemSz && "Register size must be greater than the mem size");
16116
16117   if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) {
16118     // The only way in which we have a legal 256-bit vector result but not the
16119     // integer 256-bit operations needed to directly lower a sextload is if we
16120     // have AVX1 but not AVX2. In that case, we can always emit a sextload to
16121     // a 128-bit vector and a normal sign_extend to 256-bits that should get
16122     // correctly legalized. We do this late to allow the canonical form of
16123     // sextload to persist throughout the rest of the DAG combiner -- it wants
16124     // to fold together any extensions it can, and so will fuse a sign_extend
16125     // of an sextload into a sextload targeting a wider value.
16126     SDValue Load;
16127     if (MemSz == 128) {
16128       // Just switch this to a normal load.
16129       assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
16130                                        "it must be a legal 128-bit vector "
16131                                        "type!");
16132       Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
16133                   Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(),
16134                   Ld->isInvariant(), Ld->getAlignment());
16135     } else {
16136       assert(MemSz < 128 &&
16137              "Can't extend a type wider than 128 bits to a 256 bit vector!");
16138       // Do an sext load to a 128-bit vector type. We want to use the same
16139       // number of elements, but elements half as wide. This will end up being
16140       // recursively lowered by this routine, but will succeed as we definitely
16141       // have all the necessary features if we're using AVX1.
16142       EVT HalfEltVT =
16143           EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
16144       EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
16145       Load =
16146           DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
16147                          Ld->getPointerInfo(), MemVT, Ld->isVolatile(),
16148                          Ld->isNonTemporal(), Ld->isInvariant(),
16149                          Ld->getAlignment());
16150     }
16151
16152     // Replace chain users with the new chain.
16153     assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
16154     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
16155
16156     // Finally, do a normal sign-extend to the desired register.
16157     return DAG.getSExtOrTrunc(Load, dl, RegVT);
16158   }
16159
16160   // All sizes must be a power of two.
16161   assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
16162          "Non-power-of-two elements are not custom lowered!");
16163
16164   // Attempt to load the original value using scalar loads.
16165   // Find the largest scalar type that divides the total loaded size.
16166   MVT SclrLoadTy = MVT::i8;
16167   for (MVT Tp : MVT::integer_valuetypes()) {
16168     if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
16169       SclrLoadTy = Tp;
16170     }
16171   }
16172
16173   // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
16174   if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
16175       (64 <= MemSz))
16176     SclrLoadTy = MVT::f64;
16177
16178   // Calculate the number of scalar loads that we need to perform
16179   // in order to load our vector from memory.
16180   unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
16181
16182   assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
16183          "Can only lower sext loads with a single scalar load!");
16184
16185   unsigned loadRegZize = RegSz;
16186   if (Ext == ISD::SEXTLOAD && RegSz == 256)
16187     loadRegZize /= 2;
16188
16189   // Represent our vector as a sequence of elements which are the
16190   // largest scalar that we can load.
16191   EVT LoadUnitVecVT = EVT::getVectorVT(
16192       *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
16193
16194   // Represent the data using the same element type that is stored in
16195   // memory. In practice, we ''widen'' MemVT.
16196   EVT WideVecVT =
16197       EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
16198                        loadRegZize / MemVT.getScalarType().getSizeInBits());
16199
16200   assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
16201          "Invalid vector type");
16202
16203   // We can't shuffle using an illegal type.
16204   assert(TLI.isTypeLegal(WideVecVT) &&
16205          "We only lower types that form legal widened vector types");
16206
16207   SmallVector<SDValue, 8> Chains;
16208   SDValue Ptr = Ld->getBasePtr();
16209   SDValue Increment =
16210       DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, TLI.getPointerTy());
16211   SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
16212
16213   for (unsigned i = 0; i < NumLoads; ++i) {
16214     // Perform a single load.
16215     SDValue ScalarLoad =
16216         DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
16217                     Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(),
16218                     Ld->getAlignment());
16219     Chains.push_back(ScalarLoad.getValue(1));
16220     // Create the first element type using SCALAR_TO_VECTOR in order to avoid
16221     // another round of DAGCombining.
16222     if (i == 0)
16223       Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
16224     else
16225       Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
16226                         ScalarLoad, DAG.getIntPtrConstant(i));
16227
16228     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
16229   }
16230
16231   SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
16232
16233   // Bitcast the loaded value to a vector of the original element type, in
16234   // the size of the target vector type.
16235   SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
16236   unsigned SizeRatio = RegSz / MemSz;
16237
16238   if (Ext == ISD::SEXTLOAD) {
16239     // If we have SSE4.1, we can directly emit a VSEXT node.
16240     if (Subtarget->hasSSE41()) {
16241       SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
16242       DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16243       return Sext;
16244     }
16245
16246     // Otherwise we'll shuffle the small elements in the high bits of the
16247     // larger type and perform an arithmetic shift. If the shift is not legal
16248     // it's better to scalarize.
16249     assert(TLI.isOperationLegalOrCustom(ISD::SRA, RegVT) &&
16250            "We can't implement a sext load without an arithmetic right shift!");
16251
16252     // Redistribute the loaded elements into the different locations.
16253     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
16254     for (unsigned i = 0; i != NumElems; ++i)
16255       ShuffleVec[i * SizeRatio + SizeRatio - 1] = i;
16256
16257     SDValue Shuff = DAG.getVectorShuffle(
16258         WideVecVT, dl, SlicedVec, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
16259
16260     Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
16261
16262     // Build the arithmetic shift.
16263     unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
16264                    MemVT.getVectorElementType().getSizeInBits();
16265     Shuff =
16266         DAG.getNode(ISD::SRA, dl, RegVT, Shuff, DAG.getConstant(Amt, RegVT));
16267
16268     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16269     return Shuff;
16270   }
16271
16272   // Redistribute the loaded elements into the different locations.
16273   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
16274   for (unsigned i = 0; i != NumElems; ++i)
16275     ShuffleVec[i * SizeRatio] = i;
16276
16277   SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
16278                                        DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
16279
16280   // Bitcast to the requested type.
16281   Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
16282   DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16283   return Shuff;
16284 }
16285
16286 // isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
16287 // ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
16288 // from the AND / OR.
16289 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
16290   Opc = Op.getOpcode();
16291   if (Opc != ISD::OR && Opc != ISD::AND)
16292     return false;
16293   return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
16294           Op.getOperand(0).hasOneUse() &&
16295           Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
16296           Op.getOperand(1).hasOneUse());
16297 }
16298
16299 // isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
16300 // 1 and that the SETCC node has a single use.
16301 static bool isXor1OfSetCC(SDValue Op) {
16302   if (Op.getOpcode() != ISD::XOR)
16303     return false;
16304   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
16305   if (N1C && N1C->getAPIntValue() == 1) {
16306     return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
16307       Op.getOperand(0).hasOneUse();
16308   }
16309   return false;
16310 }
16311
16312 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
16313   bool addTest = true;
16314   SDValue Chain = Op.getOperand(0);
16315   SDValue Cond  = Op.getOperand(1);
16316   SDValue Dest  = Op.getOperand(2);
16317   SDLoc dl(Op);
16318   SDValue CC;
16319   bool Inverted = false;
16320
16321   if (Cond.getOpcode() == ISD::SETCC) {
16322     // Check for setcc([su]{add,sub,mul}o == 0).
16323     if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
16324         isa<ConstantSDNode>(Cond.getOperand(1)) &&
16325         cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() &&
16326         Cond.getOperand(0).getResNo() == 1 &&
16327         (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
16328          Cond.getOperand(0).getOpcode() == ISD::UADDO ||
16329          Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
16330          Cond.getOperand(0).getOpcode() == ISD::USUBO ||
16331          Cond.getOperand(0).getOpcode() == ISD::SMULO ||
16332          Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
16333       Inverted = true;
16334       Cond = Cond.getOperand(0);
16335     } else {
16336       SDValue NewCond = LowerSETCC(Cond, DAG);
16337       if (NewCond.getNode())
16338         Cond = NewCond;
16339     }
16340   }
16341 #if 0
16342   // FIXME: LowerXALUO doesn't handle these!!
16343   else if (Cond.getOpcode() == X86ISD::ADD  ||
16344            Cond.getOpcode() == X86ISD::SUB  ||
16345            Cond.getOpcode() == X86ISD::SMUL ||
16346            Cond.getOpcode() == X86ISD::UMUL)
16347     Cond = LowerXALUO(Cond, DAG);
16348 #endif
16349
16350   // Look pass (and (setcc_carry (cmp ...)), 1).
16351   if (Cond.getOpcode() == ISD::AND &&
16352       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
16353     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
16354     if (C && C->getAPIntValue() == 1)
16355       Cond = Cond.getOperand(0);
16356   }
16357
16358   // If condition flag is set by a X86ISD::CMP, then use it as the condition
16359   // setting operand in place of the X86ISD::SETCC.
16360   unsigned CondOpcode = Cond.getOpcode();
16361   if (CondOpcode == X86ISD::SETCC ||
16362       CondOpcode == X86ISD::SETCC_CARRY) {
16363     CC = Cond.getOperand(0);
16364
16365     SDValue Cmp = Cond.getOperand(1);
16366     unsigned Opc = Cmp.getOpcode();
16367     // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
16368     if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
16369       Cond = Cmp;
16370       addTest = false;
16371     } else {
16372       switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
16373       default: break;
16374       case X86::COND_O:
16375       case X86::COND_B:
16376         // These can only come from an arithmetic instruction with overflow,
16377         // e.g. SADDO, UADDO.
16378         Cond = Cond.getNode()->getOperand(1);
16379         addTest = false;
16380         break;
16381       }
16382     }
16383   }
16384   CondOpcode = Cond.getOpcode();
16385   if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
16386       CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
16387       ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
16388        Cond.getOperand(0).getValueType() != MVT::i8)) {
16389     SDValue LHS = Cond.getOperand(0);
16390     SDValue RHS = Cond.getOperand(1);
16391     unsigned X86Opcode;
16392     unsigned X86Cond;
16393     SDVTList VTs;
16394     // Keep this in sync with LowerXALUO, otherwise we might create redundant
16395     // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
16396     // X86ISD::INC).
16397     switch (CondOpcode) {
16398     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
16399     case ISD::SADDO:
16400       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
16401         if (C->isOne()) {
16402           X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
16403           break;
16404         }
16405       X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
16406     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
16407     case ISD::SSUBO:
16408       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
16409         if (C->isOne()) {
16410           X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
16411           break;
16412         }
16413       X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
16414     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
16415     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
16416     default: llvm_unreachable("unexpected overflowing operator");
16417     }
16418     if (Inverted)
16419       X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
16420     if (CondOpcode == ISD::UMULO)
16421       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
16422                           MVT::i32);
16423     else
16424       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
16425
16426     SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
16427
16428     if (CondOpcode == ISD::UMULO)
16429       Cond = X86Op.getValue(2);
16430     else
16431       Cond = X86Op.getValue(1);
16432
16433     CC = DAG.getConstant(X86Cond, MVT::i8);
16434     addTest = false;
16435   } else {
16436     unsigned CondOpc;
16437     if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
16438       SDValue Cmp = Cond.getOperand(0).getOperand(1);
16439       if (CondOpc == ISD::OR) {
16440         // Also, recognize the pattern generated by an FCMP_UNE. We can emit
16441         // two branches instead of an explicit OR instruction with a
16442         // separate test.
16443         if (Cmp == Cond.getOperand(1).getOperand(1) &&
16444             isX86LogicalCmp(Cmp)) {
16445           CC = Cond.getOperand(0).getOperand(0);
16446           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16447                               Chain, Dest, CC, Cmp);
16448           CC = Cond.getOperand(1).getOperand(0);
16449           Cond = Cmp;
16450           addTest = false;
16451         }
16452       } else { // ISD::AND
16453         // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
16454         // two branches instead of an explicit AND instruction with a
16455         // separate test. However, we only do this if this block doesn't
16456         // have a fall-through edge, because this requires an explicit
16457         // jmp when the condition is false.
16458         if (Cmp == Cond.getOperand(1).getOperand(1) &&
16459             isX86LogicalCmp(Cmp) &&
16460             Op.getNode()->hasOneUse()) {
16461           X86::CondCode CCode =
16462             (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
16463           CCode = X86::GetOppositeBranchCondition(CCode);
16464           CC = DAG.getConstant(CCode, MVT::i8);
16465           SDNode *User = *Op.getNode()->use_begin();
16466           // Look for an unconditional branch following this conditional branch.
16467           // We need this because we need to reverse the successors in order
16468           // to implement FCMP_OEQ.
16469           if (User->getOpcode() == ISD::BR) {
16470             SDValue FalseBB = User->getOperand(1);
16471             SDNode *NewBR =
16472               DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16473             assert(NewBR == User);
16474             (void)NewBR;
16475             Dest = FalseBB;
16476
16477             Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16478                                 Chain, Dest, CC, Cmp);
16479             X86::CondCode CCode =
16480               (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
16481             CCode = X86::GetOppositeBranchCondition(CCode);
16482             CC = DAG.getConstant(CCode, MVT::i8);
16483             Cond = Cmp;
16484             addTest = false;
16485           }
16486         }
16487       }
16488     } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
16489       // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
16490       // It should be transformed during dag combiner except when the condition
16491       // is set by a arithmetics with overflow node.
16492       X86::CondCode CCode =
16493         (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
16494       CCode = X86::GetOppositeBranchCondition(CCode);
16495       CC = DAG.getConstant(CCode, MVT::i8);
16496       Cond = Cond.getOperand(0).getOperand(1);
16497       addTest = false;
16498     } else if (Cond.getOpcode() == ISD::SETCC &&
16499                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
16500       // For FCMP_OEQ, we can emit
16501       // two branches instead of an explicit AND instruction with a
16502       // separate test. However, we only do this if this block doesn't
16503       // have a fall-through edge, because this requires an explicit
16504       // jmp when the condition is false.
16505       if (Op.getNode()->hasOneUse()) {
16506         SDNode *User = *Op.getNode()->use_begin();
16507         // Look for an unconditional branch following this conditional branch.
16508         // We need this because we need to reverse the successors in order
16509         // to implement FCMP_OEQ.
16510         if (User->getOpcode() == ISD::BR) {
16511           SDValue FalseBB = User->getOperand(1);
16512           SDNode *NewBR =
16513             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16514           assert(NewBR == User);
16515           (void)NewBR;
16516           Dest = FalseBB;
16517
16518           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
16519                                     Cond.getOperand(0), Cond.getOperand(1));
16520           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16521           CC = DAG.getConstant(X86::COND_NE, MVT::i8);
16522           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16523                               Chain, Dest, CC, Cmp);
16524           CC = DAG.getConstant(X86::COND_P, MVT::i8);
16525           Cond = Cmp;
16526           addTest = false;
16527         }
16528       }
16529     } else if (Cond.getOpcode() == ISD::SETCC &&
16530                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
16531       // For FCMP_UNE, we can emit
16532       // two branches instead of an explicit AND instruction with a
16533       // separate test. However, we only do this if this block doesn't
16534       // have a fall-through edge, because this requires an explicit
16535       // jmp when the condition is false.
16536       if (Op.getNode()->hasOneUse()) {
16537         SDNode *User = *Op.getNode()->use_begin();
16538         // Look for an unconditional branch following this conditional branch.
16539         // We need this because we need to reverse the successors in order
16540         // to implement FCMP_UNE.
16541         if (User->getOpcode() == ISD::BR) {
16542           SDValue FalseBB = User->getOperand(1);
16543           SDNode *NewBR =
16544             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16545           assert(NewBR == User);
16546           (void)NewBR;
16547
16548           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
16549                                     Cond.getOperand(0), Cond.getOperand(1));
16550           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16551           CC = DAG.getConstant(X86::COND_NE, MVT::i8);
16552           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16553                               Chain, Dest, CC, Cmp);
16554           CC = DAG.getConstant(X86::COND_NP, MVT::i8);
16555           Cond = Cmp;
16556           addTest = false;
16557           Dest = FalseBB;
16558         }
16559       }
16560     }
16561   }
16562
16563   if (addTest) {
16564     // Look pass the truncate if the high bits are known zero.
16565     if (isTruncWithZeroHighBitsInput(Cond, DAG))
16566         Cond = Cond.getOperand(0);
16567
16568     // We know the result of AND is compared against zero. Try to match
16569     // it to BT.
16570     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
16571       SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
16572       if (NewSetCC.getNode()) {
16573         CC = NewSetCC.getOperand(0);
16574         Cond = NewSetCC.getOperand(1);
16575         addTest = false;
16576       }
16577     }
16578   }
16579
16580   if (addTest) {
16581     X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
16582     CC = DAG.getConstant(X86Cond, MVT::i8);
16583     Cond = EmitTest(Cond, X86Cond, dl, DAG);
16584   }
16585   Cond = ConvertCmpIfNecessary(Cond, DAG);
16586   return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16587                      Chain, Dest, CC, Cond);
16588 }
16589
16590 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
16591 // Calls to _alloca are needed to probe the stack when allocating more than 4k
16592 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
16593 // that the guard pages used by the OS virtual memory manager are allocated in
16594 // correct sequence.
16595 SDValue
16596 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
16597                                            SelectionDAG &DAG) const {
16598   MachineFunction &MF = DAG.getMachineFunction();
16599   bool SplitStack = MF.shouldSplitStack();
16600   bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMachO()) ||
16601                SplitStack;
16602   SDLoc dl(Op);
16603
16604   if (!Lower) {
16605     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16606     SDNode* Node = Op.getNode();
16607
16608     unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
16609     assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
16610         " not tell us which reg is the stack pointer!");
16611     EVT VT = Node->getValueType(0);
16612     SDValue Tmp1 = SDValue(Node, 0);
16613     SDValue Tmp2 = SDValue(Node, 1);
16614     SDValue Tmp3 = Node->getOperand(2);
16615     SDValue Chain = Tmp1.getOperand(0);
16616
16617     // Chain the dynamic stack allocation so that it doesn't modify the stack
16618     // pointer when other instructions are using the stack.
16619     Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true),
16620         SDLoc(Node));
16621
16622     SDValue Size = Tmp2.getOperand(1);
16623     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
16624     Chain = SP.getValue(1);
16625     unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
16626     const TargetFrameLowering &TFI = *DAG.getSubtarget().getFrameLowering();
16627     unsigned StackAlign = TFI.getStackAlignment();
16628     Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
16629     if (Align > StackAlign)
16630       Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
16631           DAG.getConstant(-(uint64_t)Align, VT));
16632     Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
16633
16634     Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, true),
16635         DAG.getIntPtrConstant(0, true), SDValue(),
16636         SDLoc(Node));
16637
16638     SDValue Ops[2] = { Tmp1, Tmp2 };
16639     return DAG.getMergeValues(Ops, dl);
16640   }
16641
16642   // Get the inputs.
16643   SDValue Chain = Op.getOperand(0);
16644   SDValue Size  = Op.getOperand(1);
16645   unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
16646   EVT VT = Op.getNode()->getValueType(0);
16647
16648   bool Is64Bit = Subtarget->is64Bit();
16649   EVT SPTy = getPointerTy();
16650
16651   if (SplitStack) {
16652     MachineRegisterInfo &MRI = MF.getRegInfo();
16653
16654     if (Is64Bit) {
16655       // The 64 bit implementation of segmented stacks needs to clobber both r10
16656       // r11. This makes it impossible to use it along with nested parameters.
16657       const Function *F = MF.getFunction();
16658
16659       for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
16660            I != E; ++I)
16661         if (I->hasNestAttr())
16662           report_fatal_error("Cannot use segmented stacks with functions that "
16663                              "have nested arguments.");
16664     }
16665
16666     const TargetRegisterClass *AddrRegClass =
16667       getRegClassFor(getPointerTy());
16668     unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
16669     Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
16670     SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
16671                                 DAG.getRegister(Vreg, SPTy));
16672     SDValue Ops1[2] = { Value, Chain };
16673     return DAG.getMergeValues(Ops1, dl);
16674   } else {
16675     SDValue Flag;
16676     const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX);
16677
16678     Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
16679     Flag = Chain.getValue(1);
16680     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
16681
16682     Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
16683
16684     const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
16685         DAG.getSubtarget().getRegisterInfo());
16686     unsigned SPReg = RegInfo->getStackRegister();
16687     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
16688     Chain = SP.getValue(1);
16689
16690     if (Align) {
16691       SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
16692                        DAG.getConstant(-(uint64_t)Align, VT));
16693       Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
16694     }
16695
16696     SDValue Ops1[2] = { SP, Chain };
16697     return DAG.getMergeValues(Ops1, dl);
16698   }
16699 }
16700
16701 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
16702   MachineFunction &MF = DAG.getMachineFunction();
16703   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
16704
16705   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
16706   SDLoc DL(Op);
16707
16708   if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) {
16709     // vastart just stores the address of the VarArgsFrameIndex slot into the
16710     // memory location argument.
16711     SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
16712                                    getPointerTy());
16713     return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
16714                         MachinePointerInfo(SV), false, false, 0);
16715   }
16716
16717   // __va_list_tag:
16718   //   gp_offset         (0 - 6 * 8)
16719   //   fp_offset         (48 - 48 + 8 * 16)
16720   //   overflow_arg_area (point to parameters coming in memory).
16721   //   reg_save_area
16722   SmallVector<SDValue, 8> MemOps;
16723   SDValue FIN = Op.getOperand(1);
16724   // Store gp_offset
16725   SDValue Store = DAG.getStore(Op.getOperand(0), DL,
16726                                DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
16727                                                MVT::i32),
16728                                FIN, MachinePointerInfo(SV), false, false, 0);
16729   MemOps.push_back(Store);
16730
16731   // Store fp_offset
16732   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
16733                     FIN, DAG.getIntPtrConstant(4));
16734   Store = DAG.getStore(Op.getOperand(0), DL,
16735                        DAG.getConstant(FuncInfo->getVarArgsFPOffset(),
16736                                        MVT::i32),
16737                        FIN, MachinePointerInfo(SV, 4), false, false, 0);
16738   MemOps.push_back(Store);
16739
16740   // Store ptr to overflow_arg_area
16741   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
16742                     FIN, DAG.getIntPtrConstant(4));
16743   SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
16744                                     getPointerTy());
16745   Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
16746                        MachinePointerInfo(SV, 8),
16747                        false, false, 0);
16748   MemOps.push_back(Store);
16749
16750   // Store ptr to reg_save_area.
16751   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
16752                     FIN, DAG.getIntPtrConstant(8));
16753   SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
16754                                     getPointerTy());
16755   Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
16756                        MachinePointerInfo(SV, 16), false, false, 0);
16757   MemOps.push_back(Store);
16758   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
16759 }
16760
16761 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
16762   assert(Subtarget->is64Bit() &&
16763          "LowerVAARG only handles 64-bit va_arg!");
16764   assert((Subtarget->isTargetLinux() ||
16765           Subtarget->isTargetDarwin()) &&
16766           "Unhandled target in LowerVAARG");
16767   assert(Op.getNode()->getNumOperands() == 4);
16768   SDValue Chain = Op.getOperand(0);
16769   SDValue SrcPtr = Op.getOperand(1);
16770   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
16771   unsigned Align = Op.getConstantOperandVal(3);
16772   SDLoc dl(Op);
16773
16774   EVT ArgVT = Op.getNode()->getValueType(0);
16775   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
16776   uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
16777   uint8_t ArgMode;
16778
16779   // Decide which area this value should be read from.
16780   // TODO: Implement the AMD64 ABI in its entirety. This simple
16781   // selection mechanism works only for the basic types.
16782   if (ArgVT == MVT::f80) {
16783     llvm_unreachable("va_arg for f80 not yet implemented");
16784   } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
16785     ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
16786   } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
16787     ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
16788   } else {
16789     llvm_unreachable("Unhandled argument type in LowerVAARG");
16790   }
16791
16792   if (ArgMode == 2) {
16793     // Sanity Check: Make sure using fp_offset makes sense.
16794     assert(!DAG.getTarget().Options.UseSoftFloat &&
16795            !(DAG.getMachineFunction()
16796                 .getFunction()->getAttributes()
16797                 .hasAttribute(AttributeSet::FunctionIndex,
16798                               Attribute::NoImplicitFloat)) &&
16799            Subtarget->hasSSE1());
16800   }
16801
16802   // Insert VAARG_64 node into the DAG
16803   // VAARG_64 returns two values: Variable Argument Address, Chain
16804   SmallVector<SDValue, 11> InstOps;
16805   InstOps.push_back(Chain);
16806   InstOps.push_back(SrcPtr);
16807   InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32));
16808   InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8));
16809   InstOps.push_back(DAG.getConstant(Align, MVT::i32));
16810   SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other);
16811   SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
16812                                           VTs, InstOps, MVT::i64,
16813                                           MachinePointerInfo(SV),
16814                                           /*Align=*/0,
16815                                           /*Volatile=*/false,
16816                                           /*ReadMem=*/true,
16817                                           /*WriteMem=*/true);
16818   Chain = VAARG.getValue(1);
16819
16820   // Load the next argument and return it
16821   return DAG.getLoad(ArgVT, dl,
16822                      Chain,
16823                      VAARG,
16824                      MachinePointerInfo(),
16825                      false, false, false, 0);
16826 }
16827
16828 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
16829                            SelectionDAG &DAG) {
16830   // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
16831   assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
16832   SDValue Chain = Op.getOperand(0);
16833   SDValue DstPtr = Op.getOperand(1);
16834   SDValue SrcPtr = Op.getOperand(2);
16835   const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
16836   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
16837   SDLoc DL(Op);
16838
16839   return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
16840                        DAG.getIntPtrConstant(24), 8, /*isVolatile*/false,
16841                        false,
16842                        MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
16843 }
16844
16845 // getTargetVShiftByConstNode - Handle vector element shifts where the shift
16846 // amount is a constant. Takes immediate version of shift as input.
16847 static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
16848                                           SDValue SrcOp, uint64_t ShiftAmt,
16849                                           SelectionDAG &DAG) {
16850   MVT ElementType = VT.getVectorElementType();
16851
16852   // Fold this packed shift into its first operand if ShiftAmt is 0.
16853   if (ShiftAmt == 0)
16854     return SrcOp;
16855
16856   // Check for ShiftAmt >= element width
16857   if (ShiftAmt >= ElementType.getSizeInBits()) {
16858     if (Opc == X86ISD::VSRAI)
16859       ShiftAmt = ElementType.getSizeInBits() - 1;
16860     else
16861       return DAG.getConstant(0, VT);
16862   }
16863
16864   assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
16865          && "Unknown target vector shift-by-constant node");
16866
16867   // Fold this packed vector shift into a build vector if SrcOp is a
16868   // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT.
16869   if (VT == SrcOp.getSimpleValueType() &&
16870       ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
16871     SmallVector<SDValue, 8> Elts;
16872     unsigned NumElts = SrcOp->getNumOperands();
16873     ConstantSDNode *ND;
16874
16875     switch(Opc) {
16876     default: llvm_unreachable(nullptr);
16877     case X86ISD::VSHLI:
16878       for (unsigned i=0; i!=NumElts; ++i) {
16879         SDValue CurrentOp = SrcOp->getOperand(i);
16880         if (CurrentOp->getOpcode() == ISD::UNDEF) {
16881           Elts.push_back(CurrentOp);
16882           continue;
16883         }
16884         ND = cast<ConstantSDNode>(CurrentOp);
16885         const APInt &C = ND->getAPIntValue();
16886         Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), ElementType));
16887       }
16888       break;
16889     case X86ISD::VSRLI:
16890       for (unsigned i=0; i!=NumElts; ++i) {
16891         SDValue CurrentOp = SrcOp->getOperand(i);
16892         if (CurrentOp->getOpcode() == ISD::UNDEF) {
16893           Elts.push_back(CurrentOp);
16894           continue;
16895         }
16896         ND = cast<ConstantSDNode>(CurrentOp);
16897         const APInt &C = ND->getAPIntValue();
16898         Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), ElementType));
16899       }
16900       break;
16901     case X86ISD::VSRAI:
16902       for (unsigned i=0; i!=NumElts; ++i) {
16903         SDValue CurrentOp = SrcOp->getOperand(i);
16904         if (CurrentOp->getOpcode() == ISD::UNDEF) {
16905           Elts.push_back(CurrentOp);
16906           continue;
16907         }
16908         ND = cast<ConstantSDNode>(CurrentOp);
16909         const APInt &C = ND->getAPIntValue();
16910         Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), ElementType));
16911       }
16912       break;
16913     }
16914
16915     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
16916   }
16917
16918   return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8));
16919 }
16920
16921 // getTargetVShiftNode - Handle vector element shifts where the shift amount
16922 // may or may not be a constant. Takes immediate version of shift as input.
16923 static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
16924                                    SDValue SrcOp, SDValue ShAmt,
16925                                    SelectionDAG &DAG) {
16926   MVT SVT = ShAmt.getSimpleValueType();
16927   assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
16928
16929   // Catch shift-by-constant.
16930   if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
16931     return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
16932                                       CShAmt->getZExtValue(), DAG);
16933
16934   // Change opcode to non-immediate version
16935   switch (Opc) {
16936     default: llvm_unreachable("Unknown target vector shift node");
16937     case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
16938     case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
16939     case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
16940   }
16941
16942   const X86Subtarget &Subtarget =
16943       DAG.getTarget().getSubtarget<X86Subtarget>();
16944   if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
16945       ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
16946     // Let the shuffle legalizer expand this shift amount node.
16947     SDValue Op0 = ShAmt.getOperand(0);
16948     Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
16949     ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, &Subtarget, DAG);
16950   } else {
16951     // Need to build a vector containing shift amount.
16952     // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
16953     SmallVector<SDValue, 4> ShOps;
16954     ShOps.push_back(ShAmt);
16955     if (SVT == MVT::i32) {
16956       ShOps.push_back(DAG.getConstant(0, SVT));
16957       ShOps.push_back(DAG.getUNDEF(SVT));
16958     }
16959     ShOps.push_back(DAG.getUNDEF(SVT));
16960
16961     MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
16962     ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, BVT, ShOps);
16963   }
16964
16965   // The return type has to be a 128-bit type with the same element
16966   // type as the input type.
16967   MVT EltVT = VT.getVectorElementType();
16968   EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
16969
16970   ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt);
16971   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
16972 }
16973
16974 /// \brief Return (and \p Op, \p Mask) for compare instructions or
16975 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
16976 /// necessary casting for \p Mask when lowering masking intrinsics.
16977 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
16978                                     SDValue PreservedSrc,
16979                                     const X86Subtarget *Subtarget,
16980                                     SelectionDAG &DAG) {
16981     EVT VT = Op.getValueType();
16982     EVT MaskVT = EVT::getVectorVT(*DAG.getContext(),
16983                                   MVT::i1, VT.getVectorNumElements());
16984     EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
16985                                      Mask.getValueType().getSizeInBits());
16986     SDLoc dl(Op);
16987
16988     assert(MaskVT.isSimple() && "invalid mask type");
16989
16990     if (isAllOnes(Mask))
16991       return Op;
16992
16993     // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
16994     // are extracted by EXTRACT_SUBVECTOR.
16995     SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
16996                               DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
16997                               DAG.getIntPtrConstant(0));
16998
16999     switch (Op.getOpcode()) {
17000       default: break;
17001       case X86ISD::PCMPEQM:
17002       case X86ISD::PCMPGTM:
17003       case X86ISD::CMPM:
17004       case X86ISD::CMPMU:
17005         return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
17006     }
17007     if (PreservedSrc.getOpcode() == ISD::UNDEF)
17008       PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
17009     return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc);
17010 }
17011
17012 /// \brief Creates an SDNode for a predicated scalar operation.
17013 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
17014 /// The mask is comming as MVT::i8 and it should be truncated
17015 /// to MVT::i1 while lowering masking intrinsics.
17016 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
17017 /// "X86select" instead of "vselect". We just can't create the "vselect" node for
17018 /// a scalar instruction.
17019 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
17020                                     SDValue PreservedSrc,
17021                                     const X86Subtarget *Subtarget,
17022                                     SelectionDAG &DAG) {
17023     if (isAllOnes(Mask))
17024       return Op;
17025
17026     EVT VT = Op.getValueType();
17027     SDLoc dl(Op);
17028     // The mask should be of type MVT::i1
17029     SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
17030
17031     if (PreservedSrc.getOpcode() == ISD::UNDEF)
17032       PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
17033     return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
17034 }
17035
17036 static unsigned getOpcodeForFMAIntrinsic(unsigned IntNo) {
17037     switch (IntNo) {
17038     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
17039     case Intrinsic::x86_fma_vfmadd_ps:
17040     case Intrinsic::x86_fma_vfmadd_pd:
17041     case Intrinsic::x86_fma_vfmadd_ps_256:
17042     case Intrinsic::x86_fma_vfmadd_pd_256:
17043     case Intrinsic::x86_fma_mask_vfmadd_ps_512:
17044     case Intrinsic::x86_fma_mask_vfmadd_pd_512:
17045       return X86ISD::FMADD;
17046     case Intrinsic::x86_fma_vfmsub_ps:
17047     case Intrinsic::x86_fma_vfmsub_pd:
17048     case Intrinsic::x86_fma_vfmsub_ps_256:
17049     case Intrinsic::x86_fma_vfmsub_pd_256:
17050     case Intrinsic::x86_fma_mask_vfmsub_ps_512:
17051     case Intrinsic::x86_fma_mask_vfmsub_pd_512:
17052       return X86ISD::FMSUB;
17053     case Intrinsic::x86_fma_vfnmadd_ps:
17054     case Intrinsic::x86_fma_vfnmadd_pd:
17055     case Intrinsic::x86_fma_vfnmadd_ps_256:
17056     case Intrinsic::x86_fma_vfnmadd_pd_256:
17057     case Intrinsic::x86_fma_mask_vfnmadd_ps_512:
17058     case Intrinsic::x86_fma_mask_vfnmadd_pd_512:
17059       return X86ISD::FNMADD;
17060     case Intrinsic::x86_fma_vfnmsub_ps:
17061     case Intrinsic::x86_fma_vfnmsub_pd:
17062     case Intrinsic::x86_fma_vfnmsub_ps_256:
17063     case Intrinsic::x86_fma_vfnmsub_pd_256:
17064     case Intrinsic::x86_fma_mask_vfnmsub_ps_512:
17065     case Intrinsic::x86_fma_mask_vfnmsub_pd_512:
17066       return X86ISD::FNMSUB;
17067     case Intrinsic::x86_fma_vfmaddsub_ps:
17068     case Intrinsic::x86_fma_vfmaddsub_pd:
17069     case Intrinsic::x86_fma_vfmaddsub_ps_256:
17070     case Intrinsic::x86_fma_vfmaddsub_pd_256:
17071     case Intrinsic::x86_fma_mask_vfmaddsub_ps_512:
17072     case Intrinsic::x86_fma_mask_vfmaddsub_pd_512:
17073       return X86ISD::FMADDSUB;
17074     case Intrinsic::x86_fma_vfmsubadd_ps:
17075     case Intrinsic::x86_fma_vfmsubadd_pd:
17076     case Intrinsic::x86_fma_vfmsubadd_ps_256:
17077     case Intrinsic::x86_fma_vfmsubadd_pd_256:
17078     case Intrinsic::x86_fma_mask_vfmsubadd_ps_512:
17079     case Intrinsic::x86_fma_mask_vfmsubadd_pd_512:
17080       return X86ISD::FMSUBADD;
17081     }
17082 }
17083
17084 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
17085                                        SelectionDAG &DAG) {
17086   SDLoc dl(Op);
17087   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
17088   EVT VT = Op.getValueType();
17089   const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
17090   if (IntrData) {
17091     switch(IntrData->Type) {
17092     case INTR_TYPE_1OP:
17093       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
17094     case INTR_TYPE_2OP:
17095       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17096         Op.getOperand(2));
17097     case INTR_TYPE_3OP:
17098       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17099         Op.getOperand(2), Op.getOperand(3));
17100     case INTR_TYPE_1OP_MASK_RM: {
17101       SDValue Src = Op.getOperand(1);
17102       SDValue Src0 = Op.getOperand(2);
17103       SDValue Mask = Op.getOperand(3);
17104       SDValue RoundingMode = Op.getOperand(4);
17105       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
17106                                               RoundingMode),
17107                                   Mask, Src0, Subtarget, DAG);
17108     }
17109     case INTR_TYPE_SCALAR_MASK_RM: {
17110       SDValue Src1 = Op.getOperand(1);
17111       SDValue Src2 = Op.getOperand(2);
17112       SDValue Src0 = Op.getOperand(3);
17113       SDValue Mask = Op.getOperand(4);
17114       SDValue RoundingMode = Op.getOperand(5);
17115       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
17116                                               RoundingMode),
17117                                   Mask, Src0, Subtarget, DAG);
17118     }
17119     case INTR_TYPE_2OP_MASK: {
17120       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
17121                                               Op.getOperand(2)),
17122                                   Op.getOperand(4), Op.getOperand(3), Subtarget, DAG);
17123     }
17124     case CMP_MASK:
17125     case CMP_MASK_CC: {
17126       // Comparison intrinsics with masks.
17127       // Example of transformation:
17128       // (i8 (int_x86_avx512_mask_pcmpeq_q_128
17129       //             (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
17130       // (i8 (bitcast
17131       //   (v8i1 (insert_subvector undef,
17132       //           (v2i1 (and (PCMPEQM %a, %b),
17133       //                      (extract_subvector
17134       //                         (v8i1 (bitcast %mask)), 0))), 0))))
17135       EVT VT = Op.getOperand(1).getValueType();
17136       EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17137                                     VT.getVectorNumElements());
17138       SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
17139       EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17140                                        Mask.getValueType().getSizeInBits());
17141       SDValue Cmp;
17142       if (IntrData->Type == CMP_MASK_CC) {
17143         Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
17144                     Op.getOperand(2), Op.getOperand(3));
17145       } else {
17146         assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
17147         Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
17148                     Op.getOperand(2));
17149       }
17150       SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
17151                                              DAG.getTargetConstant(0, MaskVT),
17152                                              Subtarget, DAG);
17153       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
17154                                 DAG.getUNDEF(BitcastVT), CmpMask,
17155                                 DAG.getIntPtrConstant(0));
17156       return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
17157     }
17158     case COMI: { // Comparison intrinsics
17159       ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
17160       SDValue LHS = Op.getOperand(1);
17161       SDValue RHS = Op.getOperand(2);
17162       unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
17163       assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
17164       SDValue Cond = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
17165       SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17166                                   DAG.getConstant(X86CC, MVT::i8), Cond);
17167       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17168     }
17169     case VSHIFT:
17170       return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
17171                                  Op.getOperand(1), Op.getOperand(2), DAG);
17172     case VSHIFT_MASK:
17173       return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl,
17174                                                       Op.getSimpleValueType(),
17175                                                       Op.getOperand(1),
17176                                                       Op.getOperand(2), DAG),
17177                                   Op.getOperand(4), Op.getOperand(3), Subtarget,
17178                                   DAG);
17179     case COMPRESS_EXPAND_IN_REG: {
17180       SDValue Mask = Op.getOperand(3);
17181       SDValue DataToCompress = Op.getOperand(1);
17182       SDValue PassThru = Op.getOperand(2);
17183       if (isAllOnes(Mask)) // return data as is
17184         return Op.getOperand(1);
17185       EVT VT = Op.getValueType();
17186       EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17187                                     VT.getVectorNumElements());
17188       EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17189                                        Mask.getValueType().getSizeInBits());
17190       SDLoc dl(Op);
17191       SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17192                                   DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17193                                   DAG.getIntPtrConstant(0));
17194
17195       return DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToCompress,
17196                          PassThru);
17197     }
17198     case BLEND: {
17199       SDValue Mask = Op.getOperand(3);
17200       EVT VT = Op.getValueType();
17201       EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17202                                     VT.getVectorNumElements());
17203       EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17204                                        Mask.getValueType().getSizeInBits());
17205       SDLoc dl(Op);
17206       SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17207                                   DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17208                                   DAG.getIntPtrConstant(0));
17209       return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1),
17210                          Op.getOperand(2));
17211     }
17212     case FMA_OP_MASK:
17213     {
17214         return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
17215             dl, Op.getValueType(),
17216             Op.getOperand(1),
17217             Op.getOperand(2),
17218             Op.getOperand(3)),
17219             Op.getOperand(4), Op.getOperand(1),
17220             Subtarget, DAG);
17221     }
17222     default:
17223       break;
17224     }
17225   }
17226
17227   switch (IntNo) {
17228   default: return SDValue();    // Don't custom lower most intrinsics.
17229
17230   case Intrinsic::x86_avx512_mask_valign_q_512:
17231   case Intrinsic::x86_avx512_mask_valign_d_512:
17232     // Vector source operands are swapped.
17233     return getVectorMaskingNode(DAG.getNode(X86ISD::VALIGN, dl,
17234                                             Op.getValueType(), Op.getOperand(2),
17235                                             Op.getOperand(1),
17236                                             Op.getOperand(3)),
17237                                 Op.getOperand(5), Op.getOperand(4),
17238                                 Subtarget, DAG);
17239
17240   // ptest and testp intrinsics. The intrinsic these come from are designed to
17241   // return an integer value, not just an instruction so lower it to the ptest
17242   // or testp pattern and a setcc for the result.
17243   case Intrinsic::x86_sse41_ptestz:
17244   case Intrinsic::x86_sse41_ptestc:
17245   case Intrinsic::x86_sse41_ptestnzc:
17246   case Intrinsic::x86_avx_ptestz_256:
17247   case Intrinsic::x86_avx_ptestc_256:
17248   case Intrinsic::x86_avx_ptestnzc_256:
17249   case Intrinsic::x86_avx_vtestz_ps:
17250   case Intrinsic::x86_avx_vtestc_ps:
17251   case Intrinsic::x86_avx_vtestnzc_ps:
17252   case Intrinsic::x86_avx_vtestz_pd:
17253   case Intrinsic::x86_avx_vtestc_pd:
17254   case Intrinsic::x86_avx_vtestnzc_pd:
17255   case Intrinsic::x86_avx_vtestz_ps_256:
17256   case Intrinsic::x86_avx_vtestc_ps_256:
17257   case Intrinsic::x86_avx_vtestnzc_ps_256:
17258   case Intrinsic::x86_avx_vtestz_pd_256:
17259   case Intrinsic::x86_avx_vtestc_pd_256:
17260   case Intrinsic::x86_avx_vtestnzc_pd_256: {
17261     bool IsTestPacked = false;
17262     unsigned X86CC;
17263     switch (IntNo) {
17264     default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
17265     case Intrinsic::x86_avx_vtestz_ps:
17266     case Intrinsic::x86_avx_vtestz_pd:
17267     case Intrinsic::x86_avx_vtestz_ps_256:
17268     case Intrinsic::x86_avx_vtestz_pd_256:
17269       IsTestPacked = true; // Fallthrough
17270     case Intrinsic::x86_sse41_ptestz:
17271     case Intrinsic::x86_avx_ptestz_256:
17272       // ZF = 1
17273       X86CC = X86::COND_E;
17274       break;
17275     case Intrinsic::x86_avx_vtestc_ps:
17276     case Intrinsic::x86_avx_vtestc_pd:
17277     case Intrinsic::x86_avx_vtestc_ps_256:
17278     case Intrinsic::x86_avx_vtestc_pd_256:
17279       IsTestPacked = true; // Fallthrough
17280     case Intrinsic::x86_sse41_ptestc:
17281     case Intrinsic::x86_avx_ptestc_256:
17282       // CF = 1
17283       X86CC = X86::COND_B;
17284       break;
17285     case Intrinsic::x86_avx_vtestnzc_ps:
17286     case Intrinsic::x86_avx_vtestnzc_pd:
17287     case Intrinsic::x86_avx_vtestnzc_ps_256:
17288     case Intrinsic::x86_avx_vtestnzc_pd_256:
17289       IsTestPacked = true; // Fallthrough
17290     case Intrinsic::x86_sse41_ptestnzc:
17291     case Intrinsic::x86_avx_ptestnzc_256:
17292       // ZF and CF = 0
17293       X86CC = X86::COND_A;
17294       break;
17295     }
17296
17297     SDValue LHS = Op.getOperand(1);
17298     SDValue RHS = Op.getOperand(2);
17299     unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
17300     SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
17301     SDValue CC = DAG.getConstant(X86CC, MVT::i8);
17302     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
17303     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17304   }
17305   case Intrinsic::x86_avx512_kortestz_w:
17306   case Intrinsic::x86_avx512_kortestc_w: {
17307     unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B;
17308     SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1));
17309     SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2));
17310     SDValue CC = DAG.getConstant(X86CC, MVT::i8);
17311     SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
17312     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i1, CC, Test);
17313     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17314   }
17315
17316   case Intrinsic::x86_sse42_pcmpistria128:
17317   case Intrinsic::x86_sse42_pcmpestria128:
17318   case Intrinsic::x86_sse42_pcmpistric128:
17319   case Intrinsic::x86_sse42_pcmpestric128:
17320   case Intrinsic::x86_sse42_pcmpistrio128:
17321   case Intrinsic::x86_sse42_pcmpestrio128:
17322   case Intrinsic::x86_sse42_pcmpistris128:
17323   case Intrinsic::x86_sse42_pcmpestris128:
17324   case Intrinsic::x86_sse42_pcmpistriz128:
17325   case Intrinsic::x86_sse42_pcmpestriz128: {
17326     unsigned Opcode;
17327     unsigned X86CC;
17328     switch (IntNo) {
17329     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
17330     case Intrinsic::x86_sse42_pcmpistria128:
17331       Opcode = X86ISD::PCMPISTRI;
17332       X86CC = X86::COND_A;
17333       break;
17334     case Intrinsic::x86_sse42_pcmpestria128:
17335       Opcode = X86ISD::PCMPESTRI;
17336       X86CC = X86::COND_A;
17337       break;
17338     case Intrinsic::x86_sse42_pcmpistric128:
17339       Opcode = X86ISD::PCMPISTRI;
17340       X86CC = X86::COND_B;
17341       break;
17342     case Intrinsic::x86_sse42_pcmpestric128:
17343       Opcode = X86ISD::PCMPESTRI;
17344       X86CC = X86::COND_B;
17345       break;
17346     case Intrinsic::x86_sse42_pcmpistrio128:
17347       Opcode = X86ISD::PCMPISTRI;
17348       X86CC = X86::COND_O;
17349       break;
17350     case Intrinsic::x86_sse42_pcmpestrio128:
17351       Opcode = X86ISD::PCMPESTRI;
17352       X86CC = X86::COND_O;
17353       break;
17354     case Intrinsic::x86_sse42_pcmpistris128:
17355       Opcode = X86ISD::PCMPISTRI;
17356       X86CC = X86::COND_S;
17357       break;
17358     case Intrinsic::x86_sse42_pcmpestris128:
17359       Opcode = X86ISD::PCMPESTRI;
17360       X86CC = X86::COND_S;
17361       break;
17362     case Intrinsic::x86_sse42_pcmpistriz128:
17363       Opcode = X86ISD::PCMPISTRI;
17364       X86CC = X86::COND_E;
17365       break;
17366     case Intrinsic::x86_sse42_pcmpestriz128:
17367       Opcode = X86ISD::PCMPESTRI;
17368       X86CC = X86::COND_E;
17369       break;
17370     }
17371     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
17372     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17373     SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
17374     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17375                                 DAG.getConstant(X86CC, MVT::i8),
17376                                 SDValue(PCMP.getNode(), 1));
17377     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17378   }
17379
17380   case Intrinsic::x86_sse42_pcmpistri128:
17381   case Intrinsic::x86_sse42_pcmpestri128: {
17382     unsigned Opcode;
17383     if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
17384       Opcode = X86ISD::PCMPISTRI;
17385     else
17386       Opcode = X86ISD::PCMPESTRI;
17387
17388     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
17389     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17390     return DAG.getNode(Opcode, dl, VTs, NewOps);
17391   }
17392
17393   case Intrinsic::x86_fma_mask_vfmadd_ps_512:
17394   case Intrinsic::x86_fma_mask_vfmadd_pd_512:
17395   case Intrinsic::x86_fma_mask_vfmsub_ps_512:
17396   case Intrinsic::x86_fma_mask_vfmsub_pd_512:
17397   case Intrinsic::x86_fma_mask_vfnmadd_ps_512:
17398   case Intrinsic::x86_fma_mask_vfnmadd_pd_512:
17399   case Intrinsic::x86_fma_mask_vfnmsub_ps_512:
17400   case Intrinsic::x86_fma_mask_vfnmsub_pd_512:
17401   case Intrinsic::x86_fma_mask_vfmaddsub_ps_512:
17402   case Intrinsic::x86_fma_mask_vfmaddsub_pd_512:
17403   case Intrinsic::x86_fma_mask_vfmsubadd_ps_512:
17404   case Intrinsic::x86_fma_mask_vfmsubadd_pd_512: {
17405     auto *SAE = cast<ConstantSDNode>(Op.getOperand(5));
17406     if (SAE->getZExtValue() == X86::STATIC_ROUNDING::CUR_DIRECTION)
17407       return getVectorMaskingNode(DAG.getNode(getOpcodeForFMAIntrinsic(IntNo),
17408                                               dl, Op.getValueType(),
17409                                               Op.getOperand(1),
17410                                               Op.getOperand(2),
17411                                               Op.getOperand(3)),
17412                                   Op.getOperand(4), Op.getOperand(1),
17413                                   Subtarget, DAG);
17414     else
17415       return SDValue();
17416   }
17417
17418   case Intrinsic::x86_fma_vfmadd_ps:
17419   case Intrinsic::x86_fma_vfmadd_pd:
17420   case Intrinsic::x86_fma_vfmsub_ps:
17421   case Intrinsic::x86_fma_vfmsub_pd:
17422   case Intrinsic::x86_fma_vfnmadd_ps:
17423   case Intrinsic::x86_fma_vfnmadd_pd:
17424   case Intrinsic::x86_fma_vfnmsub_ps:
17425   case Intrinsic::x86_fma_vfnmsub_pd:
17426   case Intrinsic::x86_fma_vfmaddsub_ps:
17427   case Intrinsic::x86_fma_vfmaddsub_pd:
17428   case Intrinsic::x86_fma_vfmsubadd_ps:
17429   case Intrinsic::x86_fma_vfmsubadd_pd:
17430   case Intrinsic::x86_fma_vfmadd_ps_256:
17431   case Intrinsic::x86_fma_vfmadd_pd_256:
17432   case Intrinsic::x86_fma_vfmsub_ps_256:
17433   case Intrinsic::x86_fma_vfmsub_pd_256:
17434   case Intrinsic::x86_fma_vfnmadd_ps_256:
17435   case Intrinsic::x86_fma_vfnmadd_pd_256:
17436   case Intrinsic::x86_fma_vfnmsub_ps_256:
17437   case Intrinsic::x86_fma_vfnmsub_pd_256:
17438   case Intrinsic::x86_fma_vfmaddsub_ps_256:
17439   case Intrinsic::x86_fma_vfmaddsub_pd_256:
17440   case Intrinsic::x86_fma_vfmsubadd_ps_256:
17441   case Intrinsic::x86_fma_vfmsubadd_pd_256:
17442     return DAG.getNode(getOpcodeForFMAIntrinsic(IntNo), dl, Op.getValueType(),
17443                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
17444   }
17445 }
17446
17447 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
17448                               SDValue Src, SDValue Mask, SDValue Base,
17449                               SDValue Index, SDValue ScaleOp, SDValue Chain,
17450                               const X86Subtarget * Subtarget) {
17451   SDLoc dl(Op);
17452   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
17453   assert(C && "Invalid scale type");
17454   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
17455   EVT MaskVT = MVT::getVectorVT(MVT::i1,
17456                              Index.getSimpleValueType().getVectorNumElements());
17457   SDValue MaskInReg;
17458   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
17459   if (MaskC)
17460     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
17461   else
17462     MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
17463   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
17464   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
17465   SDValue Segment = DAG.getRegister(0, MVT::i32);
17466   if (Src.getOpcode() == ISD::UNDEF)
17467     Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl);
17468   SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
17469   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
17470   SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
17471   return DAG.getMergeValues(RetOps, dl);
17472 }
17473
17474 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
17475                                SDValue Src, SDValue Mask, SDValue Base,
17476                                SDValue Index, SDValue ScaleOp, SDValue Chain) {
17477   SDLoc dl(Op);
17478   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
17479   assert(C && "Invalid scale type");
17480   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
17481   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
17482   SDValue Segment = DAG.getRegister(0, MVT::i32);
17483   EVT MaskVT = MVT::getVectorVT(MVT::i1,
17484                              Index.getSimpleValueType().getVectorNumElements());
17485   SDValue MaskInReg;
17486   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
17487   if (MaskC)
17488     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
17489   else
17490     MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
17491   SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
17492   SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
17493   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
17494   return SDValue(Res, 1);
17495 }
17496
17497 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
17498                                SDValue Mask, SDValue Base, SDValue Index,
17499                                SDValue ScaleOp, SDValue Chain) {
17500   SDLoc dl(Op);
17501   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
17502   assert(C && "Invalid scale type");
17503   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
17504   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
17505   SDValue Segment = DAG.getRegister(0, MVT::i32);
17506   EVT MaskVT =
17507     MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
17508   SDValue MaskInReg;
17509   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
17510   if (MaskC)
17511     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
17512   else
17513     MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
17514   //SDVTList VTs = DAG.getVTList(MVT::Other);
17515   SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
17516   SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
17517   return SDValue(Res, 0);
17518 }
17519
17520 // getReadPerformanceCounter - Handles the lowering of builtin intrinsics that
17521 // read performance monitor counters (x86_rdpmc).
17522 static void getReadPerformanceCounter(SDNode *N, SDLoc DL,
17523                               SelectionDAG &DAG, const X86Subtarget *Subtarget,
17524                               SmallVectorImpl<SDValue> &Results) {
17525   assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
17526   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
17527   SDValue LO, HI;
17528
17529   // The ECX register is used to select the index of the performance counter
17530   // to read.
17531   SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
17532                                    N->getOperand(2));
17533   SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
17534
17535   // Reads the content of a 64-bit performance counter and returns it in the
17536   // registers EDX:EAX.
17537   if (Subtarget->is64Bit()) {
17538     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
17539     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
17540                             LO.getValue(2));
17541   } else {
17542     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
17543     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
17544                             LO.getValue(2));
17545   }
17546   Chain = HI.getValue(1);
17547
17548   if (Subtarget->is64Bit()) {
17549     // The EAX register is loaded with the low-order 32 bits. The EDX register
17550     // is loaded with the supported high-order bits of the counter.
17551     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
17552                               DAG.getConstant(32, MVT::i8));
17553     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
17554     Results.push_back(Chain);
17555     return;
17556   }
17557
17558   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
17559   SDValue Ops[] = { LO, HI };
17560   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
17561   Results.push_back(Pair);
17562   Results.push_back(Chain);
17563 }
17564
17565 // getReadTimeStampCounter - Handles the lowering of builtin intrinsics that
17566 // read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is
17567 // also used to custom lower READCYCLECOUNTER nodes.
17568 static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode,
17569                               SelectionDAG &DAG, const X86Subtarget *Subtarget,
17570                               SmallVectorImpl<SDValue> &Results) {
17571   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
17572   SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
17573   SDValue LO, HI;
17574
17575   // The processor's time-stamp counter (a 64-bit MSR) is stored into the
17576   // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
17577   // and the EAX register is loaded with the low-order 32 bits.
17578   if (Subtarget->is64Bit()) {
17579     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
17580     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
17581                             LO.getValue(2));
17582   } else {
17583     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
17584     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
17585                             LO.getValue(2));
17586   }
17587   SDValue Chain = HI.getValue(1);
17588
17589   if (Opcode == X86ISD::RDTSCP_DAG) {
17590     assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
17591
17592     // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
17593     // the ECX register. Add 'ecx' explicitly to the chain.
17594     SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
17595                                      HI.getValue(2));
17596     // Explicitly store the content of ECX at the location passed in input
17597     // to the 'rdtscp' intrinsic.
17598     Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
17599                          MachinePointerInfo(), false, false, 0);
17600   }
17601
17602   if (Subtarget->is64Bit()) {
17603     // The EDX register is loaded with the high-order 32 bits of the MSR, and
17604     // the EAX register is loaded with the low-order 32 bits.
17605     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
17606                               DAG.getConstant(32, MVT::i8));
17607     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
17608     Results.push_back(Chain);
17609     return;
17610   }
17611
17612   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
17613   SDValue Ops[] = { LO, HI };
17614   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
17615   Results.push_back(Pair);
17616   Results.push_back(Chain);
17617 }
17618
17619 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
17620                                      SelectionDAG &DAG) {
17621   SmallVector<SDValue, 2> Results;
17622   SDLoc DL(Op);
17623   getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
17624                           Results);
17625   return DAG.getMergeValues(Results, DL);
17626 }
17627
17628
17629 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
17630                                       SelectionDAG &DAG) {
17631   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
17632
17633   const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo);
17634   if (!IntrData)
17635     return SDValue();
17636
17637   SDLoc dl(Op);
17638   switch(IntrData->Type) {
17639   default:
17640     llvm_unreachable("Unknown Intrinsic Type");
17641     break;
17642   case RDSEED:
17643   case RDRAND: {
17644     // Emit the node with the right value type.
17645     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
17646     SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
17647
17648     // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
17649     // Otherwise return the value from Rand, which is always 0, casted to i32.
17650     SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
17651                       DAG.getConstant(1, Op->getValueType(1)),
17652                       DAG.getConstant(X86::COND_B, MVT::i32),
17653                       SDValue(Result.getNode(), 1) };
17654     SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
17655                                   DAG.getVTList(Op->getValueType(1), MVT::Glue),
17656                                   Ops);
17657
17658     // Return { result, isValid, chain }.
17659     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
17660                        SDValue(Result.getNode(), 2));
17661   }
17662   case GATHER: {
17663   //gather(v1, mask, index, base, scale);
17664     SDValue Chain = Op.getOperand(0);
17665     SDValue Src   = Op.getOperand(2);
17666     SDValue Base  = Op.getOperand(3);
17667     SDValue Index = Op.getOperand(4);
17668     SDValue Mask  = Op.getOperand(5);
17669     SDValue Scale = Op.getOperand(6);
17670     return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain,
17671                           Subtarget);
17672   }
17673   case SCATTER: {
17674   //scatter(base, mask, index, v1, scale);
17675     SDValue Chain = Op.getOperand(0);
17676     SDValue Base  = Op.getOperand(2);
17677     SDValue Mask  = Op.getOperand(3);
17678     SDValue Index = Op.getOperand(4);
17679     SDValue Src   = Op.getOperand(5);
17680     SDValue Scale = Op.getOperand(6);
17681     return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain);
17682   }
17683   case PREFETCH: {
17684     SDValue Hint = Op.getOperand(6);
17685     unsigned HintVal;
17686     if (dyn_cast<ConstantSDNode> (Hint) == nullptr ||
17687         (HintVal = dyn_cast<ConstantSDNode> (Hint)->getZExtValue()) > 1)
17688       llvm_unreachable("Wrong prefetch hint in intrinsic: should be 0 or 1");
17689     unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0);
17690     SDValue Chain = Op.getOperand(0);
17691     SDValue Mask  = Op.getOperand(2);
17692     SDValue Index = Op.getOperand(3);
17693     SDValue Base  = Op.getOperand(4);
17694     SDValue Scale = Op.getOperand(5);
17695     return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain);
17696   }
17697   // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
17698   case RDTSC: {
17699     SmallVector<SDValue, 2> Results;
17700     getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget, Results);
17701     return DAG.getMergeValues(Results, dl);
17702   }
17703   // Read Performance Monitoring Counters.
17704   case RDPMC: {
17705     SmallVector<SDValue, 2> Results;
17706     getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
17707     return DAG.getMergeValues(Results, dl);
17708   }
17709   // XTEST intrinsics.
17710   case XTEST: {
17711     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
17712     SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
17713     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17714                                 DAG.getConstant(X86::COND_NE, MVT::i8),
17715                                 InTrans);
17716     SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
17717     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
17718                        Ret, SDValue(InTrans.getNode(), 1));
17719   }
17720   // ADC/ADCX/SBB
17721   case ADX: {
17722     SmallVector<SDValue, 2> Results;
17723     SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
17724     SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
17725     SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
17726                                 DAG.getConstant(-1, MVT::i8));
17727     SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
17728                               Op.getOperand(4), GenCF.getValue(1));
17729     SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
17730                                  Op.getOperand(5), MachinePointerInfo(),
17731                                  false, false, 0);
17732     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17733                                 DAG.getConstant(X86::COND_B, MVT::i8),
17734                                 Res.getValue(1));
17735     Results.push_back(SetCC);
17736     Results.push_back(Store);
17737     return DAG.getMergeValues(Results, dl);
17738   }
17739   case COMPRESS_TO_MEM: {
17740     SDLoc dl(Op);
17741     SDValue Mask = Op.getOperand(4);
17742     SDValue DataToCompress = Op.getOperand(3);
17743     SDValue Addr = Op.getOperand(2);
17744     SDValue Chain = Op.getOperand(0);
17745
17746     if (isAllOnes(Mask)) // return just a store
17747       return DAG.getStore(Chain, dl, DataToCompress, Addr,
17748                           MachinePointerInfo(), false, false, 0);
17749
17750     EVT VT = DataToCompress.getValueType();
17751     EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17752                                   VT.getVectorNumElements());
17753     EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17754                                      Mask.getValueType().getSizeInBits());
17755     SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17756                                 DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17757                                 DAG.getIntPtrConstant(0));
17758
17759     SDValue Compressed =  DAG.getNode(IntrData->Opc0, dl, VT, VMask,
17760                                       DataToCompress, DAG.getUNDEF(VT));
17761     return DAG.getStore(Chain, dl, Compressed, Addr,
17762                         MachinePointerInfo(), false, false, 0);
17763   }
17764   case EXPAND_FROM_MEM: {
17765     SDLoc dl(Op);
17766     SDValue Mask = Op.getOperand(4);
17767     SDValue PathThru = Op.getOperand(3);
17768     SDValue Addr = Op.getOperand(2);
17769     SDValue Chain = Op.getOperand(0);
17770     EVT VT = Op.getValueType();
17771
17772     if (isAllOnes(Mask)) // return just a load
17773       return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false,
17774                          false, 0);
17775     EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17776                                   VT.getVectorNumElements());
17777     EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17778                                      Mask.getValueType().getSizeInBits());
17779     SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17780                                 DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17781                                 DAG.getIntPtrConstant(0));
17782
17783     SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(),
17784                                    false, false, false, 0);
17785
17786     SmallVector<SDValue, 2> Results;
17787     Results.push_back(DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToExpand,
17788                                   PathThru));
17789     Results.push_back(Chain);
17790     return DAG.getMergeValues(Results, dl);
17791   }
17792   }
17793 }
17794
17795 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
17796                                            SelectionDAG &DAG) const {
17797   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
17798   MFI->setReturnAddressIsTaken(true);
17799
17800   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
17801     return SDValue();
17802
17803   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
17804   SDLoc dl(Op);
17805   EVT PtrVT = getPointerTy();
17806
17807   if (Depth > 0) {
17808     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
17809     const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
17810         DAG.getSubtarget().getRegisterInfo());
17811     SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
17812     return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
17813                        DAG.getNode(ISD::ADD, dl, PtrVT,
17814                                    FrameAddr, Offset),
17815                        MachinePointerInfo(), false, false, false, 0);
17816   }
17817
17818   // Just load the return address.
17819   SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
17820   return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
17821                      RetAddrFI, MachinePointerInfo(), false, false, false, 0);
17822 }
17823
17824 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
17825   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
17826   MFI->setFrameAddressIsTaken(true);
17827
17828   EVT VT = Op.getValueType();
17829   SDLoc dl(Op);  // FIXME probably not meaningful
17830   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
17831   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
17832       DAG.getSubtarget().getRegisterInfo());
17833   unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(
17834       DAG.getMachineFunction());
17835   assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
17836           (FrameReg == X86::EBP && VT == MVT::i32)) &&
17837          "Invalid Frame Register!");
17838   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
17839   while (Depth--)
17840     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
17841                             MachinePointerInfo(),
17842                             false, false, false, 0);
17843   return FrameAddr;
17844 }
17845
17846 // FIXME? Maybe this could be a TableGen attribute on some registers and
17847 // this table could be generated automatically from RegInfo.
17848 unsigned X86TargetLowering::getRegisterByName(const char* RegName,
17849                                               EVT VT) const {
17850   unsigned Reg = StringSwitch<unsigned>(RegName)
17851                        .Case("esp", X86::ESP)
17852                        .Case("rsp", X86::RSP)
17853                        .Default(0);
17854   if (Reg)
17855     return Reg;
17856   report_fatal_error("Invalid register name global variable");
17857 }
17858
17859 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
17860                                                      SelectionDAG &DAG) const {
17861   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
17862       DAG.getSubtarget().getRegisterInfo());
17863   return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());
17864 }
17865
17866 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
17867   SDValue Chain     = Op.getOperand(0);
17868   SDValue Offset    = Op.getOperand(1);
17869   SDValue Handler   = Op.getOperand(2);
17870   SDLoc dl      (Op);
17871
17872   EVT PtrVT = getPointerTy();
17873   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
17874       DAG.getSubtarget().getRegisterInfo());
17875   unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
17876   assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
17877           (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
17878          "Invalid Frame Register!");
17879   SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
17880   unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
17881
17882   SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
17883                                  DAG.getIntPtrConstant(RegInfo->getSlotSize()));
17884   StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
17885   Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
17886                        false, false, 0);
17887   Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
17888
17889   return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
17890                      DAG.getRegister(StoreAddrReg, PtrVT));
17891 }
17892
17893 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
17894                                                SelectionDAG &DAG) const {
17895   SDLoc DL(Op);
17896   return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
17897                      DAG.getVTList(MVT::i32, MVT::Other),
17898                      Op.getOperand(0), Op.getOperand(1));
17899 }
17900
17901 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
17902                                                 SelectionDAG &DAG) const {
17903   SDLoc DL(Op);
17904   return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
17905                      Op.getOperand(0), Op.getOperand(1));
17906 }
17907
17908 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
17909   return Op.getOperand(0);
17910 }
17911
17912 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
17913                                                 SelectionDAG &DAG) const {
17914   SDValue Root = Op.getOperand(0);
17915   SDValue Trmp = Op.getOperand(1); // trampoline
17916   SDValue FPtr = Op.getOperand(2); // nested function
17917   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
17918   SDLoc dl (Op);
17919
17920   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
17921   const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
17922
17923   if (Subtarget->is64Bit()) {
17924     SDValue OutChains[6];
17925
17926     // Large code-model.
17927     const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
17928     const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
17929
17930     const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
17931     const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
17932
17933     const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
17934
17935     // Load the pointer to the nested function into R11.
17936     unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
17937     SDValue Addr = Trmp;
17938     OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
17939                                 Addr, MachinePointerInfo(TrmpAddr),
17940                                 false, false, 0);
17941
17942     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
17943                        DAG.getConstant(2, MVT::i64));
17944     OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
17945                                 MachinePointerInfo(TrmpAddr, 2),
17946                                 false, false, 2);
17947
17948     // Load the 'nest' parameter value into R10.
17949     // R10 is specified in X86CallingConv.td
17950     OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
17951     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
17952                        DAG.getConstant(10, MVT::i64));
17953     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
17954                                 Addr, MachinePointerInfo(TrmpAddr, 10),
17955                                 false, false, 0);
17956
17957     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
17958                        DAG.getConstant(12, MVT::i64));
17959     OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
17960                                 MachinePointerInfo(TrmpAddr, 12),
17961                                 false, false, 2);
17962
17963     // Jump to the nested function.
17964     OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
17965     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
17966                        DAG.getConstant(20, MVT::i64));
17967     OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
17968                                 Addr, MachinePointerInfo(TrmpAddr, 20),
17969                                 false, false, 0);
17970
17971     unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
17972     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
17973                        DAG.getConstant(22, MVT::i64));
17974     OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,
17975                                 MachinePointerInfo(TrmpAddr, 22),
17976                                 false, false, 0);
17977
17978     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
17979   } else {
17980     const Function *Func =
17981       cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
17982     CallingConv::ID CC = Func->getCallingConv();
17983     unsigned NestReg;
17984
17985     switch (CC) {
17986     default:
17987       llvm_unreachable("Unsupported calling convention");
17988     case CallingConv::C:
17989     case CallingConv::X86_StdCall: {
17990       // Pass 'nest' parameter in ECX.
17991       // Must be kept in sync with X86CallingConv.td
17992       NestReg = X86::ECX;
17993
17994       // Check that ECX wasn't needed by an 'inreg' parameter.
17995       FunctionType *FTy = Func->getFunctionType();
17996       const AttributeSet &Attrs = Func->getAttributes();
17997
17998       if (!Attrs.isEmpty() && !Func->isVarArg()) {
17999         unsigned InRegCount = 0;
18000         unsigned Idx = 1;
18001
18002         for (FunctionType::param_iterator I = FTy->param_begin(),
18003              E = FTy->param_end(); I != E; ++I, ++Idx)
18004           if (Attrs.hasAttribute(Idx, Attribute::InReg))
18005             // FIXME: should only count parameters that are lowered to integers.
18006             InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
18007
18008         if (InRegCount > 2) {
18009           report_fatal_error("Nest register in use - reduce number of inreg"
18010                              " parameters!");
18011         }
18012       }
18013       break;
18014     }
18015     case CallingConv::X86_FastCall:
18016     case CallingConv::X86_ThisCall:
18017     case CallingConv::Fast:
18018       // Pass 'nest' parameter in EAX.
18019       // Must be kept in sync with X86CallingConv.td
18020       NestReg = X86::EAX;
18021       break;
18022     }
18023
18024     SDValue OutChains[4];
18025     SDValue Addr, Disp;
18026
18027     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18028                        DAG.getConstant(10, MVT::i32));
18029     Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
18030
18031     // This is storing the opcode for MOV32ri.
18032     const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
18033     const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
18034     OutChains[0] = DAG.getStore(Root, dl,
18035                                 DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
18036                                 Trmp, MachinePointerInfo(TrmpAddr),
18037                                 false, false, 0);
18038
18039     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18040                        DAG.getConstant(1, MVT::i32));
18041     OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
18042                                 MachinePointerInfo(TrmpAddr, 1),
18043                                 false, false, 1);
18044
18045     const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
18046     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18047                        DAG.getConstant(5, MVT::i32));
18048     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,
18049                                 MachinePointerInfo(TrmpAddr, 5),
18050                                 false, false, 1);
18051
18052     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18053                        DAG.getConstant(6, MVT::i32));
18054     OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
18055                                 MachinePointerInfo(TrmpAddr, 6),
18056                                 false, false, 1);
18057
18058     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
18059   }
18060 }
18061
18062 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
18063                                             SelectionDAG &DAG) const {
18064   /*
18065    The rounding mode is in bits 11:10 of FPSR, and has the following
18066    settings:
18067      00 Round to nearest
18068      01 Round to -inf
18069      10 Round to +inf
18070      11 Round to 0
18071
18072   FLT_ROUNDS, on the other hand, expects the following:
18073     -1 Undefined
18074      0 Round to 0
18075      1 Round to nearest
18076      2 Round to +inf
18077      3 Round to -inf
18078
18079   To perform the conversion, we do:
18080     (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
18081   */
18082
18083   MachineFunction &MF = DAG.getMachineFunction();
18084   const TargetMachine &TM = MF.getTarget();
18085   const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering();
18086   unsigned StackAlignment = TFI.getStackAlignment();
18087   MVT VT = Op.getSimpleValueType();
18088   SDLoc DL(Op);
18089
18090   // Save FP Control Word to stack slot
18091   int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
18092   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
18093
18094   MachineMemOperand *MMO =
18095    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
18096                            MachineMemOperand::MOStore, 2, 2);
18097
18098   SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
18099   SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
18100                                           DAG.getVTList(MVT::Other),
18101                                           Ops, MVT::i16, MMO);
18102
18103   // Load FP Control Word from stack slot
18104   SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
18105                             MachinePointerInfo(), false, false, false, 0);
18106
18107   // Transform as necessary
18108   SDValue CWD1 =
18109     DAG.getNode(ISD::SRL, DL, MVT::i16,
18110                 DAG.getNode(ISD::AND, DL, MVT::i16,
18111                             CWD, DAG.getConstant(0x800, MVT::i16)),
18112                 DAG.getConstant(11, MVT::i8));
18113   SDValue CWD2 =
18114     DAG.getNode(ISD::SRL, DL, MVT::i16,
18115                 DAG.getNode(ISD::AND, DL, MVT::i16,
18116                             CWD, DAG.getConstant(0x400, MVT::i16)),
18117                 DAG.getConstant(9, MVT::i8));
18118
18119   SDValue RetVal =
18120     DAG.getNode(ISD::AND, DL, MVT::i16,
18121                 DAG.getNode(ISD::ADD, DL, MVT::i16,
18122                             DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
18123                             DAG.getConstant(1, MVT::i16)),
18124                 DAG.getConstant(3, MVT::i16));
18125
18126   return DAG.getNode((VT.getSizeInBits() < 16 ?
18127                       ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
18128 }
18129
18130 static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
18131   MVT VT = Op.getSimpleValueType();
18132   EVT OpVT = VT;
18133   unsigned NumBits = VT.getSizeInBits();
18134   SDLoc dl(Op);
18135
18136   Op = Op.getOperand(0);
18137   if (VT == MVT::i8) {
18138     // Zero extend to i32 since there is not an i8 bsr.
18139     OpVT = MVT::i32;
18140     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
18141   }
18142
18143   // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
18144   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
18145   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
18146
18147   // If src is zero (i.e. bsr sets ZF), returns NumBits.
18148   SDValue Ops[] = {
18149     Op,
18150     DAG.getConstant(NumBits+NumBits-1, OpVT),
18151     DAG.getConstant(X86::COND_E, MVT::i8),
18152     Op.getValue(1)
18153   };
18154   Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
18155
18156   // Finally xor with NumBits-1.
18157   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
18158
18159   if (VT == MVT::i8)
18160     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
18161   return Op;
18162 }
18163
18164 static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
18165   MVT VT = Op.getSimpleValueType();
18166   EVT OpVT = VT;
18167   unsigned NumBits = VT.getSizeInBits();
18168   SDLoc dl(Op);
18169
18170   Op = Op.getOperand(0);
18171   if (VT == MVT::i8) {
18172     // Zero extend to i32 since there is not an i8 bsr.
18173     OpVT = MVT::i32;
18174     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
18175   }
18176
18177   // Issue a bsr (scan bits in reverse).
18178   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
18179   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
18180
18181   // And xor with NumBits-1.
18182   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
18183
18184   if (VT == MVT::i8)
18185     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
18186   return Op;
18187 }
18188
18189 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
18190   MVT VT = Op.getSimpleValueType();
18191   unsigned NumBits = VT.getSizeInBits();
18192   SDLoc dl(Op);
18193   Op = Op.getOperand(0);
18194
18195   // Issue a bsf (scan bits forward) which also sets EFLAGS.
18196   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18197   Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
18198
18199   // If src is zero (i.e. bsf sets ZF), returns NumBits.
18200   SDValue Ops[] = {
18201     Op,
18202     DAG.getConstant(NumBits, VT),
18203     DAG.getConstant(X86::COND_E, MVT::i8),
18204     Op.getValue(1)
18205   };
18206   return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
18207 }
18208
18209 // Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
18210 // ones, and then concatenate the result back.
18211 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
18212   MVT VT = Op.getSimpleValueType();
18213
18214   assert(VT.is256BitVector() && VT.isInteger() &&
18215          "Unsupported value type for operation");
18216
18217   unsigned NumElems = VT.getVectorNumElements();
18218   SDLoc dl(Op);
18219
18220   // Extract the LHS vectors
18221   SDValue LHS = Op.getOperand(0);
18222   SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
18223   SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
18224
18225   // Extract the RHS vectors
18226   SDValue RHS = Op.getOperand(1);
18227   SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
18228   SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
18229
18230   MVT EltVT = VT.getVectorElementType();
18231   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
18232
18233   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
18234                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
18235                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
18236 }
18237
18238 static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
18239   assert(Op.getSimpleValueType().is256BitVector() &&
18240          Op.getSimpleValueType().isInteger() &&
18241          "Only handle AVX 256-bit vector integer operation");
18242   return Lower256IntArith(Op, DAG);
18243 }
18244
18245 static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
18246   assert(Op.getSimpleValueType().is256BitVector() &&
18247          Op.getSimpleValueType().isInteger() &&
18248          "Only handle AVX 256-bit vector integer operation");
18249   return Lower256IntArith(Op, DAG);
18250 }
18251
18252 static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
18253                         SelectionDAG &DAG) {
18254   SDLoc dl(Op);
18255   MVT VT = Op.getSimpleValueType();
18256
18257   // Decompose 256-bit ops into smaller 128-bit ops.
18258   if (VT.is256BitVector() && !Subtarget->hasInt256())
18259     return Lower256IntArith(Op, DAG);
18260
18261   SDValue A = Op.getOperand(0);
18262   SDValue B = Op.getOperand(1);
18263
18264   // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
18265   if (VT == MVT::v4i32) {
18266     assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() &&
18267            "Should not custom lower when pmuldq is available!");
18268
18269     // Extract the odd parts.
18270     static const int UnpackMask[] = { 1, -1, 3, -1 };
18271     SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
18272     SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
18273
18274     // Multiply the even parts.
18275     SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
18276     // Now multiply odd parts.
18277     SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
18278
18279     Evens = DAG.getNode(ISD::BITCAST, dl, VT, Evens);
18280     Odds = DAG.getNode(ISD::BITCAST, dl, VT, Odds);
18281
18282     // Merge the two vectors back together with a shuffle. This expands into 2
18283     // shuffles.
18284     static const int ShufMask[] = { 0, 4, 2, 6 };
18285     return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
18286   }
18287
18288   assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
18289          "Only know how to lower V2I64/V4I64/V8I64 multiply");
18290
18291   //  Ahi = psrlqi(a, 32);
18292   //  Bhi = psrlqi(b, 32);
18293   //
18294   //  AloBlo = pmuludq(a, b);
18295   //  AloBhi = pmuludq(a, Bhi);
18296   //  AhiBlo = pmuludq(Ahi, b);
18297
18298   //  AloBhi = psllqi(AloBhi, 32);
18299   //  AhiBlo = psllqi(AhiBlo, 32);
18300   //  return AloBlo + AloBhi + AhiBlo;
18301
18302   SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
18303   SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
18304
18305   // Bit cast to 32-bit vectors for MULUDQ
18306   EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
18307                                   (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;
18308   A = DAG.getNode(ISD::BITCAST, dl, MulVT, A);
18309   B = DAG.getNode(ISD::BITCAST, dl, MulVT, B);
18310   Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi);
18311   Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi);
18312
18313   SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
18314   SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
18315   SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
18316
18317   AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG);
18318   AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG);
18319
18320   SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
18321   return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
18322 }
18323
18324 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
18325   assert(Subtarget->isTargetWin64() && "Unexpected target");
18326   EVT VT = Op.getValueType();
18327   assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
18328          "Unexpected return type for lowering");
18329
18330   RTLIB::Libcall LC;
18331   bool isSigned;
18332   switch (Op->getOpcode()) {
18333   default: llvm_unreachable("Unexpected request for libcall!");
18334   case ISD::SDIV:      isSigned = true;  LC = RTLIB::SDIV_I128;    break;
18335   case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;
18336   case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;
18337   case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;
18338   case ISD::SDIVREM:   isSigned = true;  LC = RTLIB::SDIVREM_I128; break;
18339   case ISD::UDIVREM:   isSigned = false; LC = RTLIB::UDIVREM_I128; break;
18340   }
18341
18342   SDLoc dl(Op);
18343   SDValue InChain = DAG.getEntryNode();
18344
18345   TargetLowering::ArgListTy Args;
18346   TargetLowering::ArgListEntry Entry;
18347   for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
18348     EVT ArgVT = Op->getOperand(i).getValueType();
18349     assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
18350            "Unexpected argument type for lowering");
18351     SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
18352     Entry.Node = StackPtr;
18353     InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(),
18354                            false, false, 16);
18355     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
18356     Entry.Ty = PointerType::get(ArgTy,0);
18357     Entry.isSExt = false;
18358     Entry.isZExt = false;
18359     Args.push_back(Entry);
18360   }
18361
18362   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
18363                                          getPointerTy());
18364
18365   TargetLowering::CallLoweringInfo CLI(DAG);
18366   CLI.setDebugLoc(dl).setChain(InChain)
18367     .setCallee(getLibcallCallingConv(LC),
18368                static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
18369                Callee, std::move(Args), 0)
18370     .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
18371
18372   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
18373   return DAG.getNode(ISD::BITCAST, dl, VT, CallInfo.first);
18374 }
18375
18376 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
18377                              SelectionDAG &DAG) {
18378   SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
18379   EVT VT = Op0.getValueType();
18380   SDLoc dl(Op);
18381
18382   assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) ||
18383          (VT == MVT::v8i32 && Subtarget->hasInt256()));
18384
18385   // PMULxD operations multiply each even value (starting at 0) of LHS with
18386   // the related value of RHS and produce a widen result.
18387   // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
18388   // => <2 x i64> <ae|cg>
18389   //
18390   // In other word, to have all the results, we need to perform two PMULxD:
18391   // 1. one with the even values.
18392   // 2. one with the odd values.
18393   // To achieve #2, with need to place the odd values at an even position.
18394   //
18395   // Place the odd value at an even position (basically, shift all values 1
18396   // step to the left):
18397   const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
18398   // <a|b|c|d> => <b|undef|d|undef>
18399   SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask);
18400   // <e|f|g|h> => <f|undef|h|undef>
18401   SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask);
18402
18403   // Emit two multiplies, one for the lower 2 ints and one for the higher 2
18404   // ints.
18405   MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
18406   bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
18407   unsigned Opcode =
18408       (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
18409   // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
18410   // => <2 x i64> <ae|cg>
18411   SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT,
18412                              DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
18413   // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
18414   // => <2 x i64> <bf|dh>
18415   SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT,
18416                              DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
18417
18418   // Shuffle it back into the right order.
18419   SDValue Highs, Lows;
18420   if (VT == MVT::v8i32) {
18421     const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
18422     Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
18423     const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
18424     Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
18425   } else {
18426     const int HighMask[] = {1, 5, 3, 7};
18427     Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
18428     const int LowMask[] = {0, 4, 2, 6};
18429     Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
18430   }
18431
18432   // If we have a signed multiply but no PMULDQ fix up the high parts of a
18433   // unsigned multiply.
18434   if (IsSigned && !Subtarget->hasSSE41()) {
18435     SDValue ShAmt =
18436         DAG.getConstant(31, DAG.getTargetLoweringInfo().getShiftAmountTy(VT));
18437     SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
18438                              DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
18439     SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
18440                              DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
18441
18442     SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
18443     Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
18444   }
18445
18446   // The first result of MUL_LOHI is actually the low value, followed by the
18447   // high value.
18448   SDValue Ops[] = {Lows, Highs};
18449   return DAG.getMergeValues(Ops, dl);
18450 }
18451
18452 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
18453                                          const X86Subtarget *Subtarget) {
18454   MVT VT = Op.getSimpleValueType();
18455   SDLoc dl(Op);
18456   SDValue R = Op.getOperand(0);
18457   SDValue Amt = Op.getOperand(1);
18458
18459   // Optimize shl/srl/sra with constant shift amount.
18460   if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
18461     if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
18462       uint64_t ShiftAmt = ShiftConst->getZExtValue();
18463
18464       if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
18465           (Subtarget->hasInt256() &&
18466            (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16)) ||
18467           (Subtarget->hasAVX512() &&
18468            (VT == MVT::v8i64 || VT == MVT::v16i32))) {
18469         if (Op.getOpcode() == ISD::SHL)
18470           return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
18471                                             DAG);
18472         if (Op.getOpcode() == ISD::SRL)
18473           return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
18474                                             DAG);
18475         if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64)
18476           return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
18477                                             DAG);
18478       }
18479
18480       if (VT == MVT::v16i8) {
18481         if (Op.getOpcode() == ISD::SHL) {
18482           // Make a large shift.
18483           SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
18484                                                    MVT::v8i16, R, ShiftAmt,
18485                                                    DAG);
18486           SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
18487           // Zero out the rightmost bits.
18488           SmallVector<SDValue, 16> V(16,
18489                                      DAG.getConstant(uint8_t(-1U << ShiftAmt),
18490                                                      MVT::i8));
18491           return DAG.getNode(ISD::AND, dl, VT, SHL,
18492                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18493         }
18494         if (Op.getOpcode() == ISD::SRL) {
18495           // Make a large shift.
18496           SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
18497                                                    MVT::v8i16, R, ShiftAmt,
18498                                                    DAG);
18499           SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
18500           // Zero out the leftmost bits.
18501           SmallVector<SDValue, 16> V(16,
18502                                      DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
18503                                                      MVT::i8));
18504           return DAG.getNode(ISD::AND, dl, VT, SRL,
18505                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18506         }
18507         if (Op.getOpcode() == ISD::SRA) {
18508           if (ShiftAmt == 7) {
18509             // R s>> 7  ===  R s< 0
18510             SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
18511             return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
18512           }
18513
18514           // R s>> a === ((R u>> a) ^ m) - m
18515           SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
18516           SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
18517                                                          MVT::i8));
18518           SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
18519           Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
18520           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
18521           return Res;
18522         }
18523         llvm_unreachable("Unknown shift opcode.");
18524       }
18525
18526       if (Subtarget->hasInt256() && VT == MVT::v32i8) {
18527         if (Op.getOpcode() == ISD::SHL) {
18528           // Make a large shift.
18529           SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
18530                                                    MVT::v16i16, R, ShiftAmt,
18531                                                    DAG);
18532           SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
18533           // Zero out the rightmost bits.
18534           SmallVector<SDValue, 32> V(32,
18535                                      DAG.getConstant(uint8_t(-1U << ShiftAmt),
18536                                                      MVT::i8));
18537           return DAG.getNode(ISD::AND, dl, VT, SHL,
18538                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18539         }
18540         if (Op.getOpcode() == ISD::SRL) {
18541           // Make a large shift.
18542           SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
18543                                                    MVT::v16i16, R, ShiftAmt,
18544                                                    DAG);
18545           SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
18546           // Zero out the leftmost bits.
18547           SmallVector<SDValue, 32> V(32,
18548                                      DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
18549                                                      MVT::i8));
18550           return DAG.getNode(ISD::AND, dl, VT, SRL,
18551                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18552         }
18553         if (Op.getOpcode() == ISD::SRA) {
18554           if (ShiftAmt == 7) {
18555             // R s>> 7  ===  R s< 0
18556             SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
18557             return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
18558           }
18559
18560           // R s>> a === ((R u>> a) ^ m) - m
18561           SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
18562           SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt,
18563                                                          MVT::i8));
18564           SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
18565           Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
18566           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
18567           return Res;
18568         }
18569         llvm_unreachable("Unknown shift opcode.");
18570       }
18571     }
18572   }
18573
18574   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
18575   if (!Subtarget->is64Bit() &&
18576       (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
18577       Amt.getOpcode() == ISD::BITCAST &&
18578       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
18579     Amt = Amt.getOperand(0);
18580     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
18581                      VT.getVectorNumElements();
18582     unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
18583     uint64_t ShiftAmt = 0;
18584     for (unsigned i = 0; i != Ratio; ++i) {
18585       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i));
18586       if (!C)
18587         return SDValue();
18588       // 6 == Log2(64)
18589       ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
18590     }
18591     // Check remaining shift amounts.
18592     for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
18593       uint64_t ShAmt = 0;
18594       for (unsigned j = 0; j != Ratio; ++j) {
18595         ConstantSDNode *C =
18596           dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
18597         if (!C)
18598           return SDValue();
18599         // 6 == Log2(64)
18600         ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
18601       }
18602       if (ShAmt != ShiftAmt)
18603         return SDValue();
18604     }
18605     switch (Op.getOpcode()) {
18606     default:
18607       llvm_unreachable("Unknown shift opcode!");
18608     case ISD::SHL:
18609       return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
18610                                         DAG);
18611     case ISD::SRL:
18612       return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
18613                                         DAG);
18614     case ISD::SRA:
18615       return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
18616                                         DAG);
18617     }
18618   }
18619
18620   return SDValue();
18621 }
18622
18623 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
18624                                         const X86Subtarget* Subtarget) {
18625   MVT VT = Op.getSimpleValueType();
18626   SDLoc dl(Op);
18627   SDValue R = Op.getOperand(0);
18628   SDValue Amt = Op.getOperand(1);
18629
18630   if ((VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) ||
18631       VT == MVT::v4i32 || VT == MVT::v8i16 ||
18632       (Subtarget->hasInt256() &&
18633        ((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) ||
18634         VT == MVT::v8i32 || VT == MVT::v16i16)) ||
18635        (Subtarget->hasAVX512() && (VT == MVT::v8i64 || VT == MVT::v16i32))) {
18636     SDValue BaseShAmt;
18637     EVT EltVT = VT.getVectorElementType();
18638
18639     if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
18640       // Check if this build_vector node is doing a splat.
18641       // If so, then set BaseShAmt equal to the splat value.
18642       BaseShAmt = BV->getSplatValue();
18643       if (BaseShAmt && BaseShAmt.getOpcode() == ISD::UNDEF)
18644         BaseShAmt = SDValue();
18645     } else {
18646       if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
18647         Amt = Amt.getOperand(0);
18648
18649       ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
18650       if (SVN && SVN->isSplat()) {
18651         unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
18652         SDValue InVec = Amt.getOperand(0);
18653         if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
18654           assert((SplatIdx < InVec.getValueType().getVectorNumElements()) &&
18655                  "Unexpected shuffle index found!");
18656           BaseShAmt = InVec.getOperand(SplatIdx);
18657         } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
18658            if (ConstantSDNode *C =
18659                dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
18660              if (C->getZExtValue() == SplatIdx)
18661                BaseShAmt = InVec.getOperand(1);
18662            }
18663         }
18664
18665         if (!BaseShAmt)
18666           // Avoid introducing an extract element from a shuffle.
18667           BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
18668                                     DAG.getIntPtrConstant(SplatIdx));
18669       }
18670     }
18671
18672     if (BaseShAmt.getNode()) {
18673       assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
18674       if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
18675         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
18676       else if (EltVT.bitsLT(MVT::i32))
18677         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
18678
18679       switch (Op.getOpcode()) {
18680       default:
18681         llvm_unreachable("Unknown shift opcode!");
18682       case ISD::SHL:
18683         switch (VT.SimpleTy) {
18684         default: return SDValue();
18685         case MVT::v2i64:
18686         case MVT::v4i32:
18687         case MVT::v8i16:
18688         case MVT::v4i64:
18689         case MVT::v8i32:
18690         case MVT::v16i16:
18691         case MVT::v16i32:
18692         case MVT::v8i64:
18693           return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG);
18694         }
18695       case ISD::SRA:
18696         switch (VT.SimpleTy) {
18697         default: return SDValue();
18698         case MVT::v4i32:
18699         case MVT::v8i16:
18700         case MVT::v8i32:
18701         case MVT::v16i16:
18702         case MVT::v16i32:
18703         case MVT::v8i64:
18704           return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG);
18705         }
18706       case ISD::SRL:
18707         switch (VT.SimpleTy) {
18708         default: return SDValue();
18709         case MVT::v2i64:
18710         case MVT::v4i32:
18711         case MVT::v8i16:
18712         case MVT::v4i64:
18713         case MVT::v8i32:
18714         case MVT::v16i16:
18715         case MVT::v16i32:
18716         case MVT::v8i64:
18717           return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG);
18718         }
18719       }
18720     }
18721   }
18722
18723   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
18724   if (!Subtarget->is64Bit() &&
18725       (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64) ||
18726       (Subtarget->hasAVX512() && VT == MVT::v8i64)) &&
18727       Amt.getOpcode() == ISD::BITCAST &&
18728       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
18729     Amt = Amt.getOperand(0);
18730     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
18731                      VT.getVectorNumElements();
18732     std::vector<SDValue> Vals(Ratio);
18733     for (unsigned i = 0; i != Ratio; ++i)
18734       Vals[i] = Amt.getOperand(i);
18735     for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
18736       for (unsigned j = 0; j != Ratio; ++j)
18737         if (Vals[j] != Amt.getOperand(i + j))
18738           return SDValue();
18739     }
18740     switch (Op.getOpcode()) {
18741     default:
18742       llvm_unreachable("Unknown shift opcode!");
18743     case ISD::SHL:
18744       return DAG.getNode(X86ISD::VSHL, dl, VT, R, Op.getOperand(1));
18745     case ISD::SRL:
18746       return DAG.getNode(X86ISD::VSRL, dl, VT, R, Op.getOperand(1));
18747     case ISD::SRA:
18748       return DAG.getNode(X86ISD::VSRA, dl, VT, R, Op.getOperand(1));
18749     }
18750   }
18751
18752   return SDValue();
18753 }
18754
18755 static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
18756                           SelectionDAG &DAG) {
18757   MVT VT = Op.getSimpleValueType();
18758   SDLoc dl(Op);
18759   SDValue R = Op.getOperand(0);
18760   SDValue Amt = Op.getOperand(1);
18761   SDValue V;
18762
18763   assert(VT.isVector() && "Custom lowering only for vector shifts!");
18764   assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!");
18765
18766   V = LowerScalarImmediateShift(Op, DAG, Subtarget);
18767   if (V.getNode())
18768     return V;
18769
18770   V = LowerScalarVariableShift(Op, DAG, Subtarget);
18771   if (V.getNode())
18772       return V;
18773
18774   if (Subtarget->hasAVX512() && (VT == MVT::v16i32 || VT == MVT::v8i64))
18775     return Op;
18776   // AVX2 has VPSLLV/VPSRAV/VPSRLV.
18777   if (Subtarget->hasInt256()) {
18778     if (Op.getOpcode() == ISD::SRL &&
18779         (VT == MVT::v2i64 || VT == MVT::v4i32 ||
18780          VT == MVT::v4i64 || VT == MVT::v8i32))
18781       return Op;
18782     if (Op.getOpcode() == ISD::SHL &&
18783         (VT == MVT::v2i64 || VT == MVT::v4i32 ||
18784          VT == MVT::v4i64 || VT == MVT::v8i32))
18785       return Op;
18786     if (Op.getOpcode() == ISD::SRA && (VT == MVT::v4i32 || VT == MVT::v8i32))
18787       return Op;
18788   }
18789
18790   // If possible, lower this packed shift into a vector multiply instead of
18791   // expanding it into a sequence of scalar shifts.
18792   // Do this only if the vector shift count is a constant build_vector.
18793   if (Op.getOpcode() == ISD::SHL &&
18794       (VT == MVT::v8i16 || VT == MVT::v4i32 ||
18795        (Subtarget->hasInt256() && VT == MVT::v16i16)) &&
18796       ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
18797     SmallVector<SDValue, 8> Elts;
18798     EVT SVT = VT.getScalarType();
18799     unsigned SVTBits = SVT.getSizeInBits();
18800     const APInt &One = APInt(SVTBits, 1);
18801     unsigned NumElems = VT.getVectorNumElements();
18802
18803     for (unsigned i=0; i !=NumElems; ++i) {
18804       SDValue Op = Amt->getOperand(i);
18805       if (Op->getOpcode() == ISD::UNDEF) {
18806         Elts.push_back(Op);
18807         continue;
18808       }
18809
18810       ConstantSDNode *ND = cast<ConstantSDNode>(Op);
18811       const APInt &C = APInt(SVTBits, ND->getAPIntValue().getZExtValue());
18812       uint64_t ShAmt = C.getZExtValue();
18813       if (ShAmt >= SVTBits) {
18814         Elts.push_back(DAG.getUNDEF(SVT));
18815         continue;
18816       }
18817       Elts.push_back(DAG.getConstant(One.shl(ShAmt), SVT));
18818     }
18819     SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
18820     return DAG.getNode(ISD::MUL, dl, VT, R, BV);
18821   }
18822
18823   // Lower SHL with variable shift amount.
18824   if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
18825     Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT));
18826
18827     Op = DAG.getNode(ISD::ADD, dl, VT, Op, DAG.getConstant(0x3f800000U, VT));
18828     Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op);
18829     Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
18830     return DAG.getNode(ISD::MUL, dl, VT, Op, R);
18831   }
18832
18833   // If possible, lower this shift as a sequence of two shifts by
18834   // constant plus a MOVSS/MOVSD instead of scalarizing it.
18835   // Example:
18836   //   (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
18837   //
18838   // Could be rewritten as:
18839   //   (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
18840   //
18841   // The advantage is that the two shifts from the example would be
18842   // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
18843   // the vector shift into four scalar shifts plus four pairs of vector
18844   // insert/extract.
18845   if ((VT == MVT::v8i16 || VT == MVT::v4i32) &&
18846       ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
18847     unsigned TargetOpcode = X86ISD::MOVSS;
18848     bool CanBeSimplified;
18849     // The splat value for the first packed shift (the 'X' from the example).
18850     SDValue Amt1 = Amt->getOperand(0);
18851     // The splat value for the second packed shift (the 'Y' from the example).
18852     SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) :
18853                                         Amt->getOperand(2);
18854
18855     // See if it is possible to replace this node with a sequence of
18856     // two shifts followed by a MOVSS/MOVSD
18857     if (VT == MVT::v4i32) {
18858       // Check if it is legal to use a MOVSS.
18859       CanBeSimplified = Amt2 == Amt->getOperand(2) &&
18860                         Amt2 == Amt->getOperand(3);
18861       if (!CanBeSimplified) {
18862         // Otherwise, check if we can still simplify this node using a MOVSD.
18863         CanBeSimplified = Amt1 == Amt->getOperand(1) &&
18864                           Amt->getOperand(2) == Amt->getOperand(3);
18865         TargetOpcode = X86ISD::MOVSD;
18866         Amt2 = Amt->getOperand(2);
18867       }
18868     } else {
18869       // Do similar checks for the case where the machine value type
18870       // is MVT::v8i16.
18871       CanBeSimplified = Amt1 == Amt->getOperand(1);
18872       for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
18873         CanBeSimplified = Amt2 == Amt->getOperand(i);
18874
18875       if (!CanBeSimplified) {
18876         TargetOpcode = X86ISD::MOVSD;
18877         CanBeSimplified = true;
18878         Amt2 = Amt->getOperand(4);
18879         for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
18880           CanBeSimplified = Amt1 == Amt->getOperand(i);
18881         for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
18882           CanBeSimplified = Amt2 == Amt->getOperand(j);
18883       }
18884     }
18885
18886     if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
18887         isa<ConstantSDNode>(Amt2)) {
18888       // Replace this node with two shifts followed by a MOVSS/MOVSD.
18889       EVT CastVT = MVT::v4i32;
18890       SDValue Splat1 =
18891         DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), VT);
18892       SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
18893       SDValue Splat2 =
18894         DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), VT);
18895       SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
18896       if (TargetOpcode == X86ISD::MOVSD)
18897         CastVT = MVT::v2i64;
18898       SDValue BitCast1 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift1);
18899       SDValue BitCast2 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift2);
18900       SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2,
18901                                             BitCast1, DAG);
18902       return DAG.getNode(ISD::BITCAST, dl, VT, Result);
18903     }
18904   }
18905
18906   if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
18907     assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq.");
18908
18909     // a = a << 5;
18910     Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(5, VT));
18911     Op = DAG.getNode(ISD::BITCAST, dl, VT, Op);
18912
18913     // Turn 'a' into a mask suitable for VSELECT
18914     SDValue VSelM = DAG.getConstant(0x80, VT);
18915     SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
18916     OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
18917
18918     SDValue CM1 = DAG.getConstant(0x0f, VT);
18919     SDValue CM2 = DAG.getConstant(0x3f, VT);
18920
18921     // r = VSELECT(r, psllw(r & (char16)15, 4), a);
18922     SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1);
18923     M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 4, DAG);
18924     M = DAG.getNode(ISD::BITCAST, dl, VT, M);
18925     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
18926
18927     // a += a
18928     Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
18929     OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
18930     OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
18931
18932     // r = VSELECT(r, psllw(r & (char16)63, 2), a);
18933     M = DAG.getNode(ISD::AND, dl, VT, R, CM2);
18934     M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 2, DAG);
18935     M = DAG.getNode(ISD::BITCAST, dl, VT, M);
18936     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
18937
18938     // a += a
18939     Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
18940     OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
18941     OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
18942
18943     // return VSELECT(r, r+r, a);
18944     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel,
18945                     DAG.getNode(ISD::ADD, dl, VT, R, R), R);
18946     return R;
18947   }
18948
18949   // It's worth extending once and using the v8i32 shifts for 16-bit types, but
18950   // the extra overheads to get from v16i8 to v8i32 make the existing SSE
18951   // solution better.
18952   if (Subtarget->hasInt256() && VT == MVT::v8i16) {
18953     MVT NewVT = VT == MVT::v8i16 ? MVT::v8i32 : MVT::v16i16;
18954     unsigned ExtOpc =
18955         Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
18956     R = DAG.getNode(ExtOpc, dl, NewVT, R);
18957     Amt = DAG.getNode(ISD::ANY_EXTEND, dl, NewVT, Amt);
18958     return DAG.getNode(ISD::TRUNCATE, dl, VT,
18959                        DAG.getNode(Op.getOpcode(), dl, NewVT, R, Amt));
18960     }
18961
18962   // Decompose 256-bit shifts into smaller 128-bit shifts.
18963   if (VT.is256BitVector()) {
18964     unsigned NumElems = VT.getVectorNumElements();
18965     MVT EltVT = VT.getVectorElementType();
18966     EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
18967
18968     // Extract the two vectors
18969     SDValue V1 = Extract128BitVector(R, 0, DAG, dl);
18970     SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl);
18971
18972     // Recreate the shift amount vectors
18973     SDValue Amt1, Amt2;
18974     if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
18975       // Constant shift amount
18976       SmallVector<SDValue, 4> Amt1Csts;
18977       SmallVector<SDValue, 4> Amt2Csts;
18978       for (unsigned i = 0; i != NumElems/2; ++i)
18979         Amt1Csts.push_back(Amt->getOperand(i));
18980       for (unsigned i = NumElems/2; i != NumElems; ++i)
18981         Amt2Csts.push_back(Amt->getOperand(i));
18982
18983       Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts);
18984       Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts);
18985     } else {
18986       // Variable shift amount
18987       Amt1 = Extract128BitVector(Amt, 0, DAG, dl);
18988       Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl);
18989     }
18990
18991     // Issue new vector shifts for the smaller types
18992     V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
18993     V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
18994
18995     // Concatenate the result back
18996     return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
18997   }
18998
18999   return SDValue();
19000 }
19001
19002 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
19003   // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
19004   // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
19005   // looks for this combo and may remove the "setcc" instruction if the "setcc"
19006   // has only one use.
19007   SDNode *N = Op.getNode();
19008   SDValue LHS = N->getOperand(0);
19009   SDValue RHS = N->getOperand(1);
19010   unsigned BaseOp = 0;
19011   unsigned Cond = 0;
19012   SDLoc DL(Op);
19013   switch (Op.getOpcode()) {
19014   default: llvm_unreachable("Unknown ovf instruction!");
19015   case ISD::SADDO:
19016     // A subtract of one will be selected as a INC. Note that INC doesn't
19017     // set CF, so we can't do this for UADDO.
19018     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
19019       if (C->isOne()) {
19020         BaseOp = X86ISD::INC;
19021         Cond = X86::COND_O;
19022         break;
19023       }
19024     BaseOp = X86ISD::ADD;
19025     Cond = X86::COND_O;
19026     break;
19027   case ISD::UADDO:
19028     BaseOp = X86ISD::ADD;
19029     Cond = X86::COND_B;
19030     break;
19031   case ISD::SSUBO:
19032     // A subtract of one will be selected as a DEC. Note that DEC doesn't
19033     // set CF, so we can't do this for USUBO.
19034     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
19035       if (C->isOne()) {
19036         BaseOp = X86ISD::DEC;
19037         Cond = X86::COND_O;
19038         break;
19039       }
19040     BaseOp = X86ISD::SUB;
19041     Cond = X86::COND_O;
19042     break;
19043   case ISD::USUBO:
19044     BaseOp = X86ISD::SUB;
19045     Cond = X86::COND_B;
19046     break;
19047   case ISD::SMULO:
19048     BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
19049     Cond = X86::COND_O;
19050     break;
19051   case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
19052     if (N->getValueType(0) == MVT::i8) {
19053       BaseOp = X86ISD::UMUL8;
19054       Cond = X86::COND_O;
19055       break;
19056     }
19057     SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
19058                                  MVT::i32);
19059     SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
19060
19061     SDValue SetCC =
19062       DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
19063                   DAG.getConstant(X86::COND_O, MVT::i32),
19064                   SDValue(Sum.getNode(), 2));
19065
19066     return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
19067   }
19068   }
19069
19070   // Also sets EFLAGS.
19071   SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
19072   SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
19073
19074   SDValue SetCC =
19075     DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1),
19076                 DAG.getConstant(Cond, MVT::i32),
19077                 SDValue(Sum.getNode(), 1));
19078
19079   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
19080 }
19081
19082 // Sign extension of the low part of vector elements. This may be used either
19083 // when sign extend instructions are not available or if the vector element
19084 // sizes already match the sign-extended size. If the vector elements are in
19085 // their pre-extended size and sign extend instructions are available, that will
19086 // be handled by LowerSIGN_EXTEND.
19087 SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
19088                                                   SelectionDAG &DAG) const {
19089   SDLoc dl(Op);
19090   EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
19091   MVT VT = Op.getSimpleValueType();
19092
19093   if (!Subtarget->hasSSE2() || !VT.isVector())
19094     return SDValue();
19095
19096   unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
19097                       ExtraVT.getScalarType().getSizeInBits();
19098
19099   switch (VT.SimpleTy) {
19100     default: return SDValue();
19101     case MVT::v8i32:
19102     case MVT::v16i16:
19103       if (!Subtarget->hasFp256())
19104         return SDValue();
19105       if (!Subtarget->hasInt256()) {
19106         // needs to be split
19107         unsigned NumElems = VT.getVectorNumElements();
19108
19109         // Extract the LHS vectors
19110         SDValue LHS = Op.getOperand(0);
19111         SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
19112         SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
19113
19114         MVT EltVT = VT.getVectorElementType();
19115         EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
19116
19117         EVT ExtraEltVT = ExtraVT.getVectorElementType();
19118         unsigned ExtraNumElems = ExtraVT.getVectorNumElements();
19119         ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
19120                                    ExtraNumElems/2);
19121         SDValue Extra = DAG.getValueType(ExtraVT);
19122
19123         LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
19124         LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
19125
19126         return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);
19127       }
19128       // fall through
19129     case MVT::v4i32:
19130     case MVT::v8i16: {
19131       SDValue Op0 = Op.getOperand(0);
19132
19133       // This is a sign extension of some low part of vector elements without
19134       // changing the size of the vector elements themselves:
19135       // Shift-Left + Shift-Right-Algebraic.
19136       SDValue Shl = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0,
19137                                                BitsDiff, DAG);
19138       return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Shl, BitsDiff,
19139                                         DAG);
19140     }
19141   }
19142 }
19143
19144 /// Returns true if the operand type is exactly twice the native width, and
19145 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
19146 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
19147 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
19148 bool X86TargetLowering::needsCmpXchgNb(const Type *MemType) const {
19149   const X86Subtarget &Subtarget =
19150       getTargetMachine().getSubtarget<X86Subtarget>();
19151   unsigned OpWidth = MemType->getPrimitiveSizeInBits();
19152
19153   if (OpWidth == 64)
19154     return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
19155   else if (OpWidth == 128)
19156     return Subtarget.hasCmpxchg16b();
19157   else
19158     return false;
19159 }
19160
19161 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
19162   return needsCmpXchgNb(SI->getValueOperand()->getType());
19163 }
19164
19165 // Note: this turns large loads into lock cmpxchg8b/16b.
19166 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
19167 bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
19168   auto PTy = cast<PointerType>(LI->getPointerOperand()->getType());
19169   return needsCmpXchgNb(PTy->getElementType());
19170 }
19171
19172 bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
19173   const X86Subtarget &Subtarget =
19174       getTargetMachine().getSubtarget<X86Subtarget>();
19175   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
19176   const Type *MemType = AI->getType();
19177
19178   // If the operand is too big, we must see if cmpxchg8/16b is available
19179   // and default to library calls otherwise.
19180   if (MemType->getPrimitiveSizeInBits() > NativeWidth)
19181     return needsCmpXchgNb(MemType);
19182
19183   AtomicRMWInst::BinOp Op = AI->getOperation();
19184   switch (Op) {
19185   default:
19186     llvm_unreachable("Unknown atomic operation");
19187   case AtomicRMWInst::Xchg:
19188   case AtomicRMWInst::Add:
19189   case AtomicRMWInst::Sub:
19190     // It's better to use xadd, xsub or xchg for these in all cases.
19191     return false;
19192   case AtomicRMWInst::Or:
19193   case AtomicRMWInst::And:
19194   case AtomicRMWInst::Xor:
19195     // If the atomicrmw's result isn't actually used, we can just add a "lock"
19196     // prefix to a normal instruction for these operations.
19197     return !AI->use_empty();
19198   case AtomicRMWInst::Nand:
19199   case AtomicRMWInst::Max:
19200   case AtomicRMWInst::Min:
19201   case AtomicRMWInst::UMax:
19202   case AtomicRMWInst::UMin:
19203     // These always require a non-trivial set of data operations on x86. We must
19204     // use a cmpxchg loop.
19205     return true;
19206   }
19207 }
19208
19209 static bool hasMFENCE(const X86Subtarget& Subtarget) {
19210   // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
19211   // no-sse2). There isn't any reason to disable it if the target processor
19212   // supports it.
19213   return Subtarget.hasSSE2() || Subtarget.is64Bit();
19214 }
19215
19216 LoadInst *
19217 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
19218   const X86Subtarget &Subtarget =
19219       getTargetMachine().getSubtarget<X86Subtarget>();
19220   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
19221   const Type *MemType = AI->getType();
19222   // Accesses larger than the native width are turned into cmpxchg/libcalls, so
19223   // there is no benefit in turning such RMWs into loads, and it is actually
19224   // harmful as it introduces a mfence.
19225   if (MemType->getPrimitiveSizeInBits() > NativeWidth)
19226     return nullptr;
19227
19228   auto Builder = IRBuilder<>(AI);
19229   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
19230   auto SynchScope = AI->getSynchScope();
19231   // We must restrict the ordering to avoid generating loads with Release or
19232   // ReleaseAcquire orderings.
19233   auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
19234   auto Ptr = AI->getPointerOperand();
19235
19236   // Before the load we need a fence. Here is an example lifted from
19237   // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
19238   // is required:
19239   // Thread 0:
19240   //   x.store(1, relaxed);
19241   //   r1 = y.fetch_add(0, release);
19242   // Thread 1:
19243   //   y.fetch_add(42, acquire);
19244   //   r2 = x.load(relaxed);
19245   // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
19246   // lowered to just a load without a fence. A mfence flushes the store buffer,
19247   // making the optimization clearly correct.
19248   // FIXME: it is required if isAtLeastRelease(Order) but it is not clear
19249   // otherwise, we might be able to be more agressive on relaxed idempotent
19250   // rmw. In practice, they do not look useful, so we don't try to be
19251   // especially clever.
19252   if (SynchScope == SingleThread) {
19253     // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
19254     // the IR level, so we must wrap it in an intrinsic.
19255     return nullptr;
19256   } else if (hasMFENCE(Subtarget)) {
19257     Function *MFence = llvm::Intrinsic::getDeclaration(M,
19258             Intrinsic::x86_sse2_mfence);
19259     Builder.CreateCall(MFence);
19260   } else {
19261     // FIXME: it might make sense to use a locked operation here but on a
19262     // different cache-line to prevent cache-line bouncing. In practice it
19263     // is probably a small win, and x86 processors without mfence are rare
19264     // enough that we do not bother.
19265     return nullptr;
19266   }
19267
19268   // Finally we can emit the atomic load.
19269   LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
19270           AI->getType()->getPrimitiveSizeInBits());
19271   Loaded->setAtomic(Order, SynchScope);
19272   AI->replaceAllUsesWith(Loaded);
19273   AI->eraseFromParent();
19274   return Loaded;
19275 }
19276
19277 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
19278                                  SelectionDAG &DAG) {
19279   SDLoc dl(Op);
19280   AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
19281     cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
19282   SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
19283     cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
19284
19285   // The only fence that needs an instruction is a sequentially-consistent
19286   // cross-thread fence.
19287   if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) {
19288     if (hasMFENCE(*Subtarget))
19289       return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
19290
19291     SDValue Chain = Op.getOperand(0);
19292     SDValue Zero = DAG.getConstant(0, MVT::i32);
19293     SDValue Ops[] = {
19294       DAG.getRegister(X86::ESP, MVT::i32), // Base
19295       DAG.getTargetConstant(1, MVT::i8),   // Scale
19296       DAG.getRegister(0, MVT::i32),        // Index
19297       DAG.getTargetConstant(0, MVT::i32),  // Disp
19298       DAG.getRegister(0, MVT::i32),        // Segment.
19299       Zero,
19300       Chain
19301     };
19302     SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
19303     return SDValue(Res, 0);
19304   }
19305
19306   // MEMBARRIER is a compiler barrier; it codegens to a no-op.
19307   return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
19308 }
19309
19310 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
19311                              SelectionDAG &DAG) {
19312   MVT T = Op.getSimpleValueType();
19313   SDLoc DL(Op);
19314   unsigned Reg = 0;
19315   unsigned size = 0;
19316   switch(T.SimpleTy) {
19317   default: llvm_unreachable("Invalid value type!");
19318   case MVT::i8:  Reg = X86::AL;  size = 1; break;
19319   case MVT::i16: Reg = X86::AX;  size = 2; break;
19320   case MVT::i32: Reg = X86::EAX; size = 4; break;
19321   case MVT::i64:
19322     assert(Subtarget->is64Bit() && "Node not type legal!");
19323     Reg = X86::RAX; size = 8;
19324     break;
19325   }
19326   SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
19327                                   Op.getOperand(2), SDValue());
19328   SDValue Ops[] = { cpIn.getValue(0),
19329                     Op.getOperand(1),
19330                     Op.getOperand(3),
19331                     DAG.getTargetConstant(size, MVT::i8),
19332                     cpIn.getValue(1) };
19333   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
19334   MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
19335   SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
19336                                            Ops, T, MMO);
19337
19338   SDValue cpOut =
19339     DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
19340   SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
19341                                       MVT::i32, cpOut.getValue(2));
19342   SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1),
19343                                 DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);
19344
19345   DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
19346   DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
19347   DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
19348   return SDValue();
19349 }
19350
19351 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
19352                             SelectionDAG &DAG) {
19353   MVT SrcVT = Op.getOperand(0).getSimpleValueType();
19354   MVT DstVT = Op.getSimpleValueType();
19355
19356   if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) {
19357     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
19358     if (DstVT != MVT::f64)
19359       // This conversion needs to be expanded.
19360       return SDValue();
19361
19362     SDValue InVec = Op->getOperand(0);
19363     SDLoc dl(Op);
19364     unsigned NumElts = SrcVT.getVectorNumElements();
19365     EVT SVT = SrcVT.getVectorElementType();
19366
19367     // Widen the vector in input in the case of MVT::v2i32.
19368     // Example: from MVT::v2i32 to MVT::v4i32.
19369     SmallVector<SDValue, 16> Elts;
19370     for (unsigned i = 0, e = NumElts; i != e; ++i)
19371       Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec,
19372                                  DAG.getIntPtrConstant(i)));
19373
19374     // Explicitly mark the extra elements as Undef.
19375     SDValue Undef = DAG.getUNDEF(SVT);
19376     for (unsigned i = NumElts, e = NumElts * 2; i != e; ++i)
19377       Elts.push_back(Undef);
19378
19379     EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
19380     SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts);
19381     SDValue ToV2F64 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, BV);
19382     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
19383                        DAG.getIntPtrConstant(0));
19384   }
19385
19386   assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
19387          Subtarget->hasMMX() && "Unexpected custom BITCAST");
19388   assert((DstVT == MVT::i64 ||
19389           (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
19390          "Unexpected custom BITCAST");
19391   // i64 <=> MMX conversions are Legal.
19392   if (SrcVT==MVT::i64 && DstVT.isVector())
19393     return Op;
19394   if (DstVT==MVT::i64 && SrcVT.isVector())
19395     return Op;
19396   // MMX <=> MMX conversions are Legal.
19397   if (SrcVT.isVector() && DstVT.isVector())
19398     return Op;
19399   // All other conversions need to be expanded.
19400   return SDValue();
19401 }
19402
19403 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget,
19404                           SelectionDAG &DAG) {
19405   SDNode *Node = Op.getNode();
19406   SDLoc dl(Node);
19407
19408   Op = Op.getOperand(0);
19409   EVT VT = Op.getValueType();
19410   assert((VT.is128BitVector() || VT.is256BitVector()) &&
19411          "CTPOP lowering only implemented for 128/256-bit wide vector types");
19412
19413   unsigned NumElts = VT.getVectorNumElements();
19414   EVT EltVT = VT.getVectorElementType();
19415   unsigned Len = EltVT.getSizeInBits();
19416
19417   // This is the vectorized version of the "best" algorithm from
19418   // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
19419   // with a minor tweak to use a series of adds + shifts instead of vector
19420   // multiplications. Implemented for the v2i64, v4i64, v4i32, v8i32 types:
19421   //
19422   //  v2i64, v4i64, v4i32 => Only profitable w/ popcnt disabled
19423   //  v8i32 => Always profitable
19424   //
19425   // FIXME: There a couple of possible improvements:
19426   //
19427   // 1) Support for i8 and i16 vectors (needs measurements if popcnt enabled).
19428   // 2) Use strategies from http://wm.ite.pl/articles/sse-popcount.html
19429   //
19430   assert(EltVT.isInteger() && (Len == 32 || Len == 64) && Len % 8 == 0 &&
19431          "CTPOP not implemented for this vector element type.");
19432
19433   // X86 canonicalize ANDs to vXi64, generate the appropriate bitcasts to avoid
19434   // extra legalization.
19435   bool NeedsBitcast = EltVT == MVT::i32;
19436   MVT BitcastVT = VT.is256BitVector() ? MVT::v4i64 : MVT::v2i64;
19437
19438   SDValue Cst55 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), EltVT);
19439   SDValue Cst33 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), EltVT);
19440   SDValue Cst0F = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), EltVT);
19441
19442   // v = v - ((v >> 1) & 0x55555555...)
19443   SmallVector<SDValue, 8> Ones(NumElts, DAG.getConstant(1, EltVT));
19444   SDValue OnesV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ones);
19445   SDValue Srl = DAG.getNode(ISD::SRL, dl, VT, Op, OnesV);
19446   if (NeedsBitcast)
19447     Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);
19448
19449   SmallVector<SDValue, 8> Mask55(NumElts, Cst55);
19450   SDValue M55 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask55);
19451   if (NeedsBitcast)
19452     M55 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M55);
19453
19454   SDValue And = DAG.getNode(ISD::AND, dl, Srl.getValueType(), Srl, M55);
19455   if (VT != And.getValueType())
19456     And = DAG.getNode(ISD::BITCAST, dl, VT, And);
19457   SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, Op, And);
19458
19459   // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
19460   SmallVector<SDValue, 8> Mask33(NumElts, Cst33);
19461   SDValue M33 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask33);
19462   SmallVector<SDValue, 8> Twos(NumElts, DAG.getConstant(2, EltVT));
19463   SDValue TwosV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Twos);
19464
19465   Srl = DAG.getNode(ISD::SRL, dl, VT, Sub, TwosV);
19466   if (NeedsBitcast) {
19467     Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);
19468     M33 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M33);
19469     Sub = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Sub);
19470   }
19471
19472   SDValue AndRHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Srl, M33);
19473   SDValue AndLHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Sub, M33);
19474   if (VT != AndRHS.getValueType()) {
19475     AndRHS = DAG.getNode(ISD::BITCAST, dl, VT, AndRHS);
19476     AndLHS = DAG.getNode(ISD::BITCAST, dl, VT, AndLHS);
19477   }
19478   SDValue Add = DAG.getNode(ISD::ADD, dl, VT, AndLHS, AndRHS);
19479
19480   // v = (v + (v >> 4)) & 0x0F0F0F0F...
19481   SmallVector<SDValue, 8> Fours(NumElts, DAG.getConstant(4, EltVT));
19482   SDValue FoursV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Fours);
19483   Srl = DAG.getNode(ISD::SRL, dl, VT, Add, FoursV);
19484   Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);
19485
19486   SmallVector<SDValue, 8> Mask0F(NumElts, Cst0F);
19487   SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask0F);
19488   if (NeedsBitcast) {
19489     Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);
19490     M0F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M0F);
19491   }
19492   And = DAG.getNode(ISD::AND, dl, M0F.getValueType(), Add, M0F);
19493   if (VT != And.getValueType())
19494     And = DAG.getNode(ISD::BITCAST, dl, VT, And);
19495
19496   // The algorithm mentioned above uses:
19497   //    v = (v * 0x01010101...) >> (Len - 8)
19498   //
19499   // Change it to use vector adds + vector shifts which yield faster results on
19500   // Haswell than using vector integer multiplication.
19501   //
19502   // For i32 elements:
19503   //    v = v + (v >> 8)
19504   //    v = v + (v >> 16)
19505   //
19506   // For i64 elements:
19507   //    v = v + (v >> 8)
19508   //    v = v + (v >> 16)
19509   //    v = v + (v >> 32)
19510   //
19511   Add = And;
19512   SmallVector<SDValue, 8> Csts;
19513   for (unsigned i = 8; i <= Len/2; i *= 2) {
19514     Csts.assign(NumElts, DAG.getConstant(i, EltVT));
19515     SDValue CstsV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Csts);
19516     Srl = DAG.getNode(ISD::SRL, dl, VT, Add, CstsV);
19517     Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);
19518     Csts.clear();
19519   }
19520
19521   // The result is on the least significant 6-bits on i32 and 7-bits on i64.
19522   SDValue Cst3F = DAG.getConstant(APInt(Len, Len == 32 ? 0x3F : 0x7F), EltVT);
19523   SmallVector<SDValue, 8> Cst3FV(NumElts, Cst3F);
19524   SDValue M3F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Cst3FV);
19525   if (NeedsBitcast) {
19526     Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);
19527     M3F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M3F);
19528   }
19529   And = DAG.getNode(ISD::AND, dl, M3F.getValueType(), Add, M3F);
19530   if (VT != And.getValueType())
19531     And = DAG.getNode(ISD::BITCAST, dl, VT, And);
19532
19533   return And;
19534 }
19535
19536 static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
19537   SDNode *Node = Op.getNode();
19538   SDLoc dl(Node);
19539   EVT T = Node->getValueType(0);
19540   SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
19541                               DAG.getConstant(0, T), Node->getOperand(2));
19542   return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
19543                        cast<AtomicSDNode>(Node)->getMemoryVT(),
19544                        Node->getOperand(0),
19545                        Node->getOperand(1), negOp,
19546                        cast<AtomicSDNode>(Node)->getMemOperand(),
19547                        cast<AtomicSDNode>(Node)->getOrdering(),
19548                        cast<AtomicSDNode>(Node)->getSynchScope());
19549 }
19550
19551 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
19552   SDNode *Node = Op.getNode();
19553   SDLoc dl(Node);
19554   EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
19555
19556   // Convert seq_cst store -> xchg
19557   // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
19558   // FIXME: On 32-bit, store -> fist or movq would be more efficient
19559   //        (The only way to get a 16-byte store is cmpxchg16b)
19560   // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
19561   if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent ||
19562       !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
19563     SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
19564                                  cast<AtomicSDNode>(Node)->getMemoryVT(),
19565                                  Node->getOperand(0),
19566                                  Node->getOperand(1), Node->getOperand(2),
19567                                  cast<AtomicSDNode>(Node)->getMemOperand(),
19568                                  cast<AtomicSDNode>(Node)->getOrdering(),
19569                                  cast<AtomicSDNode>(Node)->getSynchScope());
19570     return Swap.getValue(1);
19571   }
19572   // Other atomic stores have a simple pattern.
19573   return Op;
19574 }
19575
19576 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
19577   EVT VT = Op.getNode()->getSimpleValueType(0);
19578
19579   // Let legalize expand this if it isn't a legal type yet.
19580   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
19581     return SDValue();
19582
19583   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
19584
19585   unsigned Opc;
19586   bool ExtraOp = false;
19587   switch (Op.getOpcode()) {
19588   default: llvm_unreachable("Invalid code");
19589   case ISD::ADDC: Opc = X86ISD::ADD; break;
19590   case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
19591   case ISD::SUBC: Opc = X86ISD::SUB; break;
19592   case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
19593   }
19594
19595   if (!ExtraOp)
19596     return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
19597                        Op.getOperand(1));
19598   return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
19599                      Op.getOperand(1), Op.getOperand(2));
19600 }
19601
19602 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
19603                             SelectionDAG &DAG) {
19604   assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit());
19605
19606   // For MacOSX, we want to call an alternative entry point: __sincos_stret,
19607   // which returns the values as { float, float } (in XMM0) or
19608   // { double, double } (which is returned in XMM0, XMM1).
19609   SDLoc dl(Op);
19610   SDValue Arg = Op.getOperand(0);
19611   EVT ArgVT = Arg.getValueType();
19612   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
19613
19614   TargetLowering::ArgListTy Args;
19615   TargetLowering::ArgListEntry Entry;
19616
19617   Entry.Node = Arg;
19618   Entry.Ty = ArgTy;
19619   Entry.isSExt = false;
19620   Entry.isZExt = false;
19621   Args.push_back(Entry);
19622
19623   bool isF64 = ArgVT == MVT::f64;
19624   // Only optimize x86_64 for now. i386 is a bit messy. For f32,
19625   // the small struct {f32, f32} is returned in (eax, edx). For f64,
19626   // the results are returned via SRet in memory.
19627   const char *LibcallName =  isF64 ? "__sincos_stret" : "__sincosf_stret";
19628   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19629   SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy());
19630
19631   Type *RetTy = isF64
19632     ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
19633     : (Type*)VectorType::get(ArgTy, 4);
19634
19635   TargetLowering::CallLoweringInfo CLI(DAG);
19636   CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
19637     .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0);
19638
19639   std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
19640
19641   if (isF64)
19642     // Returned in xmm0 and xmm1.
19643     return CallResult.first;
19644
19645   // Returned in bits 0:31 and 32:64 xmm0.
19646   SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
19647                                CallResult.first, DAG.getIntPtrConstant(0));
19648   SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
19649                                CallResult.first, DAG.getIntPtrConstant(1));
19650   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
19651   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
19652 }
19653
19654 /// LowerOperation - Provide custom lowering hooks for some operations.
19655 ///
19656 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
19657   switch (Op.getOpcode()) {
19658   default: llvm_unreachable("Should not custom lower this!");
19659   case ISD::SIGN_EXTEND_INREG:  return LowerSIGN_EXTEND_INREG(Op,DAG);
19660   case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
19661   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
19662     return LowerCMP_SWAP(Op, Subtarget, DAG);
19663   case ISD::CTPOP:              return LowerCTPOP(Op, Subtarget, DAG);
19664   case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
19665   case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op,DAG);
19666   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
19667   case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, DAG);
19668   case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
19669   case ISD::VSELECT:            return LowerVSELECT(Op, DAG);
19670   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
19671   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
19672   case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
19673   case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
19674   case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
19675   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
19676   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
19677   case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
19678   case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
19679   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
19680   case ISD::SHL_PARTS:
19681   case ISD::SRA_PARTS:
19682   case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
19683   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
19684   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
19685   case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
19686   case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, Subtarget, DAG);
19687   case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, Subtarget, DAG);
19688   case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, Subtarget, DAG);
19689   case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
19690   case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
19691   case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
19692   case ISD::LOAD:               return LowerExtendedLoad(Op, Subtarget, DAG);
19693   case ISD::FABS:
19694   case ISD::FNEG:               return LowerFABSorFNEG(Op, DAG);
19695   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
19696   case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
19697   case ISD::SETCC:              return LowerSETCC(Op, DAG);
19698   case ISD::SELECT:             return LowerSELECT(Op, DAG);
19699   case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
19700   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
19701   case ISD::VASTART:            return LowerVASTART(Op, DAG);
19702   case ISD::VAARG:              return LowerVAARG(Op, DAG);
19703   case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
19704   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
19705   case ISD::INTRINSIC_VOID:
19706   case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
19707   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
19708   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
19709   case ISD::FRAME_TO_ARGS_OFFSET:
19710                                 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
19711   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
19712   case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
19713   case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
19714   case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
19715   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
19716   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
19717   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
19718   case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
19719   case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ_ZERO_UNDEF(Op, DAG);
19720   case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
19721   case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
19722   case ISD::UMUL_LOHI:
19723   case ISD::SMUL_LOHI:          return LowerMUL_LOHI(Op, Subtarget, DAG);
19724   case ISD::SRA:
19725   case ISD::SRL:
19726   case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
19727   case ISD::SADDO:
19728   case ISD::UADDO:
19729   case ISD::SSUBO:
19730   case ISD::USUBO:
19731   case ISD::SMULO:
19732   case ISD::UMULO:              return LowerXALUO(Op, DAG);
19733   case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
19734   case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);
19735   case ISD::ADDC:
19736   case ISD::ADDE:
19737   case ISD::SUBC:
19738   case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
19739   case ISD::ADD:                return LowerADD(Op, DAG);
19740   case ISD::SUB:                return LowerSUB(Op, DAG);
19741   case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
19742   }
19743 }
19744
19745 /// ReplaceNodeResults - Replace a node with an illegal result type
19746 /// with a new node built out of custom code.
19747 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
19748                                            SmallVectorImpl<SDValue>&Results,
19749                                            SelectionDAG &DAG) const {
19750   SDLoc dl(N);
19751   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19752   switch (N->getOpcode()) {
19753   default:
19754     llvm_unreachable("Do not know how to custom type legalize this operation!");
19755   // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
19756   case X86ISD::FMINC:
19757   case X86ISD::FMIN:
19758   case X86ISD::FMAXC:
19759   case X86ISD::FMAX: {
19760     EVT VT = N->getValueType(0);
19761     if (VT != MVT::v2f32)
19762       llvm_unreachable("Unexpected type (!= v2f32) on FMIN/FMAX.");
19763     SDValue UNDEF = DAG.getUNDEF(VT);
19764     SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
19765                               N->getOperand(0), UNDEF);
19766     SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
19767                               N->getOperand(1), UNDEF);
19768     Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
19769     return;
19770   }
19771   case ISD::SIGN_EXTEND_INREG:
19772   case ISD::ADDC:
19773   case ISD::ADDE:
19774   case ISD::SUBC:
19775   case ISD::SUBE:
19776     // We don't want to expand or promote these.
19777     return;
19778   case ISD::SDIV:
19779   case ISD::UDIV:
19780   case ISD::SREM:
19781   case ISD::UREM:
19782   case ISD::SDIVREM:
19783   case ISD::UDIVREM: {
19784     SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
19785     Results.push_back(V);
19786     return;
19787   }
19788   case ISD::FP_TO_SINT:
19789   case ISD::FP_TO_UINT: {
19790     bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
19791
19792     if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType()))
19793       return;
19794
19795     std::pair<SDValue,SDValue> Vals =
19796         FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
19797     SDValue FIST = Vals.first, StackSlot = Vals.second;
19798     if (FIST.getNode()) {
19799       EVT VT = N->getValueType(0);
19800       // Return a load from the stack slot.
19801       if (StackSlot.getNode())
19802         Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
19803                                       MachinePointerInfo(),
19804                                       false, false, false, 0));
19805       else
19806         Results.push_back(FIST);
19807     }
19808     return;
19809   }
19810   case ISD::UINT_TO_FP: {
19811     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
19812     if (N->getOperand(0).getValueType() != MVT::v2i32 ||
19813         N->getValueType(0) != MVT::v2f32)
19814       return;
19815     SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,
19816                                  N->getOperand(0));
19817     SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
19818                                      MVT::f64);
19819     SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias);
19820     SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
19821                              DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, VBias));
19822     Or = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or);
19823     SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
19824     Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
19825     return;
19826   }
19827   case ISD::FP_ROUND: {
19828     if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
19829         return;
19830     SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
19831     Results.push_back(V);
19832     return;
19833   }
19834   case ISD::INTRINSIC_W_CHAIN: {
19835     unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
19836     switch (IntNo) {
19837     default : llvm_unreachable("Do not know how to custom type "
19838                                "legalize this intrinsic operation!");
19839     case Intrinsic::x86_rdtsc:
19840       return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
19841                                      Results);
19842     case Intrinsic::x86_rdtscp:
19843       return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
19844                                      Results);
19845     case Intrinsic::x86_rdpmc:
19846       return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
19847     }
19848   }
19849   case ISD::READCYCLECOUNTER: {
19850     return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
19851                                    Results);
19852   }
19853   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
19854     EVT T = N->getValueType(0);
19855     assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
19856     bool Regs64bit = T == MVT::i128;
19857     EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
19858     SDValue cpInL, cpInH;
19859     cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
19860                         DAG.getConstant(0, HalfT));
19861     cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
19862                         DAG.getConstant(1, HalfT));
19863     cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
19864                              Regs64bit ? X86::RAX : X86::EAX,
19865                              cpInL, SDValue());
19866     cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
19867                              Regs64bit ? X86::RDX : X86::EDX,
19868                              cpInH, cpInL.getValue(1));
19869     SDValue swapInL, swapInH;
19870     swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
19871                           DAG.getConstant(0, HalfT));
19872     swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
19873                           DAG.getConstant(1, HalfT));
19874     swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl,
19875                                Regs64bit ? X86::RBX : X86::EBX,
19876                                swapInL, cpInH.getValue(1));
19877     swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl,
19878                                Regs64bit ? X86::RCX : X86::ECX,
19879                                swapInH, swapInL.getValue(1));
19880     SDValue Ops[] = { swapInH.getValue(0),
19881                       N->getOperand(1),
19882                       swapInH.getValue(1) };
19883     SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
19884     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
19885     unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
19886                                   X86ISD::LCMPXCHG8_DAG;
19887     SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
19888     SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
19889                                         Regs64bit ? X86::RAX : X86::EAX,
19890                                         HalfT, Result.getValue(1));
19891     SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
19892                                         Regs64bit ? X86::RDX : X86::EDX,
19893                                         HalfT, cpOutL.getValue(2));
19894     SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
19895
19896     SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
19897                                         MVT::i32, cpOutH.getValue(2));
19898     SDValue Success =
19899         DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
19900                     DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);
19901     Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
19902
19903     Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
19904     Results.push_back(Success);
19905     Results.push_back(EFLAGS.getValue(1));
19906     return;
19907   }
19908   case ISD::ATOMIC_SWAP:
19909   case ISD::ATOMIC_LOAD_ADD:
19910   case ISD::ATOMIC_LOAD_SUB:
19911   case ISD::ATOMIC_LOAD_AND:
19912   case ISD::ATOMIC_LOAD_OR:
19913   case ISD::ATOMIC_LOAD_XOR:
19914   case ISD::ATOMIC_LOAD_NAND:
19915   case ISD::ATOMIC_LOAD_MIN:
19916   case ISD::ATOMIC_LOAD_MAX:
19917   case ISD::ATOMIC_LOAD_UMIN:
19918   case ISD::ATOMIC_LOAD_UMAX:
19919   case ISD::ATOMIC_LOAD: {
19920     // Delegate to generic TypeLegalization. Situations we can really handle
19921     // should have already been dealt with by AtomicExpandPass.cpp.
19922     break;
19923   }
19924   case ISD::BITCAST: {
19925     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
19926     EVT DstVT = N->getValueType(0);
19927     EVT SrcVT = N->getOperand(0)->getValueType(0);
19928
19929     if (SrcVT != MVT::f64 ||
19930         (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
19931       return;
19932
19933     unsigned NumElts = DstVT.getVectorNumElements();
19934     EVT SVT = DstVT.getVectorElementType();
19935     EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
19936     SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
19937                                    MVT::v2f64, N->getOperand(0));
19938     SDValue ToVecInt = DAG.getNode(ISD::BITCAST, dl, WiderVT, Expanded);
19939
19940     if (ExperimentalVectorWideningLegalization) {
19941       // If we are legalizing vectors by widening, we already have the desired
19942       // legal vector type, just return it.
19943       Results.push_back(ToVecInt);
19944       return;
19945     }
19946
19947     SmallVector<SDValue, 8> Elts;
19948     for (unsigned i = 0, e = NumElts; i != e; ++i)
19949       Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
19950                                    ToVecInt, DAG.getIntPtrConstant(i)));
19951
19952     Results.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, DstVT, Elts));
19953   }
19954   }
19955 }
19956
19957 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
19958   switch (Opcode) {
19959   default: return nullptr;
19960   case X86ISD::BSF:                return "X86ISD::BSF";
19961   case X86ISD::BSR:                return "X86ISD::BSR";
19962   case X86ISD::SHLD:               return "X86ISD::SHLD";
19963   case X86ISD::SHRD:               return "X86ISD::SHRD";
19964   case X86ISD::FAND:               return "X86ISD::FAND";
19965   case X86ISD::FANDN:              return "X86ISD::FANDN";
19966   case X86ISD::FOR:                return "X86ISD::FOR";
19967   case X86ISD::FXOR:               return "X86ISD::FXOR";
19968   case X86ISD::FSRL:               return "X86ISD::FSRL";
19969   case X86ISD::FILD:               return "X86ISD::FILD";
19970   case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
19971   case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
19972   case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
19973   case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
19974   case X86ISD::FLD:                return "X86ISD::FLD";
19975   case X86ISD::FST:                return "X86ISD::FST";
19976   case X86ISD::CALL:               return "X86ISD::CALL";
19977   case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
19978   case X86ISD::RDTSCP_DAG:         return "X86ISD::RDTSCP_DAG";
19979   case X86ISD::RDPMC_DAG:          return "X86ISD::RDPMC_DAG";
19980   case X86ISD::BT:                 return "X86ISD::BT";
19981   case X86ISD::CMP:                return "X86ISD::CMP";
19982   case X86ISD::COMI:               return "X86ISD::COMI";
19983   case X86ISD::UCOMI:              return "X86ISD::UCOMI";
19984   case X86ISD::CMPM:               return "X86ISD::CMPM";
19985   case X86ISD::CMPMU:              return "X86ISD::CMPMU";
19986   case X86ISD::SETCC:              return "X86ISD::SETCC";
19987   case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
19988   case X86ISD::FSETCC:             return "X86ISD::FSETCC";
19989   case X86ISD::CMOV:               return "X86ISD::CMOV";
19990   case X86ISD::BRCOND:             return "X86ISD::BRCOND";
19991   case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
19992   case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
19993   case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
19994   case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
19995   case X86ISD::Wrapper:            return "X86ISD::Wrapper";
19996   case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
19997   case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
19998   case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
19999   case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
20000   case X86ISD::PINSRB:             return "X86ISD::PINSRB";
20001   case X86ISD::PINSRW:             return "X86ISD::PINSRW";
20002   case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
20003   case X86ISD::ANDNP:              return "X86ISD::ANDNP";
20004   case X86ISD::PSIGN:              return "X86ISD::PSIGN";
20005   case X86ISD::BLENDI:             return "X86ISD::BLENDI";
20006   case X86ISD::SHRUNKBLEND:        return "X86ISD::SHRUNKBLEND";
20007   case X86ISD::SUBUS:              return "X86ISD::SUBUS";
20008   case X86ISD::HADD:               return "X86ISD::HADD";
20009   case X86ISD::HSUB:               return "X86ISD::HSUB";
20010   case X86ISD::FHADD:              return "X86ISD::FHADD";
20011   case X86ISD::FHSUB:              return "X86ISD::FHSUB";
20012   case X86ISD::UMAX:               return "X86ISD::UMAX";
20013   case X86ISD::UMIN:               return "X86ISD::UMIN";
20014   case X86ISD::SMAX:               return "X86ISD::SMAX";
20015   case X86ISD::SMIN:               return "X86ISD::SMIN";
20016   case X86ISD::FMAX:               return "X86ISD::FMAX";
20017   case X86ISD::FMIN:               return "X86ISD::FMIN";
20018   case X86ISD::FMAXC:              return "X86ISD::FMAXC";
20019   case X86ISD::FMINC:              return "X86ISD::FMINC";
20020   case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
20021   case X86ISD::FRCP:               return "X86ISD::FRCP";
20022   case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
20023   case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
20024   case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
20025   case X86ISD::EH_SJLJ_SETJMP:     return "X86ISD::EH_SJLJ_SETJMP";
20026   case X86ISD::EH_SJLJ_LONGJMP:    return "X86ISD::EH_SJLJ_LONGJMP";
20027   case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
20028   case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
20029   case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
20030   case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
20031   case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
20032   case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
20033   case X86ISD::LCMPXCHG16_DAG:     return "X86ISD::LCMPXCHG16_DAG";
20034   case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
20035   case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
20036   case X86ISD::VZEXT:              return "X86ISD::VZEXT";
20037   case X86ISD::VSEXT:              return "X86ISD::VSEXT";
20038   case X86ISD::VTRUNC:             return "X86ISD::VTRUNC";
20039   case X86ISD::VTRUNCM:            return "X86ISD::VTRUNCM";
20040   case X86ISD::VINSERT:            return "X86ISD::VINSERT";
20041   case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
20042   case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
20043   case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
20044   case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
20045   case X86ISD::VSHL:               return "X86ISD::VSHL";
20046   case X86ISD::VSRL:               return "X86ISD::VSRL";
20047   case X86ISD::VSRA:               return "X86ISD::VSRA";
20048   case X86ISD::VSHLI:              return "X86ISD::VSHLI";
20049   case X86ISD::VSRLI:              return "X86ISD::VSRLI";
20050   case X86ISD::VSRAI:              return "X86ISD::VSRAI";
20051   case X86ISD::CMPP:               return "X86ISD::CMPP";
20052   case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
20053   case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
20054   case X86ISD::PCMPEQM:            return "X86ISD::PCMPEQM";
20055   case X86ISD::PCMPGTM:            return "X86ISD::PCMPGTM";
20056   case X86ISD::ADD:                return "X86ISD::ADD";
20057   case X86ISD::SUB:                return "X86ISD::SUB";
20058   case X86ISD::ADC:                return "X86ISD::ADC";
20059   case X86ISD::SBB:                return "X86ISD::SBB";
20060   case X86ISD::SMUL:               return "X86ISD::SMUL";
20061   case X86ISD::UMUL:               return "X86ISD::UMUL";
20062   case X86ISD::SMUL8:              return "X86ISD::SMUL8";
20063   case X86ISD::UMUL8:              return "X86ISD::UMUL8";
20064   case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
20065   case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
20066   case X86ISD::INC:                return "X86ISD::INC";
20067   case X86ISD::DEC:                return "X86ISD::DEC";
20068   case X86ISD::OR:                 return "X86ISD::OR";
20069   case X86ISD::XOR:                return "X86ISD::XOR";
20070   case X86ISD::AND:                return "X86ISD::AND";
20071   case X86ISD::BEXTR:              return "X86ISD::BEXTR";
20072   case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
20073   case X86ISD::PTEST:              return "X86ISD::PTEST";
20074   case X86ISD::TESTP:              return "X86ISD::TESTP";
20075   case X86ISD::TESTM:              return "X86ISD::TESTM";
20076   case X86ISD::TESTNM:             return "X86ISD::TESTNM";
20077   case X86ISD::KORTEST:            return "X86ISD::KORTEST";
20078   case X86ISD::PACKSS:             return "X86ISD::PACKSS";
20079   case X86ISD::PACKUS:             return "X86ISD::PACKUS";
20080   case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
20081   case X86ISD::VALIGN:             return "X86ISD::VALIGN";
20082   case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
20083   case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
20084   case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
20085   case X86ISD::SHUFP:              return "X86ISD::SHUFP";
20086   case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
20087   case X86ISD::MOVLHPD:            return "X86ISD::MOVLHPD";
20088   case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
20089   case X86ISD::MOVLPS:             return "X86ISD::MOVLPS";
20090   case X86ISD::MOVLPD:             return "X86ISD::MOVLPD";
20091   case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
20092   case X86ISD::MOVSHDUP:           return "X86ISD::MOVSHDUP";
20093   case X86ISD::MOVSLDUP:           return "X86ISD::MOVSLDUP";
20094   case X86ISD::MOVSD:              return "X86ISD::MOVSD";
20095   case X86ISD::MOVSS:              return "X86ISD::MOVSS";
20096   case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
20097   case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
20098   case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
20099   case X86ISD::VBROADCASTM:        return "X86ISD::VBROADCASTM";
20100   case X86ISD::VEXTRACT:           return "X86ISD::VEXTRACT";
20101   case X86ISD::VPERMILPI:          return "X86ISD::VPERMILPI";
20102   case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
20103   case X86ISD::VPERMV:             return "X86ISD::VPERMV";
20104   case X86ISD::VPERMV3:            return "X86ISD::VPERMV3";
20105   case X86ISD::VPERMIV3:           return "X86ISD::VPERMIV3";
20106   case X86ISD::VPERMI:             return "X86ISD::VPERMI";
20107   case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
20108   case X86ISD::PMULDQ:             return "X86ISD::PMULDQ";
20109   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
20110   case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
20111   case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
20112   case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
20113   case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
20114   case X86ISD::WIN_FTOL:           return "X86ISD::WIN_FTOL";
20115   case X86ISD::SAHF:               return "X86ISD::SAHF";
20116   case X86ISD::RDRAND:             return "X86ISD::RDRAND";
20117   case X86ISD::RDSEED:             return "X86ISD::RDSEED";
20118   case X86ISD::FMADD:              return "X86ISD::FMADD";
20119   case X86ISD::FMSUB:              return "X86ISD::FMSUB";
20120   case X86ISD::FNMADD:             return "X86ISD::FNMADD";
20121   case X86ISD::FNMSUB:             return "X86ISD::FNMSUB";
20122   case X86ISD::FMADDSUB:           return "X86ISD::FMADDSUB";
20123   case X86ISD::FMSUBADD:           return "X86ISD::FMSUBADD";
20124   case X86ISD::PCMPESTRI:          return "X86ISD::PCMPESTRI";
20125   case X86ISD::PCMPISTRI:          return "X86ISD::PCMPISTRI";
20126   case X86ISD::XTEST:              return "X86ISD::XTEST";
20127   case X86ISD::COMPRESS:           return "X86ISD::COMPRESS";
20128   case X86ISD::EXPAND:             return "X86ISD::EXPAND";
20129   case X86ISD::SELECT:             return "X86ISD::SELECT";
20130   }
20131 }
20132
20133 // isLegalAddressingMode - Return true if the addressing mode represented
20134 // by AM is legal for this target, for a load/store of the specified type.
20135 bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
20136                                               Type *Ty) const {
20137   // X86 supports extremely general addressing modes.
20138   CodeModel::Model M = getTargetMachine().getCodeModel();
20139   Reloc::Model R = getTargetMachine().getRelocationModel();
20140
20141   // X86 allows a sign-extended 32-bit immediate field as a displacement.
20142   if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
20143     return false;
20144
20145   if (AM.BaseGV) {
20146     unsigned GVFlags =
20147       Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
20148
20149     // If a reference to this global requires an extra load, we can't fold it.
20150     if (isGlobalStubReference(GVFlags))
20151       return false;
20152
20153     // If BaseGV requires a register for the PIC base, we cannot also have a
20154     // BaseReg specified.
20155     if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
20156       return false;
20157
20158     // If lower 4G is not available, then we must use rip-relative addressing.
20159     if ((M != CodeModel::Small || R != Reloc::Static) &&
20160         Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
20161       return false;
20162   }
20163
20164   switch (AM.Scale) {
20165   case 0:
20166   case 1:
20167   case 2:
20168   case 4:
20169   case 8:
20170     // These scales always work.
20171     break;
20172   case 3:
20173   case 5:
20174   case 9:
20175     // These scales are formed with basereg+scalereg.  Only accept if there is
20176     // no basereg yet.
20177     if (AM.HasBaseReg)
20178       return false;
20179     break;
20180   default:  // Other stuff never works.
20181     return false;
20182   }
20183
20184   return true;
20185 }
20186
20187 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
20188   unsigned Bits = Ty->getScalarSizeInBits();
20189
20190   // 8-bit shifts are always expensive, but versions with a scalar amount aren't
20191   // particularly cheaper than those without.
20192   if (Bits == 8)
20193     return false;
20194
20195   // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
20196   // variable shifts just as cheap as scalar ones.
20197   if (Subtarget->hasInt256() && (Bits == 32 || Bits == 64))
20198     return false;
20199
20200   // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
20201   // fully general vector.
20202   return true;
20203 }
20204
20205 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
20206   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
20207     return false;
20208   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
20209   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
20210   return NumBits1 > NumBits2;
20211 }
20212
20213 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
20214   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
20215     return false;
20216
20217   if (!isTypeLegal(EVT::getEVT(Ty1)))
20218     return false;
20219
20220   assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
20221
20222   // Assuming the caller doesn't have a zeroext or signext return parameter,
20223   // truncation all the way down to i1 is valid.
20224   return true;
20225 }
20226
20227 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
20228   return isInt<32>(Imm);
20229 }
20230
20231 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
20232   // Can also use sub to handle negated immediates.
20233   return isInt<32>(Imm);
20234 }
20235
20236 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
20237   if (!VT1.isInteger() || !VT2.isInteger())
20238     return false;
20239   unsigned NumBits1 = VT1.getSizeInBits();
20240   unsigned NumBits2 = VT2.getSizeInBits();
20241   return NumBits1 > NumBits2;
20242 }
20243
20244 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
20245   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
20246   return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
20247 }
20248
20249 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
20250   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
20251   return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
20252 }
20253
20254 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
20255   EVT VT1 = Val.getValueType();
20256   if (isZExtFree(VT1, VT2))
20257     return true;
20258
20259   if (Val.getOpcode() != ISD::LOAD)
20260     return false;
20261
20262   if (!VT1.isSimple() || !VT1.isInteger() ||
20263       !VT2.isSimple() || !VT2.isInteger())
20264     return false;
20265
20266   switch (VT1.getSimpleVT().SimpleTy) {
20267   default: break;
20268   case MVT::i8:
20269   case MVT::i16:
20270   case MVT::i32:
20271     // X86 has 8, 16, and 32-bit zero-extending loads.
20272     return true;
20273   }
20274
20275   return false;
20276 }
20277
20278 bool
20279 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
20280   if (!(Subtarget->hasFMA() || Subtarget->hasFMA4()))
20281     return false;
20282
20283   VT = VT.getScalarType();
20284
20285   if (!VT.isSimple())
20286     return false;
20287
20288   switch (VT.getSimpleVT().SimpleTy) {
20289   case MVT::f32:
20290   case MVT::f64:
20291     return true;
20292   default:
20293     break;
20294   }
20295
20296   return false;
20297 }
20298
20299 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
20300   // i16 instructions are longer (0x66 prefix) and potentially slower.
20301   return !(VT1 == MVT::i32 && VT2 == MVT::i16);
20302 }
20303
20304 /// isShuffleMaskLegal - Targets can use this to indicate that they only
20305 /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
20306 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
20307 /// are assumed to be legal.
20308 bool
20309 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
20310                                       EVT VT) const {
20311   if (!VT.isSimple())
20312     return false;
20313
20314   MVT SVT = VT.getSimpleVT();
20315
20316   // Very little shuffling can be done for 64-bit vectors right now.
20317   if (VT.getSizeInBits() == 64)
20318     return false;
20319
20320   // This is an experimental legality test that is tailored to match the
20321   // legality test of the experimental lowering more closely. They are gated
20322   // separately to ease testing of performance differences.
20323   if (ExperimentalVectorShuffleLegality)
20324     // We only care that the types being shuffled are legal. The lowering can
20325     // handle any possible shuffle mask that results.
20326     return isTypeLegal(SVT);
20327
20328   // If this is a single-input shuffle with no 128 bit lane crossings we can
20329   // lower it into pshufb.
20330   if ((SVT.is128BitVector() && Subtarget->hasSSSE3()) ||
20331       (SVT.is256BitVector() && Subtarget->hasInt256())) {
20332     bool isLegal = true;
20333     for (unsigned I = 0, E = M.size(); I != E; ++I) {
20334       if (M[I] >= (int)SVT.getVectorNumElements() ||
20335           ShuffleCrosses128bitLane(SVT, I, M[I])) {
20336         isLegal = false;
20337         break;
20338       }
20339     }
20340     if (isLegal)
20341       return true;
20342   }
20343
20344   // FIXME: blends, shifts.
20345   return (SVT.getVectorNumElements() == 2 ||
20346           ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
20347           isMOVLMask(M, SVT) ||
20348           isCommutedMOVLMask(M, SVT) ||
20349           isMOVHLPSMask(M, SVT) ||
20350           isSHUFPMask(M, SVT) ||
20351           isSHUFPMask(M, SVT, /* Commuted */ true) ||
20352           isPSHUFDMask(M, SVT) ||
20353           isPSHUFDMask(M, SVT, /* SecondOperand */ true) ||
20354           isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) ||
20355           isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) ||
20356           isPALIGNRMask(M, SVT, Subtarget) ||
20357           isUNPCKLMask(M, SVT, Subtarget->hasInt256()) ||
20358           isUNPCKHMask(M, SVT, Subtarget->hasInt256()) ||
20359           isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
20360           isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
20361           isBlendMask(M, SVT, Subtarget->hasSSE41(), Subtarget->hasInt256()) ||
20362           (Subtarget->hasSSE41() && isINSERTPSMask(M, SVT)));
20363 }
20364
20365 bool
20366 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
20367                                           EVT VT) const {
20368   if (!VT.isSimple())
20369     return false;
20370
20371   MVT SVT = VT.getSimpleVT();
20372
20373   // This is an experimental legality test that is tailored to match the
20374   // legality test of the experimental lowering more closely. They are gated
20375   // separately to ease testing of performance differences.
20376   if (ExperimentalVectorShuffleLegality)
20377     // The new vector shuffle lowering is very good at managing zero-inputs.
20378     return isShuffleMaskLegal(Mask, VT);
20379
20380   unsigned NumElts = SVT.getVectorNumElements();
20381   // FIXME: This collection of masks seems suspect.
20382   if (NumElts == 2)
20383     return true;
20384   if (NumElts == 4 && SVT.is128BitVector()) {
20385     return (isMOVLMask(Mask, SVT)  ||
20386             isCommutedMOVLMask(Mask, SVT, true) ||
20387             isSHUFPMask(Mask, SVT) ||
20388             isSHUFPMask(Mask, SVT, /* Commuted */ true) ||
20389             isBlendMask(Mask, SVT, Subtarget->hasSSE41(),
20390                         Subtarget->hasInt256()));
20391   }
20392   return false;
20393 }
20394
20395 //===----------------------------------------------------------------------===//
20396 //                           X86 Scheduler Hooks
20397 //===----------------------------------------------------------------------===//
20398
20399 /// Utility function to emit xbegin specifying the start of an RTM region.
20400 static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,
20401                                      const TargetInstrInfo *TII) {
20402   DebugLoc DL = MI->getDebugLoc();
20403
20404   const BasicBlock *BB = MBB->getBasicBlock();
20405   MachineFunction::iterator I = MBB;
20406   ++I;
20407
20408   // For the v = xbegin(), we generate
20409   //
20410   // thisMBB:
20411   //  xbegin sinkMBB
20412   //
20413   // mainMBB:
20414   //  eax = -1
20415   //
20416   // sinkMBB:
20417   //  v = eax
20418
20419   MachineBasicBlock *thisMBB = MBB;
20420   MachineFunction *MF = MBB->getParent();
20421   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
20422   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
20423   MF->insert(I, mainMBB);
20424   MF->insert(I, sinkMBB);
20425
20426   // Transfer the remainder of BB and its successor edges to sinkMBB.
20427   sinkMBB->splice(sinkMBB->begin(), MBB,
20428                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
20429   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
20430
20431   // thisMBB:
20432   //  xbegin sinkMBB
20433   //  # fallthrough to mainMBB
20434   //  # abortion to sinkMBB
20435   BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
20436   thisMBB->addSuccessor(mainMBB);
20437   thisMBB->addSuccessor(sinkMBB);
20438
20439   // mainMBB:
20440   //  EAX = -1
20441   BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
20442   mainMBB->addSuccessor(sinkMBB);
20443
20444   // sinkMBB:
20445   // EAX is live into the sinkMBB
20446   sinkMBB->addLiveIn(X86::EAX);
20447   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
20448           TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
20449     .addReg(X86::EAX);
20450
20451   MI->eraseFromParent();
20452   return sinkMBB;
20453 }
20454
20455 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
20456 // or XMM0_V32I8 in AVX all of this code can be replaced with that
20457 // in the .td file.
20458 static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB,
20459                                        const TargetInstrInfo *TII) {
20460   unsigned Opc;
20461   switch (MI->getOpcode()) {
20462   default: llvm_unreachable("illegal opcode!");
20463   case X86::PCMPISTRM128REG:  Opc = X86::PCMPISTRM128rr;  break;
20464   case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
20465   case X86::PCMPISTRM128MEM:  Opc = X86::PCMPISTRM128rm;  break;
20466   case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
20467   case X86::PCMPESTRM128REG:  Opc = X86::PCMPESTRM128rr;  break;
20468   case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
20469   case X86::PCMPESTRM128MEM:  Opc = X86::PCMPESTRM128rm;  break;
20470   case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
20471   }
20472
20473   DebugLoc dl = MI->getDebugLoc();
20474   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
20475
20476   unsigned NumArgs = MI->getNumOperands();
20477   for (unsigned i = 1; i < NumArgs; ++i) {
20478     MachineOperand &Op = MI->getOperand(i);
20479     if (!(Op.isReg() && Op.isImplicit()))
20480       MIB.addOperand(Op);
20481   }
20482   if (MI->hasOneMemOperand())
20483     MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
20484
20485   BuildMI(*BB, MI, dl,
20486     TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
20487     .addReg(X86::XMM0);
20488
20489   MI->eraseFromParent();
20490   return BB;
20491 }
20492
20493 // FIXME: Custom handling because TableGen doesn't support multiple implicit
20494 // defs in an instruction pattern
20495 static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB,
20496                                        const TargetInstrInfo *TII) {
20497   unsigned Opc;
20498   switch (MI->getOpcode()) {
20499   default: llvm_unreachable("illegal opcode!");
20500   case X86::PCMPISTRIREG:  Opc = X86::PCMPISTRIrr;  break;
20501   case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
20502   case X86::PCMPISTRIMEM:  Opc = X86::PCMPISTRIrm;  break;
20503   case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
20504   case X86::PCMPESTRIREG:  Opc = X86::PCMPESTRIrr;  break;
20505   case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
20506   case X86::PCMPESTRIMEM:  Opc = X86::PCMPESTRIrm;  break;
20507   case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
20508   }
20509
20510   DebugLoc dl = MI->getDebugLoc();
20511   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
20512
20513   unsigned NumArgs = MI->getNumOperands(); // remove the results
20514   for (unsigned i = 1; i < NumArgs; ++i) {
20515     MachineOperand &Op = MI->getOperand(i);
20516     if (!(Op.isReg() && Op.isImplicit()))
20517       MIB.addOperand(Op);
20518   }
20519   if (MI->hasOneMemOperand())
20520     MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
20521
20522   BuildMI(*BB, MI, dl,
20523     TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
20524     .addReg(X86::ECX);
20525
20526   MI->eraseFromParent();
20527   return BB;
20528 }
20529
20530 static MachineBasicBlock * EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
20531                                        const TargetInstrInfo *TII,
20532                                        const X86Subtarget* Subtarget) {
20533   DebugLoc dl = MI->getDebugLoc();
20534
20535   // Address into RAX/EAX, other two args into ECX, EDX.
20536   unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
20537   unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
20538   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
20539   for (int i = 0; i < X86::AddrNumOperands; ++i)
20540     MIB.addOperand(MI->getOperand(i));
20541
20542   unsigned ValOps = X86::AddrNumOperands;
20543   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
20544     .addReg(MI->getOperand(ValOps).getReg());
20545   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
20546     .addReg(MI->getOperand(ValOps+1).getReg());
20547
20548   // The instruction doesn't actually take any operands though.
20549   BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr));
20550
20551   MI->eraseFromParent(); // The pseudo is gone now.
20552   return BB;
20553 }
20554
20555 MachineBasicBlock *
20556 X86TargetLowering::EmitVAARG64WithCustomInserter(
20557                    MachineInstr *MI,
20558                    MachineBasicBlock *MBB) const {
20559   // Emit va_arg instruction on X86-64.
20560
20561   // Operands to this pseudo-instruction:
20562   // 0  ) Output        : destination address (reg)
20563   // 1-5) Input         : va_list address (addr, i64mem)
20564   // 6  ) ArgSize       : Size (in bytes) of vararg type
20565   // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
20566   // 8  ) Align         : Alignment of type
20567   // 9  ) EFLAGS (implicit-def)
20568
20569   assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
20570   assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands");
20571
20572   unsigned DestReg = MI->getOperand(0).getReg();
20573   MachineOperand &Base = MI->getOperand(1);
20574   MachineOperand &Scale = MI->getOperand(2);
20575   MachineOperand &Index = MI->getOperand(3);
20576   MachineOperand &Disp = MI->getOperand(4);
20577   MachineOperand &Segment = MI->getOperand(5);
20578   unsigned ArgSize = MI->getOperand(6).getImm();
20579   unsigned ArgMode = MI->getOperand(7).getImm();
20580   unsigned Align = MI->getOperand(8).getImm();
20581
20582   // Memory Reference
20583   assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
20584   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
20585   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
20586
20587   // Machine Information
20588   const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo();
20589   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
20590   const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
20591   const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
20592   DebugLoc DL = MI->getDebugLoc();
20593
20594   // struct va_list {
20595   //   i32   gp_offset
20596   //   i32   fp_offset
20597   //   i64   overflow_area (address)
20598   //   i64   reg_save_area (address)
20599   // }
20600   // sizeof(va_list) = 24
20601   // alignment(va_list) = 8
20602
20603   unsigned TotalNumIntRegs = 6;
20604   unsigned TotalNumXMMRegs = 8;
20605   bool UseGPOffset = (ArgMode == 1);
20606   bool UseFPOffset = (ArgMode == 2);
20607   unsigned MaxOffset = TotalNumIntRegs * 8 +
20608                        (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
20609
20610   /* Align ArgSize to a multiple of 8 */
20611   unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
20612   bool NeedsAlign = (Align > 8);
20613
20614   MachineBasicBlock *thisMBB = MBB;
20615   MachineBasicBlock *overflowMBB;
20616   MachineBasicBlock *offsetMBB;
20617   MachineBasicBlock *endMBB;
20618
20619   unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
20620   unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
20621   unsigned OffsetReg = 0;
20622
20623   if (!UseGPOffset && !UseFPOffset) {
20624     // If we only pull from the overflow region, we don't create a branch.
20625     // We don't need to alter control flow.
20626     OffsetDestReg = 0; // unused
20627     OverflowDestReg = DestReg;
20628
20629     offsetMBB = nullptr;
20630     overflowMBB = thisMBB;
20631     endMBB = thisMBB;
20632   } else {
20633     // First emit code to check if gp_offset (or fp_offset) is below the bound.
20634     // If so, pull the argument from reg_save_area. (branch to offsetMBB)
20635     // If not, pull from overflow_area. (branch to overflowMBB)
20636     //
20637     //       thisMBB
20638     //         |     .
20639     //         |        .
20640     //     offsetMBB   overflowMBB
20641     //         |        .
20642     //         |     .
20643     //        endMBB
20644
20645     // Registers for the PHI in endMBB
20646     OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
20647     OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
20648
20649     const BasicBlock *LLVM_BB = MBB->getBasicBlock();
20650     MachineFunction *MF = MBB->getParent();
20651     overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20652     offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20653     endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20654
20655     MachineFunction::iterator MBBIter = MBB;
20656     ++MBBIter;
20657
20658     // Insert the new basic blocks
20659     MF->insert(MBBIter, offsetMBB);
20660     MF->insert(MBBIter, overflowMBB);
20661     MF->insert(MBBIter, endMBB);
20662
20663     // Transfer the remainder of MBB and its successor edges to endMBB.
20664     endMBB->splice(endMBB->begin(), thisMBB,
20665                    std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
20666     endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
20667
20668     // Make offsetMBB and overflowMBB successors of thisMBB
20669     thisMBB->addSuccessor(offsetMBB);
20670     thisMBB->addSuccessor(overflowMBB);
20671
20672     // endMBB is a successor of both offsetMBB and overflowMBB
20673     offsetMBB->addSuccessor(endMBB);
20674     overflowMBB->addSuccessor(endMBB);
20675
20676     // Load the offset value into a register
20677     OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
20678     BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
20679       .addOperand(Base)
20680       .addOperand(Scale)
20681       .addOperand(Index)
20682       .addDisp(Disp, UseFPOffset ? 4 : 0)
20683       .addOperand(Segment)
20684       .setMemRefs(MMOBegin, MMOEnd);
20685
20686     // Check if there is enough room left to pull this argument.
20687     BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
20688       .addReg(OffsetReg)
20689       .addImm(MaxOffset + 8 - ArgSizeA8);
20690
20691     // Branch to "overflowMBB" if offset >= max
20692     // Fall through to "offsetMBB" otherwise
20693     BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
20694       .addMBB(overflowMBB);
20695   }
20696
20697   // In offsetMBB, emit code to use the reg_save_area.
20698   if (offsetMBB) {
20699     assert(OffsetReg != 0);
20700
20701     // Read the reg_save_area address.
20702     unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
20703     BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
20704       .addOperand(Base)
20705       .addOperand(Scale)
20706       .addOperand(Index)
20707       .addDisp(Disp, 16)
20708       .addOperand(Segment)
20709       .setMemRefs(MMOBegin, MMOEnd);
20710
20711     // Zero-extend the offset
20712     unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
20713       BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
20714         .addImm(0)
20715         .addReg(OffsetReg)
20716         .addImm(X86::sub_32bit);
20717
20718     // Add the offset to the reg_save_area to get the final address.
20719     BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
20720       .addReg(OffsetReg64)
20721       .addReg(RegSaveReg);
20722
20723     // Compute the offset for the next argument
20724     unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
20725     BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
20726       .addReg(OffsetReg)
20727       .addImm(UseFPOffset ? 16 : 8);
20728
20729     // Store it back into the va_list.
20730     BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
20731       .addOperand(Base)
20732       .addOperand(Scale)
20733       .addOperand(Index)
20734       .addDisp(Disp, UseFPOffset ? 4 : 0)
20735       .addOperand(Segment)
20736       .addReg(NextOffsetReg)
20737       .setMemRefs(MMOBegin, MMOEnd);
20738
20739     // Jump to endMBB
20740     BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
20741       .addMBB(endMBB);
20742   }
20743
20744   //
20745   // Emit code to use overflow area
20746   //
20747
20748   // Load the overflow_area address into a register.
20749   unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
20750   BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
20751     .addOperand(Base)
20752     .addOperand(Scale)
20753     .addOperand(Index)
20754     .addDisp(Disp, 8)
20755     .addOperand(Segment)
20756     .setMemRefs(MMOBegin, MMOEnd);
20757
20758   // If we need to align it, do so. Otherwise, just copy the address
20759   // to OverflowDestReg.
20760   if (NeedsAlign) {
20761     // Align the overflow address
20762     assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2");
20763     unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
20764
20765     // aligned_addr = (addr + (align-1)) & ~(align-1)
20766     BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
20767       .addReg(OverflowAddrReg)
20768       .addImm(Align-1);
20769
20770     BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
20771       .addReg(TmpReg)
20772       .addImm(~(uint64_t)(Align-1));
20773   } else {
20774     BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
20775       .addReg(OverflowAddrReg);
20776   }
20777
20778   // Compute the next overflow address after this argument.
20779   // (the overflow address should be kept 8-byte aligned)
20780   unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
20781   BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
20782     .addReg(OverflowDestReg)
20783     .addImm(ArgSizeA8);
20784
20785   // Store the new overflow address.
20786   BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
20787     .addOperand(Base)
20788     .addOperand(Scale)
20789     .addOperand(Index)
20790     .addDisp(Disp, 8)
20791     .addOperand(Segment)
20792     .addReg(NextAddrReg)
20793     .setMemRefs(MMOBegin, MMOEnd);
20794
20795   // If we branched, emit the PHI to the front of endMBB.
20796   if (offsetMBB) {
20797     BuildMI(*endMBB, endMBB->begin(), DL,
20798             TII->get(X86::PHI), DestReg)
20799       .addReg(OffsetDestReg).addMBB(offsetMBB)
20800       .addReg(OverflowDestReg).addMBB(overflowMBB);
20801   }
20802
20803   // Erase the pseudo instruction
20804   MI->eraseFromParent();
20805
20806   return endMBB;
20807 }
20808
20809 MachineBasicBlock *
20810 X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
20811                                                  MachineInstr *MI,
20812                                                  MachineBasicBlock *MBB) const {
20813   // Emit code to save XMM registers to the stack. The ABI says that the
20814   // number of registers to save is given in %al, so it's theoretically
20815   // possible to do an indirect jump trick to avoid saving all of them,
20816   // however this code takes a simpler approach and just executes all
20817   // of the stores if %al is non-zero. It's less code, and it's probably
20818   // easier on the hardware branch predictor, and stores aren't all that
20819   // expensive anyway.
20820
20821   // Create the new basic blocks. One block contains all the XMM stores,
20822   // and one block is the final destination regardless of whether any
20823   // stores were performed.
20824   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
20825   MachineFunction *F = MBB->getParent();
20826   MachineFunction::iterator MBBIter = MBB;
20827   ++MBBIter;
20828   MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
20829   MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
20830   F->insert(MBBIter, XMMSaveMBB);
20831   F->insert(MBBIter, EndMBB);
20832
20833   // Transfer the remainder of MBB and its successor edges to EndMBB.
20834   EndMBB->splice(EndMBB->begin(), MBB,
20835                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
20836   EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
20837
20838   // The original block will now fall through to the XMM save block.
20839   MBB->addSuccessor(XMMSaveMBB);
20840   // The XMMSaveMBB will fall through to the end block.
20841   XMMSaveMBB->addSuccessor(EndMBB);
20842
20843   // Now add the instructions.
20844   const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo();
20845   DebugLoc DL = MI->getDebugLoc();
20846
20847   unsigned CountReg = MI->getOperand(0).getReg();
20848   int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
20849   int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
20850
20851   if (!Subtarget->isTargetWin64()) {
20852     // If %al is 0, branch around the XMM save block.
20853     BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
20854     BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
20855     MBB->addSuccessor(EndMBB);
20856   }
20857
20858   // Make sure the last operand is EFLAGS, which gets clobbered by the branch
20859   // that was just emitted, but clearly shouldn't be "saved".
20860   assert((MI->getNumOperands() <= 3 ||
20861           !MI->getOperand(MI->getNumOperands() - 1).isReg() ||
20862           MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS)
20863          && "Expected last argument to be EFLAGS");
20864   unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
20865   // In the XMM save block, save all the XMM argument registers.
20866   for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) {
20867     int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
20868     MachineMemOperand *MMO =
20869       F->getMachineMemOperand(
20870           MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset),
20871         MachineMemOperand::MOStore,
20872         /*Size=*/16, /*Align=*/16);
20873     BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
20874       .addFrameIndex(RegSaveFrameIndex)
20875       .addImm(/*Scale=*/1)
20876       .addReg(/*IndexReg=*/0)
20877       .addImm(/*Disp=*/Offset)
20878       .addReg(/*Segment=*/0)
20879       .addReg(MI->getOperand(i).getReg())
20880       .addMemOperand(MMO);
20881   }
20882
20883   MI->eraseFromParent();   // The pseudo instruction is gone now.
20884
20885   return EndMBB;
20886 }
20887
20888 // The EFLAGS operand of SelectItr might be missing a kill marker
20889 // because there were multiple uses of EFLAGS, and ISel didn't know
20890 // which to mark. Figure out whether SelectItr should have had a
20891 // kill marker, and set it if it should. Returns the correct kill
20892 // marker value.
20893 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
20894                                      MachineBasicBlock* BB,
20895                                      const TargetRegisterInfo* TRI) {
20896   // Scan forward through BB for a use/def of EFLAGS.
20897   MachineBasicBlock::iterator miI(std::next(SelectItr));
20898   for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
20899     const MachineInstr& mi = *miI;
20900     if (mi.readsRegister(X86::EFLAGS))
20901       return false;
20902     if (mi.definesRegister(X86::EFLAGS))
20903       break; // Should have kill-flag - update below.
20904   }
20905
20906   // If we hit the end of the block, check whether EFLAGS is live into a
20907   // successor.
20908   if (miI == BB->end()) {
20909     for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
20910                                           sEnd = BB->succ_end();
20911          sItr != sEnd; ++sItr) {
20912       MachineBasicBlock* succ = *sItr;
20913       if (succ->isLiveIn(X86::EFLAGS))
20914         return false;
20915     }
20916   }
20917
20918   // We found a def, or hit the end of the basic block and EFLAGS wasn't live
20919   // out. SelectMI should have a kill flag on EFLAGS.
20920   SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
20921   return true;
20922 }
20923
20924 MachineBasicBlock *
20925 X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
20926                                      MachineBasicBlock *BB) const {
20927   const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
20928   DebugLoc DL = MI->getDebugLoc();
20929
20930   // To "insert" a SELECT_CC instruction, we actually have to insert the
20931   // diamond control-flow pattern.  The incoming instruction knows the
20932   // destination vreg to set, the condition code register to branch on, the
20933   // true/false values to select between, and a branch opcode to use.
20934   const BasicBlock *LLVM_BB = BB->getBasicBlock();
20935   MachineFunction::iterator It = BB;
20936   ++It;
20937
20938   //  thisMBB:
20939   //  ...
20940   //   TrueVal = ...
20941   //   cmpTY ccX, r1, r2
20942   //   bCC copy1MBB
20943   //   fallthrough --> copy0MBB
20944   MachineBasicBlock *thisMBB = BB;
20945   MachineFunction *F = BB->getParent();
20946   MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
20947   MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
20948   F->insert(It, copy0MBB);
20949   F->insert(It, sinkMBB);
20950
20951   // If the EFLAGS register isn't dead in the terminator, then claim that it's
20952   // live into the sink and copy blocks.
20953   const TargetRegisterInfo *TRI =
20954       BB->getParent()->getSubtarget().getRegisterInfo();
20955   if (!MI->killsRegister(X86::EFLAGS) &&
20956       !checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
20957     copy0MBB->addLiveIn(X86::EFLAGS);
20958     sinkMBB->addLiveIn(X86::EFLAGS);
20959   }
20960
20961   // Transfer the remainder of BB and its successor edges to sinkMBB.
20962   sinkMBB->splice(sinkMBB->begin(), BB,
20963                   std::next(MachineBasicBlock::iterator(MI)), BB->end());
20964   sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
20965
20966   // Add the true and fallthrough blocks as its successors.
20967   BB->addSuccessor(copy0MBB);
20968   BB->addSuccessor(sinkMBB);
20969
20970   // Create the conditional branch instruction.
20971   unsigned Opc =
20972     X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
20973   BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
20974
20975   //  copy0MBB:
20976   //   %FalseValue = ...
20977   //   # fallthrough to sinkMBB
20978   copy0MBB->addSuccessor(sinkMBB);
20979
20980   //  sinkMBB:
20981   //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
20982   //  ...
20983   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
20984           TII->get(X86::PHI), MI->getOperand(0).getReg())
20985     .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
20986     .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
20987
20988   MI->eraseFromParent();   // The pseudo instruction is gone now.
20989   return sinkMBB;
20990 }
20991
20992 MachineBasicBlock *
20993 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
20994                                         MachineBasicBlock *BB) const {
20995   MachineFunction *MF = BB->getParent();
20996   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
20997   DebugLoc DL = MI->getDebugLoc();
20998   const BasicBlock *LLVM_BB = BB->getBasicBlock();
20999
21000   assert(MF->shouldSplitStack());
21001
21002   const bool Is64Bit = Subtarget->is64Bit();
21003   const bool IsLP64 = Subtarget->isTarget64BitLP64();
21004
21005   const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
21006   const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
21007
21008   // BB:
21009   //  ... [Till the alloca]
21010   // If stacklet is not large enough, jump to mallocMBB
21011   //
21012   // bumpMBB:
21013   //  Allocate by subtracting from RSP
21014   //  Jump to continueMBB
21015   //
21016   // mallocMBB:
21017   //  Allocate by call to runtime
21018   //
21019   // continueMBB:
21020   //  ...
21021   //  [rest of original BB]
21022   //
21023
21024   MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
21025   MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
21026   MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
21027
21028   MachineRegisterInfo &MRI = MF->getRegInfo();
21029   const TargetRegisterClass *AddrRegClass =
21030     getRegClassFor(getPointerTy());
21031
21032   unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
21033     bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
21034     tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
21035     SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
21036     sizeVReg = MI->getOperand(1).getReg(),
21037     physSPReg = IsLP64 || Subtarget->isTargetNaCl64() ? X86::RSP : X86::ESP;
21038
21039   MachineFunction::iterator MBBIter = BB;
21040   ++MBBIter;
21041
21042   MF->insert(MBBIter, bumpMBB);
21043   MF->insert(MBBIter, mallocMBB);
21044   MF->insert(MBBIter, continueMBB);
21045
21046   continueMBB->splice(continueMBB->begin(), BB,
21047                       std::next(MachineBasicBlock::iterator(MI)), BB->end());
21048   continueMBB->transferSuccessorsAndUpdatePHIs(BB);
21049
21050   // Add code to the main basic block to check if the stack limit has been hit,
21051   // and if so, jump to mallocMBB otherwise to bumpMBB.
21052   BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
21053   BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
21054     .addReg(tmpSPVReg).addReg(sizeVReg);
21055   BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
21056     .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
21057     .addReg(SPLimitVReg);
21058   BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
21059
21060   // bumpMBB simply decreases the stack pointer, since we know the current
21061   // stacklet has enough space.
21062   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
21063     .addReg(SPLimitVReg);
21064   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
21065     .addReg(SPLimitVReg);
21066   BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
21067
21068   // Calls into a routine in libgcc to allocate more space from the heap.
21069   const uint32_t *RegMask = MF->getTarget()
21070                                 .getSubtargetImpl()
21071                                 ->getRegisterInfo()
21072                                 ->getCallPreservedMask(CallingConv::C);
21073   if (IsLP64) {
21074     BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
21075       .addReg(sizeVReg);
21076     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
21077       .addExternalSymbol("__morestack_allocate_stack_space")
21078       .addRegMask(RegMask)
21079       .addReg(X86::RDI, RegState::Implicit)
21080       .addReg(X86::RAX, RegState::ImplicitDefine);
21081   } else if (Is64Bit) {
21082     BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
21083       .addReg(sizeVReg);
21084     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
21085       .addExternalSymbol("__morestack_allocate_stack_space")
21086       .addRegMask(RegMask)
21087       .addReg(X86::EDI, RegState::Implicit)
21088       .addReg(X86::EAX, RegState::ImplicitDefine);
21089   } else {
21090     BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
21091       .addImm(12);
21092     BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
21093     BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
21094       .addExternalSymbol("__morestack_allocate_stack_space")
21095       .addRegMask(RegMask)
21096       .addReg(X86::EAX, RegState::ImplicitDefine);
21097   }
21098
21099   if (!Is64Bit)
21100     BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
21101       .addImm(16);
21102
21103   BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
21104     .addReg(IsLP64 ? X86::RAX : X86::EAX);
21105   BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
21106
21107   // Set up the CFG correctly.
21108   BB->addSuccessor(bumpMBB);
21109   BB->addSuccessor(mallocMBB);
21110   mallocMBB->addSuccessor(continueMBB);
21111   bumpMBB->addSuccessor(continueMBB);
21112
21113   // Take care of the PHI nodes.
21114   BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
21115           MI->getOperand(0).getReg())
21116     .addReg(mallocPtrVReg).addMBB(mallocMBB)
21117     .addReg(bumpSPPtrVReg).addMBB(bumpMBB);
21118
21119   // Delete the original pseudo instruction.
21120   MI->eraseFromParent();
21121
21122   // And we're done.
21123   return continueMBB;
21124 }
21125
21126 MachineBasicBlock *
21127 X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
21128                                         MachineBasicBlock *BB) const {
21129   const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
21130   DebugLoc DL = MI->getDebugLoc();
21131
21132   assert(!Subtarget->isTargetMachO());
21133
21134   // The lowering is pretty easy: we're just emitting the call to _alloca.  The
21135   // non-trivial part is impdef of ESP.
21136
21137   if (Subtarget->isTargetWin64()) {
21138     if (Subtarget->isTargetCygMing()) {
21139       // ___chkstk(Mingw64):
21140       // Clobbers R10, R11, RAX and EFLAGS.
21141       // Updates RSP.
21142       BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
21143         .addExternalSymbol("___chkstk")
21144         .addReg(X86::RAX, RegState::Implicit)
21145         .addReg(X86::RSP, RegState::Implicit)
21146         .addReg(X86::RAX, RegState::Define | RegState::Implicit)
21147         .addReg(X86::RSP, RegState::Define | RegState::Implicit)
21148         .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
21149     } else {
21150       // __chkstk(MSVCRT): does not update stack pointer.
21151       // Clobbers R10, R11 and EFLAGS.
21152       BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
21153         .addExternalSymbol("__chkstk")
21154         .addReg(X86::RAX, RegState::Implicit)
21155         .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
21156       // RAX has the offset to be subtracted from RSP.
21157       BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP)
21158         .addReg(X86::RSP)
21159         .addReg(X86::RAX);
21160     }
21161   } else {
21162     const char *StackProbeSymbol = (Subtarget->isTargetKnownWindowsMSVC() ||
21163                                     Subtarget->isTargetWindowsItanium())
21164                                        ? "_chkstk"
21165                                        : "_alloca";
21166
21167     BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32))
21168       .addExternalSymbol(StackProbeSymbol)
21169       .addReg(X86::EAX, RegState::Implicit)
21170       .addReg(X86::ESP, RegState::Implicit)
21171       .addReg(X86::EAX, RegState::Define | RegState::Implicit)
21172       .addReg(X86::ESP, RegState::Define | RegState::Implicit)
21173       .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
21174   }
21175
21176   MI->eraseFromParent();   // The pseudo instruction is gone now.
21177   return BB;
21178 }
21179
21180 MachineBasicBlock *
21181 X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
21182                                       MachineBasicBlock *BB) const {
21183   // This is pretty easy.  We're taking the value that we received from
21184   // our load from the relocation, sticking it in either RDI (x86-64)
21185   // or EAX and doing an indirect call.  The return value will then
21186   // be in the normal return register.
21187   MachineFunction *F = BB->getParent();
21188   const X86InstrInfo *TII =
21189       static_cast<const X86InstrInfo *>(F->getSubtarget().getInstrInfo());
21190   DebugLoc DL = MI->getDebugLoc();
21191
21192   assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
21193   assert(MI->getOperand(3).isGlobal() && "This should be a global");
21194
21195   // Get a register mask for the lowered call.
21196   // FIXME: The 32-bit calls have non-standard calling conventions. Use a
21197   // proper register mask.
21198   const uint32_t *RegMask = F->getTarget()
21199                                 .getSubtargetImpl()
21200                                 ->getRegisterInfo()
21201                                 ->getCallPreservedMask(CallingConv::C);
21202   if (Subtarget->is64Bit()) {
21203     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
21204                                       TII->get(X86::MOV64rm), X86::RDI)
21205     .addReg(X86::RIP)
21206     .addImm(0).addReg(0)
21207     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
21208                       MI->getOperand(3).getTargetFlags())
21209     .addReg(0);
21210     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
21211     addDirectMem(MIB, X86::RDI);
21212     MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
21213   } else if (F->getTarget().getRelocationModel() != Reloc::PIC_) {
21214     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
21215                                       TII->get(X86::MOV32rm), X86::EAX)
21216     .addReg(0)
21217     .addImm(0).addReg(0)
21218     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
21219                       MI->getOperand(3).getTargetFlags())
21220     .addReg(0);
21221     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
21222     addDirectMem(MIB, X86::EAX);
21223     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
21224   } else {
21225     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
21226                                       TII->get(X86::MOV32rm), X86::EAX)
21227     .addReg(TII->getGlobalBaseReg(F))
21228     .addImm(0).addReg(0)
21229     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
21230                       MI->getOperand(3).getTargetFlags())
21231     .addReg(0);
21232     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
21233     addDirectMem(MIB, X86::EAX);
21234     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
21235   }
21236
21237   MI->eraseFromParent(); // The pseudo instruction is gone now.
21238   return BB;
21239 }
21240
21241 MachineBasicBlock *
21242 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
21243                                     MachineBasicBlock *MBB) const {
21244   DebugLoc DL = MI->getDebugLoc();
21245   MachineFunction *MF = MBB->getParent();
21246   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
21247   MachineRegisterInfo &MRI = MF->getRegInfo();
21248
21249   const BasicBlock *BB = MBB->getBasicBlock();
21250   MachineFunction::iterator I = MBB;
21251   ++I;
21252
21253   // Memory Reference
21254   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
21255   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
21256
21257   unsigned DstReg;
21258   unsigned MemOpndSlot = 0;
21259
21260   unsigned CurOp = 0;
21261
21262   DstReg = MI->getOperand(CurOp++).getReg();
21263   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
21264   assert(RC->hasType(MVT::i32) && "Invalid destination!");
21265   unsigned mainDstReg = MRI.createVirtualRegister(RC);
21266   unsigned restoreDstReg = MRI.createVirtualRegister(RC);
21267
21268   MemOpndSlot = CurOp;
21269
21270   MVT PVT = getPointerTy();
21271   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
21272          "Invalid Pointer Size!");
21273
21274   // For v = setjmp(buf), we generate
21275   //
21276   // thisMBB:
21277   //  buf[LabelOffset] = restoreMBB
21278   //  SjLjSetup restoreMBB
21279   //
21280   // mainMBB:
21281   //  v_main = 0
21282   //
21283   // sinkMBB:
21284   //  v = phi(main, restore)
21285   //
21286   // restoreMBB:
21287   //  if base pointer being used, load it from frame
21288   //  v_restore = 1
21289
21290   MachineBasicBlock *thisMBB = MBB;
21291   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
21292   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
21293   MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
21294   MF->insert(I, mainMBB);
21295   MF->insert(I, sinkMBB);
21296   MF->push_back(restoreMBB);
21297
21298   MachineInstrBuilder MIB;
21299
21300   // Transfer the remainder of BB and its successor edges to sinkMBB.
21301   sinkMBB->splice(sinkMBB->begin(), MBB,
21302                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
21303   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
21304
21305   // thisMBB:
21306   unsigned PtrStoreOpc = 0;
21307   unsigned LabelReg = 0;
21308   const int64_t LabelOffset = 1 * PVT.getStoreSize();
21309   Reloc::Model RM = MF->getTarget().getRelocationModel();
21310   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
21311                      (RM == Reloc::Static || RM == Reloc::DynamicNoPIC);
21312
21313   // Prepare IP either in reg or imm.
21314   if (!UseImmLabel) {
21315     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
21316     const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
21317     LabelReg = MRI.createVirtualRegister(PtrRC);
21318     if (Subtarget->is64Bit()) {
21319       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
21320               .addReg(X86::RIP)
21321               .addImm(0)
21322               .addReg(0)
21323               .addMBB(restoreMBB)
21324               .addReg(0);
21325     } else {
21326       const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
21327       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
21328               .addReg(XII->getGlobalBaseReg(MF))
21329               .addImm(0)
21330               .addReg(0)
21331               .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference())
21332               .addReg(0);
21333     }
21334   } else
21335     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
21336   // Store IP
21337   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
21338   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
21339     if (i == X86::AddrDisp)
21340       MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset);
21341     else
21342       MIB.addOperand(MI->getOperand(MemOpndSlot + i));
21343   }
21344   if (!UseImmLabel)
21345     MIB.addReg(LabelReg);
21346   else
21347     MIB.addMBB(restoreMBB);
21348   MIB.setMemRefs(MMOBegin, MMOEnd);
21349   // Setup
21350   MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
21351           .addMBB(restoreMBB);
21352
21353   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
21354       MF->getSubtarget().getRegisterInfo());
21355   MIB.addRegMask(RegInfo->getNoPreservedMask());
21356   thisMBB->addSuccessor(mainMBB);
21357   thisMBB->addSuccessor(restoreMBB);
21358
21359   // mainMBB:
21360   //  EAX = 0
21361   BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
21362   mainMBB->addSuccessor(sinkMBB);
21363
21364   // sinkMBB:
21365   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
21366           TII->get(X86::PHI), DstReg)
21367     .addReg(mainDstReg).addMBB(mainMBB)
21368     .addReg(restoreDstReg).addMBB(restoreMBB);
21369
21370   // restoreMBB:
21371   if (RegInfo->hasBasePointer(*MF)) {
21372     const X86Subtarget &STI = MF->getTarget().getSubtarget<X86Subtarget>();
21373     const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64();
21374     X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
21375     X86FI->setRestoreBasePointer(MF);
21376     unsigned FramePtr = RegInfo->getFrameRegister(*MF);
21377     unsigned BasePtr = RegInfo->getBaseRegister();
21378     unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
21379     addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
21380                  FramePtr, true, X86FI->getRestoreBasePointerOffset())
21381       .setMIFlag(MachineInstr::FrameSetup);
21382   }
21383   BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
21384   BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
21385   restoreMBB->addSuccessor(sinkMBB);
21386
21387   MI->eraseFromParent();
21388   return sinkMBB;
21389 }
21390
21391 MachineBasicBlock *
21392 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
21393                                      MachineBasicBlock *MBB) const {
21394   DebugLoc DL = MI->getDebugLoc();
21395   MachineFunction *MF = MBB->getParent();
21396   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
21397   MachineRegisterInfo &MRI = MF->getRegInfo();
21398
21399   // Memory Reference
21400   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
21401   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
21402
21403   MVT PVT = getPointerTy();
21404   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
21405          "Invalid Pointer Size!");
21406
21407   const TargetRegisterClass *RC =
21408     (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
21409   unsigned Tmp = MRI.createVirtualRegister(RC);
21410   // Since FP is only updated here but NOT referenced, it's treated as GPR.
21411   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
21412       MF->getSubtarget().getRegisterInfo());
21413   unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
21414   unsigned SP = RegInfo->getStackRegister();
21415
21416   MachineInstrBuilder MIB;
21417
21418   const int64_t LabelOffset = 1 * PVT.getStoreSize();
21419   const int64_t SPOffset = 2 * PVT.getStoreSize();
21420
21421   unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
21422   unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
21423
21424   // Reload FP
21425   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
21426   for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
21427     MIB.addOperand(MI->getOperand(i));
21428   MIB.setMemRefs(MMOBegin, MMOEnd);
21429   // Reload IP
21430   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
21431   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
21432     if (i == X86::AddrDisp)
21433       MIB.addDisp(MI->getOperand(i), LabelOffset);
21434     else
21435       MIB.addOperand(MI->getOperand(i));
21436   }
21437   MIB.setMemRefs(MMOBegin, MMOEnd);
21438   // Reload SP
21439   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
21440   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
21441     if (i == X86::AddrDisp)
21442       MIB.addDisp(MI->getOperand(i), SPOffset);
21443     else
21444       MIB.addOperand(MI->getOperand(i));
21445   }
21446   MIB.setMemRefs(MMOBegin, MMOEnd);
21447   // Jump
21448   BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
21449
21450   MI->eraseFromParent();
21451   return MBB;
21452 }
21453
21454 // Replace 213-type (isel default) FMA3 instructions with 231-type for
21455 // accumulator loops. Writing back to the accumulator allows the coalescer
21456 // to remove extra copies in the loop.
21457 MachineBasicBlock *
21458 X86TargetLowering::emitFMA3Instr(MachineInstr *MI,
21459                                  MachineBasicBlock *MBB) const {
21460   MachineOperand &AddendOp = MI->getOperand(3);
21461
21462   // Bail out early if the addend isn't a register - we can't switch these.
21463   if (!AddendOp.isReg())
21464     return MBB;
21465
21466   MachineFunction &MF = *MBB->getParent();
21467   MachineRegisterInfo &MRI = MF.getRegInfo();
21468
21469   // Check whether the addend is defined by a PHI:
21470   assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?");
21471   MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg());
21472   if (!AddendDef.isPHI())
21473     return MBB;
21474
21475   // Look for the following pattern:
21476   // loop:
21477   //   %addend = phi [%entry, 0], [%loop, %result]
21478   //   ...
21479   //   %result<tied1> = FMA213 %m2<tied0>, %m1, %addend
21480
21481   // Replace with:
21482   //   loop:
21483   //   %addend = phi [%entry, 0], [%loop, %result]
21484   //   ...
21485   //   %result<tied1> = FMA231 %addend<tied0>, %m1, %m2
21486
21487   for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) {
21488     assert(AddendDef.getOperand(i).isReg());
21489     MachineOperand PHISrcOp = AddendDef.getOperand(i);
21490     MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg());
21491     if (&PHISrcInst == MI) {
21492       // Found a matching instruction.
21493       unsigned NewFMAOpc = 0;
21494       switch (MI->getOpcode()) {
21495         case X86::VFMADDPDr213r: NewFMAOpc = X86::VFMADDPDr231r; break;
21496         case X86::VFMADDPSr213r: NewFMAOpc = X86::VFMADDPSr231r; break;
21497         case X86::VFMADDSDr213r: NewFMAOpc = X86::VFMADDSDr231r; break;
21498         case X86::VFMADDSSr213r: NewFMAOpc = X86::VFMADDSSr231r; break;
21499         case X86::VFMSUBPDr213r: NewFMAOpc = X86::VFMSUBPDr231r; break;
21500         case X86::VFMSUBPSr213r: NewFMAOpc = X86::VFMSUBPSr231r; break;
21501         case X86::VFMSUBSDr213r: NewFMAOpc = X86::VFMSUBSDr231r; break;
21502         case X86::VFMSUBSSr213r: NewFMAOpc = X86::VFMSUBSSr231r; break;
21503         case X86::VFNMADDPDr213r: NewFMAOpc = X86::VFNMADDPDr231r; break;
21504         case X86::VFNMADDPSr213r: NewFMAOpc = X86::VFNMADDPSr231r; break;
21505         case X86::VFNMADDSDr213r: NewFMAOpc = X86::VFNMADDSDr231r; break;
21506         case X86::VFNMADDSSr213r: NewFMAOpc = X86::VFNMADDSSr231r; break;
21507         case X86::VFNMSUBPDr213r: NewFMAOpc = X86::VFNMSUBPDr231r; break;
21508         case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break;
21509         case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break;
21510         case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break;
21511         case X86::VFMADDSUBPDr213r: NewFMAOpc = X86::VFMADDSUBPDr231r; break;
21512         case X86::VFMADDSUBPSr213r: NewFMAOpc = X86::VFMADDSUBPSr231r; break;
21513         case X86::VFMSUBADDPDr213r: NewFMAOpc = X86::VFMSUBADDPDr231r; break;
21514         case X86::VFMSUBADDPSr213r: NewFMAOpc = X86::VFMSUBADDPSr231r; break;
21515
21516         case X86::VFMADDPDr213rY: NewFMAOpc = X86::VFMADDPDr231rY; break;
21517         case X86::VFMADDPSr213rY: NewFMAOpc = X86::VFMADDPSr231rY; break;
21518         case X86::VFMSUBPDr213rY: NewFMAOpc = X86::VFMSUBPDr231rY; break;
21519         case X86::VFMSUBPSr213rY: NewFMAOpc = X86::VFMSUBPSr231rY; break;
21520         case X86::VFNMADDPDr213rY: NewFMAOpc = X86::VFNMADDPDr231rY; break;
21521         case X86::VFNMADDPSr213rY: NewFMAOpc = X86::VFNMADDPSr231rY; break;
21522         case X86::VFNMSUBPDr213rY: NewFMAOpc = X86::VFNMSUBPDr231rY; break;
21523         case X86::VFNMSUBPSr213rY: NewFMAOpc = X86::VFNMSUBPSr231rY; break;
21524         case X86::VFMADDSUBPDr213rY: NewFMAOpc = X86::VFMADDSUBPDr231rY; break;
21525         case X86::VFMADDSUBPSr213rY: NewFMAOpc = X86::VFMADDSUBPSr231rY; break;
21526         case X86::VFMSUBADDPDr213rY: NewFMAOpc = X86::VFMSUBADDPDr231rY; break;
21527         case X86::VFMSUBADDPSr213rY: NewFMAOpc = X86::VFMSUBADDPSr231rY; break;
21528         default: llvm_unreachable("Unrecognized FMA variant.");
21529       }
21530
21531       const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
21532       MachineInstrBuilder MIB =
21533         BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc))
21534         .addOperand(MI->getOperand(0))
21535         .addOperand(MI->getOperand(3))
21536         .addOperand(MI->getOperand(2))
21537         .addOperand(MI->getOperand(1));
21538       MBB->insert(MachineBasicBlock::iterator(MI), MIB);
21539       MI->eraseFromParent();
21540     }
21541   }
21542
21543   return MBB;
21544 }
21545
21546 MachineBasicBlock *
21547 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
21548                                                MachineBasicBlock *BB) const {
21549   switch (MI->getOpcode()) {
21550   default: llvm_unreachable("Unexpected instr type to insert");
21551   case X86::TAILJMPd64:
21552   case X86::TAILJMPr64:
21553   case X86::TAILJMPm64:
21554     llvm_unreachable("TAILJMP64 would not be touched here.");
21555   case X86::TCRETURNdi64:
21556   case X86::TCRETURNri64:
21557   case X86::TCRETURNmi64:
21558     return BB;
21559   case X86::WIN_ALLOCA:
21560     return EmitLoweredWinAlloca(MI, BB);
21561   case X86::SEG_ALLOCA_32:
21562   case X86::SEG_ALLOCA_64:
21563     return EmitLoweredSegAlloca(MI, BB);
21564   case X86::TLSCall_32:
21565   case X86::TLSCall_64:
21566     return EmitLoweredTLSCall(MI, BB);
21567   case X86::CMOV_GR8:
21568   case X86::CMOV_FR32:
21569   case X86::CMOV_FR64:
21570   case X86::CMOV_V4F32:
21571   case X86::CMOV_V2F64:
21572   case X86::CMOV_V2I64:
21573   case X86::CMOV_V8F32:
21574   case X86::CMOV_V4F64:
21575   case X86::CMOV_V4I64:
21576   case X86::CMOV_V16F32:
21577   case X86::CMOV_V8F64:
21578   case X86::CMOV_V8I64:
21579   case X86::CMOV_GR16:
21580   case X86::CMOV_GR32:
21581   case X86::CMOV_RFP32:
21582   case X86::CMOV_RFP64:
21583   case X86::CMOV_RFP80:
21584     return EmitLoweredSelect(MI, BB);
21585
21586   case X86::FP32_TO_INT16_IN_MEM:
21587   case X86::FP32_TO_INT32_IN_MEM:
21588   case X86::FP32_TO_INT64_IN_MEM:
21589   case X86::FP64_TO_INT16_IN_MEM:
21590   case X86::FP64_TO_INT32_IN_MEM:
21591   case X86::FP64_TO_INT64_IN_MEM:
21592   case X86::FP80_TO_INT16_IN_MEM:
21593   case X86::FP80_TO_INT32_IN_MEM:
21594   case X86::FP80_TO_INT64_IN_MEM: {
21595     MachineFunction *F = BB->getParent();
21596     const TargetInstrInfo *TII = F->getSubtarget().getInstrInfo();
21597     DebugLoc DL = MI->getDebugLoc();
21598
21599     // Change the floating point control register to use "round towards zero"
21600     // mode when truncating to an integer value.
21601     int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
21602     addFrameReference(BuildMI(*BB, MI, DL,
21603                               TII->get(X86::FNSTCW16m)), CWFrameIdx);
21604
21605     // Load the old value of the high byte of the control word...
21606     unsigned OldCW =
21607       F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
21608     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
21609                       CWFrameIdx);
21610
21611     // Set the high part to be round to zero...
21612     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
21613       .addImm(0xC7F);
21614
21615     // Reload the modified control word now...
21616     addFrameReference(BuildMI(*BB, MI, DL,
21617                               TII->get(X86::FLDCW16m)), CWFrameIdx);
21618
21619     // Restore the memory image of control word to original value
21620     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
21621       .addReg(OldCW);
21622
21623     // Get the X86 opcode to use.
21624     unsigned Opc;
21625     switch (MI->getOpcode()) {
21626     default: llvm_unreachable("illegal opcode!");
21627     case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
21628     case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
21629     case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
21630     case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
21631     case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
21632     case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
21633     case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
21634     case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
21635     case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
21636     }
21637
21638     X86AddressMode AM;
21639     MachineOperand &Op = MI->getOperand(0);
21640     if (Op.isReg()) {
21641       AM.BaseType = X86AddressMode::RegBase;
21642       AM.Base.Reg = Op.getReg();
21643     } else {
21644       AM.BaseType = X86AddressMode::FrameIndexBase;
21645       AM.Base.FrameIndex = Op.getIndex();
21646     }
21647     Op = MI->getOperand(1);
21648     if (Op.isImm())
21649       AM.Scale = Op.getImm();
21650     Op = MI->getOperand(2);
21651     if (Op.isImm())
21652       AM.IndexReg = Op.getImm();
21653     Op = MI->getOperand(3);
21654     if (Op.isGlobal()) {
21655       AM.GV = Op.getGlobal();
21656     } else {
21657       AM.Disp = Op.getImm();
21658     }
21659     addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
21660                       .addReg(MI->getOperand(X86::AddrNumOperands).getReg());
21661
21662     // Reload the original control word now.
21663     addFrameReference(BuildMI(*BB, MI, DL,
21664                               TII->get(X86::FLDCW16m)), CWFrameIdx);
21665
21666     MI->eraseFromParent();   // The pseudo instruction is gone now.
21667     return BB;
21668   }
21669     // String/text processing lowering.
21670   case X86::PCMPISTRM128REG:
21671   case X86::VPCMPISTRM128REG:
21672   case X86::PCMPISTRM128MEM:
21673   case X86::VPCMPISTRM128MEM:
21674   case X86::PCMPESTRM128REG:
21675   case X86::VPCMPESTRM128REG:
21676   case X86::PCMPESTRM128MEM:
21677   case X86::VPCMPESTRM128MEM:
21678     assert(Subtarget->hasSSE42() &&
21679            "Target must have SSE4.2 or AVX features enabled");
21680     return EmitPCMPSTRM(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());
21681
21682   // String/text processing lowering.
21683   case X86::PCMPISTRIREG:
21684   case X86::VPCMPISTRIREG:
21685   case X86::PCMPISTRIMEM:
21686   case X86::VPCMPISTRIMEM:
21687   case X86::PCMPESTRIREG:
21688   case X86::VPCMPESTRIREG:
21689   case X86::PCMPESTRIMEM:
21690   case X86::VPCMPESTRIMEM:
21691     assert(Subtarget->hasSSE42() &&
21692            "Target must have SSE4.2 or AVX features enabled");
21693     return EmitPCMPSTRI(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());
21694
21695   // Thread synchronization.
21696   case X86::MONITOR:
21697     return EmitMonitor(MI, BB, BB->getParent()->getSubtarget().getInstrInfo(),
21698                        Subtarget);
21699
21700   // xbegin
21701   case X86::XBEGIN:
21702     return EmitXBegin(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());
21703
21704   case X86::VASTART_SAVE_XMM_REGS:
21705     return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
21706
21707   case X86::VAARG_64:
21708     return EmitVAARG64WithCustomInserter(MI, BB);
21709
21710   case X86::EH_SjLj_SetJmp32:
21711   case X86::EH_SjLj_SetJmp64:
21712     return emitEHSjLjSetJmp(MI, BB);
21713
21714   case X86::EH_SjLj_LongJmp32:
21715   case X86::EH_SjLj_LongJmp64:
21716     return emitEHSjLjLongJmp(MI, BB);
21717
21718   case TargetOpcode::STATEPOINT:
21719     // As an implementation detail, STATEPOINT shares the STACKMAP format at
21720     // this point in the process.  We diverge later.
21721     return emitPatchPoint(MI, BB);
21722
21723   case TargetOpcode::STACKMAP:
21724   case TargetOpcode::PATCHPOINT:
21725     return emitPatchPoint(MI, BB);
21726
21727   case X86::VFMADDPDr213r:
21728   case X86::VFMADDPSr213r:
21729   case X86::VFMADDSDr213r:
21730   case X86::VFMADDSSr213r:
21731   case X86::VFMSUBPDr213r:
21732   case X86::VFMSUBPSr213r:
21733   case X86::VFMSUBSDr213r:
21734   case X86::VFMSUBSSr213r:
21735   case X86::VFNMADDPDr213r:
21736   case X86::VFNMADDPSr213r:
21737   case X86::VFNMADDSDr213r:
21738   case X86::VFNMADDSSr213r:
21739   case X86::VFNMSUBPDr213r:
21740   case X86::VFNMSUBPSr213r:
21741   case X86::VFNMSUBSDr213r:
21742   case X86::VFNMSUBSSr213r:
21743   case X86::VFMADDSUBPDr213r:
21744   case X86::VFMADDSUBPSr213r:
21745   case X86::VFMSUBADDPDr213r:
21746   case X86::VFMSUBADDPSr213r:
21747   case X86::VFMADDPDr213rY:
21748   case X86::VFMADDPSr213rY:
21749   case X86::VFMSUBPDr213rY:
21750   case X86::VFMSUBPSr213rY:
21751   case X86::VFNMADDPDr213rY:
21752   case X86::VFNMADDPSr213rY:
21753   case X86::VFNMSUBPDr213rY:
21754   case X86::VFNMSUBPSr213rY:
21755   case X86::VFMADDSUBPDr213rY:
21756   case X86::VFMADDSUBPSr213rY:
21757   case X86::VFMSUBADDPDr213rY:
21758   case X86::VFMSUBADDPSr213rY:
21759     return emitFMA3Instr(MI, BB);
21760   }
21761 }
21762
21763 //===----------------------------------------------------------------------===//
21764 //                           X86 Optimization Hooks
21765 //===----------------------------------------------------------------------===//
21766
21767 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
21768                                                       APInt &KnownZero,
21769                                                       APInt &KnownOne,
21770                                                       const SelectionDAG &DAG,
21771                                                       unsigned Depth) const {
21772   unsigned BitWidth = KnownZero.getBitWidth();
21773   unsigned Opc = Op.getOpcode();
21774   assert((Opc >= ISD::BUILTIN_OP_END ||
21775           Opc == ISD::INTRINSIC_WO_CHAIN ||
21776           Opc == ISD::INTRINSIC_W_CHAIN ||
21777           Opc == ISD::INTRINSIC_VOID) &&
21778          "Should use MaskedValueIsZero if you don't know whether Op"
21779          " is a target node!");
21780
21781   KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
21782   switch (Opc) {
21783   default: break;
21784   case X86ISD::ADD:
21785   case X86ISD::SUB:
21786   case X86ISD::ADC:
21787   case X86ISD::SBB:
21788   case X86ISD::SMUL:
21789   case X86ISD::UMUL:
21790   case X86ISD::INC:
21791   case X86ISD::DEC:
21792   case X86ISD::OR:
21793   case X86ISD::XOR:
21794   case X86ISD::AND:
21795     // These nodes' second result is a boolean.
21796     if (Op.getResNo() == 0)
21797       break;
21798     // Fallthrough
21799   case X86ISD::SETCC:
21800     KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
21801     break;
21802   case ISD::INTRINSIC_WO_CHAIN: {
21803     unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
21804     unsigned NumLoBits = 0;
21805     switch (IntId) {
21806     default: break;
21807     case Intrinsic::x86_sse_movmsk_ps:
21808     case Intrinsic::x86_avx_movmsk_ps_256:
21809     case Intrinsic::x86_sse2_movmsk_pd:
21810     case Intrinsic::x86_avx_movmsk_pd_256:
21811     case Intrinsic::x86_mmx_pmovmskb:
21812     case Intrinsic::x86_sse2_pmovmskb_128:
21813     case Intrinsic::x86_avx2_pmovmskb: {
21814       // High bits of movmskp{s|d}, pmovmskb are known zero.
21815       switch (IntId) {
21816         default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
21817         case Intrinsic::x86_sse_movmsk_ps:      NumLoBits = 4; break;
21818         case Intrinsic::x86_avx_movmsk_ps_256:  NumLoBits = 8; break;
21819         case Intrinsic::x86_sse2_movmsk_pd:     NumLoBits = 2; break;
21820         case Intrinsic::x86_avx_movmsk_pd_256:  NumLoBits = 4; break;
21821         case Intrinsic::x86_mmx_pmovmskb:       NumLoBits = 8; break;
21822         case Intrinsic::x86_sse2_pmovmskb_128:  NumLoBits = 16; break;
21823         case Intrinsic::x86_avx2_pmovmskb:      NumLoBits = 32; break;
21824       }
21825       KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
21826       break;
21827     }
21828     }
21829     break;
21830   }
21831   }
21832 }
21833
21834 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
21835   SDValue Op,
21836   const SelectionDAG &,
21837   unsigned Depth) const {
21838   // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
21839   if (Op.getOpcode() == X86ISD::SETCC_CARRY)
21840     return Op.getValueType().getScalarType().getSizeInBits();
21841
21842   // Fallback case.
21843   return 1;
21844 }
21845
21846 /// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
21847 /// node is a GlobalAddress + offset.
21848 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
21849                                        const GlobalValue* &GA,
21850                                        int64_t &Offset) const {
21851   if (N->getOpcode() == X86ISD::Wrapper) {
21852     if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
21853       GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
21854       Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
21855       return true;
21856     }
21857   }
21858   return TargetLowering::isGAPlusOffset(N, GA, Offset);
21859 }
21860
21861 /// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the
21862 /// same as extracting the high 128-bit part of 256-bit vector and then
21863 /// inserting the result into the low part of a new 256-bit vector
21864 static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
21865   EVT VT = SVOp->getValueType(0);
21866   unsigned NumElems = VT.getVectorNumElements();
21867
21868   // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
21869   for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j)
21870     if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
21871         SVOp->getMaskElt(j) >= 0)
21872       return false;
21873
21874   return true;
21875 }
21876
21877 /// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the
21878 /// same as extracting the low 128-bit part of 256-bit vector and then
21879 /// inserting the result into the high part of a new 256-bit vector
21880 static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
21881   EVT VT = SVOp->getValueType(0);
21882   unsigned NumElems = VT.getVectorNumElements();
21883
21884   // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
21885   for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j)
21886     if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
21887         SVOp->getMaskElt(j) >= 0)
21888       return false;
21889
21890   return true;
21891 }
21892
21893 /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
21894 static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
21895                                         TargetLowering::DAGCombinerInfo &DCI,
21896                                         const X86Subtarget* Subtarget) {
21897   SDLoc dl(N);
21898   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
21899   SDValue V1 = SVOp->getOperand(0);
21900   SDValue V2 = SVOp->getOperand(1);
21901   EVT VT = SVOp->getValueType(0);
21902   unsigned NumElems = VT.getVectorNumElements();
21903
21904   if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
21905       V2.getOpcode() == ISD::CONCAT_VECTORS) {
21906     //
21907     //                   0,0,0,...
21908     //                      |
21909     //    V      UNDEF    BUILD_VECTOR    UNDEF
21910     //     \      /           \           /
21911     //  CONCAT_VECTOR         CONCAT_VECTOR
21912     //         \                  /
21913     //          \                /
21914     //          RESULT: V + zero extended
21915     //
21916     if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
21917         V2.getOperand(1).getOpcode() != ISD::UNDEF ||
21918         V1.getOperand(1).getOpcode() != ISD::UNDEF)
21919       return SDValue();
21920
21921     if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
21922       return SDValue();
21923
21924     // To match the shuffle mask, the first half of the mask should
21925     // be exactly the first vector, and all the rest a splat with the
21926     // first element of the second one.
21927     for (unsigned i = 0; i != NumElems/2; ++i)
21928       if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
21929           !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
21930         return SDValue();
21931
21932     // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
21933     if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
21934       if (Ld->hasNUsesOfValue(1, 0)) {
21935         SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
21936         SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
21937         SDValue ResNode =
21938           DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
21939                                   Ld->getMemoryVT(),
21940                                   Ld->getPointerInfo(),
21941                                   Ld->getAlignment(),
21942                                   false/*isVolatile*/, true/*ReadMem*/,
21943                                   false/*WriteMem*/);
21944
21945         // Make sure the newly-created LOAD is in the same position as Ld in
21946         // terms of dependency. We create a TokenFactor for Ld and ResNode,
21947         // and update uses of Ld's output chain to use the TokenFactor.
21948         if (Ld->hasAnyUseOfValue(1)) {
21949           SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
21950                              SDValue(Ld, 1), SDValue(ResNode.getNode(), 1));
21951           DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
21952           DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
21953                                  SDValue(ResNode.getNode(), 1));
21954         }
21955
21956         return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
21957       }
21958     }
21959
21960     // Emit a zeroed vector and insert the desired subvector on its
21961     // first half.
21962     SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
21963     SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
21964     return DCI.CombineTo(N, InsV);
21965   }
21966
21967   //===--------------------------------------------------------------------===//
21968   // Combine some shuffles into subvector extracts and inserts:
21969   //
21970
21971   // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
21972   if (isShuffleHigh128VectorInsertLow(SVOp)) {
21973     SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl);
21974     SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl);
21975     return DCI.CombineTo(N, InsV);
21976   }
21977
21978   // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
21979   if (isShuffleLow128VectorInsertHigh(SVOp)) {
21980     SDValue V = Extract128BitVector(V1, 0, DAG, dl);
21981     SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl);
21982     return DCI.CombineTo(N, InsV);
21983   }
21984
21985   return SDValue();
21986 }
21987
21988 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
21989 /// possible.
21990 ///
21991 /// This is the leaf of the recursive combinine below. When we have found some
21992 /// chain of single-use x86 shuffle instructions and accumulated the combined
21993 /// shuffle mask represented by them, this will try to pattern match that mask
21994 /// into either a single instruction if there is a special purpose instruction
21995 /// for this operation, or into a PSHUFB instruction which is a fully general
21996 /// instruction but should only be used to replace chains over a certain depth.
21997 static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
21998                                    int Depth, bool HasPSHUFB, SelectionDAG &DAG,
21999                                    TargetLowering::DAGCombinerInfo &DCI,
22000                                    const X86Subtarget *Subtarget) {
22001   assert(!Mask.empty() && "Cannot combine an empty shuffle mask!");
22002
22003   // Find the operand that enters the chain. Note that multiple uses are OK
22004   // here, we're not going to remove the operand we find.
22005   SDValue Input = Op.getOperand(0);
22006   while (Input.getOpcode() == ISD::BITCAST)
22007     Input = Input.getOperand(0);
22008
22009   MVT VT = Input.getSimpleValueType();
22010   MVT RootVT = Root.getSimpleValueType();
22011   SDLoc DL(Root);
22012
22013   // Just remove no-op shuffle masks.
22014   if (Mask.size() == 1) {
22015     DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Input),
22016                   /*AddTo*/ true);
22017     return true;
22018   }
22019
22020   // Use the float domain if the operand type is a floating point type.
22021   bool FloatDomain = VT.isFloatingPoint();
22022
22023   // For floating point shuffles, we don't have free copies in the shuffle
22024   // instructions or the ability to load as part of the instruction, so
22025   // canonicalize their shuffles to UNPCK or MOV variants.
22026   //
22027   // Note that even with AVX we prefer the PSHUFD form of shuffle for integer
22028   // vectors because it can have a load folded into it that UNPCK cannot. This
22029   // doesn't preclude something switching to the shorter encoding post-RA.
22030   if (FloatDomain) {
22031     if (Mask.equals(0, 0) || Mask.equals(1, 1)) {
22032       bool Lo = Mask.equals(0, 0);
22033       unsigned Shuffle;
22034       MVT ShuffleVT;
22035       // Check if we have SSE3 which will let us use MOVDDUP. That instruction
22036       // is no slower than UNPCKLPD but has the option to fold the input operand
22037       // into even an unaligned memory load.
22038       if (Lo && Subtarget->hasSSE3()) {
22039         Shuffle = X86ISD::MOVDDUP;
22040         ShuffleVT = MVT::v2f64;
22041       } else {
22042         // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller
22043         // than the UNPCK variants.
22044         Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS;
22045         ShuffleVT = MVT::v4f32;
22046       }
22047       if (Depth == 1 && Root->getOpcode() == Shuffle)
22048         return false; // Nothing to do!
22049       Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
22050       DCI.AddToWorklist(Op.getNode());
22051       if (Shuffle == X86ISD::MOVDDUP)
22052         Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
22053       else
22054         Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
22055       DCI.AddToWorklist(Op.getNode());
22056       DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22057                     /*AddTo*/ true);
22058       return true;
22059     }
22060     if (Subtarget->hasSSE3() &&
22061         (Mask.equals(0, 0, 2, 2) || Mask.equals(1, 1, 3, 3))) {
22062       bool Lo = Mask.equals(0, 0, 2, 2);
22063       unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP;
22064       MVT ShuffleVT = MVT::v4f32;
22065       if (Depth == 1 && Root->getOpcode() == Shuffle)
22066         return false; // Nothing to do!
22067       Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
22068       DCI.AddToWorklist(Op.getNode());
22069       Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
22070       DCI.AddToWorklist(Op.getNode());
22071       DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22072                     /*AddTo*/ true);
22073       return true;
22074     }
22075     if (Mask.equals(0, 0, 1, 1) || Mask.equals(2, 2, 3, 3)) {
22076       bool Lo = Mask.equals(0, 0, 1, 1);
22077       unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
22078       MVT ShuffleVT = MVT::v4f32;
22079       if (Depth == 1 && Root->getOpcode() == Shuffle)
22080         return false; // Nothing to do!
22081       Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
22082       DCI.AddToWorklist(Op.getNode());
22083       Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
22084       DCI.AddToWorklist(Op.getNode());
22085       DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22086                     /*AddTo*/ true);
22087       return true;
22088     }
22089   }
22090
22091   // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK
22092   // variants as none of these have single-instruction variants that are
22093   // superior to the UNPCK formulation.
22094   if (!FloatDomain &&
22095       (Mask.equals(0, 0, 1, 1, 2, 2, 3, 3) ||
22096        Mask.equals(4, 4, 5, 5, 6, 6, 7, 7) ||
22097        Mask.equals(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7) ||
22098        Mask.equals(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15,
22099                    15))) {
22100     bool Lo = Mask[0] == 0;
22101     unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
22102     if (Depth == 1 && Root->getOpcode() == Shuffle)
22103       return false; // Nothing to do!
22104     MVT ShuffleVT;
22105     switch (Mask.size()) {
22106     case 8:
22107       ShuffleVT = MVT::v8i16;
22108       break;
22109     case 16:
22110       ShuffleVT = MVT::v16i8;
22111       break;
22112     default:
22113       llvm_unreachable("Impossible mask size!");
22114     };
22115     Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
22116     DCI.AddToWorklist(Op.getNode());
22117     Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
22118     DCI.AddToWorklist(Op.getNode());
22119     DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22120                   /*AddTo*/ true);
22121     return true;
22122   }
22123
22124   // Don't try to re-form single instruction chains under any circumstances now
22125   // that we've done encoding canonicalization for them.
22126   if (Depth < 2)
22127     return false;
22128
22129   // If we have 3 or more shuffle instructions or a chain involving PSHUFB, we
22130   // can replace them with a single PSHUFB instruction profitably. Intel's
22131   // manuals suggest only using PSHUFB if doing so replacing 5 instructions, but
22132   // in practice PSHUFB tends to be *very* fast so we're more aggressive.
22133   if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) {
22134     SmallVector<SDValue, 16> PSHUFBMask;
22135     assert(Mask.size() <= 16 && "Can't shuffle elements smaller than bytes!");
22136     int Ratio = 16 / Mask.size();
22137     for (unsigned i = 0; i < 16; ++i) {
22138       if (Mask[i / Ratio] == SM_SentinelUndef) {
22139         PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
22140         continue;
22141       }
22142       int M = Mask[i / Ratio] != SM_SentinelZero
22143                   ? Ratio * Mask[i / Ratio] + i % Ratio
22144                   : 255;
22145       PSHUFBMask.push_back(DAG.getConstant(M, MVT::i8));
22146     }
22147     Op = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Input);
22148     DCI.AddToWorklist(Op.getNode());
22149     SDValue PSHUFBMaskOp =
22150         DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, PSHUFBMask);
22151     DCI.AddToWorklist(PSHUFBMaskOp.getNode());
22152     Op = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, Op, PSHUFBMaskOp);
22153     DCI.AddToWorklist(Op.getNode());
22154     DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22155                   /*AddTo*/ true);
22156     return true;
22157   }
22158
22159   // Failed to find any combines.
22160   return false;
22161 }
22162
22163 /// \brief Fully generic combining of x86 shuffle instructions.
22164 ///
22165 /// This should be the last combine run over the x86 shuffle instructions. Once
22166 /// they have been fully optimized, this will recursively consider all chains
22167 /// of single-use shuffle instructions, build a generic model of the cumulative
22168 /// shuffle operation, and check for simpler instructions which implement this
22169 /// operation. We use this primarily for two purposes:
22170 ///
22171 /// 1) Collapse generic shuffles to specialized single instructions when
22172 ///    equivalent. In most cases, this is just an encoding size win, but
22173 ///    sometimes we will collapse multiple generic shuffles into a single
22174 ///    special-purpose shuffle.
22175 /// 2) Look for sequences of shuffle instructions with 3 or more total
22176 ///    instructions, and replace them with the slightly more expensive SSSE3
22177 ///    PSHUFB instruction if available. We do this as the last combining step
22178 ///    to ensure we avoid using PSHUFB if we can implement the shuffle with
22179 ///    a suitable short sequence of other instructions. The PHUFB will either
22180 ///    use a register or have to read from memory and so is slightly (but only
22181 ///    slightly) more expensive than the other shuffle instructions.
22182 ///
22183 /// Because this is inherently a quadratic operation (for each shuffle in
22184 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
22185 /// This should never be an issue in practice as the shuffle lowering doesn't
22186 /// produce sequences of more than 8 instructions.
22187 ///
22188 /// FIXME: We will currently miss some cases where the redundant shuffling
22189 /// would simplify under the threshold for PSHUFB formation because of
22190 /// combine-ordering. To fix this, we should do the redundant instruction
22191 /// combining in this recursive walk.
22192 static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
22193                                           ArrayRef<int> RootMask,
22194                                           int Depth, bool HasPSHUFB,
22195                                           SelectionDAG &DAG,
22196                                           TargetLowering::DAGCombinerInfo &DCI,
22197                                           const X86Subtarget *Subtarget) {
22198   // Bound the depth of our recursive combine because this is ultimately
22199   // quadratic in nature.
22200   if (Depth > 8)
22201     return false;
22202
22203   // Directly rip through bitcasts to find the underlying operand.
22204   while (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).hasOneUse())
22205     Op = Op.getOperand(0);
22206
22207   MVT VT = Op.getSimpleValueType();
22208   if (!VT.isVector())
22209     return false; // Bail if we hit a non-vector.
22210   // FIXME: This routine should be taught about 256-bit shuffles, or a 256-bit
22211   // version should be added.
22212   if (VT.getSizeInBits() != 128)
22213     return false;
22214
22215   assert(Root.getSimpleValueType().isVector() &&
22216          "Shuffles operate on vector types!");
22217   assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
22218          "Can only combine shuffles of the same vector register size.");
22219
22220   if (!isTargetShuffle(Op.getOpcode()))
22221     return false;
22222   SmallVector<int, 16> OpMask;
22223   bool IsUnary;
22224   bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, OpMask, IsUnary);
22225   // We only can combine unary shuffles which we can decode the mask for.
22226   if (!HaveMask || !IsUnary)
22227     return false;
22228
22229   assert(VT.getVectorNumElements() == OpMask.size() &&
22230          "Different mask size from vector size!");
22231   assert(((RootMask.size() > OpMask.size() &&
22232            RootMask.size() % OpMask.size() == 0) ||
22233           (OpMask.size() > RootMask.size() &&
22234            OpMask.size() % RootMask.size() == 0) ||
22235           OpMask.size() == RootMask.size()) &&
22236          "The smaller number of elements must divide the larger.");
22237   int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
22238   int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
22239   assert(((RootRatio == 1 && OpRatio == 1) ||
22240           (RootRatio == 1) != (OpRatio == 1)) &&
22241          "Must not have a ratio for both incoming and op masks!");
22242
22243   SmallVector<int, 16> Mask;
22244   Mask.reserve(std::max(OpMask.size(), RootMask.size()));
22245
22246   // Merge this shuffle operation's mask into our accumulated mask. Note that
22247   // this shuffle's mask will be the first applied to the input, followed by the
22248   // root mask to get us all the way to the root value arrangement. The reason
22249   // for this order is that we are recursing up the operation chain.
22250   for (int i = 0, e = std::max(OpMask.size(), RootMask.size()); i < e; ++i) {
22251     int RootIdx = i / RootRatio;
22252     if (RootMask[RootIdx] < 0) {
22253       // This is a zero or undef lane, we're done.
22254       Mask.push_back(RootMask[RootIdx]);
22255       continue;
22256     }
22257
22258     int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
22259     int OpIdx = RootMaskedIdx / OpRatio;
22260     if (OpMask[OpIdx] < 0) {
22261       // The incoming lanes are zero or undef, it doesn't matter which ones we
22262       // are using.
22263       Mask.push_back(OpMask[OpIdx]);
22264       continue;
22265     }
22266
22267     // Ok, we have non-zero lanes, map them through.
22268     Mask.push_back(OpMask[OpIdx] * OpRatio +
22269                    RootMaskedIdx % OpRatio);
22270   }
22271
22272   // See if we can recurse into the operand to combine more things.
22273   switch (Op.getOpcode()) {
22274     case X86ISD::PSHUFB:
22275       HasPSHUFB = true;
22276     case X86ISD::PSHUFD:
22277     case X86ISD::PSHUFHW:
22278     case X86ISD::PSHUFLW:
22279       if (Op.getOperand(0).hasOneUse() &&
22280           combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
22281                                         HasPSHUFB, DAG, DCI, Subtarget))
22282         return true;
22283       break;
22284
22285     case X86ISD::UNPCKL:
22286     case X86ISD::UNPCKH:
22287       assert(Op.getOperand(0) == Op.getOperand(1) && "We only combine unary shuffles!");
22288       // We can't check for single use, we have to check that this shuffle is the only user.
22289       if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
22290           combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
22291                                         HasPSHUFB, DAG, DCI, Subtarget))
22292           return true;
22293       break;
22294   }
22295
22296   // Minor canonicalization of the accumulated shuffle mask to make it easier
22297   // to match below. All this does is detect masks with squential pairs of
22298   // elements, and shrink them to the half-width mask. It does this in a loop
22299   // so it will reduce the size of the mask to the minimal width mask which
22300   // performs an equivalent shuffle.
22301   SmallVector<int, 16> WidenedMask;
22302   while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
22303     Mask = std::move(WidenedMask);
22304     WidenedMask.clear();
22305   }
22306
22307   return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI,
22308                                 Subtarget);
22309 }
22310
22311 /// \brief Get the PSHUF-style mask from PSHUF node.
22312 ///
22313 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
22314 /// PSHUF-style masks that can be reused with such instructions.
22315 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
22316   SmallVector<int, 4> Mask;
22317   bool IsUnary;
22318   bool HaveMask = getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), Mask, IsUnary);
22319   (void)HaveMask;
22320   assert(HaveMask);
22321
22322   switch (N.getOpcode()) {
22323   case X86ISD::PSHUFD:
22324     return Mask;
22325   case X86ISD::PSHUFLW:
22326     Mask.resize(4);
22327     return Mask;
22328   case X86ISD::PSHUFHW:
22329     Mask.erase(Mask.begin(), Mask.begin() + 4);
22330     for (int &M : Mask)
22331       M -= 4;
22332     return Mask;
22333   default:
22334     llvm_unreachable("No valid shuffle instruction found!");
22335   }
22336 }
22337
22338 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
22339 ///
22340 /// We walk up the chain and look for a combinable shuffle, skipping over
22341 /// shuffles that we could hoist this shuffle's transformation past without
22342 /// altering anything.
22343 static SDValue
22344 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
22345                              SelectionDAG &DAG,
22346                              TargetLowering::DAGCombinerInfo &DCI) {
22347   assert(N.getOpcode() == X86ISD::PSHUFD &&
22348          "Called with something other than an x86 128-bit half shuffle!");
22349   SDLoc DL(N);
22350
22351   // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
22352   // of the shuffles in the chain so that we can form a fresh chain to replace
22353   // this one.
22354   SmallVector<SDValue, 8> Chain;
22355   SDValue V = N.getOperand(0);
22356   for (; V.hasOneUse(); V = V.getOperand(0)) {
22357     switch (V.getOpcode()) {
22358     default:
22359       return SDValue(); // Nothing combined!
22360
22361     case ISD::BITCAST:
22362       // Skip bitcasts as we always know the type for the target specific
22363       // instructions.
22364       continue;
22365
22366     case X86ISD::PSHUFD:
22367       // Found another dword shuffle.
22368       break;
22369
22370     case X86ISD::PSHUFLW:
22371       // Check that the low words (being shuffled) are the identity in the
22372       // dword shuffle, and the high words are self-contained.
22373       if (Mask[0] != 0 || Mask[1] != 1 ||
22374           !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
22375         return SDValue();
22376
22377       Chain.push_back(V);
22378       continue;
22379
22380     case X86ISD::PSHUFHW:
22381       // Check that the high words (being shuffled) are the identity in the
22382       // dword shuffle, and the low words are self-contained.
22383       if (Mask[2] != 2 || Mask[3] != 3 ||
22384           !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
22385         return SDValue();
22386
22387       Chain.push_back(V);
22388       continue;
22389
22390     case X86ISD::UNPCKL:
22391     case X86ISD::UNPCKH:
22392       // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
22393       // shuffle into a preceding word shuffle.
22394       if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16)
22395         return SDValue();
22396
22397       // Search for a half-shuffle which we can combine with.
22398       unsigned CombineOp =
22399           V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
22400       if (V.getOperand(0) != V.getOperand(1) ||
22401           !V->isOnlyUserOf(V.getOperand(0).getNode()))
22402         return SDValue();
22403       Chain.push_back(V);
22404       V = V.getOperand(0);
22405       do {
22406         switch (V.getOpcode()) {
22407         default:
22408           return SDValue(); // Nothing to combine.
22409
22410         case X86ISD::PSHUFLW:
22411         case X86ISD::PSHUFHW:
22412           if (V.getOpcode() == CombineOp)
22413             break;
22414
22415           Chain.push_back(V);
22416
22417           // Fallthrough!
22418         case ISD::BITCAST:
22419           V = V.getOperand(0);
22420           continue;
22421         }
22422         break;
22423       } while (V.hasOneUse());
22424       break;
22425     }
22426     // Break out of the loop if we break out of the switch.
22427     break;
22428   }
22429
22430   if (!V.hasOneUse())
22431     // We fell out of the loop without finding a viable combining instruction.
22432     return SDValue();
22433
22434   // Merge this node's mask and our incoming mask.
22435   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
22436   for (int &M : Mask)
22437     M = VMask[M];
22438   V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
22439                   getV4X86ShuffleImm8ForMask(Mask, DAG));
22440
22441   // Rebuild the chain around this new shuffle.
22442   while (!Chain.empty()) {
22443     SDValue W = Chain.pop_back_val();
22444
22445     if (V.getValueType() != W.getOperand(0).getValueType())
22446       V = DAG.getNode(ISD::BITCAST, DL, W.getOperand(0).getValueType(), V);
22447
22448     switch (W.getOpcode()) {
22449     default:
22450       llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
22451
22452     case X86ISD::UNPCKL:
22453     case X86ISD::UNPCKH:
22454       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
22455       break;
22456
22457     case X86ISD::PSHUFD:
22458     case X86ISD::PSHUFLW:
22459     case X86ISD::PSHUFHW:
22460       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
22461       break;
22462     }
22463   }
22464   if (V.getValueType() != N.getValueType())
22465     V = DAG.getNode(ISD::BITCAST, DL, N.getValueType(), V);
22466
22467   // Return the new chain to replace N.
22468   return V;
22469 }
22470
22471 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw.
22472 ///
22473 /// We walk up the chain, skipping shuffles of the other half and looking
22474 /// through shuffles which switch halves trying to find a shuffle of the same
22475 /// pair of dwords.
22476 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
22477                                         SelectionDAG &DAG,
22478                                         TargetLowering::DAGCombinerInfo &DCI) {
22479   assert(
22480       (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
22481       "Called with something other than an x86 128-bit half shuffle!");
22482   SDLoc DL(N);
22483   unsigned CombineOpcode = N.getOpcode();
22484
22485   // Walk up a single-use chain looking for a combinable shuffle.
22486   SDValue V = N.getOperand(0);
22487   for (; V.hasOneUse(); V = V.getOperand(0)) {
22488     switch (V.getOpcode()) {
22489     default:
22490       return false; // Nothing combined!
22491
22492     case ISD::BITCAST:
22493       // Skip bitcasts as we always know the type for the target specific
22494       // instructions.
22495       continue;
22496
22497     case X86ISD::PSHUFLW:
22498     case X86ISD::PSHUFHW:
22499       if (V.getOpcode() == CombineOpcode)
22500         break;
22501
22502       // Other-half shuffles are no-ops.
22503       continue;
22504     }
22505     // Break out of the loop if we break out of the switch.
22506     break;
22507   }
22508
22509   if (!V.hasOneUse())
22510     // We fell out of the loop without finding a viable combining instruction.
22511     return false;
22512
22513   // Combine away the bottom node as its shuffle will be accumulated into
22514   // a preceding shuffle.
22515   DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
22516
22517   // Record the old value.
22518   SDValue Old = V;
22519
22520   // Merge this node's mask and our incoming mask (adjusted to account for all
22521   // the pshufd instructions encountered).
22522   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
22523   for (int &M : Mask)
22524     M = VMask[M];
22525   V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
22526                   getV4X86ShuffleImm8ForMask(Mask, DAG));
22527
22528   // Check that the shuffles didn't cancel each other out. If not, we need to
22529   // combine to the new one.
22530   if (Old != V)
22531     // Replace the combinable shuffle with the combined one, updating all users
22532     // so that we re-evaluate the chain here.
22533     DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
22534
22535   return true;
22536 }
22537
22538 /// \brief Try to combine x86 target specific shuffles.
22539 static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
22540                                            TargetLowering::DAGCombinerInfo &DCI,
22541                                            const X86Subtarget *Subtarget) {
22542   SDLoc DL(N);
22543   MVT VT = N.getSimpleValueType();
22544   SmallVector<int, 4> Mask;
22545
22546   switch (N.getOpcode()) {
22547   case X86ISD::PSHUFD:
22548   case X86ISD::PSHUFLW:
22549   case X86ISD::PSHUFHW:
22550     Mask = getPSHUFShuffleMask(N);
22551     assert(Mask.size() == 4);
22552     break;
22553   default:
22554     return SDValue();
22555   }
22556
22557   // Nuke no-op shuffles that show up after combining.
22558   if (isNoopShuffleMask(Mask))
22559     return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
22560
22561   // Look for simplifications involving one or two shuffle instructions.
22562   SDValue V = N.getOperand(0);
22563   switch (N.getOpcode()) {
22564   default:
22565     break;
22566   case X86ISD::PSHUFLW:
22567   case X86ISD::PSHUFHW:
22568     assert(VT == MVT::v8i16);
22569     (void)VT;
22570
22571     if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
22572       return SDValue(); // We combined away this shuffle, so we're done.
22573
22574     // See if this reduces to a PSHUFD which is no more expensive and can
22575     // combine with more operations. Note that it has to at least flip the
22576     // dwords as otherwise it would have been removed as a no-op.
22577     if (Mask[0] == 2 && Mask[1] == 3 && Mask[2] == 0 && Mask[3] == 1) {
22578       int DMask[] = {0, 1, 2, 3};
22579       int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
22580       DMask[DOffset + 0] = DOffset + 1;
22581       DMask[DOffset + 1] = DOffset + 0;
22582       V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V);
22583       DCI.AddToWorklist(V.getNode());
22584       V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V,
22585                       getV4X86ShuffleImm8ForMask(DMask, DAG));
22586       DCI.AddToWorklist(V.getNode());
22587       return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
22588     }
22589
22590     // Look for shuffle patterns which can be implemented as a single unpack.
22591     // FIXME: This doesn't handle the location of the PSHUFD generically, and
22592     // only works when we have a PSHUFD followed by two half-shuffles.
22593     if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
22594         (V.getOpcode() == X86ISD::PSHUFLW ||
22595          V.getOpcode() == X86ISD::PSHUFHW) &&
22596         V.getOpcode() != N.getOpcode() &&
22597         V.hasOneUse()) {
22598       SDValue D = V.getOperand(0);
22599       while (D.getOpcode() == ISD::BITCAST && D.hasOneUse())
22600         D = D.getOperand(0);
22601       if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
22602         SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
22603         SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
22604         int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
22605         int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
22606         int WordMask[8];
22607         for (int i = 0; i < 4; ++i) {
22608           WordMask[i + NOffset] = Mask[i] + NOffset;
22609           WordMask[i + VOffset] = VMask[i] + VOffset;
22610         }
22611         // Map the word mask through the DWord mask.
22612         int MappedMask[8];
22613         for (int i = 0; i < 8; ++i)
22614           MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
22615         const int UnpackLoMask[] = {0, 0, 1, 1, 2, 2, 3, 3};
22616         const int UnpackHiMask[] = {4, 4, 5, 5, 6, 6, 7, 7};
22617         if (std::equal(std::begin(MappedMask), std::end(MappedMask),
22618                        std::begin(UnpackLoMask)) ||
22619             std::equal(std::begin(MappedMask), std::end(MappedMask),
22620                        std::begin(UnpackHiMask))) {
22621           // We can replace all three shuffles with an unpack.
22622           V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, D.getOperand(0));
22623           DCI.AddToWorklist(V.getNode());
22624           return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
22625                                                 : X86ISD::UNPCKH,
22626                              DL, MVT::v8i16, V, V);
22627         }
22628       }
22629     }
22630
22631     break;
22632
22633   case X86ISD::PSHUFD:
22634     if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI))
22635       return NewN;
22636
22637     break;
22638   }
22639
22640   return SDValue();
22641 }
22642
22643 /// \brief Try to combine a shuffle into a target-specific add-sub node.
22644 ///
22645 /// We combine this directly on the abstract vector shuffle nodes so it is
22646 /// easier to generically match. We also insert dummy vector shuffle nodes for
22647 /// the operands which explicitly discard the lanes which are unused by this
22648 /// operation to try to flow through the rest of the combiner the fact that
22649 /// they're unused.
22650 static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) {
22651   SDLoc DL(N);
22652   EVT VT = N->getValueType(0);
22653
22654   // We only handle target-independent shuffles.
22655   // FIXME: It would be easy and harmless to use the target shuffle mask
22656   // extraction tool to support more.
22657   if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
22658     return SDValue();
22659
22660   auto *SVN = cast<ShuffleVectorSDNode>(N);
22661   ArrayRef<int> Mask = SVN->getMask();
22662   SDValue V1 = N->getOperand(0);
22663   SDValue V2 = N->getOperand(1);
22664
22665   // We require the first shuffle operand to be the SUB node, and the second to
22666   // be the ADD node.
22667   // FIXME: We should support the commuted patterns.
22668   if (V1->getOpcode() != ISD::FSUB || V2->getOpcode() != ISD::FADD)
22669     return SDValue();
22670
22671   // If there are other uses of these operations we can't fold them.
22672   if (!V1->hasOneUse() || !V2->hasOneUse())
22673     return SDValue();
22674
22675   // Ensure that both operations have the same operands. Note that we can
22676   // commute the FADD operands.
22677   SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
22678   if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
22679       (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
22680     return SDValue();
22681
22682   // We're looking for blends between FADD and FSUB nodes. We insist on these
22683   // nodes being lined up in a specific expected pattern.
22684   if (!(isShuffleEquivalent(Mask, 0, 3) ||
22685         isShuffleEquivalent(Mask, 0, 5, 2, 7) ||
22686         isShuffleEquivalent(Mask, 0, 9, 2, 11, 4, 13, 6, 15)))
22687     return SDValue();
22688
22689   // Only specific types are legal at this point, assert so we notice if and
22690   // when these change.
22691   assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 ||
22692           VT == MVT::v4f64) &&
22693          "Unknown vector type encountered!");
22694
22695   return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS);
22696 }
22697
22698 /// PerformShuffleCombine - Performs several different shuffle combines.
22699 static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
22700                                      TargetLowering::DAGCombinerInfo &DCI,
22701                                      const X86Subtarget *Subtarget) {
22702   SDLoc dl(N);
22703   SDValue N0 = N->getOperand(0);
22704   SDValue N1 = N->getOperand(1);
22705   EVT VT = N->getValueType(0);
22706
22707   // Don't create instructions with illegal types after legalize types has run.
22708   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22709   if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
22710     return SDValue();
22711
22712   // If we have legalized the vector types, look for blends of FADD and FSUB
22713   // nodes that we can fuse into an ADDSUB node.
22714   if (TLI.isTypeLegal(VT) && Subtarget->hasSSE3())
22715     if (SDValue AddSub = combineShuffleToAddSub(N, DAG))
22716       return AddSub;
22717
22718   // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
22719   if (Subtarget->hasFp256() && VT.is256BitVector() &&
22720       N->getOpcode() == ISD::VECTOR_SHUFFLE)
22721     return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
22722
22723   // During Type Legalization, when promoting illegal vector types,
22724   // the backend might introduce new shuffle dag nodes and bitcasts.
22725   //
22726   // This code performs the following transformation:
22727   // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
22728   //       (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
22729   //
22730   // We do this only if both the bitcast and the BINOP dag nodes have
22731   // one use. Also, perform this transformation only if the new binary
22732   // operation is legal. This is to avoid introducing dag nodes that
22733   // potentially need to be further expanded (or custom lowered) into a
22734   // less optimal sequence of dag nodes.
22735   if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
22736       N1.getOpcode() == ISD::UNDEF && N0.hasOneUse() &&
22737       N0.getOpcode() == ISD::BITCAST) {
22738     SDValue BC0 = N0.getOperand(0);
22739     EVT SVT = BC0.getValueType();
22740     unsigned Opcode = BC0.getOpcode();
22741     unsigned NumElts = VT.getVectorNumElements();
22742
22743     if (BC0.hasOneUse() && SVT.isVector() &&
22744         SVT.getVectorNumElements() * 2 == NumElts &&
22745         TLI.isOperationLegal(Opcode, VT)) {
22746       bool CanFold = false;
22747       switch (Opcode) {
22748       default : break;
22749       case ISD::ADD :
22750       case ISD::FADD :
22751       case ISD::SUB :
22752       case ISD::FSUB :
22753       case ISD::MUL :
22754       case ISD::FMUL :
22755         CanFold = true;
22756       }
22757
22758       unsigned SVTNumElts = SVT.getVectorNumElements();
22759       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
22760       for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
22761         CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
22762       for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
22763         CanFold = SVOp->getMaskElt(i) < 0;
22764
22765       if (CanFold) {
22766         SDValue BC00 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(0));
22767         SDValue BC01 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(1));
22768         SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
22769         return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]);
22770       }
22771     }
22772   }
22773
22774   // Only handle 128 wide vector from here on.
22775   if (!VT.is128BitVector())
22776     return SDValue();
22777
22778   // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
22779   // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
22780   // consecutive, non-overlapping, and in the right order.
22781   SmallVector<SDValue, 16> Elts;
22782   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
22783     Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
22784
22785   SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true);
22786   if (LD.getNode())
22787     return LD;
22788
22789   if (isTargetShuffle(N->getOpcode())) {
22790     SDValue Shuffle =
22791         PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget);
22792     if (Shuffle.getNode())
22793       return Shuffle;
22794
22795     // Try recursively combining arbitrary sequences of x86 shuffle
22796     // instructions into higher-order shuffles. We do this after combining
22797     // specific PSHUF instruction sequences into their minimal form so that we
22798     // can evaluate how many specialized shuffle instructions are involved in
22799     // a particular chain.
22800     SmallVector<int, 1> NonceMask; // Just a placeholder.
22801     NonceMask.push_back(0);
22802     if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask,
22803                                       /*Depth*/ 1, /*HasPSHUFB*/ false, DAG,
22804                                       DCI, Subtarget))
22805       return SDValue(); // This routine will use CombineTo to replace N.
22806   }
22807
22808   return SDValue();
22809 }
22810
22811 /// PerformTruncateCombine - Converts truncate operation to
22812 /// a sequence of vector shuffle operations.
22813 /// It is possible when we truncate 256-bit vector to 128-bit vector
22814 static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
22815                                       TargetLowering::DAGCombinerInfo &DCI,
22816                                       const X86Subtarget *Subtarget)  {
22817   return SDValue();
22818 }
22819
22820 /// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target
22821 /// specific shuffle of a load can be folded into a single element load.
22822 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
22823 /// shuffles have been custom lowered so we need to handle those here.
22824 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
22825                                          TargetLowering::DAGCombinerInfo &DCI) {
22826   if (DCI.isBeforeLegalizeOps())
22827     return SDValue();
22828
22829   SDValue InVec = N->getOperand(0);
22830   SDValue EltNo = N->getOperand(1);
22831
22832   if (!isa<ConstantSDNode>(EltNo))
22833     return SDValue();
22834
22835   EVT OriginalVT = InVec.getValueType();
22836
22837   if (InVec.getOpcode() == ISD::BITCAST) {
22838     // Don't duplicate a load with other uses.
22839     if (!InVec.hasOneUse())
22840       return SDValue();
22841     EVT BCVT = InVec.getOperand(0).getValueType();
22842     if (BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
22843       return SDValue();
22844     InVec = InVec.getOperand(0);
22845   }
22846
22847   EVT CurrentVT = InVec.getValueType();
22848
22849   if (!isTargetShuffle(InVec.getOpcode()))
22850     return SDValue();
22851
22852   // Don't duplicate a load with other uses.
22853   if (!InVec.hasOneUse())
22854     return SDValue();
22855
22856   SmallVector<int, 16> ShuffleMask;
22857   bool UnaryShuffle;
22858   if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(),
22859                             ShuffleMask, UnaryShuffle))
22860     return SDValue();
22861
22862   // Select the input vector, guarding against out of range extract vector.
22863   unsigned NumElems = CurrentVT.getVectorNumElements();
22864   int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
22865   int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt];
22866   SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0)
22867                                          : InVec.getOperand(1);
22868
22869   // If inputs to shuffle are the same for both ops, then allow 2 uses
22870   unsigned AllowedUses = InVec.getNumOperands() > 1 &&
22871                          InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
22872
22873   if (LdNode.getOpcode() == ISD::BITCAST) {
22874     // Don't duplicate a load with other uses.
22875     if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
22876       return SDValue();
22877
22878     AllowedUses = 1; // only allow 1 load use if we have a bitcast
22879     LdNode = LdNode.getOperand(0);
22880   }
22881
22882   if (!ISD::isNormalLoad(LdNode.getNode()))
22883     return SDValue();
22884
22885   LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
22886
22887   if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
22888     return SDValue();
22889
22890   EVT EltVT = N->getValueType(0);
22891   // If there's a bitcast before the shuffle, check if the load type and
22892   // alignment is valid.
22893   unsigned Align = LN0->getAlignment();
22894   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22895   unsigned NewAlign = TLI.getDataLayout()->getABITypeAlignment(
22896       EltVT.getTypeForEVT(*DAG.getContext()));
22897
22898   if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
22899     return SDValue();
22900
22901   // All checks match so transform back to vector_shuffle so that DAG combiner
22902   // can finish the job
22903   SDLoc dl(N);
22904
22905   // Create shuffle node taking into account the case that its a unary shuffle
22906   SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT)
22907                                    : InVec.getOperand(1);
22908   Shuffle = DAG.getVectorShuffle(CurrentVT, dl,
22909                                  InVec.getOperand(0), Shuffle,
22910                                  &ShuffleMask[0]);
22911   Shuffle = DAG.getNode(ISD::BITCAST, dl, OriginalVT, Shuffle);
22912   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
22913                      EltNo);
22914 }
22915
22916 /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
22917 /// generation and convert it from being a bunch of shuffles and extracts
22918 /// into a somewhat faster sequence. For i686, the best sequence is apparently
22919 /// storing the value and loading scalars back, while for x64 we should
22920 /// use 64-bit extracts and shifts.
22921 static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
22922                                          TargetLowering::DAGCombinerInfo &DCI) {
22923   SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI);
22924   if (NewOp.getNode())
22925     return NewOp;
22926
22927   SDValue InputVector = N->getOperand(0);
22928
22929   // Detect whether we are trying to convert from mmx to i32 and the bitcast
22930   // from mmx to v2i32 has a single usage.
22931   if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST &&
22932       InputVector.getNode()->getOperand(0).getValueType() == MVT::x86mmx &&
22933       InputVector.hasOneUse() && N->getValueType(0) == MVT::i32)
22934     return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
22935                        N->getValueType(0),
22936                        InputVector.getNode()->getOperand(0));
22937
22938   // Only operate on vectors of 4 elements, where the alternative shuffling
22939   // gets to be more expensive.
22940   if (InputVector.getValueType() != MVT::v4i32)
22941     return SDValue();
22942
22943   // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
22944   // single use which is a sign-extend or zero-extend, and all elements are
22945   // used.
22946   SmallVector<SDNode *, 4> Uses;
22947   unsigned ExtractedElements = 0;
22948   for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
22949        UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
22950     if (UI.getUse().getResNo() != InputVector.getResNo())
22951       return SDValue();
22952
22953     SDNode *Extract = *UI;
22954     if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
22955       return SDValue();
22956
22957     if (Extract->getValueType(0) != MVT::i32)
22958       return SDValue();
22959     if (!Extract->hasOneUse())
22960       return SDValue();
22961     if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
22962         Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
22963       return SDValue();
22964     if (!isa<ConstantSDNode>(Extract->getOperand(1)))
22965       return SDValue();
22966
22967     // Record which element was extracted.
22968     ExtractedElements |=
22969       1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
22970
22971     Uses.push_back(Extract);
22972   }
22973
22974   // If not all the elements were used, this may not be worthwhile.
22975   if (ExtractedElements != 15)
22976     return SDValue();
22977
22978   // Ok, we've now decided to do the transformation.
22979   // If 64-bit shifts are legal, use the extract-shift sequence,
22980   // otherwise bounce the vector off the cache.
22981   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22982   SDValue Vals[4];
22983   SDLoc dl(InputVector);
22984
22985   if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
22986     SDValue Cst = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, InputVector);
22987     EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy();
22988     SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
22989       DAG.getConstant(0, VecIdxTy));
22990     SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
22991       DAG.getConstant(1, VecIdxTy));
22992
22993     SDValue ShAmt = DAG.getConstant(32,
22994       DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64));
22995     Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
22996     Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
22997       DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
22998     Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
22999     Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
23000       DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
23001   } else {
23002     // Store the value to a temporary stack slot.
23003     SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
23004     SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
23005       MachinePointerInfo(), false, false, 0);
23006
23007     EVT ElementType = InputVector.getValueType().getVectorElementType();
23008     unsigned EltSize = ElementType.getSizeInBits() / 8;
23009
23010     // Replace each use (extract) with a load of the appropriate element.
23011     for (unsigned i = 0; i < 4; ++i) {
23012       uint64_t Offset = EltSize * i;
23013       SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
23014
23015       SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
23016                                        StackPtr, OffsetVal);
23017
23018       // Load the scalar.
23019       Vals[i] = DAG.getLoad(ElementType, dl, Ch,
23020                             ScalarAddr, MachinePointerInfo(),
23021                             false, false, false, 0);
23022
23023     }
23024   }
23025
23026   // Replace the extracts
23027   for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
23028     UE = Uses.end(); UI != UE; ++UI) {
23029     SDNode *Extract = *UI;
23030
23031     SDValue Idx = Extract->getOperand(1);
23032     uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
23033     DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
23034   }
23035
23036   // The replacement was made in place; don't return anything.
23037   return SDValue();
23038 }
23039
23040 /// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match.
23041 static std::pair<unsigned, bool>
23042 matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
23043                    SelectionDAG &DAG, const X86Subtarget *Subtarget) {
23044   if (!VT.isVector())
23045     return std::make_pair(0, false);
23046
23047   bool NeedSplit = false;
23048   switch (VT.getSimpleVT().SimpleTy) {
23049   default: return std::make_pair(0, false);
23050   case MVT::v4i64:
23051   case MVT::v2i64:
23052     if (!Subtarget->hasVLX())
23053       return std::make_pair(0, false);
23054     break;
23055   case MVT::v64i8:
23056   case MVT::v32i16:
23057     if (!Subtarget->hasBWI())
23058       return std::make_pair(0, false);
23059     break;
23060   case MVT::v16i32:
23061   case MVT::v8i64:
23062     if (!Subtarget->hasAVX512())
23063       return std::make_pair(0, false);
23064     break;
23065   case MVT::v32i8:
23066   case MVT::v16i16:
23067   case MVT::v8i32:
23068     if (!Subtarget->hasAVX2())
23069       NeedSplit = true;
23070     if (!Subtarget->hasAVX())
23071       return std::make_pair(0, false);
23072     break;
23073   case MVT::v16i8:
23074   case MVT::v8i16:
23075   case MVT::v4i32:
23076     if (!Subtarget->hasSSE2())
23077       return std::make_pair(0, false);
23078   }
23079
23080   // SSE2 has only a small subset of the operations.
23081   bool hasUnsigned = Subtarget->hasSSE41() ||
23082                      (Subtarget->hasSSE2() && VT == MVT::v16i8);
23083   bool hasSigned = Subtarget->hasSSE41() ||
23084                    (Subtarget->hasSSE2() && VT == MVT::v8i16);
23085
23086   ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23087
23088   unsigned Opc = 0;
23089   // Check for x CC y ? x : y.
23090   if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
23091       DAG.isEqualTo(RHS, Cond.getOperand(1))) {
23092     switch (CC) {
23093     default: break;
23094     case ISD::SETULT:
23095     case ISD::SETULE:
23096       Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
23097     case ISD::SETUGT:
23098     case ISD::SETUGE:
23099       Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
23100     case ISD::SETLT:
23101     case ISD::SETLE:
23102       Opc = hasSigned ? X86ISD::SMIN : 0; break;
23103     case ISD::SETGT:
23104     case ISD::SETGE:
23105       Opc = hasSigned ? X86ISD::SMAX : 0; break;
23106     }
23107   // Check for x CC y ? y : x -- a min/max with reversed arms.
23108   } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
23109              DAG.isEqualTo(RHS, Cond.getOperand(0))) {
23110     switch (CC) {
23111     default: break;
23112     case ISD::SETULT:
23113     case ISD::SETULE:
23114       Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
23115     case ISD::SETUGT:
23116     case ISD::SETUGE:
23117       Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
23118     case ISD::SETLT:
23119     case ISD::SETLE:
23120       Opc = hasSigned ? X86ISD::SMAX : 0; break;
23121     case ISD::SETGT:
23122     case ISD::SETGE:
23123       Opc = hasSigned ? X86ISD::SMIN : 0; break;
23124     }
23125   }
23126
23127   return std::make_pair(Opc, NeedSplit);
23128 }
23129
23130 static SDValue
23131 transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
23132                                       const X86Subtarget *Subtarget) {
23133   SDLoc dl(N);
23134   SDValue Cond = N->getOperand(0);
23135   SDValue LHS = N->getOperand(1);
23136   SDValue RHS = N->getOperand(2);
23137
23138   if (Cond.getOpcode() == ISD::SIGN_EXTEND) {
23139     SDValue CondSrc = Cond->getOperand(0);
23140     if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG)
23141       Cond = CondSrc->getOperand(0);
23142   }
23143
23144   if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
23145     return SDValue();
23146
23147   // A vselect where all conditions and data are constants can be optimized into
23148   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
23149   if (ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
23150       ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
23151     return SDValue();
23152
23153   unsigned MaskValue = 0;
23154   if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
23155     return SDValue();
23156
23157   MVT VT = N->getSimpleValueType(0);
23158   unsigned NumElems = VT.getVectorNumElements();
23159   SmallVector<int, 8> ShuffleMask(NumElems, -1);
23160   for (unsigned i = 0; i < NumElems; ++i) {
23161     // Be sure we emit undef where we can.
23162     if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF)
23163       ShuffleMask[i] = -1;
23164     else
23165       ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1);
23166   }
23167
23168   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23169   if (!TLI.isShuffleMaskLegal(ShuffleMask, VT))
23170     return SDValue();
23171   return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]);
23172 }
23173
23174 /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
23175 /// nodes.
23176 static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
23177                                     TargetLowering::DAGCombinerInfo &DCI,
23178                                     const X86Subtarget *Subtarget) {
23179   SDLoc DL(N);
23180   SDValue Cond = N->getOperand(0);
23181   // Get the LHS/RHS of the select.
23182   SDValue LHS = N->getOperand(1);
23183   SDValue RHS = N->getOperand(2);
23184   EVT VT = LHS.getValueType();
23185   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23186
23187   // If we have SSE[12] support, try to form min/max nodes. SSE min/max
23188   // instructions match the semantics of the common C idiom x<y?x:y but not
23189   // x<=y?x:y, because of how they handle negative zero (which can be
23190   // ignored in unsafe-math mode).
23191   // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
23192   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
23193       VT != MVT::f80 && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
23194       (Subtarget->hasSSE2() ||
23195        (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
23196     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23197
23198     unsigned Opcode = 0;
23199     // Check for x CC y ? x : y.
23200     if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
23201         DAG.isEqualTo(RHS, Cond.getOperand(1))) {
23202       switch (CC) {
23203       default: break;
23204       case ISD::SETULT:
23205         // Converting this to a min would handle NaNs incorrectly, and swapping
23206         // the operands would cause it to handle comparisons between positive
23207         // and negative zero incorrectly.
23208         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
23209           if (!DAG.getTarget().Options.UnsafeFPMath &&
23210               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
23211             break;
23212           std::swap(LHS, RHS);
23213         }
23214         Opcode = X86ISD::FMIN;
23215         break;
23216       case ISD::SETOLE:
23217         // Converting this to a min would handle comparisons between positive
23218         // and negative zero incorrectly.
23219         if (!DAG.getTarget().Options.UnsafeFPMath &&
23220             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
23221           break;
23222         Opcode = X86ISD::FMIN;
23223         break;
23224       case ISD::SETULE:
23225         // Converting this to a min would handle both negative zeros and NaNs
23226         // incorrectly, but we can swap the operands to fix both.
23227         std::swap(LHS, RHS);
23228       case ISD::SETOLT:
23229       case ISD::SETLT:
23230       case ISD::SETLE:
23231         Opcode = X86ISD::FMIN;
23232         break;
23233
23234       case ISD::SETOGE:
23235         // Converting this to a max would handle comparisons between positive
23236         // and negative zero incorrectly.
23237         if (!DAG.getTarget().Options.UnsafeFPMath &&
23238             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
23239           break;
23240         Opcode = X86ISD::FMAX;
23241         break;
23242       case ISD::SETUGT:
23243         // Converting this to a max would handle NaNs incorrectly, and swapping
23244         // the operands would cause it to handle comparisons between positive
23245         // and negative zero incorrectly.
23246         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
23247           if (!DAG.getTarget().Options.UnsafeFPMath &&
23248               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
23249             break;
23250           std::swap(LHS, RHS);
23251         }
23252         Opcode = X86ISD::FMAX;
23253         break;
23254       case ISD::SETUGE:
23255         // Converting this to a max would handle both negative zeros and NaNs
23256         // incorrectly, but we can swap the operands to fix both.
23257         std::swap(LHS, RHS);
23258       case ISD::SETOGT:
23259       case ISD::SETGT:
23260       case ISD::SETGE:
23261         Opcode = X86ISD::FMAX;
23262         break;
23263       }
23264     // Check for x CC y ? y : x -- a min/max with reversed arms.
23265     } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
23266                DAG.isEqualTo(RHS, Cond.getOperand(0))) {
23267       switch (CC) {
23268       default: break;
23269       case ISD::SETOGE:
23270         // Converting this to a min would handle comparisons between positive
23271         // and negative zero incorrectly, and swapping the operands would
23272         // cause it to handle NaNs incorrectly.
23273         if (!DAG.getTarget().Options.UnsafeFPMath &&
23274             !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
23275           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
23276             break;
23277           std::swap(LHS, RHS);
23278         }
23279         Opcode = X86ISD::FMIN;
23280         break;
23281       case ISD::SETUGT:
23282         // Converting this to a min would handle NaNs incorrectly.
23283         if (!DAG.getTarget().Options.UnsafeFPMath &&
23284             (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
23285           break;
23286         Opcode = X86ISD::FMIN;
23287         break;
23288       case ISD::SETUGE:
23289         // Converting this to a min would handle both negative zeros and NaNs
23290         // incorrectly, but we can swap the operands to fix both.
23291         std::swap(LHS, RHS);
23292       case ISD::SETOGT:
23293       case ISD::SETGT:
23294       case ISD::SETGE:
23295         Opcode = X86ISD::FMIN;
23296         break;
23297
23298       case ISD::SETULT:
23299         // Converting this to a max would handle NaNs incorrectly.
23300         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
23301           break;
23302         Opcode = X86ISD::FMAX;
23303         break;
23304       case ISD::SETOLE:
23305         // Converting this to a max would handle comparisons between positive
23306         // and negative zero incorrectly, and swapping the operands would
23307         // cause it to handle NaNs incorrectly.
23308         if (!DAG.getTarget().Options.UnsafeFPMath &&
23309             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
23310           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
23311             break;
23312           std::swap(LHS, RHS);
23313         }
23314         Opcode = X86ISD::FMAX;
23315         break;
23316       case ISD::SETULE:
23317         // Converting this to a max would handle both negative zeros and NaNs
23318         // incorrectly, but we can swap the operands to fix both.
23319         std::swap(LHS, RHS);
23320       case ISD::SETOLT:
23321       case ISD::SETLT:
23322       case ISD::SETLE:
23323         Opcode = X86ISD::FMAX;
23324         break;
23325       }
23326     }
23327
23328     if (Opcode)
23329       return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
23330   }
23331
23332   EVT CondVT = Cond.getValueType();
23333   if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() &&
23334       CondVT.getVectorElementType() == MVT::i1) {
23335     // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
23336     // lowering on KNL. In this case we convert it to
23337     // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
23338     // The same situation for all 128 and 256-bit vectors of i8 and i16.
23339     // Since SKX these selects have a proper lowering.
23340     EVT OpVT = LHS.getValueType();
23341     if ((OpVT.is128BitVector() || OpVT.is256BitVector()) &&
23342         (OpVT.getVectorElementType() == MVT::i8 ||
23343          OpVT.getVectorElementType() == MVT::i16) &&
23344         !(Subtarget->hasBWI() && Subtarget->hasVLX())) {
23345       Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond);
23346       DCI.AddToWorklist(Cond.getNode());
23347       return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS);
23348     }
23349   }
23350   // If this is a select between two integer constants, try to do some
23351   // optimizations.
23352   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
23353     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
23354       // Don't do this for crazy integer types.
23355       if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
23356         // If this is efficiently invertible, canonicalize the LHSC/RHSC values
23357         // so that TrueC (the true value) is larger than FalseC.
23358         bool NeedsCondInvert = false;
23359
23360         if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
23361             // Efficiently invertible.
23362             (Cond.getOpcode() == ISD::SETCC ||  // setcc -> invertible.
23363              (Cond.getOpcode() == ISD::XOR &&   // xor(X, C) -> invertible.
23364               isa<ConstantSDNode>(Cond.getOperand(1))))) {
23365           NeedsCondInvert = true;
23366           std::swap(TrueC, FalseC);
23367         }
23368
23369         // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
23370         if (FalseC->getAPIntValue() == 0 &&
23371             TrueC->getAPIntValue().isPowerOf2()) {
23372           if (NeedsCondInvert) // Invert the condition if needed.
23373             Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
23374                                DAG.getConstant(1, Cond.getValueType()));
23375
23376           // Zero extend the condition if needed.
23377           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
23378
23379           unsigned ShAmt = TrueC->getAPIntValue().logBase2();
23380           return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
23381                              DAG.getConstant(ShAmt, MVT::i8));
23382         }
23383
23384         // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
23385         if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
23386           if (NeedsCondInvert) // Invert the condition if needed.
23387             Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
23388                                DAG.getConstant(1, Cond.getValueType()));
23389
23390           // Zero extend the condition if needed.
23391           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
23392                              FalseC->getValueType(0), Cond);
23393           return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
23394                              SDValue(FalseC, 0));
23395         }
23396
23397         // Optimize cases that will turn into an LEA instruction.  This requires
23398         // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
23399         if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
23400           uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
23401           if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
23402
23403           bool isFastMultiplier = false;
23404           if (Diff < 10) {
23405             switch ((unsigned char)Diff) {
23406               default: break;
23407               case 1:  // result = add base, cond
23408               case 2:  // result = lea base(    , cond*2)
23409               case 3:  // result = lea base(cond, cond*2)
23410               case 4:  // result = lea base(    , cond*4)
23411               case 5:  // result = lea base(cond, cond*4)
23412               case 8:  // result = lea base(    , cond*8)
23413               case 9:  // result = lea base(cond, cond*8)
23414                 isFastMultiplier = true;
23415                 break;
23416             }
23417           }
23418
23419           if (isFastMultiplier) {
23420             APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
23421             if (NeedsCondInvert) // Invert the condition if needed.
23422               Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
23423                                  DAG.getConstant(1, Cond.getValueType()));
23424
23425             // Zero extend the condition if needed.
23426             Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
23427                                Cond);
23428             // Scale the condition by the difference.
23429             if (Diff != 1)
23430               Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
23431                                  DAG.getConstant(Diff, Cond.getValueType()));
23432
23433             // Add the base if non-zero.
23434             if (FalseC->getAPIntValue() != 0)
23435               Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
23436                                  SDValue(FalseC, 0));
23437             return Cond;
23438           }
23439         }
23440       }
23441   }
23442
23443   // Canonicalize max and min:
23444   // (x > y) ? x : y -> (x >= y) ? x : y
23445   // (x < y) ? x : y -> (x <= y) ? x : y
23446   // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
23447   // the need for an extra compare
23448   // against zero. e.g.
23449   // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
23450   // subl   %esi, %edi
23451   // testl  %edi, %edi
23452   // movl   $0, %eax
23453   // cmovgl %edi, %eax
23454   // =>
23455   // xorl   %eax, %eax
23456   // subl   %esi, $edi
23457   // cmovsl %eax, %edi
23458   if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
23459       DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
23460       DAG.isEqualTo(RHS, Cond.getOperand(1))) {
23461     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23462     switch (CC) {
23463     default: break;
23464     case ISD::SETLT:
23465     case ISD::SETGT: {
23466       ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
23467       Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
23468                           Cond.getOperand(0), Cond.getOperand(1), NewCC);
23469       return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
23470     }
23471     }
23472   }
23473
23474   // Early exit check
23475   if (!TLI.isTypeLegal(VT))
23476     return SDValue();
23477
23478   // Match VSELECTs into subs with unsigned saturation.
23479   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
23480       // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
23481       ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
23482        (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
23483     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23484
23485     // Check if one of the arms of the VSELECT is a zero vector. If it's on the
23486     // left side invert the predicate to simplify logic below.
23487     SDValue Other;
23488     if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
23489       Other = RHS;
23490       CC = ISD::getSetCCInverse(CC, true);
23491     } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
23492       Other = LHS;
23493     }
23494
23495     if (Other.getNode() && Other->getNumOperands() == 2 &&
23496         DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
23497       SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
23498       SDValue CondRHS = Cond->getOperand(1);
23499
23500       // Look for a general sub with unsigned saturation first.
23501       // x >= y ? x-y : 0 --> subus x, y
23502       // x >  y ? x-y : 0 --> subus x, y
23503       if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
23504           Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
23505         return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
23506
23507       if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
23508         if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
23509           if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
23510             if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
23511               // If the RHS is a constant we have to reverse the const
23512               // canonicalization.
23513               // x > C-1 ? x+-C : 0 --> subus x, C
23514               if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
23515                   CondRHSConst->getAPIntValue() ==
23516                       (-OpRHSConst->getAPIntValue() - 1))
23517                 return DAG.getNode(
23518                     X86ISD::SUBUS, DL, VT, OpLHS,
23519                     DAG.getConstant(-OpRHSConst->getAPIntValue(), VT));
23520
23521           // Another special case: If C was a sign bit, the sub has been
23522           // canonicalized into a xor.
23523           // FIXME: Would it be better to use computeKnownBits to determine
23524           //        whether it's safe to decanonicalize the xor?
23525           // x s< 0 ? x^C : 0 --> subus x, C
23526           if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
23527               ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
23528               OpRHSConst->getAPIntValue().isSignBit())
23529             // Note that we have to rebuild the RHS constant here to ensure we
23530             // don't rely on particular values of undef lanes.
23531             return DAG.getNode(
23532                 X86ISD::SUBUS, DL, VT, OpLHS,
23533                 DAG.getConstant(OpRHSConst->getAPIntValue(), VT));
23534         }
23535     }
23536   }
23537
23538   // Try to match a min/max vector operation.
23539   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) {
23540     std::pair<unsigned, bool> ret = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget);
23541     unsigned Opc = ret.first;
23542     bool NeedSplit = ret.second;
23543
23544     if (Opc && NeedSplit) {
23545       unsigned NumElems = VT.getVectorNumElements();
23546       // Extract the LHS vectors
23547       SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, DL);
23548       SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, DL);
23549
23550       // Extract the RHS vectors
23551       SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, DL);
23552       SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, DL);
23553
23554       // Create min/max for each subvector
23555       LHS = DAG.getNode(Opc, DL, LHS1.getValueType(), LHS1, RHS1);
23556       RHS = DAG.getNode(Opc, DL, LHS2.getValueType(), LHS2, RHS2);
23557
23558       // Merge the result
23559       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS, RHS);
23560     } else if (Opc)
23561       return DAG.getNode(Opc, DL, VT, LHS, RHS);
23562   }
23563
23564   // Simplify vector selection if condition value type matches vselect
23565   // operand type
23566   if (N->getOpcode() == ISD::VSELECT && CondVT == VT) {
23567     assert(Cond.getValueType().isVector() &&
23568            "vector select expects a vector selector!");
23569
23570     bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
23571     bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
23572
23573     // Try invert the condition if true value is not all 1s and false value
23574     // is not all 0s.
23575     if (!TValIsAllOnes && !FValIsAllZeros &&
23576         // Check if the selector will be produced by CMPP*/PCMP*
23577         Cond.getOpcode() == ISD::SETCC &&
23578         // Check if SETCC has already been promoted
23579         TLI.getSetCCResultType(*DAG.getContext(), VT) == CondVT) {
23580       bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
23581       bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
23582
23583       if (TValIsAllZeros || FValIsAllOnes) {
23584         SDValue CC = Cond.getOperand(2);
23585         ISD::CondCode NewCC =
23586           ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
23587                                Cond.getOperand(0).getValueType().isInteger());
23588         Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC);
23589         std::swap(LHS, RHS);
23590         TValIsAllOnes = FValIsAllOnes;
23591         FValIsAllZeros = TValIsAllZeros;
23592       }
23593     }
23594
23595     if (TValIsAllOnes || FValIsAllZeros) {
23596       SDValue Ret;
23597
23598       if (TValIsAllOnes && FValIsAllZeros)
23599         Ret = Cond;
23600       else if (TValIsAllOnes)
23601         Ret = DAG.getNode(ISD::OR, DL, CondVT, Cond,
23602                           DAG.getNode(ISD::BITCAST, DL, CondVT, RHS));
23603       else if (FValIsAllZeros)
23604         Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond,
23605                           DAG.getNode(ISD::BITCAST, DL, CondVT, LHS));
23606
23607       return DAG.getNode(ISD::BITCAST, DL, VT, Ret);
23608     }
23609   }
23610
23611   // If we know that this node is legal then we know that it is going to be
23612   // matched by one of the SSE/AVX BLEND instructions. These instructions only
23613   // depend on the highest bit in each word. Try to use SimplifyDemandedBits
23614   // to simplify previous instructions.
23615   if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
23616       !DCI.isBeforeLegalize() &&
23617       // We explicitly check against v8i16 and v16i16 because, although
23618       // they're marked as Custom, they might only be legal when Cond is a
23619       // build_vector of constants. This will be taken care in a later
23620       // condition.
23621       (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 &&
23622        VT != MVT::v8i16) &&
23623       // Don't optimize vector of constants. Those are handled by
23624       // the generic code and all the bits must be properly set for
23625       // the generic optimizer.
23626       !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
23627     unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
23628
23629     // Don't optimize vector selects that map to mask-registers.
23630     if (BitWidth == 1)
23631       return SDValue();
23632
23633     assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
23634     APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
23635
23636     APInt KnownZero, KnownOne;
23637     TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
23638                                           DCI.isBeforeLegalizeOps());
23639     if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
23640         TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
23641                                  TLO)) {
23642       // If we changed the computation somewhere in the DAG, this change
23643       // will affect all users of Cond.
23644       // Make sure it is fine and update all the nodes so that we do not
23645       // use the generic VSELECT anymore. Otherwise, we may perform
23646       // wrong optimizations as we messed up with the actual expectation
23647       // for the vector boolean values.
23648       if (Cond != TLO.Old) {
23649         // Check all uses of that condition operand to check whether it will be
23650         // consumed by non-BLEND instructions, which may depend on all bits are
23651         // set properly.
23652         for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
23653              I != E; ++I)
23654           if (I->getOpcode() != ISD::VSELECT)
23655             // TODO: Add other opcodes eventually lowered into BLEND.
23656             return SDValue();
23657
23658         // Update all the users of the condition, before committing the change,
23659         // so that the VSELECT optimizations that expect the correct vector
23660         // boolean value will not be triggered.
23661         for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
23662              I != E; ++I)
23663           DAG.ReplaceAllUsesOfValueWith(
23664               SDValue(*I, 0),
23665               DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0),
23666                           Cond, I->getOperand(1), I->getOperand(2)));
23667         DCI.CommitTargetLoweringOpt(TLO);
23668         return SDValue();
23669       }
23670       // At this point, only Cond is changed. Change the condition
23671       // just for N to keep the opportunity to optimize all other
23672       // users their own way.
23673       DAG.ReplaceAllUsesOfValueWith(
23674           SDValue(N, 0),
23675           DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0),
23676                       TLO.New, N->getOperand(1), N->getOperand(2)));
23677       return SDValue();
23678     }
23679   }
23680
23681   // We should generate an X86ISD::BLENDI from a vselect if its argument
23682   // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
23683   // constants. This specific pattern gets generated when we split a
23684   // selector for a 512 bit vector in a machine without AVX512 (but with
23685   // 256-bit vectors), during legalization:
23686   //
23687   // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
23688   //
23689   // Iff we find this pattern and the build_vectors are built from
23690   // constants, we translate the vselect into a shuffle_vector that we
23691   // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
23692   if ((N->getOpcode() == ISD::VSELECT ||
23693        N->getOpcode() == X86ISD::SHRUNKBLEND) &&
23694       !DCI.isBeforeLegalize()) {
23695     SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
23696     if (Shuffle.getNode())
23697       return Shuffle;
23698   }
23699
23700   return SDValue();
23701 }
23702
23703 // Check whether a boolean test is testing a boolean value generated by
23704 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
23705 // code.
23706 //
23707 // Simplify the following patterns:
23708 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
23709 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
23710 // to (Op EFLAGS Cond)
23711 //
23712 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
23713 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
23714 // to (Op EFLAGS !Cond)
23715 //
23716 // where Op could be BRCOND or CMOV.
23717 //
23718 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
23719   // Quit if not CMP and SUB with its value result used.
23720   if (Cmp.getOpcode() != X86ISD::CMP &&
23721       (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0)))
23722       return SDValue();
23723
23724   // Quit if not used as a boolean value.
23725   if (CC != X86::COND_E && CC != X86::COND_NE)
23726     return SDValue();
23727
23728   // Check CMP operands. One of them should be 0 or 1 and the other should be
23729   // an SetCC or extended from it.
23730   SDValue Op1 = Cmp.getOperand(0);
23731   SDValue Op2 = Cmp.getOperand(1);
23732
23733   SDValue SetCC;
23734   const ConstantSDNode* C = nullptr;
23735   bool needOppositeCond = (CC == X86::COND_E);
23736   bool checkAgainstTrue = false; // Is it a comparison against 1?
23737
23738   if ((C = dyn_cast<ConstantSDNode>(Op1)))
23739     SetCC = Op2;
23740   else if ((C = dyn_cast<ConstantSDNode>(Op2)))
23741     SetCC = Op1;
23742   else // Quit if all operands are not constants.
23743     return SDValue();
23744
23745   if (C->getZExtValue() == 1) {
23746     needOppositeCond = !needOppositeCond;
23747     checkAgainstTrue = true;
23748   } else if (C->getZExtValue() != 0)
23749     // Quit if the constant is neither 0 or 1.
23750     return SDValue();
23751
23752   bool truncatedToBoolWithAnd = false;
23753   // Skip (zext $x), (trunc $x), or (and $x, 1) node.
23754   while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
23755          SetCC.getOpcode() == ISD::TRUNCATE ||
23756          SetCC.getOpcode() == ISD::AND) {
23757     if (SetCC.getOpcode() == ISD::AND) {
23758       int OpIdx = -1;
23759       ConstantSDNode *CS;
23760       if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(0))) &&
23761           CS->getZExtValue() == 1)
23762         OpIdx = 1;
23763       if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(1))) &&
23764           CS->getZExtValue() == 1)
23765         OpIdx = 0;
23766       if (OpIdx == -1)
23767         break;
23768       SetCC = SetCC.getOperand(OpIdx);
23769       truncatedToBoolWithAnd = true;
23770     } else
23771       SetCC = SetCC.getOperand(0);
23772   }
23773
23774   switch (SetCC.getOpcode()) {
23775   case X86ISD::SETCC_CARRY:
23776     // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
23777     // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
23778     // i.e. it's a comparison against true but the result of SETCC_CARRY is not
23779     // truncated to i1 using 'and'.
23780     if (checkAgainstTrue && !truncatedToBoolWithAnd)
23781       break;
23782     assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
23783            "Invalid use of SETCC_CARRY!");
23784     // FALL THROUGH
23785   case X86ISD::SETCC:
23786     // Set the condition code or opposite one if necessary.
23787     CC = X86::CondCode(SetCC.getConstantOperandVal(0));
23788     if (needOppositeCond)
23789       CC = X86::GetOppositeBranchCondition(CC);
23790     return SetCC.getOperand(1);
23791   case X86ISD::CMOV: {
23792     // Check whether false/true value has canonical one, i.e. 0 or 1.
23793     ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
23794     ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
23795     // Quit if true value is not a constant.
23796     if (!TVal)
23797       return SDValue();
23798     // Quit if false value is not a constant.
23799     if (!FVal) {
23800       SDValue Op = SetCC.getOperand(0);
23801       // Skip 'zext' or 'trunc' node.
23802       if (Op.getOpcode() == ISD::ZERO_EXTEND ||
23803           Op.getOpcode() == ISD::TRUNCATE)
23804         Op = Op.getOperand(0);
23805       // A special case for rdrand/rdseed, where 0 is set if false cond is
23806       // found.
23807       if ((Op.getOpcode() != X86ISD::RDRAND &&
23808            Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
23809         return SDValue();
23810     }
23811     // Quit if false value is not the constant 0 or 1.
23812     bool FValIsFalse = true;
23813     if (FVal && FVal->getZExtValue() != 0) {
23814       if (FVal->getZExtValue() != 1)
23815         return SDValue();
23816       // If FVal is 1, opposite cond is needed.
23817       needOppositeCond = !needOppositeCond;
23818       FValIsFalse = false;
23819     }
23820     // Quit if TVal is not the constant opposite of FVal.
23821     if (FValIsFalse && TVal->getZExtValue() != 1)
23822       return SDValue();
23823     if (!FValIsFalse && TVal->getZExtValue() != 0)
23824       return SDValue();
23825     CC = X86::CondCode(SetCC.getConstantOperandVal(2));
23826     if (needOppositeCond)
23827       CC = X86::GetOppositeBranchCondition(CC);
23828     return SetCC.getOperand(3);
23829   }
23830   }
23831
23832   return SDValue();
23833 }
23834
23835 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
23836 static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
23837                                   TargetLowering::DAGCombinerInfo &DCI,
23838                                   const X86Subtarget *Subtarget) {
23839   SDLoc DL(N);
23840
23841   // If the flag operand isn't dead, don't touch this CMOV.
23842   if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
23843     return SDValue();
23844
23845   SDValue FalseOp = N->getOperand(0);
23846   SDValue TrueOp = N->getOperand(1);
23847   X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
23848   SDValue Cond = N->getOperand(3);
23849
23850   if (CC == X86::COND_E || CC == X86::COND_NE) {
23851     switch (Cond.getOpcode()) {
23852     default: break;
23853     case X86ISD::BSR:
23854     case X86ISD::BSF:
23855       // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
23856       if (DAG.isKnownNeverZero(Cond.getOperand(0)))
23857         return (CC == X86::COND_E) ? FalseOp : TrueOp;
23858     }
23859   }
23860
23861   SDValue Flags;
23862
23863   Flags = checkBoolTestSetCCCombine(Cond, CC);
23864   if (Flags.getNode() &&
23865       // Extra check as FCMOV only supports a subset of X86 cond.
23866       (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
23867     SDValue Ops[] = { FalseOp, TrueOp,
23868                       DAG.getConstant(CC, MVT::i8), Flags };
23869     return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
23870   }
23871
23872   // If this is a select between two integer constants, try to do some
23873   // optimizations.  Note that the operands are ordered the opposite of SELECT
23874   // operands.
23875   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
23876     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
23877       // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
23878       // larger than FalseC (the false value).
23879       if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
23880         CC = X86::GetOppositeBranchCondition(CC);
23881         std::swap(TrueC, FalseC);
23882         std::swap(TrueOp, FalseOp);
23883       }
23884
23885       // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
23886       // This is efficient for any integer data type (including i8/i16) and
23887       // shift amount.
23888       if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
23889         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
23890                            DAG.getConstant(CC, MVT::i8), Cond);
23891
23892         // Zero extend the condition if needed.
23893         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
23894
23895         unsigned ShAmt = TrueC->getAPIntValue().logBase2();
23896         Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
23897                            DAG.getConstant(ShAmt, MVT::i8));
23898         if (N->getNumValues() == 2)  // Dead flag value?
23899           return DCI.CombineTo(N, Cond, SDValue());
23900         return Cond;
23901       }
23902
23903       // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
23904       // for any integer data type, including i8/i16.
23905       if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
23906         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
23907                            DAG.getConstant(CC, MVT::i8), Cond);
23908
23909         // Zero extend the condition if needed.
23910         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
23911                            FalseC->getValueType(0), Cond);
23912         Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
23913                            SDValue(FalseC, 0));
23914
23915         if (N->getNumValues() == 2)  // Dead flag value?
23916           return DCI.CombineTo(N, Cond, SDValue());
23917         return Cond;
23918       }
23919
23920       // Optimize cases that will turn into an LEA instruction.  This requires
23921       // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
23922       if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
23923         uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
23924         if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
23925
23926         bool isFastMultiplier = false;
23927         if (Diff < 10) {
23928           switch ((unsigned char)Diff) {
23929           default: break;
23930           case 1:  // result = add base, cond
23931           case 2:  // result = lea base(    , cond*2)
23932           case 3:  // result = lea base(cond, cond*2)
23933           case 4:  // result = lea base(    , cond*4)
23934           case 5:  // result = lea base(cond, cond*4)
23935           case 8:  // result = lea base(    , cond*8)
23936           case 9:  // result = lea base(cond, cond*8)
23937             isFastMultiplier = true;
23938             break;
23939           }
23940         }
23941
23942         if (isFastMultiplier) {
23943           APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
23944           Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
23945                              DAG.getConstant(CC, MVT::i8), Cond);
23946           // Zero extend the condition if needed.
23947           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
23948                              Cond);
23949           // Scale the condition by the difference.
23950           if (Diff != 1)
23951             Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
23952                                DAG.getConstant(Diff, Cond.getValueType()));
23953
23954           // Add the base if non-zero.
23955           if (FalseC->getAPIntValue() != 0)
23956             Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
23957                                SDValue(FalseC, 0));
23958           if (N->getNumValues() == 2)  // Dead flag value?
23959             return DCI.CombineTo(N, Cond, SDValue());
23960           return Cond;
23961         }
23962       }
23963     }
23964   }
23965
23966   // Handle these cases:
23967   //   (select (x != c), e, c) -> select (x != c), e, x),
23968   //   (select (x == c), c, e) -> select (x == c), x, e)
23969   // where the c is an integer constant, and the "select" is the combination
23970   // of CMOV and CMP.
23971   //
23972   // The rationale for this change is that the conditional-move from a constant
23973   // needs two instructions, however, conditional-move from a register needs
23974   // only one instruction.
23975   //
23976   // CAVEAT: By replacing a constant with a symbolic value, it may obscure
23977   //  some instruction-combining opportunities. This opt needs to be
23978   //  postponed as late as possible.
23979   //
23980   if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
23981     // the DCI.xxxx conditions are provided to postpone the optimization as
23982     // late as possible.
23983
23984     ConstantSDNode *CmpAgainst = nullptr;
23985     if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
23986         (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
23987         !isa<ConstantSDNode>(Cond.getOperand(0))) {
23988
23989       if (CC == X86::COND_NE &&
23990           CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
23991         CC = X86::GetOppositeBranchCondition(CC);
23992         std::swap(TrueOp, FalseOp);
23993       }
23994
23995       if (CC == X86::COND_E &&
23996           CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
23997         SDValue Ops[] = { FalseOp, Cond.getOperand(0),
23998                           DAG.getConstant(CC, MVT::i8), Cond };
23999         return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
24000       }
24001     }
24002   }
24003
24004   return SDValue();
24005 }
24006
24007 static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
24008                                                 const X86Subtarget *Subtarget) {
24009   unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
24010   switch (IntNo) {
24011   default: return SDValue();
24012   // SSE/AVX/AVX2 blend intrinsics.
24013   case Intrinsic::x86_avx2_pblendvb:
24014   case Intrinsic::x86_avx2_pblendw:
24015   case Intrinsic::x86_avx2_pblendd_128:
24016   case Intrinsic::x86_avx2_pblendd_256:
24017     // Don't try to simplify this intrinsic if we don't have AVX2.
24018     if (!Subtarget->hasAVX2())
24019       return SDValue();
24020     // FALL-THROUGH
24021   case Intrinsic::x86_avx_blend_pd_256:
24022   case Intrinsic::x86_avx_blend_ps_256:
24023   case Intrinsic::x86_avx_blendv_pd_256:
24024   case Intrinsic::x86_avx_blendv_ps_256:
24025     // Don't try to simplify this intrinsic if we don't have AVX.
24026     if (!Subtarget->hasAVX())
24027       return SDValue();
24028     // FALL-THROUGH
24029   case Intrinsic::x86_sse41_pblendw:
24030   case Intrinsic::x86_sse41_blendpd:
24031   case Intrinsic::x86_sse41_blendps:
24032   case Intrinsic::x86_sse41_blendvps:
24033   case Intrinsic::x86_sse41_blendvpd:
24034   case Intrinsic::x86_sse41_pblendvb: {
24035     SDValue Op0 = N->getOperand(1);
24036     SDValue Op1 = N->getOperand(2);
24037     SDValue Mask = N->getOperand(3);
24038
24039     // Don't try to simplify this intrinsic if we don't have SSE4.1.
24040     if (!Subtarget->hasSSE41())
24041       return SDValue();
24042
24043     // fold (blend A, A, Mask) -> A
24044     if (Op0 == Op1)
24045       return Op0;
24046     // fold (blend A, B, allZeros) -> A
24047     if (ISD::isBuildVectorAllZeros(Mask.getNode()))
24048       return Op0;
24049     // fold (blend A, B, allOnes) -> B
24050     if (ISD::isBuildVectorAllOnes(Mask.getNode()))
24051       return Op1;
24052
24053     // Simplify the case where the mask is a constant i32 value.
24054     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mask)) {
24055       if (C->isNullValue())
24056         return Op0;
24057       if (C->isAllOnesValue())
24058         return Op1;
24059     }
24060
24061     return SDValue();
24062   }
24063
24064   // Packed SSE2/AVX2 arithmetic shift immediate intrinsics.
24065   case Intrinsic::x86_sse2_psrai_w:
24066   case Intrinsic::x86_sse2_psrai_d:
24067   case Intrinsic::x86_avx2_psrai_w:
24068   case Intrinsic::x86_avx2_psrai_d:
24069   case Intrinsic::x86_sse2_psra_w:
24070   case Intrinsic::x86_sse2_psra_d:
24071   case Intrinsic::x86_avx2_psra_w:
24072   case Intrinsic::x86_avx2_psra_d: {
24073     SDValue Op0 = N->getOperand(1);
24074     SDValue Op1 = N->getOperand(2);
24075     EVT VT = Op0.getValueType();
24076     assert(VT.isVector() && "Expected a vector type!");
24077
24078     if (isa<BuildVectorSDNode>(Op1))
24079       Op1 = Op1.getOperand(0);
24080
24081     if (!isa<ConstantSDNode>(Op1))
24082       return SDValue();
24083
24084     EVT SVT = VT.getVectorElementType();
24085     unsigned SVTBits = SVT.getSizeInBits();
24086
24087     ConstantSDNode *CND = cast<ConstantSDNode>(Op1);
24088     const APInt &C = APInt(SVTBits, CND->getAPIntValue().getZExtValue());
24089     uint64_t ShAmt = C.getZExtValue();
24090
24091     // Don't try to convert this shift into a ISD::SRA if the shift
24092     // count is bigger than or equal to the element size.
24093     if (ShAmt >= SVTBits)
24094       return SDValue();
24095
24096     // Trivial case: if the shift count is zero, then fold this
24097     // into the first operand.
24098     if (ShAmt == 0)
24099       return Op0;
24100
24101     // Replace this packed shift intrinsic with a target independent
24102     // shift dag node.
24103     SDValue Splat = DAG.getConstant(C, VT);
24104     return DAG.getNode(ISD::SRA, SDLoc(N), VT, Op0, Splat);
24105   }
24106   }
24107 }
24108
24109 /// PerformMulCombine - Optimize a single multiply with constant into two
24110 /// in order to implement it with two cheaper instructions, e.g.
24111 /// LEA + SHL, LEA + LEA.
24112 static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
24113                                  TargetLowering::DAGCombinerInfo &DCI) {
24114   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
24115     return SDValue();
24116
24117   EVT VT = N->getValueType(0);
24118   if (VT != MVT::i64)
24119     return SDValue();
24120
24121   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
24122   if (!C)
24123     return SDValue();
24124   uint64_t MulAmt = C->getZExtValue();
24125   if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
24126     return SDValue();
24127
24128   uint64_t MulAmt1 = 0;
24129   uint64_t MulAmt2 = 0;
24130   if ((MulAmt % 9) == 0) {
24131     MulAmt1 = 9;
24132     MulAmt2 = MulAmt / 9;
24133   } else if ((MulAmt % 5) == 0) {
24134     MulAmt1 = 5;
24135     MulAmt2 = MulAmt / 5;
24136   } else if ((MulAmt % 3) == 0) {
24137     MulAmt1 = 3;
24138     MulAmt2 = MulAmt / 3;
24139   }
24140   if (MulAmt2 &&
24141       (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
24142     SDLoc DL(N);
24143
24144     if (isPowerOf2_64(MulAmt2) &&
24145         !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
24146       // If second multiplifer is pow2, issue it first. We want the multiply by
24147       // 3, 5, or 9 to be folded into the addressing mode unless the lone use
24148       // is an add.
24149       std::swap(MulAmt1, MulAmt2);
24150
24151     SDValue NewMul;
24152     if (isPowerOf2_64(MulAmt1))
24153       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
24154                            DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
24155     else
24156       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
24157                            DAG.getConstant(MulAmt1, VT));
24158
24159     if (isPowerOf2_64(MulAmt2))
24160       NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
24161                            DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
24162     else
24163       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
24164                            DAG.getConstant(MulAmt2, VT));
24165
24166     // Do not add new nodes to DAG combiner worklist.
24167     DCI.CombineTo(N, NewMul, false);
24168   }
24169   return SDValue();
24170 }
24171
24172 static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
24173   SDValue N0 = N->getOperand(0);
24174   SDValue N1 = N->getOperand(1);
24175   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
24176   EVT VT = N0.getValueType();
24177
24178   // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
24179   // since the result of setcc_c is all zero's or all ones.
24180   if (VT.isInteger() && !VT.isVector() &&
24181       N1C && N0.getOpcode() == ISD::AND &&
24182       N0.getOperand(1).getOpcode() == ISD::Constant) {
24183     SDValue N00 = N0.getOperand(0);
24184     if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
24185         ((N00.getOpcode() == ISD::ANY_EXTEND ||
24186           N00.getOpcode() == ISD::ZERO_EXTEND) &&
24187          N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) {
24188       APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
24189       APInt ShAmt = N1C->getAPIntValue();
24190       Mask = Mask.shl(ShAmt);
24191       if (Mask != 0)
24192         return DAG.getNode(ISD::AND, SDLoc(N), VT,
24193                            N00, DAG.getConstant(Mask, VT));
24194     }
24195   }
24196
24197   // Hardware support for vector shifts is sparse which makes us scalarize the
24198   // vector operations in many cases. Also, on sandybridge ADD is faster than
24199   // shl.
24200   // (shl V, 1) -> add V,V
24201   if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
24202     if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
24203       assert(N0.getValueType().isVector() && "Invalid vector shift type");
24204       // We shift all of the values by one. In many cases we do not have
24205       // hardware support for this operation. This is better expressed as an ADD
24206       // of two values.
24207       if (N1SplatC->getZExtValue() == 1)
24208         return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
24209     }
24210
24211   return SDValue();
24212 }
24213
24214 /// \brief Returns a vector of 0s if the node in input is a vector logical
24215 /// shift by a constant amount which is known to be bigger than or equal
24216 /// to the vector element size in bits.
24217 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
24218                                       const X86Subtarget *Subtarget) {
24219   EVT VT = N->getValueType(0);
24220
24221   if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
24222       (!Subtarget->hasInt256() ||
24223        (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
24224     return SDValue();
24225
24226   SDValue Amt = N->getOperand(1);
24227   SDLoc DL(N);
24228   if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
24229     if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
24230       APInt ShiftAmt = AmtSplat->getAPIntValue();
24231       unsigned MaxAmount = VT.getVectorElementType().getSizeInBits();
24232
24233       // SSE2/AVX2 logical shifts always return a vector of 0s
24234       // if the shift amount is bigger than or equal to
24235       // the element size. The constant shift amount will be
24236       // encoded as a 8-bit immediate.
24237       if (ShiftAmt.trunc(8).uge(MaxAmount))
24238         return getZeroVector(VT, Subtarget, DAG, DL);
24239     }
24240
24241   return SDValue();
24242 }
24243
24244 /// PerformShiftCombine - Combine shifts.
24245 static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
24246                                    TargetLowering::DAGCombinerInfo &DCI,
24247                                    const X86Subtarget *Subtarget) {
24248   if (N->getOpcode() == ISD::SHL) {
24249     SDValue V = PerformSHLCombine(N, DAG);
24250     if (V.getNode()) return V;
24251   }
24252
24253   if (N->getOpcode() != ISD::SRA) {
24254     // Try to fold this logical shift into a zero vector.
24255     SDValue V = performShiftToAllZeros(N, DAG, Subtarget);
24256     if (V.getNode()) return V;
24257   }
24258
24259   return SDValue();
24260 }
24261
24262 // CMPEQCombine - Recognize the distinctive  (AND (setcc ...) (setcc ..))
24263 // where both setccs reference the same FP CMP, and rewrite for CMPEQSS
24264 // and friends.  Likewise for OR -> CMPNEQSS.
24265 static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
24266                             TargetLowering::DAGCombinerInfo &DCI,
24267                             const X86Subtarget *Subtarget) {
24268   unsigned opcode;
24269
24270   // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
24271   // we're requiring SSE2 for both.
24272   if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
24273     SDValue N0 = N->getOperand(0);
24274     SDValue N1 = N->getOperand(1);
24275     SDValue CMP0 = N0->getOperand(1);
24276     SDValue CMP1 = N1->getOperand(1);
24277     SDLoc DL(N);
24278
24279     // The SETCCs should both refer to the same CMP.
24280     if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
24281       return SDValue();
24282
24283     SDValue CMP00 = CMP0->getOperand(0);
24284     SDValue CMP01 = CMP0->getOperand(1);
24285     EVT     VT    = CMP00.getValueType();
24286
24287     if (VT == MVT::f32 || VT == MVT::f64) {
24288       bool ExpectingFlags = false;
24289       // Check for any users that want flags:
24290       for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
24291            !ExpectingFlags && UI != UE; ++UI)
24292         switch (UI->getOpcode()) {
24293         default:
24294         case ISD::BR_CC:
24295         case ISD::BRCOND:
24296         case ISD::SELECT:
24297           ExpectingFlags = true;
24298           break;
24299         case ISD::CopyToReg:
24300         case ISD::SIGN_EXTEND:
24301         case ISD::ZERO_EXTEND:
24302         case ISD::ANY_EXTEND:
24303           break;
24304         }
24305
24306       if (!ExpectingFlags) {
24307         enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
24308         enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
24309
24310         if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
24311           X86::CondCode tmp = cc0;
24312           cc0 = cc1;
24313           cc1 = tmp;
24314         }
24315
24316         if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
24317             (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
24318           // FIXME: need symbolic constants for these magic numbers.
24319           // See X86ATTInstPrinter.cpp:printSSECC().
24320           unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
24321           if (Subtarget->hasAVX512()) {
24322             SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00,
24323                                          CMP01, DAG.getConstant(x86cc, MVT::i8));
24324             if (N->getValueType(0) != MVT::i1)
24325               return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
24326                                  FSetCC);
24327             return FSetCC;
24328           }
24329           SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
24330                                               CMP00.getValueType(), CMP00, CMP01,
24331                                               DAG.getConstant(x86cc, MVT::i8));
24332
24333           bool is64BitFP = (CMP00.getValueType() == MVT::f64);
24334           MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
24335
24336           if (is64BitFP && !Subtarget->is64Bit()) {
24337             // On a 32-bit target, we cannot bitcast the 64-bit float to a
24338             // 64-bit integer, since that's not a legal type. Since
24339             // OnesOrZeroesF is all ones of all zeroes, we don't need all the
24340             // bits, but can do this little dance to extract the lowest 32 bits
24341             // and work with those going forward.
24342             SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
24343                                            OnesOrZeroesF);
24344             SDValue Vector32 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32,
24345                                            Vector64);
24346             OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
24347                                         Vector32, DAG.getIntPtrConstant(0));
24348             IntVT = MVT::i32;
24349           }
24350
24351           SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, IntVT, OnesOrZeroesF);
24352           SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
24353                                       DAG.getConstant(1, IntVT));
24354           SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed);
24355           return OneBitOfTruth;
24356         }
24357       }
24358     }
24359   }
24360   return SDValue();
24361 }
24362
24363 /// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector
24364 /// so it can be folded inside ANDNP.
24365 static bool CanFoldXORWithAllOnes(const SDNode *N) {
24366   EVT VT = N->getValueType(0);
24367
24368   // Match direct AllOnes for 128 and 256-bit vectors
24369   if (ISD::isBuildVectorAllOnes(N))
24370     return true;
24371
24372   // Look through a bit convert.
24373   if (N->getOpcode() == ISD::BITCAST)
24374     N = N->getOperand(0).getNode();
24375
24376   // Sometimes the operand may come from a insert_subvector building a 256-bit
24377   // allones vector
24378   if (VT.is256BitVector() &&
24379       N->getOpcode() == ISD::INSERT_SUBVECTOR) {
24380     SDValue V1 = N->getOperand(0);
24381     SDValue V2 = N->getOperand(1);
24382
24383     if (V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
24384         V1.getOperand(0).getOpcode() == ISD::UNDEF &&
24385         ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
24386         ISD::isBuildVectorAllOnes(V2.getNode()))
24387       return true;
24388   }
24389
24390   return false;
24391 }
24392
24393 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
24394 // register. In most cases we actually compare or select YMM-sized registers
24395 // and mixing the two types creates horrible code. This method optimizes
24396 // some of the transition sequences.
24397 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
24398                                  TargetLowering::DAGCombinerInfo &DCI,
24399                                  const X86Subtarget *Subtarget) {
24400   EVT VT = N->getValueType(0);
24401   if (!VT.is256BitVector())
24402     return SDValue();
24403
24404   assert((N->getOpcode() == ISD::ANY_EXTEND ||
24405           N->getOpcode() == ISD::ZERO_EXTEND ||
24406           N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
24407
24408   SDValue Narrow = N->getOperand(0);
24409   EVT NarrowVT = Narrow->getValueType(0);
24410   if (!NarrowVT.is128BitVector())
24411     return SDValue();
24412
24413   if (Narrow->getOpcode() != ISD::XOR &&
24414       Narrow->getOpcode() != ISD::AND &&
24415       Narrow->getOpcode() != ISD::OR)
24416     return SDValue();
24417
24418   SDValue N0  = Narrow->getOperand(0);
24419   SDValue N1  = Narrow->getOperand(1);
24420   SDLoc DL(Narrow);
24421
24422   // The Left side has to be a trunc.
24423   if (N0.getOpcode() != ISD::TRUNCATE)
24424     return SDValue();
24425
24426   // The type of the truncated inputs.
24427   EVT WideVT = N0->getOperand(0)->getValueType(0);
24428   if (WideVT != VT)
24429     return SDValue();
24430
24431   // The right side has to be a 'trunc' or a constant vector.
24432   bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
24433   ConstantSDNode *RHSConstSplat = nullptr;
24434   if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
24435     RHSConstSplat = RHSBV->getConstantSplatNode();
24436   if (!RHSTrunc && !RHSConstSplat)
24437     return SDValue();
24438
24439   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24440
24441   if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
24442     return SDValue();
24443
24444   // Set N0 and N1 to hold the inputs to the new wide operation.
24445   N0 = N0->getOperand(0);
24446   if (RHSConstSplat) {
24447     N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(),
24448                      SDValue(RHSConstSplat, 0));
24449     SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);
24450     N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C);
24451   } else if (RHSTrunc) {
24452     N1 = N1->getOperand(0);
24453   }
24454
24455   // Generate the wide operation.
24456   SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
24457   unsigned Opcode = N->getOpcode();
24458   switch (Opcode) {
24459   case ISD::ANY_EXTEND:
24460     return Op;
24461   case ISD::ZERO_EXTEND: {
24462     unsigned InBits = NarrowVT.getScalarType().getSizeInBits();
24463     APInt Mask = APInt::getAllOnesValue(InBits);
24464     Mask = Mask.zext(VT.getScalarType().getSizeInBits());
24465     return DAG.getNode(ISD::AND, DL, VT,
24466                        Op, DAG.getConstant(Mask, VT));
24467   }
24468   case ISD::SIGN_EXTEND:
24469     return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
24470                        Op, DAG.getValueType(NarrowVT));
24471   default:
24472     llvm_unreachable("Unexpected opcode");
24473   }
24474 }
24475
24476 static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
24477                                  TargetLowering::DAGCombinerInfo &DCI,
24478                                  const X86Subtarget *Subtarget) {
24479   EVT VT = N->getValueType(0);
24480   if (DCI.isBeforeLegalizeOps())
24481     return SDValue();
24482
24483   SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
24484   if (R.getNode())
24485     return R;
24486
24487   // Create BEXTR instructions
24488   // BEXTR is ((X >> imm) & (2**size-1))
24489   if (VT == MVT::i32 || VT == MVT::i64) {
24490     SDValue N0 = N->getOperand(0);
24491     SDValue N1 = N->getOperand(1);
24492     SDLoc DL(N);
24493
24494     // Check for BEXTR.
24495     if ((Subtarget->hasBMI() || Subtarget->hasTBM()) &&
24496         (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) {
24497       ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
24498       ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
24499       if (MaskNode && ShiftNode) {
24500         uint64_t Mask = MaskNode->getZExtValue();
24501         uint64_t Shift = ShiftNode->getZExtValue();
24502         if (isMask_64(Mask)) {
24503           uint64_t MaskSize = CountPopulation_64(Mask);
24504           if (Shift + MaskSize <= VT.getSizeInBits())
24505             return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
24506                                DAG.getConstant(Shift | (MaskSize << 8), VT));
24507         }
24508       }
24509     } // BEXTR
24510
24511     return SDValue();
24512   }
24513
24514   // Want to form ANDNP nodes:
24515   // 1) In the hopes of then easily combining them with OR and AND nodes
24516   //    to form PBLEND/PSIGN.
24517   // 2) To match ANDN packed intrinsics
24518   if (VT != MVT::v2i64 && VT != MVT::v4i64)
24519     return SDValue();
24520
24521   SDValue N0 = N->getOperand(0);
24522   SDValue N1 = N->getOperand(1);
24523   SDLoc DL(N);
24524
24525   // Check LHS for vnot
24526   if (N0.getOpcode() == ISD::XOR &&
24527       //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
24528       CanFoldXORWithAllOnes(N0.getOperand(1).getNode()))
24529     return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
24530
24531   // Check RHS for vnot
24532   if (N1.getOpcode() == ISD::XOR &&
24533       //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
24534       CanFoldXORWithAllOnes(N1.getOperand(1).getNode()))
24535     return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
24536
24537   return SDValue();
24538 }
24539
24540 static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
24541                                 TargetLowering::DAGCombinerInfo &DCI,
24542                                 const X86Subtarget *Subtarget) {
24543   if (DCI.isBeforeLegalizeOps())
24544     return SDValue();
24545
24546   SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
24547   if (R.getNode())
24548     return R;
24549
24550   SDValue N0 = N->getOperand(0);
24551   SDValue N1 = N->getOperand(1);
24552   EVT VT = N->getValueType(0);
24553
24554   // look for psign/blend
24555   if (VT == MVT::v2i64 || VT == MVT::v4i64) {
24556     if (!Subtarget->hasSSSE3() ||
24557         (VT == MVT::v4i64 && !Subtarget->hasInt256()))
24558       return SDValue();
24559
24560     // Canonicalize pandn to RHS
24561     if (N0.getOpcode() == X86ISD::ANDNP)
24562       std::swap(N0, N1);
24563     // or (and (m, y), (pandn m, x))
24564     if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
24565       SDValue Mask = N1.getOperand(0);
24566       SDValue X    = N1.getOperand(1);
24567       SDValue Y;
24568       if (N0.getOperand(0) == Mask)
24569         Y = N0.getOperand(1);
24570       if (N0.getOperand(1) == Mask)
24571         Y = N0.getOperand(0);
24572
24573       // Check to see if the mask appeared in both the AND and ANDNP and
24574       if (!Y.getNode())
24575         return SDValue();
24576
24577       // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
24578       // Look through mask bitcast.
24579       if (Mask.getOpcode() == ISD::BITCAST)
24580         Mask = Mask.getOperand(0);
24581       if (X.getOpcode() == ISD::BITCAST)
24582         X = X.getOperand(0);
24583       if (Y.getOpcode() == ISD::BITCAST)
24584         Y = Y.getOperand(0);
24585
24586       EVT MaskVT = Mask.getValueType();
24587
24588       // Validate that the Mask operand is a vector sra node.
24589       // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
24590       // there is no psrai.b
24591       unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
24592       unsigned SraAmt = ~0;
24593       if (Mask.getOpcode() == ISD::SRA) {
24594         if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
24595           if (auto *AmtConst = AmtBV->getConstantSplatNode())
24596             SraAmt = AmtConst->getZExtValue();
24597       } else if (Mask.getOpcode() == X86ISD::VSRAI) {
24598         SDValue SraC = Mask.getOperand(1);
24599         SraAmt  = cast<ConstantSDNode>(SraC)->getZExtValue();
24600       }
24601       if ((SraAmt + 1) != EltBits)
24602         return SDValue();
24603
24604       SDLoc DL(N);
24605
24606       // Now we know we at least have a plendvb with the mask val.  See if
24607       // we can form a psignb/w/d.
24608       // psign = x.type == y.type == mask.type && y = sub(0, x);
24609       if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
24610           ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
24611           X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
24612         assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
24613                "Unsupported VT for PSIGN");
24614         Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0));
24615         return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
24616       }
24617       // PBLENDVB only available on SSE 4.1
24618       if (!Subtarget->hasSSE41())
24619         return SDValue();
24620
24621       EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
24622
24623       X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X);
24624       Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y);
24625       Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask);
24626       Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
24627       return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
24628     }
24629   }
24630
24631   if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
24632     return SDValue();
24633
24634   // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
24635   MachineFunction &MF = DAG.getMachineFunction();
24636   bool OptForSize = MF.getFunction()->getAttributes().
24637     hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
24638
24639   // SHLD/SHRD instructions have lower register pressure, but on some
24640   // platforms they have higher latency than the equivalent
24641   // series of shifts/or that would otherwise be generated.
24642   // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
24643   // have higher latencies and we are not optimizing for size.
24644   if (!OptForSize && Subtarget->isSHLDSlow())
24645     return SDValue();
24646
24647   if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
24648     std::swap(N0, N1);
24649   if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
24650     return SDValue();
24651   if (!N0.hasOneUse() || !N1.hasOneUse())
24652     return SDValue();
24653
24654   SDValue ShAmt0 = N0.getOperand(1);
24655   if (ShAmt0.getValueType() != MVT::i8)
24656     return SDValue();
24657   SDValue ShAmt1 = N1.getOperand(1);
24658   if (ShAmt1.getValueType() != MVT::i8)
24659     return SDValue();
24660   if (ShAmt0.getOpcode() == ISD::TRUNCATE)
24661     ShAmt0 = ShAmt0.getOperand(0);
24662   if (ShAmt1.getOpcode() == ISD::TRUNCATE)
24663     ShAmt1 = ShAmt1.getOperand(0);
24664
24665   SDLoc DL(N);
24666   unsigned Opc = X86ISD::SHLD;
24667   SDValue Op0 = N0.getOperand(0);
24668   SDValue Op1 = N1.getOperand(0);
24669   if (ShAmt0.getOpcode() == ISD::SUB) {
24670     Opc = X86ISD::SHRD;
24671     std::swap(Op0, Op1);
24672     std::swap(ShAmt0, ShAmt1);
24673   }
24674
24675   unsigned Bits = VT.getSizeInBits();
24676   if (ShAmt1.getOpcode() == ISD::SUB) {
24677     SDValue Sum = ShAmt1.getOperand(0);
24678     if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
24679       SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
24680       if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
24681         ShAmt1Op1 = ShAmt1Op1.getOperand(0);
24682       if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
24683         return DAG.getNode(Opc, DL, VT,
24684                            Op0, Op1,
24685                            DAG.getNode(ISD::TRUNCATE, DL,
24686                                        MVT::i8, ShAmt0));
24687     }
24688   } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
24689     ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
24690     if (ShAmt0C &&
24691         ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
24692       return DAG.getNode(Opc, DL, VT,
24693                          N0.getOperand(0), N1.getOperand(0),
24694                          DAG.getNode(ISD::TRUNCATE, DL,
24695                                        MVT::i8, ShAmt0));
24696   }
24697
24698   return SDValue();
24699 }
24700
24701 // Generate NEG and CMOV for integer abs.
24702 static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
24703   EVT VT = N->getValueType(0);
24704
24705   // Since X86 does not have CMOV for 8-bit integer, we don't convert
24706   // 8-bit integer abs to NEG and CMOV.
24707   if (VT.isInteger() && VT.getSizeInBits() == 8)
24708     return SDValue();
24709
24710   SDValue N0 = N->getOperand(0);
24711   SDValue N1 = N->getOperand(1);
24712   SDLoc DL(N);
24713
24714   // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
24715   // and change it to SUB and CMOV.
24716   if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
24717       N0.getOpcode() == ISD::ADD &&
24718       N0.getOperand(1) == N1 &&
24719       N1.getOpcode() == ISD::SRA &&
24720       N1.getOperand(0) == N0.getOperand(0))
24721     if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
24722       if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) {
24723         // Generate SUB & CMOV.
24724         SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
24725                                   DAG.getConstant(0, VT), N0.getOperand(0));
24726
24727         SDValue Ops[] = { N0.getOperand(0), Neg,
24728                           DAG.getConstant(X86::COND_GE, MVT::i8),
24729                           SDValue(Neg.getNode(), 1) };
24730         return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
24731       }
24732   return SDValue();
24733 }
24734
24735 // PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes
24736 static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
24737                                  TargetLowering::DAGCombinerInfo &DCI,
24738                                  const X86Subtarget *Subtarget) {
24739   if (DCI.isBeforeLegalizeOps())
24740     return SDValue();
24741
24742   if (Subtarget->hasCMov()) {
24743     SDValue RV = performIntegerAbsCombine(N, DAG);
24744     if (RV.getNode())
24745       return RV;
24746   }
24747
24748   return SDValue();
24749 }
24750
24751 /// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
24752 static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
24753                                   TargetLowering::DAGCombinerInfo &DCI,
24754                                   const X86Subtarget *Subtarget) {
24755   LoadSDNode *Ld = cast<LoadSDNode>(N);
24756   EVT RegVT = Ld->getValueType(0);
24757   EVT MemVT = Ld->getMemoryVT();
24758   SDLoc dl(Ld);
24759   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24760
24761   // For chips with slow 32-byte unaligned loads, break the 32-byte operation
24762   // into two 16-byte operations.
24763   ISD::LoadExtType Ext = Ld->getExtensionType();
24764   unsigned Alignment = Ld->getAlignment();
24765   bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
24766   if (RegVT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
24767       !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
24768     unsigned NumElems = RegVT.getVectorNumElements();
24769     if (NumElems < 2)
24770       return SDValue();
24771
24772     SDValue Ptr = Ld->getBasePtr();
24773     SDValue Increment = DAG.getConstant(16, TLI.getPointerTy());
24774
24775     EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
24776                                   NumElems/2);
24777     SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
24778                                 Ld->getPointerInfo(), Ld->isVolatile(),
24779                                 Ld->isNonTemporal(), Ld->isInvariant(),
24780                                 Alignment);
24781     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
24782     SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
24783                                 Ld->getPointerInfo(), Ld->isVolatile(),
24784                                 Ld->isNonTemporal(), Ld->isInvariant(),
24785                                 std::min(16U, Alignment));
24786     SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
24787                              Load1.getValue(1),
24788                              Load2.getValue(1));
24789
24790     SDValue NewVec = DAG.getUNDEF(RegVT);
24791     NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);
24792     NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);
24793     return DCI.CombineTo(N, NewVec, TF, true);
24794   }
24795
24796   return SDValue();
24797 }
24798
24799 /// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
24800 static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
24801                                    const X86Subtarget *Subtarget) {
24802   StoreSDNode *St = cast<StoreSDNode>(N);
24803   EVT VT = St->getValue().getValueType();
24804   EVT StVT = St->getMemoryVT();
24805   SDLoc dl(St);
24806   SDValue StoredVal = St->getOperand(1);
24807   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24808
24809   // If we are saving a concatenation of two XMM registers and 32-byte stores
24810   // are slow, such as on Sandy Bridge, perform two 16-byte stores.
24811   unsigned Alignment = St->getAlignment();
24812   bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8;
24813   if (VT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
24814       StVT == VT && !IsAligned) {
24815     unsigned NumElems = VT.getVectorNumElements();
24816     if (NumElems < 2)
24817       return SDValue();
24818
24819     SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl);
24820     SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl);
24821
24822     SDValue Stride = DAG.getConstant(16, TLI.getPointerTy());
24823     SDValue Ptr0 = St->getBasePtr();
24824     SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
24825
24826     SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
24827                                 St->getPointerInfo(), St->isVolatile(),
24828                                 St->isNonTemporal(), Alignment);
24829     SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
24830                                 St->getPointerInfo(), St->isVolatile(),
24831                                 St->isNonTemporal(),
24832                                 std::min(16U, Alignment));
24833     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
24834   }
24835
24836   // Optimize trunc store (of multiple scalars) to shuffle and store.
24837   // First, pack all of the elements in one place. Next, store to memory
24838   // in fewer chunks.
24839   if (St->isTruncatingStore() && VT.isVector()) {
24840     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24841     unsigned NumElems = VT.getVectorNumElements();
24842     assert(StVT != VT && "Cannot truncate to the same type");
24843     unsigned FromSz = VT.getVectorElementType().getSizeInBits();
24844     unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
24845
24846     // From, To sizes and ElemCount must be pow of two
24847     if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
24848     // We are going to use the original vector elt for storing.
24849     // Accumulated smaller vector elements must be a multiple of the store size.
24850     if (0 != (NumElems * FromSz) % ToSz) return SDValue();
24851
24852     unsigned SizeRatio  = FromSz / ToSz;
24853
24854     assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
24855
24856     // Create a type on which we perform the shuffle
24857     EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
24858             StVT.getScalarType(), NumElems*SizeRatio);
24859
24860     assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
24861
24862     SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());
24863     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
24864     for (unsigned i = 0; i != NumElems; ++i)
24865       ShuffleVec[i] = i * SizeRatio;
24866
24867     // Can't shuffle using an illegal type.
24868     if (!TLI.isTypeLegal(WideVecVT))
24869       return SDValue();
24870
24871     SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
24872                                          DAG.getUNDEF(WideVecVT),
24873                                          &ShuffleVec[0]);
24874     // At this point all of the data is stored at the bottom of the
24875     // register. We now need to save it to mem.
24876
24877     // Find the largest store unit
24878     MVT StoreType = MVT::i8;
24879     for (MVT Tp : MVT::integer_valuetypes()) {
24880       if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
24881         StoreType = Tp;
24882     }
24883
24884     // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
24885     if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
24886         (64 <= NumElems * ToSz))
24887       StoreType = MVT::f64;
24888
24889     // Bitcast the original vector into a vector of store-size units
24890     EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
24891             StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
24892     assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
24893     SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
24894     SmallVector<SDValue, 8> Chains;
24895     SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
24896                                         TLI.getPointerTy());
24897     SDValue Ptr = St->getBasePtr();
24898
24899     // Perform one or more big stores into memory.
24900     for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
24901       SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
24902                                    StoreType, ShuffWide,
24903                                    DAG.getIntPtrConstant(i));
24904       SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
24905                                 St->getPointerInfo(), St->isVolatile(),
24906                                 St->isNonTemporal(), St->getAlignment());
24907       Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
24908       Chains.push_back(Ch);
24909     }
24910
24911     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
24912   }
24913
24914   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
24915   // the FP state in cases where an emms may be missing.
24916   // A preferable solution to the general problem is to figure out the right
24917   // places to insert EMMS.  This qualifies as a quick hack.
24918
24919   // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
24920   if (VT.getSizeInBits() != 64)
24921     return SDValue();
24922
24923   const Function *F = DAG.getMachineFunction().getFunction();
24924   bool NoImplicitFloatOps = F->getAttributes().
24925     hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
24926   bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps
24927                      && Subtarget->hasSSE2();
24928   if ((VT.isVector() ||
24929        (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
24930       isa<LoadSDNode>(St->getValue()) &&
24931       !cast<LoadSDNode>(St->getValue())->isVolatile() &&
24932       St->getChain().hasOneUse() && !St->isVolatile()) {
24933     SDNode* LdVal = St->getValue().getNode();
24934     LoadSDNode *Ld = nullptr;
24935     int TokenFactorIndex = -1;
24936     SmallVector<SDValue, 8> Ops;
24937     SDNode* ChainVal = St->getChain().getNode();
24938     // Must be a store of a load.  We currently handle two cases:  the load
24939     // is a direct child, and it's under an intervening TokenFactor.  It is
24940     // possible to dig deeper under nested TokenFactors.
24941     if (ChainVal == LdVal)
24942       Ld = cast<LoadSDNode>(St->getChain());
24943     else if (St->getValue().hasOneUse() &&
24944              ChainVal->getOpcode() == ISD::TokenFactor) {
24945       for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
24946         if (ChainVal->getOperand(i).getNode() == LdVal) {
24947           TokenFactorIndex = i;
24948           Ld = cast<LoadSDNode>(St->getValue());
24949         } else
24950           Ops.push_back(ChainVal->getOperand(i));
24951       }
24952     }
24953
24954     if (!Ld || !ISD::isNormalLoad(Ld))
24955       return SDValue();
24956
24957     // If this is not the MMX case, i.e. we are just turning i64 load/store
24958     // into f64 load/store, avoid the transformation if there are multiple
24959     // uses of the loaded value.
24960     if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
24961       return SDValue();
24962
24963     SDLoc LdDL(Ld);
24964     SDLoc StDL(N);
24965     // If we are a 64-bit capable x86, lower to a single movq load/store pair.
24966     // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
24967     // pair instead.
24968     if (Subtarget->is64Bit() || F64IsLegal) {
24969       EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
24970       SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
24971                                   Ld->getPointerInfo(), Ld->isVolatile(),
24972                                   Ld->isNonTemporal(), Ld->isInvariant(),
24973                                   Ld->getAlignment());
24974       SDValue NewChain = NewLd.getValue(1);
24975       if (TokenFactorIndex != -1) {
24976         Ops.push_back(NewChain);
24977         NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
24978       }
24979       return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
24980                           St->getPointerInfo(),
24981                           St->isVolatile(), St->isNonTemporal(),
24982                           St->getAlignment());
24983     }
24984
24985     // Otherwise, lower to two pairs of 32-bit loads / stores.
24986     SDValue LoAddr = Ld->getBasePtr();
24987     SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
24988                                  DAG.getConstant(4, MVT::i32));
24989
24990     SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
24991                                Ld->getPointerInfo(),
24992                                Ld->isVolatile(), Ld->isNonTemporal(),
24993                                Ld->isInvariant(), Ld->getAlignment());
24994     SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
24995                                Ld->getPointerInfo().getWithOffset(4),
24996                                Ld->isVolatile(), Ld->isNonTemporal(),
24997                                Ld->isInvariant(),
24998                                MinAlign(Ld->getAlignment(), 4));
24999
25000     SDValue NewChain = LoLd.getValue(1);
25001     if (TokenFactorIndex != -1) {
25002       Ops.push_back(LoLd);
25003       Ops.push_back(HiLd);
25004       NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
25005     }
25006
25007     LoAddr = St->getBasePtr();
25008     HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
25009                          DAG.getConstant(4, MVT::i32));
25010
25011     SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
25012                                 St->getPointerInfo(),
25013                                 St->isVolatile(), St->isNonTemporal(),
25014                                 St->getAlignment());
25015     SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
25016                                 St->getPointerInfo().getWithOffset(4),
25017                                 St->isVolatile(),
25018                                 St->isNonTemporal(),
25019                                 MinAlign(St->getAlignment(), 4));
25020     return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
25021   }
25022   return SDValue();
25023 }
25024
25025 /// Return 'true' if this vector operation is "horizontal"
25026 /// and return the operands for the horizontal operation in LHS and RHS.  A
25027 /// horizontal operation performs the binary operation on successive elements
25028 /// of its first operand, then on successive elements of its second operand,
25029 /// returning the resulting values in a vector.  For example, if
25030 ///   A = < float a0, float a1, float a2, float a3 >
25031 /// and
25032 ///   B = < float b0, float b1, float b2, float b3 >
25033 /// then the result of doing a horizontal operation on A and B is
25034 ///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
25035 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
25036 /// A horizontal-op B, for some already available A and B, and if so then LHS is
25037 /// set to A, RHS to B, and the routine returns 'true'.
25038 /// Note that the binary operation should have the property that if one of the
25039 /// operands is UNDEF then the result is UNDEF.
25040 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
25041   // Look for the following pattern: if
25042   //   A = < float a0, float a1, float a2, float a3 >
25043   //   B = < float b0, float b1, float b2, float b3 >
25044   // and
25045   //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
25046   //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
25047   // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
25048   // which is A horizontal-op B.
25049
25050   // At least one of the operands should be a vector shuffle.
25051   if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
25052       RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
25053     return false;
25054
25055   MVT VT = LHS.getSimpleValueType();
25056
25057   assert((VT.is128BitVector() || VT.is256BitVector()) &&
25058          "Unsupported vector type for horizontal add/sub");
25059
25060   // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
25061   // operate independently on 128-bit lanes.
25062   unsigned NumElts = VT.getVectorNumElements();
25063   unsigned NumLanes = VT.getSizeInBits()/128;
25064   unsigned NumLaneElts = NumElts / NumLanes;
25065   assert((NumLaneElts % 2 == 0) &&
25066          "Vector type should have an even number of elements in each lane");
25067   unsigned HalfLaneElts = NumLaneElts/2;
25068
25069   // View LHS in the form
25070   //   LHS = VECTOR_SHUFFLE A, B, LMask
25071   // If LHS is not a shuffle then pretend it is the shuffle
25072   //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
25073   // NOTE: in what follows a default initialized SDValue represents an UNDEF of
25074   // type VT.
25075   SDValue A, B;
25076   SmallVector<int, 16> LMask(NumElts);
25077   if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
25078     if (LHS.getOperand(0).getOpcode() != ISD::UNDEF)
25079       A = LHS.getOperand(0);
25080     if (LHS.getOperand(1).getOpcode() != ISD::UNDEF)
25081       B = LHS.getOperand(1);
25082     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
25083     std::copy(Mask.begin(), Mask.end(), LMask.begin());
25084   } else {
25085     if (LHS.getOpcode() != ISD::UNDEF)
25086       A = LHS;
25087     for (unsigned i = 0; i != NumElts; ++i)
25088       LMask[i] = i;
25089   }
25090
25091   // Likewise, view RHS in the form
25092   //   RHS = VECTOR_SHUFFLE C, D, RMask
25093   SDValue C, D;
25094   SmallVector<int, 16> RMask(NumElts);
25095   if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
25096     if (RHS.getOperand(0).getOpcode() != ISD::UNDEF)
25097       C = RHS.getOperand(0);
25098     if (RHS.getOperand(1).getOpcode() != ISD::UNDEF)
25099       D = RHS.getOperand(1);
25100     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
25101     std::copy(Mask.begin(), Mask.end(), RMask.begin());
25102   } else {
25103     if (RHS.getOpcode() != ISD::UNDEF)
25104       C = RHS;
25105     for (unsigned i = 0; i != NumElts; ++i)
25106       RMask[i] = i;
25107   }
25108
25109   // Check that the shuffles are both shuffling the same vectors.
25110   if (!(A == C && B == D) && !(A == D && B == C))
25111     return false;
25112
25113   // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
25114   if (!A.getNode() && !B.getNode())
25115     return false;
25116
25117   // If A and B occur in reverse order in RHS, then "swap" them (which means
25118   // rewriting the mask).
25119   if (A != C)
25120     CommuteVectorShuffleMask(RMask, NumElts);
25121
25122   // At this point LHS and RHS are equivalent to
25123   //   LHS = VECTOR_SHUFFLE A, B, LMask
25124   //   RHS = VECTOR_SHUFFLE A, B, RMask
25125   // Check that the masks correspond to performing a horizontal operation.
25126   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
25127     for (unsigned i = 0; i != NumLaneElts; ++i) {
25128       int LIdx = LMask[i+l], RIdx = RMask[i+l];
25129
25130       // Ignore any UNDEF components.
25131       if (LIdx < 0 || RIdx < 0 ||
25132           (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
25133           (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
25134         continue;
25135
25136       // Check that successive elements are being operated on.  If not, this is
25137       // not a horizontal operation.
25138       unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
25139       int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
25140       if (!(LIdx == Index && RIdx == Index + 1) &&
25141           !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
25142         return false;
25143     }
25144   }
25145
25146   LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
25147   RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
25148   return true;
25149 }
25150
25151 /// Do target-specific dag combines on floating point adds.
25152 static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
25153                                   const X86Subtarget *Subtarget) {
25154   EVT VT = N->getValueType(0);
25155   SDValue LHS = N->getOperand(0);
25156   SDValue RHS = N->getOperand(1);
25157
25158   // Try to synthesize horizontal adds from adds of shuffles.
25159   if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
25160        (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
25161       isHorizontalBinOp(LHS, RHS, true))
25162     return DAG.getNode(X86ISD::FHADD, SDLoc(N), VT, LHS, RHS);
25163   return SDValue();
25164 }
25165
25166 /// Do target-specific dag combines on floating point subs.
25167 static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
25168                                   const X86Subtarget *Subtarget) {
25169   EVT VT = N->getValueType(0);
25170   SDValue LHS = N->getOperand(0);
25171   SDValue RHS = N->getOperand(1);
25172
25173   // Try to synthesize horizontal subs from subs of shuffles.
25174   if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
25175        (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
25176       isHorizontalBinOp(LHS, RHS, false))
25177     return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS);
25178   return SDValue();
25179 }
25180
25181 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
25182 static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
25183   assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
25184   // F[X]OR(0.0, x) -> x
25185   // F[X]OR(x, 0.0) -> x
25186   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
25187     if (C->getValueAPF().isPosZero())
25188       return N->getOperand(1);
25189   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
25190     if (C->getValueAPF().isPosZero())
25191       return N->getOperand(0);
25192   return SDValue();
25193 }
25194
25195 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
25196 static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
25197   assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
25198
25199   // Only perform optimizations if UnsafeMath is used.
25200   if (!DAG.getTarget().Options.UnsafeFPMath)
25201     return SDValue();
25202
25203   // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
25204   // into FMINC and FMAXC, which are Commutative operations.
25205   unsigned NewOp = 0;
25206   switch (N->getOpcode()) {
25207     default: llvm_unreachable("unknown opcode");
25208     case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
25209     case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
25210   }
25211
25212   return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
25213                      N->getOperand(0), N->getOperand(1));
25214 }
25215
25216 /// Do target-specific dag combines on X86ISD::FAND nodes.
25217 static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
25218   // FAND(0.0, x) -> 0.0
25219   // FAND(x, 0.0) -> 0.0
25220   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
25221     if (C->getValueAPF().isPosZero())
25222       return N->getOperand(0);
25223   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
25224     if (C->getValueAPF().isPosZero())
25225       return N->getOperand(1);
25226   return SDValue();
25227 }
25228
25229 /// Do target-specific dag combines on X86ISD::FANDN nodes
25230 static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) {
25231   // FANDN(x, 0.0) -> 0.0
25232   // FANDN(0.0, x) -> x
25233   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
25234     if (C->getValueAPF().isPosZero())
25235       return N->getOperand(1);
25236   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
25237     if (C->getValueAPF().isPosZero())
25238       return N->getOperand(1);
25239   return SDValue();
25240 }
25241
25242 static SDValue PerformBTCombine(SDNode *N,
25243                                 SelectionDAG &DAG,
25244                                 TargetLowering::DAGCombinerInfo &DCI) {
25245   // BT ignores high bits in the bit index operand.
25246   SDValue Op1 = N->getOperand(1);
25247   if (Op1.hasOneUse()) {
25248     unsigned BitWidth = Op1.getValueSizeInBits();
25249     APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
25250     APInt KnownZero, KnownOne;
25251     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
25252                                           !DCI.isBeforeLegalizeOps());
25253     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25254     if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
25255         TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
25256       DCI.CommitTargetLoweringOpt(TLO);
25257   }
25258   return SDValue();
25259 }
25260
25261 static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
25262   SDValue Op = N->getOperand(0);
25263   if (Op.getOpcode() == ISD::BITCAST)
25264     Op = Op.getOperand(0);
25265   EVT VT = N->getValueType(0), OpVT = Op.getValueType();
25266   if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
25267       VT.getVectorElementType().getSizeInBits() ==
25268       OpVT.getVectorElementType().getSizeInBits()) {
25269     return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
25270   }
25271   return SDValue();
25272 }
25273
25274 static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
25275                                                const X86Subtarget *Subtarget) {
25276   EVT VT = N->getValueType(0);
25277   if (!VT.isVector())
25278     return SDValue();
25279
25280   SDValue N0 = N->getOperand(0);
25281   SDValue N1 = N->getOperand(1);
25282   EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
25283   SDLoc dl(N);
25284
25285   // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
25286   // both SSE and AVX2 since there is no sign-extended shift right
25287   // operation on a vector with 64-bit elements.
25288   //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
25289   // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
25290   if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
25291       N0.getOpcode() == ISD::SIGN_EXTEND)) {
25292     SDValue N00 = N0.getOperand(0);
25293
25294     // EXTLOAD has a better solution on AVX2,
25295     // it may be replaced with X86ISD::VSEXT node.
25296     if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256())
25297       if (!ISD::isNormalLoad(N00.getNode()))
25298         return SDValue();
25299
25300     if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
25301         SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
25302                                   N00, N1);
25303       return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
25304     }
25305   }
25306   return SDValue();
25307 }
25308
25309 static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
25310                                   TargetLowering::DAGCombinerInfo &DCI,
25311                                   const X86Subtarget *Subtarget) {
25312   SDValue N0 = N->getOperand(0);
25313   EVT VT = N->getValueType(0);
25314
25315   // (i8,i32 sext (sdivrem (i8 x, i8 y)) ->
25316   // (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y)
25317   // This exposes the sext to the sdivrem lowering, so that it directly extends
25318   // from AH (which we otherwise need to do contortions to access).
25319   if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 &&
25320       N0.getValueType() == MVT::i8 && VT == MVT::i32) {
25321     SDLoc dl(N);
25322     SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
25323     SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, dl, NodeTys,
25324                             N0.getOperand(0), N0.getOperand(1));
25325     DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
25326     return R.getValue(1);
25327   }
25328
25329   if (!DCI.isBeforeLegalizeOps())
25330     return SDValue();
25331
25332   if (!Subtarget->hasFp256())
25333     return SDValue();
25334
25335   if (VT.isVector() && VT.getSizeInBits() == 256) {
25336     SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
25337     if (R.getNode())
25338       return R;
25339   }
25340
25341   return SDValue();
25342 }
25343
25344 static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
25345                                  const X86Subtarget* Subtarget) {
25346   SDLoc dl(N);
25347   EVT VT = N->getValueType(0);
25348
25349   // Let legalize expand this if it isn't a legal type yet.
25350   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
25351     return SDValue();
25352
25353   EVT ScalarVT = VT.getScalarType();
25354   if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
25355       (!Subtarget->hasFMA() && !Subtarget->hasFMA4()))
25356     return SDValue();
25357
25358   SDValue A = N->getOperand(0);
25359   SDValue B = N->getOperand(1);
25360   SDValue C = N->getOperand(2);
25361
25362   bool NegA = (A.getOpcode() == ISD::FNEG);
25363   bool NegB = (B.getOpcode() == ISD::FNEG);
25364   bool NegC = (C.getOpcode() == ISD::FNEG);
25365
25366   // Negative multiplication when NegA xor NegB
25367   bool NegMul = (NegA != NegB);
25368   if (NegA)
25369     A = A.getOperand(0);
25370   if (NegB)
25371     B = B.getOperand(0);
25372   if (NegC)
25373     C = C.getOperand(0);
25374
25375   unsigned Opcode;
25376   if (!NegMul)
25377     Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
25378   else
25379     Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
25380
25381   return DAG.getNode(Opcode, dl, VT, A, B, C);
25382 }
25383
25384 static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
25385                                   TargetLowering::DAGCombinerInfo &DCI,
25386                                   const X86Subtarget *Subtarget) {
25387   // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
25388   //           (and (i32 x86isd::setcc_carry), 1)
25389   // This eliminates the zext. This transformation is necessary because
25390   // ISD::SETCC is always legalized to i8.
25391   SDLoc dl(N);
25392   SDValue N0 = N->getOperand(0);
25393   EVT VT = N->getValueType(0);
25394
25395   if (N0.getOpcode() == ISD::AND &&
25396       N0.hasOneUse() &&
25397       N0.getOperand(0).hasOneUse()) {
25398     SDValue N00 = N0.getOperand(0);
25399     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
25400       ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
25401       if (!C || C->getZExtValue() != 1)
25402         return SDValue();
25403       return DAG.getNode(ISD::AND, dl, VT,
25404                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
25405                                      N00.getOperand(0), N00.getOperand(1)),
25406                          DAG.getConstant(1, VT));
25407     }
25408   }
25409
25410   if (N0.getOpcode() == ISD::TRUNCATE &&
25411       N0.hasOneUse() &&
25412       N0.getOperand(0).hasOneUse()) {
25413     SDValue N00 = N0.getOperand(0);
25414     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
25415       return DAG.getNode(ISD::AND, dl, VT,
25416                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
25417                                      N00.getOperand(0), N00.getOperand(1)),
25418                          DAG.getConstant(1, VT));
25419     }
25420   }
25421   if (VT.is256BitVector()) {
25422     SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
25423     if (R.getNode())
25424       return R;
25425   }
25426
25427   // (i8,i32 zext (udivrem (i8 x, i8 y)) ->
25428   // (i8,i32 (udivrem_zext_hreg (i8 x, i8 y)
25429   // This exposes the zext to the udivrem lowering, so that it directly extends
25430   // from AH (which we otherwise need to do contortions to access).
25431   if (N0.getOpcode() == ISD::UDIVREM &&
25432       N0.getResNo() == 1 && N0.getValueType() == MVT::i8 &&
25433       (VT == MVT::i32 || VT == MVT::i64)) {
25434     SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
25435     SDValue R = DAG.getNode(X86ISD::UDIVREM8_ZEXT_HREG, dl, NodeTys,
25436                             N0.getOperand(0), N0.getOperand(1));
25437     DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
25438     return R.getValue(1);
25439   }
25440
25441   return SDValue();
25442 }
25443
25444 // Optimize x == -y --> x+y == 0
25445 //          x != -y --> x+y != 0
25446 static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
25447                                       const X86Subtarget* Subtarget) {
25448   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
25449   SDValue LHS = N->getOperand(0);
25450   SDValue RHS = N->getOperand(1);
25451   EVT VT = N->getValueType(0);
25452   SDLoc DL(N);
25453
25454   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
25455     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
25456       if (C->getAPIntValue() == 0 && LHS.hasOneUse()) {
25457         SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
25458                                    LHS.getValueType(), RHS, LHS.getOperand(1));
25459         return DAG.getSetCC(SDLoc(N), N->getValueType(0),
25460                             addV, DAG.getConstant(0, addV.getValueType()), CC);
25461       }
25462   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
25463     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0)))
25464       if (C->getAPIntValue() == 0 && RHS.hasOneUse()) {
25465         SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
25466                                    RHS.getValueType(), LHS, RHS.getOperand(1));
25467         return DAG.getSetCC(SDLoc(N), N->getValueType(0),
25468                             addV, DAG.getConstant(0, addV.getValueType()), CC);
25469       }
25470
25471   if (VT.getScalarType() == MVT::i1) {
25472     bool IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
25473       (LHS.getOperand(0).getValueType().getScalarType() ==  MVT::i1);
25474     bool IsVZero0 = ISD::isBuildVectorAllZeros(LHS.getNode());
25475     if (!IsSEXT0 && !IsVZero0)
25476       return SDValue();
25477     bool IsSEXT1 = (RHS.getOpcode() == ISD::SIGN_EXTEND) &&
25478       (RHS.getOperand(0).getValueType().getScalarType() ==  MVT::i1);
25479     bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
25480
25481     if (!IsSEXT1 && !IsVZero1)
25482       return SDValue();
25483
25484     if (IsSEXT0 && IsVZero1) {
25485       assert(VT == LHS.getOperand(0).getValueType() && "Uexpected operand type");
25486       if (CC == ISD::SETEQ)
25487         return DAG.getNOT(DL, LHS.getOperand(0), VT);
25488       return LHS.getOperand(0);
25489     }
25490     if (IsSEXT1 && IsVZero0) {
25491       assert(VT == RHS.getOperand(0).getValueType() && "Uexpected operand type");
25492       if (CC == ISD::SETEQ)
25493         return DAG.getNOT(DL, RHS.getOperand(0), VT);
25494       return RHS.getOperand(0);
25495     }
25496   }
25497
25498   return SDValue();
25499 }
25500
25501 static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
25502                                       const X86Subtarget *Subtarget) {
25503   SDLoc dl(N);
25504   MVT VT = N->getOperand(1)->getSimpleValueType(0);
25505   assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
25506          "X86insertps is only defined for v4x32");
25507
25508   SDValue Ld = N->getOperand(1);
25509   if (MayFoldLoad(Ld)) {
25510     // Extract the countS bits from the immediate so we can get the proper
25511     // address when narrowing the vector load to a specific element.
25512     // When the second source op is a memory address, interps doesn't use
25513     // countS and just gets an f32 from that address.
25514     unsigned DestIndex =
25515         cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
25516     Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
25517   } else
25518     return SDValue();
25519
25520   // Create this as a scalar to vector to match the instruction pattern.
25521   SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
25522   // countS bits are ignored when loading from memory on insertps, which
25523   // means we don't need to explicitly set them to 0.
25524   return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
25525                      LoadScalarToVector, N->getOperand(2));
25526 }
25527
25528 // Helper function of PerformSETCCCombine. It is to materialize "setb reg"
25529 // as "sbb reg,reg", since it can be extended without zext and produces
25530 // an all-ones bit which is more useful than 0/1 in some cases.
25531 static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG,
25532                                MVT VT) {
25533   if (VT == MVT::i8)
25534     return DAG.getNode(ISD::AND, DL, VT,
25535                        DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
25536                                    DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS),
25537                        DAG.getConstant(1, VT));
25538   assert (VT == MVT::i1 && "Unexpected type for SECCC node");
25539   return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,
25540                      DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
25541                                  DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS));
25542 }
25543
25544 // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
25545 static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
25546                                    TargetLowering::DAGCombinerInfo &DCI,
25547                                    const X86Subtarget *Subtarget) {
25548   SDLoc DL(N);
25549   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
25550   SDValue EFLAGS = N->getOperand(1);
25551
25552   if (CC == X86::COND_A) {
25553     // Try to convert COND_A into COND_B in an attempt to facilitate
25554     // materializing "setb reg".
25555     //
25556     // Do not flip "e > c", where "c" is a constant, because Cmp instruction
25557     // cannot take an immediate as its first operand.
25558     //
25559     if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
25560         EFLAGS.getValueType().isInteger() &&
25561         !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
25562       SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
25563                                    EFLAGS.getNode()->getVTList(),
25564                                    EFLAGS.getOperand(1), EFLAGS.getOperand(0));
25565       SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
25566       return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0));
25567     }
25568   }
25569
25570   // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
25571   // a zext and produces an all-ones bit which is more useful than 0/1 in some
25572   // cases.
25573   if (CC == X86::COND_B)
25574     return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
25575
25576   SDValue Flags;
25577
25578   Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
25579   if (Flags.getNode()) {
25580     SDValue Cond = DAG.getConstant(CC, MVT::i8);
25581     return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
25582   }
25583
25584   return SDValue();
25585 }
25586
25587 // Optimize branch condition evaluation.
25588 //
25589 static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
25590                                     TargetLowering::DAGCombinerInfo &DCI,
25591                                     const X86Subtarget *Subtarget) {
25592   SDLoc DL(N);
25593   SDValue Chain = N->getOperand(0);
25594   SDValue Dest = N->getOperand(1);
25595   SDValue EFLAGS = N->getOperand(3);
25596   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
25597
25598   SDValue Flags;
25599
25600   Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
25601   if (Flags.getNode()) {
25602     SDValue Cond = DAG.getConstant(CC, MVT::i8);
25603     return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
25604                        Flags);
25605   }
25606
25607   return SDValue();
25608 }
25609
25610 static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
25611                                                          SelectionDAG &DAG) {
25612   // Take advantage of vector comparisons producing 0 or -1 in each lane to
25613   // optimize away operation when it's from a constant.
25614   //
25615   // The general transformation is:
25616   //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
25617   //       AND(VECTOR_CMP(x,y), constant2)
25618   //    constant2 = UNARYOP(constant)
25619
25620   // Early exit if this isn't a vector operation, the operand of the
25621   // unary operation isn't a bitwise AND, or if the sizes of the operations
25622   // aren't the same.
25623   EVT VT = N->getValueType(0);
25624   if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
25625       N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
25626       VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
25627     return SDValue();
25628
25629   // Now check that the other operand of the AND is a constant. We could
25630   // make the transformation for non-constant splats as well, but it's unclear
25631   // that would be a benefit as it would not eliminate any operations, just
25632   // perform one more step in scalar code before moving to the vector unit.
25633   if (BuildVectorSDNode *BV =
25634           dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
25635     // Bail out if the vector isn't a constant.
25636     if (!BV->isConstant())
25637       return SDValue();
25638
25639     // Everything checks out. Build up the new and improved node.
25640     SDLoc DL(N);
25641     EVT IntVT = BV->getValueType(0);
25642     // Create a new constant of the appropriate type for the transformed
25643     // DAG.
25644     SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
25645     // The AND node needs bitcasts to/from an integer vector type around it.
25646     SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
25647     SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
25648                                  N->getOperand(0)->getOperand(0), MaskConst);
25649     SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
25650     return Res;
25651   }
25652
25653   return SDValue();
25654 }
25655
25656 static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
25657                                         const X86TargetLowering *XTLI) {
25658   // First try to optimize away the conversion entirely when it's
25659   // conditionally from a constant. Vectors only.
25660   SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG);
25661   if (Res != SDValue())
25662     return Res;
25663
25664   // Now move on to more general possibilities.
25665   SDValue Op0 = N->getOperand(0);
25666   EVT InVT = Op0->getValueType(0);
25667
25668   // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32))
25669   if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
25670     SDLoc dl(N);
25671     MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
25672     SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
25673     return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
25674   }
25675
25676   // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
25677   // a 32-bit target where SSE doesn't support i64->FP operations.
25678   if (Op0.getOpcode() == ISD::LOAD) {
25679     LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
25680     EVT VT = Ld->getValueType(0);
25681     if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
25682         ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
25683         !XTLI->getSubtarget()->is64Bit() &&
25684         VT == MVT::i64) {
25685       SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0),
25686                                           Ld->getChain(), Op0, DAG);
25687       DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
25688       return FILDChain;
25689     }
25690   }
25691   return SDValue();
25692 }
25693
25694 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
25695 static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
25696                                  X86TargetLowering::DAGCombinerInfo &DCI) {
25697   // If the LHS and RHS of the ADC node are zero, then it can't overflow and
25698   // the result is either zero or one (depending on the input carry bit).
25699   // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
25700   if (X86::isZeroNode(N->getOperand(0)) &&
25701       X86::isZeroNode(N->getOperand(1)) &&
25702       // We don't have a good way to replace an EFLAGS use, so only do this when
25703       // dead right now.
25704       SDValue(N, 1).use_empty()) {
25705     SDLoc DL(N);
25706     EVT VT = N->getValueType(0);
25707     SDValue CarryOut = DAG.getConstant(0, N->getValueType(1));
25708     SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
25709                                DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
25710                                            DAG.getConstant(X86::COND_B,MVT::i8),
25711                                            N->getOperand(2)),
25712                                DAG.getConstant(1, VT));
25713     return DCI.CombineTo(N, Res1, CarryOut);
25714   }
25715
25716   return SDValue();
25717 }
25718
25719 // fold (add Y, (sete  X, 0)) -> adc  0, Y
25720 //      (add Y, (setne X, 0)) -> sbb -1, Y
25721 //      (sub (sete  X, 0), Y) -> sbb  0, Y
25722 //      (sub (setne X, 0), Y) -> adc -1, Y
25723 static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
25724   SDLoc DL(N);
25725
25726   // Look through ZExts.
25727   SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
25728   if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
25729     return SDValue();
25730
25731   SDValue SetCC = Ext.getOperand(0);
25732   if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
25733     return SDValue();
25734
25735   X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
25736   if (CC != X86::COND_E && CC != X86::COND_NE)
25737     return SDValue();
25738
25739   SDValue Cmp = SetCC.getOperand(1);
25740   if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
25741       !X86::isZeroNode(Cmp.getOperand(1)) ||
25742       !Cmp.getOperand(0).getValueType().isInteger())
25743     return SDValue();
25744
25745   SDValue CmpOp0 = Cmp.getOperand(0);
25746   SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
25747                                DAG.getConstant(1, CmpOp0.getValueType()));
25748
25749   SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
25750   if (CC == X86::COND_NE)
25751     return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
25752                        DL, OtherVal.getValueType(), OtherVal,
25753                        DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp);
25754   return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
25755                      DL, OtherVal.getValueType(), OtherVal,
25756                      DAG.getConstant(0, OtherVal.getValueType()), NewCmp);
25757 }
25758
25759 /// PerformADDCombine - Do target-specific dag combines on integer adds.
25760 static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
25761                                  const X86Subtarget *Subtarget) {
25762   EVT VT = N->getValueType(0);
25763   SDValue Op0 = N->getOperand(0);
25764   SDValue Op1 = N->getOperand(1);
25765
25766   // Try to synthesize horizontal adds from adds of shuffles.
25767   if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
25768        (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
25769       isHorizontalBinOp(Op0, Op1, true))
25770     return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
25771
25772   return OptimizeConditionalInDecrement(N, DAG);
25773 }
25774
25775 static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
25776                                  const X86Subtarget *Subtarget) {
25777   SDValue Op0 = N->getOperand(0);
25778   SDValue Op1 = N->getOperand(1);
25779
25780   // X86 can't encode an immediate LHS of a sub. See if we can push the
25781   // negation into a preceding instruction.
25782   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
25783     // If the RHS of the sub is a XOR with one use and a constant, invert the
25784     // immediate. Then add one to the LHS of the sub so we can turn
25785     // X-Y -> X+~Y+1, saving one register.
25786     if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
25787         isa<ConstantSDNode>(Op1.getOperand(1))) {
25788       APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
25789       EVT VT = Op0.getValueType();
25790       SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
25791                                    Op1.getOperand(0),
25792                                    DAG.getConstant(~XorC, VT));
25793       return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
25794                          DAG.getConstant(C->getAPIntValue()+1, VT));
25795     }
25796   }
25797
25798   // Try to synthesize horizontal adds from adds of shuffles.
25799   EVT VT = N->getValueType(0);
25800   if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
25801        (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
25802       isHorizontalBinOp(Op0, Op1, true))
25803     return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
25804
25805   return OptimizeConditionalInDecrement(N, DAG);
25806 }
25807
25808 /// performVZEXTCombine - Performs build vector combines
25809 static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
25810                                    TargetLowering::DAGCombinerInfo &DCI,
25811                                    const X86Subtarget *Subtarget) {
25812   SDLoc DL(N);
25813   MVT VT = N->getSimpleValueType(0);
25814   SDValue Op = N->getOperand(0);
25815   MVT OpVT = Op.getSimpleValueType();
25816   MVT OpEltVT = OpVT.getVectorElementType();
25817   unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
25818
25819   // (vzext (bitcast (vzext (x)) -> (vzext x)
25820   SDValue V = Op;
25821   while (V.getOpcode() == ISD::BITCAST)
25822     V = V.getOperand(0);
25823
25824   if (V != Op && V.getOpcode() == X86ISD::VZEXT) {
25825     MVT InnerVT = V.getSimpleValueType();
25826     MVT InnerEltVT = InnerVT.getVectorElementType();
25827
25828     // If the element sizes match exactly, we can just do one larger vzext. This
25829     // is always an exact type match as vzext operates on integer types.
25830     if (OpEltVT == InnerEltVT) {
25831       assert(OpVT == InnerVT && "Types must match for vzext!");
25832       return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
25833     }
25834
25835     // The only other way we can combine them is if only a single element of the
25836     // inner vzext is used in the input to the outer vzext.
25837     if (InnerEltVT.getSizeInBits() < InputBits)
25838       return SDValue();
25839
25840     // In this case, the inner vzext is completely dead because we're going to
25841     // only look at bits inside of the low element. Just do the outer vzext on
25842     // a bitcast of the input to the inner.
25843     return DAG.getNode(X86ISD::VZEXT, DL, VT,
25844                        DAG.getNode(ISD::BITCAST, DL, OpVT, V));
25845   }
25846
25847   // Check if we can bypass extracting and re-inserting an element of an input
25848   // vector. Essentialy:
25849   // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
25850   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
25851       V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
25852       V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
25853     SDValue ExtractedV = V.getOperand(0);
25854     SDValue OrigV = ExtractedV.getOperand(0);
25855     if (auto *ExtractIdx = dyn_cast<ConstantSDNode>(ExtractedV.getOperand(1)))
25856       if (ExtractIdx->getZExtValue() == 0) {
25857         MVT OrigVT = OrigV.getSimpleValueType();
25858         // Extract a subvector if necessary...
25859         if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
25860           int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
25861           OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
25862                                     OrigVT.getVectorNumElements() / Ratio);
25863           OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
25864                               DAG.getIntPtrConstant(0));
25865         }
25866         Op = DAG.getNode(ISD::BITCAST, DL, OpVT, OrigV);
25867         return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
25868       }
25869   }
25870
25871   return SDValue();
25872 }
25873
25874 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
25875                                              DAGCombinerInfo &DCI) const {
25876   SelectionDAG &DAG = DCI.DAG;
25877   switch (N->getOpcode()) {
25878   default: break;
25879   case ISD::EXTRACT_VECTOR_ELT:
25880     return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
25881   case ISD::VSELECT:
25882   case ISD::SELECT:
25883   case X86ISD::SHRUNKBLEND:
25884     return PerformSELECTCombine(N, DAG, DCI, Subtarget);
25885   case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI, Subtarget);
25886   case ISD::ADD:            return PerformAddCombine(N, DAG, Subtarget);
25887   case ISD::SUB:            return PerformSubCombine(N, DAG, Subtarget);
25888   case X86ISD::ADC:         return PerformADCCombine(N, DAG, DCI);
25889   case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
25890   case ISD::SHL:
25891   case ISD::SRA:
25892   case ISD::SRL:            return PerformShiftCombine(N, DAG, DCI, Subtarget);
25893   case ISD::AND:            return PerformAndCombine(N, DAG, DCI, Subtarget);
25894   case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
25895   case ISD::XOR:            return PerformXorCombine(N, DAG, DCI, Subtarget);
25896   case ISD::LOAD:           return PerformLOADCombine(N, DAG, DCI, Subtarget);
25897   case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
25898   case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, this);
25899   case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
25900   case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
25901   case X86ISD::FXOR:
25902   case X86ISD::FOR:         return PerformFORCombine(N, DAG);
25903   case X86ISD::FMIN:
25904   case X86ISD::FMAX:        return PerformFMinFMaxCombine(N, DAG);
25905   case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
25906   case X86ISD::FANDN:       return PerformFANDNCombine(N, DAG);
25907   case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
25908   case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
25909   case ISD::ANY_EXTEND:
25910   case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, DCI, Subtarget);
25911   case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
25912   case ISD::SIGN_EXTEND_INREG:
25913     return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
25914   case ISD::TRUNCATE:       return PerformTruncateCombine(N, DAG,DCI,Subtarget);
25915   case ISD::SETCC:          return PerformISDSETCCCombine(N, DAG, Subtarget);
25916   case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG, DCI, Subtarget);
25917   case X86ISD::BRCOND:      return PerformBrCondCombine(N, DAG, DCI, Subtarget);
25918   case X86ISD::VZEXT:       return performVZEXTCombine(N, DAG, DCI, Subtarget);
25919   case X86ISD::SHUFP:       // Handle all target specific shuffles
25920   case X86ISD::PALIGNR:
25921   case X86ISD::UNPCKH:
25922   case X86ISD::UNPCKL:
25923   case X86ISD::MOVHLPS:
25924   case X86ISD::MOVLHPS:
25925   case X86ISD::PSHUFB:
25926   case X86ISD::PSHUFD:
25927   case X86ISD::PSHUFHW:
25928   case X86ISD::PSHUFLW:
25929   case X86ISD::MOVSS:
25930   case X86ISD::MOVSD:
25931   case X86ISD::VPERMILPI:
25932   case X86ISD::VPERM2X128:
25933   case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
25934   case ISD::FMA:            return PerformFMACombine(N, DAG, Subtarget);
25935   case ISD::INTRINSIC_WO_CHAIN:
25936     return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget);
25937   case X86ISD::INSERTPS: {
25938     if (getTargetMachine().getOptLevel() > CodeGenOpt::None)
25939       return PerformINSERTPSCombine(N, DAG, Subtarget);
25940     break;
25941   }
25942   case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DAG, Subtarget);
25943   }
25944
25945   return SDValue();
25946 }
25947
25948 /// isTypeDesirableForOp - Return true if the target has native support for
25949 /// the specified value type and it is 'desirable' to use the type for the
25950 /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
25951 /// instruction encodings are longer and some i16 instructions are slow.
25952 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
25953   if (!isTypeLegal(VT))
25954     return false;
25955   if (VT != MVT::i16)
25956     return true;
25957
25958   switch (Opc) {
25959   default:
25960     return true;
25961   case ISD::LOAD:
25962   case ISD::SIGN_EXTEND:
25963   case ISD::ZERO_EXTEND:
25964   case ISD::ANY_EXTEND:
25965   case ISD::SHL:
25966   case ISD::SRL:
25967   case ISD::SUB:
25968   case ISD::ADD:
25969   case ISD::MUL:
25970   case ISD::AND:
25971   case ISD::OR:
25972   case ISD::XOR:
25973     return false;
25974   }
25975 }
25976
25977 /// IsDesirableToPromoteOp - This method query the target whether it is
25978 /// beneficial for dag combiner to promote the specified node. If true, it
25979 /// should return the desired promotion type by reference.
25980 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
25981   EVT VT = Op.getValueType();
25982   if (VT != MVT::i16)
25983     return false;
25984
25985   bool Promote = false;
25986   bool Commute = false;
25987   switch (Op.getOpcode()) {
25988   default: break;
25989   case ISD::LOAD: {
25990     LoadSDNode *LD = cast<LoadSDNode>(Op);
25991     // If the non-extending load has a single use and it's not live out, then it
25992     // might be folded.
25993     if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&&
25994                                                      Op.hasOneUse()*/) {
25995       for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
25996              UE = Op.getNode()->use_end(); UI != UE; ++UI) {
25997         // The only case where we'd want to promote LOAD (rather then it being
25998         // promoted as an operand is when it's only use is liveout.
25999         if (UI->getOpcode() != ISD::CopyToReg)
26000           return false;
26001       }
26002     }
26003     Promote = true;
26004     break;
26005   }
26006   case ISD::SIGN_EXTEND:
26007   case ISD::ZERO_EXTEND:
26008   case ISD::ANY_EXTEND:
26009     Promote = true;
26010     break;
26011   case ISD::SHL:
26012   case ISD::SRL: {
26013     SDValue N0 = Op.getOperand(0);
26014     // Look out for (store (shl (load), x)).
26015     if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
26016       return false;
26017     Promote = true;
26018     break;
26019   }
26020   case ISD::ADD:
26021   case ISD::MUL:
26022   case ISD::AND:
26023   case ISD::OR:
26024   case ISD::XOR:
26025     Commute = true;
26026     // fallthrough
26027   case ISD::SUB: {
26028     SDValue N0 = Op.getOperand(0);
26029     SDValue N1 = Op.getOperand(1);
26030     if (!Commute && MayFoldLoad(N1))
26031       return false;
26032     // Avoid disabling potential load folding opportunities.
26033     if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
26034       return false;
26035     if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
26036       return false;
26037     Promote = true;
26038   }
26039   }
26040
26041   PVT = MVT::i32;
26042   return Promote;
26043 }
26044
26045 //===----------------------------------------------------------------------===//
26046 //                           X86 Inline Assembly Support
26047 //===----------------------------------------------------------------------===//
26048
26049 namespace {
26050   // Helper to match a string separated by whitespace.
26051   bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) {
26052     s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace.
26053
26054     for (unsigned i = 0, e = args.size(); i != e; ++i) {
26055       StringRef piece(*args[i]);
26056       if (!s.startswith(piece)) // Check if the piece matches.
26057         return false;
26058
26059       s = s.substr(piece.size());
26060       StringRef::size_type pos = s.find_first_not_of(" \t");
26061       if (pos == 0) // We matched a prefix.
26062         return false;
26063
26064       s = s.substr(pos);
26065     }
26066
26067     return s.empty();
26068   }
26069   const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={};
26070 }
26071
26072 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
26073
26074   if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
26075     if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
26076         std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
26077         std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
26078
26079       if (AsmPieces.size() == 3)
26080         return true;
26081       else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
26082         return true;
26083     }
26084   }
26085   return false;
26086 }
26087
26088 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
26089   InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
26090
26091   std::string AsmStr = IA->getAsmString();
26092
26093   IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
26094   if (!Ty || Ty->getBitWidth() % 16 != 0)
26095     return false;
26096
26097   // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
26098   SmallVector<StringRef, 4> AsmPieces;
26099   SplitString(AsmStr, AsmPieces, ";\n");
26100
26101   switch (AsmPieces.size()) {
26102   default: return false;
26103   case 1:
26104     // FIXME: this should verify that we are targeting a 486 or better.  If not,
26105     // we will turn this bswap into something that will be lowered to logical
26106     // ops instead of emitting the bswap asm.  For now, we don't support 486 or
26107     // lower so don't worry about this.
26108     // bswap $0
26109     if (matchAsm(AsmPieces[0], "bswap", "$0") ||
26110         matchAsm(AsmPieces[0], "bswapl", "$0") ||
26111         matchAsm(AsmPieces[0], "bswapq", "$0") ||
26112         matchAsm(AsmPieces[0], "bswap", "${0:q}") ||
26113         matchAsm(AsmPieces[0], "bswapl", "${0:q}") ||
26114         matchAsm(AsmPieces[0], "bswapq", "${0:q}")) {
26115       // No need to check constraints, nothing other than the equivalent of
26116       // "=r,0" would be valid here.
26117       return IntrinsicLowering::LowerToByteSwap(CI);
26118     }
26119
26120     // rorw $$8, ${0:w}  -->  llvm.bswap.i16
26121     if (CI->getType()->isIntegerTy(16) &&
26122         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
26123         (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") ||
26124          matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) {
26125       AsmPieces.clear();
26126       const std::string &ConstraintsStr = IA->getConstraintString();
26127       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
26128       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
26129       if (clobbersFlagRegisters(AsmPieces))
26130         return IntrinsicLowering::LowerToByteSwap(CI);
26131     }
26132     break;
26133   case 3:
26134     if (CI->getType()->isIntegerTy(32) &&
26135         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
26136         matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") &&
26137         matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") &&
26138         matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) {
26139       AsmPieces.clear();
26140       const std::string &ConstraintsStr = IA->getConstraintString();
26141       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
26142       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
26143       if (clobbersFlagRegisters(AsmPieces))
26144         return IntrinsicLowering::LowerToByteSwap(CI);
26145     }
26146
26147     if (CI->getType()->isIntegerTy(64)) {
26148       InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
26149       if (Constraints.size() >= 2 &&
26150           Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
26151           Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
26152         // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
26153         if (matchAsm(AsmPieces[0], "bswap", "%eax") &&
26154             matchAsm(AsmPieces[1], "bswap", "%edx") &&
26155             matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx"))
26156           return IntrinsicLowering::LowerToByteSwap(CI);
26157       }
26158     }
26159     break;
26160   }
26161   return false;
26162 }
26163
26164 /// getConstraintType - Given a constraint letter, return the type of
26165 /// constraint it is for this target.
26166 X86TargetLowering::ConstraintType
26167 X86TargetLowering::getConstraintType(const std::string &Constraint) const {
26168   if (Constraint.size() == 1) {
26169     switch (Constraint[0]) {
26170     case 'R':
26171     case 'q':
26172     case 'Q':
26173     case 'f':
26174     case 't':
26175     case 'u':
26176     case 'y':
26177     case 'x':
26178     case 'Y':
26179     case 'l':
26180       return C_RegisterClass;
26181     case 'a':
26182     case 'b':
26183     case 'c':
26184     case 'd':
26185     case 'S':
26186     case 'D':
26187     case 'A':
26188       return C_Register;
26189     case 'I':
26190     case 'J':
26191     case 'K':
26192     case 'L':
26193     case 'M':
26194     case 'N':
26195     case 'G':
26196     case 'C':
26197     case 'e':
26198     case 'Z':
26199       return C_Other;
26200     default:
26201       break;
26202     }
26203   }
26204   return TargetLowering::getConstraintType(Constraint);
26205 }
26206
26207 /// Examine constraint type and operand type and determine a weight value.
26208 /// This object must already have been set up with the operand type
26209 /// and the current alternative constraint selected.
26210 TargetLowering::ConstraintWeight
26211   X86TargetLowering::getSingleConstraintMatchWeight(
26212     AsmOperandInfo &info, const char *constraint) const {
26213   ConstraintWeight weight = CW_Invalid;
26214   Value *CallOperandVal = info.CallOperandVal;
26215     // If we don't have a value, we can't do a match,
26216     // but allow it at the lowest weight.
26217   if (!CallOperandVal)
26218     return CW_Default;
26219   Type *type = CallOperandVal->getType();
26220   // Look at the constraint type.
26221   switch (*constraint) {
26222   default:
26223     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
26224   case 'R':
26225   case 'q':
26226   case 'Q':
26227   case 'a':
26228   case 'b':
26229   case 'c':
26230   case 'd':
26231   case 'S':
26232   case 'D':
26233   case 'A':
26234     if (CallOperandVal->getType()->isIntegerTy())
26235       weight = CW_SpecificReg;
26236     break;
26237   case 'f':
26238   case 't':
26239   case 'u':
26240     if (type->isFloatingPointTy())
26241       weight = CW_SpecificReg;
26242     break;
26243   case 'y':
26244     if (type->isX86_MMXTy() && Subtarget->hasMMX())
26245       weight = CW_SpecificReg;
26246     break;
26247   case 'x':
26248   case 'Y':
26249     if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) ||
26250         ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256()))
26251       weight = CW_Register;
26252     break;
26253   case 'I':
26254     if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
26255       if (C->getZExtValue() <= 31)
26256         weight = CW_Constant;
26257     }
26258     break;
26259   case 'J':
26260     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26261       if (C->getZExtValue() <= 63)
26262         weight = CW_Constant;
26263     }
26264     break;
26265   case 'K':
26266     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26267       if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
26268         weight = CW_Constant;
26269     }
26270     break;
26271   case 'L':
26272     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26273       if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
26274         weight = CW_Constant;
26275     }
26276     break;
26277   case 'M':
26278     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26279       if (C->getZExtValue() <= 3)
26280         weight = CW_Constant;
26281     }
26282     break;
26283   case 'N':
26284     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26285       if (C->getZExtValue() <= 0xff)
26286         weight = CW_Constant;
26287     }
26288     break;
26289   case 'G':
26290   case 'C':
26291     if (dyn_cast<ConstantFP>(CallOperandVal)) {
26292       weight = CW_Constant;
26293     }
26294     break;
26295   case 'e':
26296     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26297       if ((C->getSExtValue() >= -0x80000000LL) &&
26298           (C->getSExtValue() <= 0x7fffffffLL))
26299         weight = CW_Constant;
26300     }
26301     break;
26302   case 'Z':
26303     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26304       if (C->getZExtValue() <= 0xffffffff)
26305         weight = CW_Constant;
26306     }
26307     break;
26308   }
26309   return weight;
26310 }
26311
26312 /// LowerXConstraint - try to replace an X constraint, which matches anything,
26313 /// with another that has more specific requirements based on the type of the
26314 /// corresponding operand.
26315 const char *X86TargetLowering::
26316 LowerXConstraint(EVT ConstraintVT) const {
26317   // FP X constraints get lowered to SSE1/2 registers if available, otherwise
26318   // 'f' like normal targets.
26319   if (ConstraintVT.isFloatingPoint()) {
26320     if (Subtarget->hasSSE2())
26321       return "Y";
26322     if (Subtarget->hasSSE1())
26323       return "x";
26324   }
26325
26326   return TargetLowering::LowerXConstraint(ConstraintVT);
26327 }
26328
26329 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
26330 /// vector.  If it is invalid, don't add anything to Ops.
26331 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
26332                                                      std::string &Constraint,
26333                                                      std::vector<SDValue>&Ops,
26334                                                      SelectionDAG &DAG) const {
26335   SDValue Result;
26336
26337   // Only support length 1 constraints for now.
26338   if (Constraint.length() > 1) return;
26339
26340   char ConstraintLetter = Constraint[0];
26341   switch (ConstraintLetter) {
26342   default: break;
26343   case 'I':
26344     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26345       if (C->getZExtValue() <= 31) {
26346         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26347         break;
26348       }
26349     }
26350     return;
26351   case 'J':
26352     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26353       if (C->getZExtValue() <= 63) {
26354         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26355         break;
26356       }
26357     }
26358     return;
26359   case 'K':
26360     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26361       if (isInt<8>(C->getSExtValue())) {
26362         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26363         break;
26364       }
26365     }
26366     return;
26367   case 'L':
26368     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26369       if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
26370           (Subtarget->is64Bit() && C->getZExtValue() == 0xffffffff)) {
26371         Result = DAG.getTargetConstant(C->getSExtValue(), Op.getValueType());
26372         break;
26373       }
26374     }
26375     return;
26376   case 'M':
26377     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26378       if (C->getZExtValue() <= 3) {
26379         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26380         break;
26381       }
26382     }
26383     return;
26384   case 'N':
26385     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26386       if (C->getZExtValue() <= 255) {
26387         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26388         break;
26389       }
26390     }
26391     return;
26392   case 'O':
26393     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26394       if (C->getZExtValue() <= 127) {
26395         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26396         break;
26397       }
26398     }
26399     return;
26400   case 'e': {
26401     // 32-bit signed value
26402     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26403       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
26404                                            C->getSExtValue())) {
26405         // Widen to 64 bits here to get it sign extended.
26406         Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
26407         break;
26408       }
26409     // FIXME gcc accepts some relocatable values here too, but only in certain
26410     // memory models; it's complicated.
26411     }
26412     return;
26413   }
26414   case 'Z': {
26415     // 32-bit unsigned value
26416     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26417       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
26418                                            C->getZExtValue())) {
26419         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26420         break;
26421       }
26422     }
26423     // FIXME gcc accepts some relocatable values here too, but only in certain
26424     // memory models; it's complicated.
26425     return;
26426   }
26427   case 'i': {
26428     // Literal immediates are always ok.
26429     if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
26430       // Widen to 64 bits here to get it sign extended.
26431       Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64);
26432       break;
26433     }
26434
26435     // In any sort of PIC mode addresses need to be computed at runtime by
26436     // adding in a register or some sort of table lookup.  These can't
26437     // be used as immediates.
26438     if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC())
26439       return;
26440
26441     // If we are in non-pic codegen mode, we allow the address of a global (with
26442     // an optional displacement) to be used with 'i'.
26443     GlobalAddressSDNode *GA = nullptr;
26444     int64_t Offset = 0;
26445
26446     // Match either (GA), (GA+C), (GA+C1+C2), etc.
26447     while (1) {
26448       if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
26449         Offset += GA->getOffset();
26450         break;
26451       } else if (Op.getOpcode() == ISD::ADD) {
26452         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
26453           Offset += C->getZExtValue();
26454           Op = Op.getOperand(0);
26455           continue;
26456         }
26457       } else if (Op.getOpcode() == ISD::SUB) {
26458         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
26459           Offset += -C->getZExtValue();
26460           Op = Op.getOperand(0);
26461           continue;
26462         }
26463       }
26464
26465       // Otherwise, this isn't something we can handle, reject it.
26466       return;
26467     }
26468
26469     const GlobalValue *GV = GA->getGlobal();
26470     // If we require an extra load to get this address, as in PIC mode, we
26471     // can't accept it.
26472     if (isGlobalStubReference(
26473             Subtarget->ClassifyGlobalReference(GV, DAG.getTarget())))
26474       return;
26475
26476     Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
26477                                         GA->getValueType(0), Offset);
26478     break;
26479   }
26480   }
26481
26482   if (Result.getNode()) {
26483     Ops.push_back(Result);
26484     return;
26485   }
26486   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
26487 }
26488
26489 std::pair<unsigned, const TargetRegisterClass*>
26490 X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
26491                                                 MVT VT) const {
26492   // First, see if this is a constraint that directly corresponds to an LLVM
26493   // register class.
26494   if (Constraint.size() == 1) {
26495     // GCC Constraint Letters
26496     switch (Constraint[0]) {
26497     default: break;
26498       // TODO: Slight differences here in allocation order and leaving
26499       // RIP in the class. Do they matter any more here than they do
26500       // in the normal allocation?
26501     case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
26502       if (Subtarget->is64Bit()) {
26503         if (VT == MVT::i32 || VT == MVT::f32)
26504           return std::make_pair(0U, &X86::GR32RegClass);
26505         if (VT == MVT::i16)
26506           return std::make_pair(0U, &X86::GR16RegClass);
26507         if (VT == MVT::i8 || VT == MVT::i1)
26508           return std::make_pair(0U, &X86::GR8RegClass);
26509         if (VT == MVT::i64 || VT == MVT::f64)
26510           return std::make_pair(0U, &X86::GR64RegClass);
26511         break;
26512       }
26513       // 32-bit fallthrough
26514     case 'Q':   // Q_REGS
26515       if (VT == MVT::i32 || VT == MVT::f32)
26516         return std::make_pair(0U, &X86::GR32_ABCDRegClass);
26517       if (VT == MVT::i16)
26518         return std::make_pair(0U, &X86::GR16_ABCDRegClass);
26519       if (VT == MVT::i8 || VT == MVT::i1)
26520         return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
26521       if (VT == MVT::i64)
26522         return std::make_pair(0U, &X86::GR64_ABCDRegClass);
26523       break;
26524     case 'r':   // GENERAL_REGS
26525     case 'l':   // INDEX_REGS
26526       if (VT == MVT::i8 || VT == MVT::i1)
26527         return std::make_pair(0U, &X86::GR8RegClass);
26528       if (VT == MVT::i16)
26529         return std::make_pair(0U, &X86::GR16RegClass);
26530       if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit())
26531         return std::make_pair(0U, &X86::GR32RegClass);
26532       return std::make_pair(0U, &X86::GR64RegClass);
26533     case 'R':   // LEGACY_REGS
26534       if (VT == MVT::i8 || VT == MVT::i1)
26535         return std::make_pair(0U, &X86::GR8_NOREXRegClass);
26536       if (VT == MVT::i16)
26537         return std::make_pair(0U, &X86::GR16_NOREXRegClass);
26538       if (VT == MVT::i32 || !Subtarget->is64Bit())
26539         return std::make_pair(0U, &X86::GR32_NOREXRegClass);
26540       return std::make_pair(0U, &X86::GR64_NOREXRegClass);
26541     case 'f':  // FP Stack registers.
26542       // If SSE is enabled for this VT, use f80 to ensure the isel moves the
26543       // value to the correct fpstack register class.
26544       if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
26545         return std::make_pair(0U, &X86::RFP32RegClass);
26546       if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
26547         return std::make_pair(0U, &X86::RFP64RegClass);
26548       return std::make_pair(0U, &X86::RFP80RegClass);
26549     case 'y':   // MMX_REGS if MMX allowed.
26550       if (!Subtarget->hasMMX()) break;
26551       return std::make_pair(0U, &X86::VR64RegClass);
26552     case 'Y':   // SSE_REGS if SSE2 allowed
26553       if (!Subtarget->hasSSE2()) break;
26554       // FALL THROUGH.
26555     case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
26556       if (!Subtarget->hasSSE1()) break;
26557
26558       switch (VT.SimpleTy) {
26559       default: break;
26560       // Scalar SSE types.
26561       case MVT::f32:
26562       case MVT::i32:
26563         return std::make_pair(0U, &X86::FR32RegClass);
26564       case MVT::f64:
26565       case MVT::i64:
26566         return std::make_pair(0U, &X86::FR64RegClass);
26567       // Vector types.
26568       case MVT::v16i8:
26569       case MVT::v8i16:
26570       case MVT::v4i32:
26571       case MVT::v2i64:
26572       case MVT::v4f32:
26573       case MVT::v2f64:
26574         return std::make_pair(0U, &X86::VR128RegClass);
26575       // AVX types.
26576       case MVT::v32i8:
26577       case MVT::v16i16:
26578       case MVT::v8i32:
26579       case MVT::v4i64:
26580       case MVT::v8f32:
26581       case MVT::v4f64:
26582         return std::make_pair(0U, &X86::VR256RegClass);
26583       case MVT::v8f64:
26584       case MVT::v16f32:
26585       case MVT::v16i32:
26586       case MVT::v8i64:
26587         return std::make_pair(0U, &X86::VR512RegClass);
26588       }
26589       break;
26590     }
26591   }
26592
26593   // Use the default implementation in TargetLowering to convert the register
26594   // constraint into a member of a register class.
26595   std::pair<unsigned, const TargetRegisterClass*> Res;
26596   Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
26597
26598   // Not found as a standard register?
26599   if (!Res.second) {
26600     // Map st(0) -> st(7) -> ST0
26601     if (Constraint.size() == 7 && Constraint[0] == '{' &&
26602         tolower(Constraint[1]) == 's' &&
26603         tolower(Constraint[2]) == 't' &&
26604         Constraint[3] == '(' &&
26605         (Constraint[4] >= '0' && Constraint[4] <= '7') &&
26606         Constraint[5] == ')' &&
26607         Constraint[6] == '}') {
26608
26609       Res.first = X86::FP0+Constraint[4]-'0';
26610       Res.second = &X86::RFP80RegClass;
26611       return Res;
26612     }
26613
26614     // GCC allows "st(0)" to be called just plain "st".
26615     if (StringRef("{st}").equals_lower(Constraint)) {
26616       Res.first = X86::FP0;
26617       Res.second = &X86::RFP80RegClass;
26618       return Res;
26619     }
26620
26621     // flags -> EFLAGS
26622     if (StringRef("{flags}").equals_lower(Constraint)) {
26623       Res.first = X86::EFLAGS;
26624       Res.second = &X86::CCRRegClass;
26625       return Res;
26626     }
26627
26628     // 'A' means EAX + EDX.
26629     if (Constraint == "A") {
26630       Res.first = X86::EAX;
26631       Res.second = &X86::GR32_ADRegClass;
26632       return Res;
26633     }
26634     return Res;
26635   }
26636
26637   // Otherwise, check to see if this is a register class of the wrong value
26638   // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
26639   // turn into {ax},{dx}.
26640   if (Res.second->hasType(VT))
26641     return Res;   // Correct type already, nothing to do.
26642
26643   // All of the single-register GCC register classes map their values onto
26644   // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp".  If we
26645   // really want an 8-bit or 32-bit register, map to the appropriate register
26646   // class and return the appropriate register.
26647   if (Res.second == &X86::GR16RegClass) {
26648     if (VT == MVT::i8 || VT == MVT::i1) {
26649       unsigned DestReg = 0;
26650       switch (Res.first) {
26651       default: break;
26652       case X86::AX: DestReg = X86::AL; break;
26653       case X86::DX: DestReg = X86::DL; break;
26654       case X86::CX: DestReg = X86::CL; break;
26655       case X86::BX: DestReg = X86::BL; break;
26656       }
26657       if (DestReg) {
26658         Res.first = DestReg;
26659         Res.second = &X86::GR8RegClass;
26660       }
26661     } else if (VT == MVT::i32 || VT == MVT::f32) {
26662       unsigned DestReg = 0;
26663       switch (Res.first) {
26664       default: break;
26665       case X86::AX: DestReg = X86::EAX; break;
26666       case X86::DX: DestReg = X86::EDX; break;
26667       case X86::CX: DestReg = X86::ECX; break;
26668       case X86::BX: DestReg = X86::EBX; break;
26669       case X86::SI: DestReg = X86::ESI; break;
26670       case X86::DI: DestReg = X86::EDI; break;
26671       case X86::BP: DestReg = X86::EBP; break;
26672       case X86::SP: DestReg = X86::ESP; break;
26673       }
26674       if (DestReg) {
26675         Res.first = DestReg;
26676         Res.second = &X86::GR32RegClass;
26677       }
26678     } else if (VT == MVT::i64 || VT == MVT::f64) {
26679       unsigned DestReg = 0;
26680       switch (Res.first) {
26681       default: break;
26682       case X86::AX: DestReg = X86::RAX; break;
26683       case X86::DX: DestReg = X86::RDX; break;
26684       case X86::CX: DestReg = X86::RCX; break;
26685       case X86::BX: DestReg = X86::RBX; break;
26686       case X86::SI: DestReg = X86::RSI; break;
26687       case X86::DI: DestReg = X86::RDI; break;
26688       case X86::BP: DestReg = X86::RBP; break;
26689       case X86::SP: DestReg = X86::RSP; break;
26690       }
26691       if (DestReg) {
26692         Res.first = DestReg;
26693         Res.second = &X86::GR64RegClass;
26694       }
26695     }
26696   } else if (Res.second == &X86::FR32RegClass ||
26697              Res.second == &X86::FR64RegClass ||
26698              Res.second == &X86::VR128RegClass ||
26699              Res.second == &X86::VR256RegClass ||
26700              Res.second == &X86::FR32XRegClass ||
26701              Res.second == &X86::FR64XRegClass ||
26702              Res.second == &X86::VR128XRegClass ||
26703              Res.second == &X86::VR256XRegClass ||
26704              Res.second == &X86::VR512RegClass) {
26705     // Handle references to XMM physical registers that got mapped into the
26706     // wrong class.  This can happen with constraints like {xmm0} where the
26707     // target independent register mapper will just pick the first match it can
26708     // find, ignoring the required type.
26709
26710     if (VT == MVT::f32 || VT == MVT::i32)
26711       Res.second = &X86::FR32RegClass;
26712     else if (VT == MVT::f64 || VT == MVT::i64)
26713       Res.second = &X86::FR64RegClass;
26714     else if (X86::VR128RegClass.hasType(VT))
26715       Res.second = &X86::VR128RegClass;
26716     else if (X86::VR256RegClass.hasType(VT))
26717       Res.second = &X86::VR256RegClass;
26718     else if (X86::VR512RegClass.hasType(VT))
26719       Res.second = &X86::VR512RegClass;
26720   }
26721
26722   return Res;
26723 }
26724
26725 int X86TargetLowering::getScalingFactorCost(const AddrMode &AM,
26726                                             Type *Ty) const {
26727   // Scaling factors are not free at all.
26728   // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
26729   // will take 2 allocations in the out of order engine instead of 1
26730   // for plain addressing mode, i.e. inst (reg1).
26731   // E.g.,
26732   // vaddps (%rsi,%drx), %ymm0, %ymm1
26733   // Requires two allocations (one for the load, one for the computation)
26734   // whereas:
26735   // vaddps (%rsi), %ymm0, %ymm1
26736   // Requires just 1 allocation, i.e., freeing allocations for other operations
26737   // and having less micro operations to execute.
26738   //
26739   // For some X86 architectures, this is even worse because for instance for
26740   // stores, the complex addressing mode forces the instruction to use the
26741   // "load" ports instead of the dedicated "store" port.
26742   // E.g., on Haswell:
26743   // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
26744   // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
26745   if (isLegalAddressingMode(AM, Ty))
26746     // Scale represents reg2 * scale, thus account for 1
26747     // as soon as we use a second register.
26748     return AM.Scale != 0;
26749   return -1;
26750 }
26751
26752 bool X86TargetLowering::isTargetFTOL() const {
26753   return Subtarget->isTargetKnownWindowsMSVC() && !Subtarget->is64Bit();
26754 }