lib/Target/X86/X86ISelLowering.cpp

   1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This file defines the interfaces that X86 uses to lower LLVM code into a
  11 // selection DAG.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "X86ISelLowering.h"
  16 #include "Utils/X86ShuffleDecode.h"
  17 #include "X86CallingConv.h"
  18 #include "X86FrameLowering.h"
  19 #include "X86InstrBuilder.h"
  20 #include "X86MachineFunctionInfo.h"
  21 #include "X86TargetMachine.h"
  22 #include "X86TargetObjectFile.h"
  23 #include "llvm/ADT/SmallBitVector.h"
  24 #include "llvm/ADT/SmallSet.h"
  25 #include "llvm/ADT/Statistic.h"
  26 #include "llvm/ADT/StringExtras.h"
  27 #include "llvm/ADT/StringSwitch.h"
  28 #include "llvm/CodeGen/IntrinsicLowering.h"
  29 #include "llvm/CodeGen/MachineFrameInfo.h"
  30 #include "llvm/CodeGen/MachineFunction.h"
  31 #include "llvm/CodeGen/MachineInstrBuilder.h"
  32 #include "llvm/CodeGen/MachineJumpTableInfo.h"
  33 #include "llvm/CodeGen/MachineModuleInfo.h"
  34 #include "llvm/CodeGen/MachineRegisterInfo.h"
  35 #include "llvm/CodeGen/WinEHFuncInfo.h"
  36 #include "llvm/IR/CallSite.h"
  37 #include "llvm/IR/CallingConv.h"
  38 #include "llvm/IR/Constants.h"
  39 #include "llvm/IR/DerivedTypes.h"
  40 #include "llvm/IR/Function.h"
  41 #include "llvm/IR/GlobalAlias.h"
  42 #include "llvm/IR/GlobalVariable.h"
  43 #include "llvm/IR/Instructions.h"
  44 #include "llvm/IR/Intrinsics.h"
  45 #include "llvm/MC/MCAsmInfo.h"
  46 #include "llvm/MC/MCContext.h"
  47 #include "llvm/MC/MCExpr.h"
  48 #include "llvm/MC/MCSymbol.h"
  49 #include "llvm/Support/CommandLine.h"
  50 #include "llvm/Support/Debug.h"
  51 #include "llvm/Support/ErrorHandling.h"
  52 #include "llvm/Support/MathExtras.h"
  53 #include "llvm/Target/TargetOptions.h"
  54 #include "X86IntrinsicsInfo.h"
  55 #include <bitset>
  56 #include <numeric>
  57 #include <cctype>
  58 using namespace llvm;
  59
  60 #define DEBUG_TYPE "x86-isel"
  61
  62 STATISTIC(NumTailCalls, "Number of tail calls");
  63
  64 static cl::opt<bool> ExperimentalVectorWideningLegalization(
  65     "x86-experimental-vector-widening-legalization", cl::init(false),
  66     cl::desc("Enable an experimental vector type legalization through widening "
  67              "rather than promotion."),
  68     cl::Hidden);
  69
  70 // Forward declarations.
  71 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
  72                        SDValue V2);
  73
  74 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
  75                                      const X86Subtarget &STI)
  76     : TargetLowering(TM), Subtarget(&STI) {
  77   X86ScalarSSEf64 = Subtarget->hasSSE2();
  78   X86ScalarSSEf32 = Subtarget->hasSSE1();
  79   TD = getDataLayout();
  80
  81   // Set up the TargetLowering object.
  82   static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
  83
  84   // X86 is weird. It always uses i8 for shift amounts and setcc results.
  85   setBooleanContents(ZeroOrOneBooleanContent);
  86   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
  87   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
  88
  89   // For 64-bit, since we have so many registers, use the ILP scheduler.
  90   // For 32-bit, use the register pressure specific scheduling.
  91   // For Atom, always use ILP scheduling.
  92   if (Subtarget->isAtom())
  93     setSchedulingPreference(Sched::ILP);
  94   else if (Subtarget->is64Bit())
  95     setSchedulingPreference(Sched::ILP);
  96   else
  97     setSchedulingPreference(Sched::RegPressure);
  98   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
  99   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
 100
 101   // Bypass expensive divides on Atom when compiling with O2.
 102   if (TM.getOptLevel() >= CodeGenOpt::Default) {
 103     if (Subtarget->hasSlowDivide32())
 104       addBypassSlowDiv(32, 8);
 105     if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit())
 106       addBypassSlowDiv(64, 16);
 107   }
 108
 109   if (Subtarget->isTargetKnownWindowsMSVC()) {
 110     // Setup Windows compiler runtime calls.
 111     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
 112     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
 113     setLibcallName(RTLIB::SREM_I64, "_allrem");
 114     setLibcallName(RTLIB::UREM_I64, "_aullrem");
 115     setLibcallName(RTLIB::MUL_I64, "_allmul");
 116     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
 117     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
 118     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
 119     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
 120     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
 121
 122     // The _ftol2 runtime function has an unusual calling conv, which
 123     // is modeled by a special pseudo-instruction.
 124     setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr);
 125     setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr);
 126     setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr);
 127     setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr);
 128   }
 129
 130   if (Subtarget->isTargetDarwin()) {
 131     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
 132     setUseUnderscoreSetJmp(false);
 133     setUseUnderscoreLongJmp(false);
 134   } else if (Subtarget->isTargetWindowsGNU()) {
 135     // MS runtime is weird: it exports _setjmp, but longjmp!
 136     setUseUnderscoreSetJmp(true);
 137     setUseUnderscoreLongJmp(false);
 138   } else {
 139     setUseUnderscoreSetJmp(true);
 140     setUseUnderscoreLongJmp(true);
 141   }
 142
 143   // Set up the register classes.
 144   addRegisterClass(MVT::i8, &X86::GR8RegClass);
 145   addRegisterClass(MVT::i16, &X86::GR16RegClass);
 146   addRegisterClass(MVT::i32, &X86::GR32RegClass);
 147   if (Subtarget->is64Bit())
 148     addRegisterClass(MVT::i64, &X86::GR64RegClass);
 149
 150   for (MVT VT : MVT::integer_valuetypes())
 151     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
 152
 153   // We don't accept any truncstore of integer registers.
 154   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
 155   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
 156   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
 157   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
 158   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
 159   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
 160
 161   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 162
 163   // SETOEQ and SETUNE require checking two conditions.
 164   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
 165   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
 166   setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
 167   setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
 168   setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
 169   setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
 170
 171   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
 172   // operation.
 173   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
 174   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
 175   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
 176
 177   if (Subtarget->is64Bit()) {
 178     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
 179     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
 180   } else if (!Subtarget->useSoftFloat()) {
 181     // We have an algorithm for SSE2->double, and we turn this into a
 182     // 64-bit FILD followed by conditional FADD for other targets.
 183     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
 184     // We have an algorithm for SSE2, and we turn this into a 64-bit
 185     // FILD for other targets.
 186     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
 187   }
 188
 189   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
 190   // this operation.
 191   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
 192   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
 193
 194   if (!Subtarget->useSoftFloat()) {
 195     // SSE has no i16 to fp conversion, only i32
 196     if (X86ScalarSSEf32) {
 197       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
 198       // f32 and f64 cases are Legal, f80 case is not
 199       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
 200     } else {
 201       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
 202       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
 203     }
 204   } else {
 205     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
 206     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
 207   }
 208
 209   // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
 210   // are Legal, f80 is custom lowered.
 211   setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
 212   setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
 213
 214   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
 215   // this operation.
 216   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
 217   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
 218
 219   if (X86ScalarSSEf32) {
 220     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
 221     // f32 and f64 cases are Legal, f80 case is not
 222     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
 223   } else {
 224     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
 225     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
 226   }
 227
 228   // Handle FP_TO_UINT by promoting the destination to a larger signed
 229   // conversion.
 230   setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
 231   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
 232   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
 233
 234   if (Subtarget->is64Bit()) {
 235     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
 236     setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
 237   } else if (!Subtarget->useSoftFloat()) {
 238     // Since AVX is a superset of SSE3, only check for SSE here.
 239     if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
 240       // Expand FP_TO_UINT into a select.
 241       // FIXME: We would like to use a Custom expander here eventually to do
 242       // the optimal thing for SSE vs. the default expansion in the legalizer.
 243       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
 244     else
 245       // With SSE3 we can use fisttpll to convert to a signed i64; without
 246       // SSE, we're stuck with a fistpll.
 247       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
 248   }
 249
 250   if (isTargetFTOL()) {
 251     // Use the _ftol2 runtime function, which has a pseudo-instruction
 252     // to handle its weird calling convention.
 253     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
 254   }
 255
 256   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
 257   if (!X86ScalarSSEf64) {
 258     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
 259     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
 260     if (Subtarget->is64Bit()) {
 261       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
 262       // Without SSE, i64->f64 goes through memory.
 263       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
 264     }
 265   }
 266
 267   // Scalar integer divide and remainder are lowered to use operations that
 268   // produce two results, to match the available instructions. This exposes
 269   // the two-result form to trivial CSE, which is able to combine x/y and x%y
 270   // into a single instruction.
 271   //
 272   // Scalar integer multiply-high is also lowered to use two-result
 273   // operations, to match the available instructions. However, plain multiply
 274   // (low) operations are left as Legal, as there are single-result
 275   // instructions for this in x86. Using the two-result multiply instructions
 276   // when both high and low results are needed must be arranged by dagcombine.
 277   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
 278     MVT VT = IntVTs[i];
 279     setOperationAction(ISD::MULHS, VT, Expand);
 280     setOperationAction(ISD::MULHU, VT, Expand);
 281     setOperationAction(ISD::SDIV, VT, Expand);
 282     setOperationAction(ISD::UDIV, VT, Expand);
 283     setOperationAction(ISD::SREM, VT, Expand);
 284     setOperationAction(ISD::UREM, VT, Expand);
 285
 286     // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
 287     setOperationAction(ISD::ADDC, VT, Custom);
 288     setOperationAction(ISD::ADDE, VT, Custom);
 289     setOperationAction(ISD::SUBC, VT, Custom);
 290     setOperationAction(ISD::SUBE, VT, Custom);
 291   }
 292
 293   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
 294   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
 295   setOperationAction(ISD::BR_CC            , MVT::f32,   Expand);
 296   setOperationAction(ISD::BR_CC            , MVT::f64,   Expand);
 297   setOperationAction(ISD::BR_CC            , MVT::f80,   Expand);
 298   setOperationAction(ISD::BR_CC            , MVT::i8,    Expand);
 299   setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
 300   setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
 301   setOperationAction(ISD::BR_CC            , MVT::i64,   Expand);
 302   setOperationAction(ISD::SELECT_CC        , MVT::f32,   Expand);
 303   setOperationAction(ISD::SELECT_CC        , MVT::f64,   Expand);
 304   setOperationAction(ISD::SELECT_CC        , MVT::f80,   Expand);
 305   setOperationAction(ISD::SELECT_CC        , MVT::i8,    Expand);
 306   setOperationAction(ISD::SELECT_CC        , MVT::i16,   Expand);
 307   setOperationAction(ISD::SELECT_CC        , MVT::i32,   Expand);
 308   setOperationAction(ISD::SELECT_CC        , MVT::i64,   Expand);
 309   if (Subtarget->is64Bit())
 310     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
 311   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
 312   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
 313   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
 314   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
 315   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
 316   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
 317   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
 318   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
 319
 320   // Promote the i8 variants and force them on up to i32 which has a shorter
 321   // encoding.
 322   setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
 323   AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
 324   setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
 325   AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
 326   if (Subtarget->hasBMI()) {
 327     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
 328     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
 329     if (Subtarget->is64Bit())
 330       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
 331   } else {
 332     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
 333     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
 334     if (Subtarget->is64Bit())
 335       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
 336   }
 337
 338   if (Subtarget->hasLZCNT()) {
 339     // When promoting the i8 variants, force them to i32 for a shorter
 340     // encoding.
 341     setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
 342     AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
 343     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
 344     AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
 345     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
 346     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
 347     if (Subtarget->is64Bit())
 348       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
 349   } else {
 350     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
 351     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
 352     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
 353     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
 354     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
 355     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
 356     if (Subtarget->is64Bit()) {
 357       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
 358       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
 359     }
 360   }
 361
 362   // Special handling for half-precision floating point conversions.
 363   // If we don't have F16C support, then lower half float conversions
 364   // into library calls.
 365   if (Subtarget->useSoftFloat() || !Subtarget->hasF16C()) {
 366     setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
 367     setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
 368   }
 369
 370   // There's never any support for operations beyond MVT::f32.
 371   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
 372   setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
 373   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
 374   setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
 375
 376   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
 377   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
 378   setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
 379   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
 380   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
 381   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
 382
 383   if (Subtarget->hasPOPCNT()) {
 384     setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
 385   } else {
 386     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
 387     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
 388     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
 389     if (Subtarget->is64Bit())
 390       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
 391   }
 392
 393   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
 394
 395   if (!Subtarget->hasMOVBE())
 396     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
 397
 398   // These should be promoted to a larger select which is supported.
 399   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
 400   // X86 wants to expand cmov itself.
 401   setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
 402   setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
 403   setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
 404   setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
 405   setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
 406   setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
 407   setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
 408   setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
 409   setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
 410   setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
 411   setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
 412   setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
 413   if (Subtarget->is64Bit()) {
 414     setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
 415     setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
 416   }
 417   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
 418   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
 419   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
 420   // support continuation, user-level threading, and etc.. As a result, no
 421   // other SjLj exception interfaces are implemented and please don't build
 422   // your own exception handling based on them.
 423   // LLVM/Clang supports zero-cost DWARF exception handling.
 424   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
 425   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
 426
 427   // Darwin ABI issue.
 428   setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
 429   setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
 430   setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
 431   setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
 432   if (Subtarget->is64Bit())
 433     setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
 434   setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
 435   setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
 436   if (Subtarget->is64Bit()) {
 437     setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
 438     setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
 439     setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
 440     setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
 441     setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
 442   }
 443   // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
 444   setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
 445   setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
 446   setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
 447   if (Subtarget->is64Bit()) {
 448     setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
 449     setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
 450     setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
 451   }
 452
 453   if (Subtarget->hasSSE1())
 454     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
 455
 456   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
 457
 458   // Expand certain atomics
 459   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
 460     MVT VT = IntVTs[i];
 461     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
 462     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
 463     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
 464   }
 465
 466   if (Subtarget->hasCmpxchg16b()) {
 467     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
 468   }
 469
 470   // FIXME - use subtarget debug flags
 471   if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() &&
 472       !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) {
 473     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
 474   }
 475
 476   if (Subtarget->is64Bit()) {
 477     setExceptionPointerRegister(X86::RAX);
 478     setExceptionSelectorRegister(X86::RDX);
 479   } else {
 480     setExceptionPointerRegister(X86::EAX);
 481     setExceptionSelectorRegister(X86::EDX);
 482   }
 483   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
 484   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
 485
 486   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
 487   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
 488
 489   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 490   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
 491
 492   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
 493   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
 494   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
 495   if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {
 496     // TargetInfo::X86_64ABIBuiltinVaList
 497     setOperationAction(ISD::VAARG           , MVT::Other, Custom);
 498     setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
 499   } else {
 500     // TargetInfo::CharPtrBuiltinVaList
 501     setOperationAction(ISD::VAARG           , MVT::Other, Expand);
 502     setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
 503   }
 504
 505   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
 506   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
 507
 508   setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(), Custom);
 509
 510   // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
 511   setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
 512   setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
 513
 514   if (!Subtarget->useSoftFloat() && X86ScalarSSEf64) {
 515     // f32 and f64 use SSE.
 516     // Set up the FP register classes.
 517     addRegisterClass(MVT::f32, &X86::FR32RegClass);
 518     addRegisterClass(MVT::f64, &X86::FR64RegClass);
 519
 520     // Use ANDPD to simulate FABS.
 521     setOperationAction(ISD::FABS , MVT::f64, Custom);
 522     setOperationAction(ISD::FABS , MVT::f32, Custom);
 523
 524     // Use XORP to simulate FNEG.
 525     setOperationAction(ISD::FNEG , MVT::f64, Custom);
 526     setOperationAction(ISD::FNEG , MVT::f32, Custom);
 527
 528     // Use ANDPD and ORPD to simulate FCOPYSIGN.
 529     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
 530     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 531
 532     // Lower this to FGETSIGNx86 plus an AND.
 533     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
 534     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
 535
 536     // We don't support sin/cos/fmod
 537     setOperationAction(ISD::FSIN   , MVT::f64, Expand);
 538     setOperationAction(ISD::FCOS   , MVT::f64, Expand);
 539     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 540     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
 541     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
 542     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 543
 544     // Expand FP immediates into loads from the stack, except for the special
 545     // cases we handle.
 546     addLegalFPImmediate(APFloat(+0.0)); // xorpd
 547     addLegalFPImmediate(APFloat(+0.0f)); // xorps
 548   } else if (!Subtarget->useSoftFloat() && X86ScalarSSEf32) {
 549     // Use SSE for f32, x87 for f64.
 550     // Set up the FP register classes.
 551     addRegisterClass(MVT::f32, &X86::FR32RegClass);
 552     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
 553
 554     // Use ANDPS to simulate FABS.
 555     setOperationAction(ISD::FABS , MVT::f32, Custom);
 556
 557     // Use XORP to simulate FNEG.
 558     setOperationAction(ISD::FNEG , MVT::f32, Custom);
 559
 560     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
 561
 562     // Use ANDPS and ORPS to simulate FCOPYSIGN.
 563     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
 564     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 565
 566     // We don't support sin/cos/fmod
 567     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
 568     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
 569     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 570
 571     // Special cases we handle for FP constants.
 572     addLegalFPImmediate(APFloat(+0.0f)); // xorps
 573     addLegalFPImmediate(APFloat(+0.0)); // FLD0
 574     addLegalFPImmediate(APFloat(+1.0)); // FLD1
 575     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
 576     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
 577
 578     if (!TM.Options.UnsafeFPMath) {
 579       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
 580       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
 581       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 582     }
 583   } else if (!Subtarget->useSoftFloat()) {
 584     // f32 and f64 in x87.
 585     // Set up the FP register classes.
 586     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
 587     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
 588
 589     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
 590     setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
 591     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
 592     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
 593
 594     if (!TM.Options.UnsafeFPMath) {
 595       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
 596       setOperationAction(ISD::FSIN   , MVT::f32, Expand);
 597       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
 598       setOperationAction(ISD::FCOS   , MVT::f32, Expand);
 599       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 600       setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 601     }
 602     addLegalFPImmediate(APFloat(+0.0)); // FLD0
 603     addLegalFPImmediate(APFloat(+1.0)); // FLD1
 604     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
 605     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
 606     addLegalFPImmediate(APFloat(+0.0f)); // FLD0
 607     addLegalFPImmediate(APFloat(+1.0f)); // FLD1
 608     addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
 609     addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
 610   }
 611
 612   // We don't support FMA.
 613   setOperationAction(ISD::FMA, MVT::f64, Expand);
 614   setOperationAction(ISD::FMA, MVT::f32, Expand);
 615
 616   // Long double always uses X87.
 617   if (!Subtarget->useSoftFloat()) {
 618     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
 619     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
 620     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
 621     {
 622       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
 623       addLegalFPImmediate(TmpFlt);  // FLD0
 624       TmpFlt.changeSign();
 625       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
 626
 627       bool ignored;
 628       APFloat TmpFlt2(+1.0);
 629       TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
 630                       &ignored);
 631       addLegalFPImmediate(TmpFlt2);  // FLD1
 632       TmpFlt2.changeSign();
 633       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
 634     }
 635
 636     if (!TM.Options.UnsafeFPMath) {
 637       setOperationAction(ISD::FSIN   , MVT::f80, Expand);
 638       setOperationAction(ISD::FCOS   , MVT::f80, Expand);
 639       setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
 640     }
 641
 642     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
 643     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
 644     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
 645     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
 646     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
 647     setOperationAction(ISD::FMA, MVT::f80, Expand);
 648   }
 649
 650   // Always use a library call for pow.
 651   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
 652   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
 653   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
 654
 655   setOperationAction(ISD::FLOG, MVT::f80, Expand);
 656   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
 657   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
 658   setOperationAction(ISD::FEXP, MVT::f80, Expand);
 659   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
 660   setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
 661   setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
 662
 663   // First set operation action for all vector types to either promote
 664   // (for widening) or expand (for scalarization). Then we will selectively
 665   // turn on ones that can be effectively codegen'd.
 666   for (MVT VT : MVT::vector_valuetypes()) {
 667     setOperationAction(ISD::ADD , VT, Expand);
 668     setOperationAction(ISD::SUB , VT, Expand);
 669     setOperationAction(ISD::FADD, VT, Expand);
 670     setOperationAction(ISD::FNEG, VT, Expand);
 671     setOperationAction(ISD::FSUB, VT, Expand);
 672     setOperationAction(ISD::MUL , VT, Expand);
 673     setOperationAction(ISD::FMUL, VT, Expand);
 674     setOperationAction(ISD::SDIV, VT, Expand);
 675     setOperationAction(ISD::UDIV, VT, Expand);
 676     setOperationAction(ISD::FDIV, VT, Expand);
 677     setOperationAction(ISD::SREM, VT, Expand);
 678     setOperationAction(ISD::UREM, VT, Expand);
 679     setOperationAction(ISD::LOAD, VT, Expand);
 680     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
 681     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
 682     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
 683     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
 684     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
 685     setOperationAction(ISD::FABS, VT, Expand);
 686     setOperationAction(ISD::FSIN, VT, Expand);
 687     setOperationAction(ISD::FSINCOS, VT, Expand);
 688     setOperationAction(ISD::FCOS, VT, Expand);
 689     setOperationAction(ISD::FSINCOS, VT, Expand);
 690     setOperationAction(ISD::FREM, VT, Expand);
 691     setOperationAction(ISD::FMA,  VT, Expand);
 692     setOperationAction(ISD::FPOWI, VT, Expand);
 693     setOperationAction(ISD::FSQRT, VT, Expand);
 694     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
 695     setOperationAction(ISD::FFLOOR, VT, Expand);
 696     setOperationAction(ISD::FCEIL, VT, Expand);
 697     setOperationAction(ISD::FTRUNC, VT, Expand);
 698     setOperationAction(ISD::FRINT, VT, Expand);
 699     setOperationAction(ISD::FNEARBYINT, VT, Expand);
 700     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
 701     setOperationAction(ISD::MULHS, VT, Expand);
 702     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 703     setOperationAction(ISD::MULHU, VT, Expand);
 704     setOperationAction(ISD::SDIVREM, VT, Expand);
 705     setOperationAction(ISD::UDIVREM, VT, Expand);
 706     setOperationAction(ISD::FPOW, VT, Expand);
 707     setOperationAction(ISD::CTPOP, VT, Expand);
 708     setOperationAction(ISD::CTTZ, VT, Expand);
 709     setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
 710     setOperationAction(ISD::CTLZ, VT, Expand);
 711     setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
 712     setOperationAction(ISD::SHL, VT, Expand);
 713     setOperationAction(ISD::SRA, VT, Expand);
 714     setOperationAction(ISD::SRL, VT, Expand);
 715     setOperationAction(ISD::ROTL, VT, Expand);
 716     setOperationAction(ISD::ROTR, VT, Expand);
 717     setOperationAction(ISD::BSWAP, VT, Expand);
 718     setOperationAction(ISD::SETCC, VT, Expand);
 719     setOperationAction(ISD::FLOG, VT, Expand);
 720     setOperationAction(ISD::FLOG2, VT, Expand);
 721     setOperationAction(ISD::FLOG10, VT, Expand);
 722     setOperationAction(ISD::FEXP, VT, Expand);
 723     setOperationAction(ISD::FEXP2, VT, Expand);
 724     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
 725     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
 726     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
 727     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
 728     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
 729     setOperationAction(ISD::TRUNCATE, VT, Expand);
 730     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
 731     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
 732     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
 733     setOperationAction(ISD::VSELECT, VT, Expand);
 734     setOperationAction(ISD::SELECT_CC, VT, Expand);
 735     for (MVT InnerVT : MVT::vector_valuetypes()) {
 736       setTruncStoreAction(InnerVT, VT, Expand);
 737
 738       setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
 739       setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
 740
 741       // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
 742       // types, we have to deal with them whether we ask for Expansion or not.
 743       // Setting Expand causes its own optimisation problems though, so leave
 744       // them legal.
 745       if (VT.getVectorElementType() == MVT::i1)
 746         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
 747
 748       // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
 749       // split/scalarized right now.
 750       if (VT.getVectorElementType() == MVT::f16)
 751         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
 752     }
 753   }
 754
 755   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
 756   // with -msoft-float, disable use of MMX as well.
 757   if (!Subtarget->useSoftFloat() && Subtarget->hasMMX()) {
 758     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
 759     // No operations on x86mmx supported, everything uses intrinsics.
 760   }
 761
 762   // MMX-sized vectors (other than x86mmx) are expected to be expanded
 763   // into smaller operations.
 764   for (MVT MMXTy : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v1i64}) {
 765     setOperationAction(ISD::MULHS,              MMXTy,      Expand);
 766     setOperationAction(ISD::AND,                MMXTy,      Expand);
 767     setOperationAction(ISD::OR,                 MMXTy,      Expand);
 768     setOperationAction(ISD::XOR,                MMXTy,      Expand);
 769     setOperationAction(ISD::SCALAR_TO_VECTOR,   MMXTy,      Expand);
 770     setOperationAction(ISD::SELECT,             MMXTy,      Expand);
 771     setOperationAction(ISD::BITCAST,            MMXTy,      Expand);
 772   }
 773   setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
 774
 775   if (!Subtarget->useSoftFloat() && Subtarget->hasSSE1()) {
 776     addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
 777
 778     setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
 779     setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
 780     setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
 781     setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
 782     setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
 783     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
 784     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
 785     setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
 786     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
 787     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
 788     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
 789     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
 790     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
 791     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
 792   }
 793
 794   if (!Subtarget->useSoftFloat() && Subtarget->hasSSE2()) {
 795     addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
 796
 797     // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
 798     // registers cannot be used even for integer operations.
 799     addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
 800     addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
 801     addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
 802     addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
 803
 804     setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
 805     setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
 806     setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
 807     setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
 808     setOperationAction(ISD::MUL,                MVT::v16i8, Custom);
 809     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
 810     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
 811     setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
 812     setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
 813     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
 814     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
 815     setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
 816     setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
 817     setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
 818     setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
 819     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
 820     setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
 821     setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
 822     setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
 823     setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
 824     setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
 825     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
 826     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
 827
 828     setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
 829     setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
 830     setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
 831     setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
 832
 833     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
 834     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
 835     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
 836     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
 837     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
 838
 839     setOperationAction(ISD::CTPOP,              MVT::v16i8, Custom);
 840     setOperationAction(ISD::CTPOP,              MVT::v8i16, Custom);
 841     setOperationAction(ISD::CTPOP,              MVT::v4i32, Custom);
 842     setOperationAction(ISD::CTPOP,              MVT::v2i64, Custom);
 843
 844     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
 845     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
 846       MVT VT = (MVT::SimpleValueType)i;
 847       // Do not attempt to custom lower non-power-of-2 vectors
 848       if (!isPowerOf2_32(VT.getVectorNumElements()))
 849         continue;
 850       // Do not attempt to custom lower non-128-bit vectors
 851       if (!VT.is128BitVector())
 852         continue;
 853       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
 854       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
 855       setOperationAction(ISD::VSELECT,            VT, Custom);
 856       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
 857     }
 858
 859     // We support custom legalizing of sext and anyext loads for specific
 860     // memory vector types which we can load as a scalar (or sequence of
 861     // scalars) and extend in-register to a legal 128-bit vector type. For sext
 862     // loads these must work with a single scalar load.
 863     for (MVT VT : MVT::integer_vector_valuetypes()) {
 864       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
 865       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
 866       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
 867       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
 868       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
 869       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
 870       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
 871       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
 872       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
 873     }
 874
 875     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
 876     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
 877     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
 878     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
 879     setOperationAction(ISD::VSELECT,            MVT::v2f64, Custom);
 880     setOperationAction(ISD::VSELECT,            MVT::v2i64, Custom);
 881     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
 882     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
 883
 884     if (Subtarget->is64Bit()) {
 885       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
 886       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
 887     }
 888
 889     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
 890     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
 891       MVT VT = (MVT::SimpleValueType)i;
 892
 893       // Do not attempt to promote non-128-bit vectors
 894       if (!VT.is128BitVector())
 895         continue;
 896
 897       setOperationAction(ISD::AND,    VT, Promote);
 898       AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
 899       setOperationAction(ISD::OR,     VT, Promote);
 900       AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
 901       setOperationAction(ISD::XOR,    VT, Promote);
 902       AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
 903       setOperationAction(ISD::LOAD,   VT, Promote);
 904       AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
 905       setOperationAction(ISD::SELECT, VT, Promote);
 906       AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
 907     }
 908
 909     // Custom lower v2i64 and v2f64 selects.
 910     setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
 911     setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
 912     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
 913     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
 914
 915     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
 916     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
 917
 918     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
 919     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
 920     // As there is no 64-bit GPR available, we need build a special custom
 921     // sequence to convert from v2i32 to v2f32.
 922     if (!Subtarget->is64Bit())
 923       setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
 924
 925     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
 926     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
 927
 928     for (MVT VT : MVT::fp_vector_valuetypes())
 929       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
 930
 931     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
 932     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
 933     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
 934   }
 935
 936   if (!Subtarget->useSoftFloat() && Subtarget->hasSSE41()) {
 937     for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
 938       setOperationAction(ISD::FFLOOR,           RoundedTy,  Legal);
 939       setOperationAction(ISD::FCEIL,            RoundedTy,  Legal);
 940       setOperationAction(ISD::FTRUNC,           RoundedTy,  Legal);
 941       setOperationAction(ISD::FRINT,            RoundedTy,  Legal);
 942       setOperationAction(ISD::FNEARBYINT,       RoundedTy,  Legal);
 943     }
 944
 945     // FIXME: Do we need to handle scalar-to-vector here?
 946     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
 947
 948     // We directly match byte blends in the backend as they match the VSELECT
 949     // condition form.
 950     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
 951
 952     // SSE41 brings specific instructions for doing vector sign extend even in
 953     // cases where we don't have SRA.
 954     for (MVT VT : MVT::integer_vector_valuetypes()) {
 955       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
 956       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
 957       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
 958     }
 959
 960     // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
 961     setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
 962     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
 963     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
 964     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
 965     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
 966     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
 967
 968     setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
 969     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
 970     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
 971     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
 972     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
 973     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
 974
 975     // i8 and i16 vectors are custom because the source register and source
 976     // source memory operand types are not the same width.  f32 vectors are
 977     // custom since the immediate controlling the insert encodes additional
 978     // information.
 979     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
 980     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
 981     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
 982     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
 983
 984     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
 985     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
 986     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
 987     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
 988
 989     // FIXME: these should be Legal, but that's only for the case where
 990     // the index is constant.  For now custom expand to deal with that.
 991     if (Subtarget->is64Bit()) {
 992       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
 993       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
 994     }
 995   }
 996
 997   if (Subtarget->hasSSE2()) {
 998     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
 999     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
1000     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
1001
1002     setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
1003     setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
1004
1005     setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
1006     setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
1007
1008     setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
1009     setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
1010
1011     // In the customized shift lowering, the legal cases in AVX2 will be
1012     // recognized.
1013     setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
1014     setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
1015
1016     setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
1017     setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
1018
1019     setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
1020   }
1021
1022   if (!Subtarget->useSoftFloat() && Subtarget->hasFp256()) {
1023     addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
1024     addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
1025     addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
1026     addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
1027     addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
1028     addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
1029
1030     setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
1031     setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
1032     setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
1033
1034     setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
1035     setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
1036     setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
1037     setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
1038     setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
1039     setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
1040     setOperationAction(ISD::FCEIL,              MVT::v8f32, Legal);
1041     setOperationAction(ISD::FTRUNC,             MVT::v8f32, Legal);
1042     setOperationAction(ISD::FRINT,              MVT::v8f32, Legal);
1043     setOperationAction(ISD::FNEARBYINT,         MVT::v8f32, Legal);
1044     setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
1045     setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
1046
1047     setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
1048     setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
1049     setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
1050     setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
1051     setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
1052     setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
1053     setOperationAction(ISD::FCEIL,              MVT::v4f64, Legal);
1054     setOperationAction(ISD::FTRUNC,             MVT::v4f64, Legal);
1055     setOperationAction(ISD::FRINT,              MVT::v4f64, Legal);
1056     setOperationAction(ISD::FNEARBYINT,         MVT::v4f64, Legal);
1057     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
1058     setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
1059
1060     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1061     // even though v8i16 is a legal type.
1062     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Promote);
1063     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Promote);
1064     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
1065
1066     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
1067     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
1068     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
1069
1070     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
1071     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
1072
1073     for (MVT VT : MVT::fp_vector_valuetypes())
1074       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1075
1076     setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
1077     setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
1078
1079     setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
1080     setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
1081
1082     setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
1083     setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
1084
1085     setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
1086     setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
1087     setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
1088     setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
1089
1090     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
1091     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
1092     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
1093
1094     setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
1095     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
1096     setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
1097     setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
1098     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
1099     setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i16, Custom);
1100     setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
1101     setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
1102     setOperationAction(ISD::ANY_EXTEND,        MVT::v16i16, Custom);
1103     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
1104     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
1105     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
1106
1107     setOperationAction(ISD::CTPOP,             MVT::v32i8, Custom);
1108     setOperationAction(ISD::CTPOP,             MVT::v16i16, Custom);
1109     setOperationAction(ISD::CTPOP,             MVT::v8i32, Custom);
1110     setOperationAction(ISD::CTPOP,             MVT::v4i64, Custom);
1111
1112     if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
1113       setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
1114       setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
1115       setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
1116       setOperationAction(ISD::FMA,             MVT::v2f64, Legal);
1117       setOperationAction(ISD::FMA,             MVT::f32, Legal);
1118       setOperationAction(ISD::FMA,             MVT::f64, Legal);
1119     }
1120
1121     if (Subtarget->hasInt256()) {
1122       setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
1123       setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
1124       setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
1125       setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
1126
1127       setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
1128       setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
1129       setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
1130       setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
1131
1132       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
1133       setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
1134       setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
1135       setOperationAction(ISD::MUL,             MVT::v32i8, Custom);
1136
1137       setOperationAction(ISD::UMUL_LOHI,       MVT::v8i32, Custom);
1138       setOperationAction(ISD::SMUL_LOHI,       MVT::v8i32, Custom);
1139       setOperationAction(ISD::MULHU,           MVT::v16i16, Legal);
1140       setOperationAction(ISD::MULHS,           MVT::v16i16, Legal);
1141
1142       // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1143       // when we have a 256bit-wide blend with immediate.
1144       setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1145
1146       // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1147       setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1148       setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
1149       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
1150       setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
1151       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
1152       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
1153
1154       setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1155       setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
1156       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
1157       setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
1158       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
1159       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
1160     } else {
1161       setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
1162       setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
1163       setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
1164       setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
1165
1166       setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
1167       setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
1168       setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
1169       setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
1170
1171       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
1172       setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
1173       setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
1174       setOperationAction(ISD::MUL,             MVT::v32i8, Custom);
1175     }
1176
1177     // In the customized shift lowering, the legal cases in AVX2 will be
1178     // recognized.
1179     setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
1180     setOperationAction(ISD::SRL,               MVT::v8i32, Custom);
1181
1182     setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
1183     setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
1184
1185     setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
1186
1187     // Custom lower several nodes for 256-bit types.
1188     for (MVT VT : MVT::vector_valuetypes()) {
1189       if (VT.getScalarSizeInBits() >= 32) {
1190         setOperationAction(ISD::MLOAD,  VT, Legal);
1191         setOperationAction(ISD::MSTORE, VT, Legal);
1192       }
1193       // Extract subvector is special because the value type
1194       // (result) is 128-bit but the source is 256-bit wide.
1195       if (VT.is128BitVector()) {
1196         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1197       }
1198       // Do not attempt to custom lower other non-256-bit vectors
1199       if (!VT.is256BitVector())
1200         continue;
1201
1202       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1203       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1204       setOperationAction(ISD::VSELECT,            VT, Custom);
1205       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1206       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1207       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
1208       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
1209       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1210     }
1211
1212     if (Subtarget->hasInt256())
1213       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
1214
1215
1216     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1217     for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
1218       MVT VT = (MVT::SimpleValueType)i;
1219
1220       // Do not attempt to promote non-256-bit vectors
1221       if (!VT.is256BitVector())
1222         continue;
1223
1224       setOperationAction(ISD::AND,    VT, Promote);
1225       AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
1226       setOperationAction(ISD::OR,     VT, Promote);
1227       AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
1228       setOperationAction(ISD::XOR,    VT, Promote);
1229       AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
1230       setOperationAction(ISD::LOAD,   VT, Promote);
1231       AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
1232       setOperationAction(ISD::SELECT, VT, Promote);
1233       AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
1234     }
1235   }
1236
1237   if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512()) {
1238     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1239     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1240     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
1241     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
1242
1243     addRegisterClass(MVT::i1,     &X86::VK1RegClass);
1244     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
1245     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
1246
1247     for (MVT VT : MVT::fp_vector_valuetypes())
1248       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1249
1250     setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i32, MVT::v16i8, Legal);
1251     setLoadExtAction(ISD::SEXTLOAD, MVT::v16i32, MVT::v16i8, Legal);
1252     setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i32, MVT::v16i16, Legal);
1253     setLoadExtAction(ISD::SEXTLOAD, MVT::v16i32, MVT::v16i16, Legal);
1254     setLoadExtAction(ISD::ZEXTLOAD, MVT::v32i16, MVT::v32i8, Legal);
1255     setLoadExtAction(ISD::SEXTLOAD, MVT::v32i16, MVT::v32i8, Legal);
1256     setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64,  MVT::v8i8,  Legal);
1257     setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64,  MVT::v8i8,  Legal);
1258     setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64,  MVT::v8i16,  Legal);
1259     setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64,  MVT::v8i16,  Legal);
1260     setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64,  MVT::v8i32,  Legal);
1261     setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64,  MVT::v8i32,  Legal);
1262
1263     setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
1264     setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
1265     setOperationAction(ISD::XOR,                MVT::i1,    Legal);
1266     setOperationAction(ISD::OR,                 MVT::i1,    Legal);
1267     setOperationAction(ISD::AND,                MVT::i1,    Legal);
1268     setOperationAction(ISD::SUB,                MVT::i1,    Custom);
1269     setOperationAction(ISD::ADD,                MVT::i1,    Custom);
1270     setOperationAction(ISD::MUL,                MVT::i1,    Custom);
1271     setOperationAction(ISD::LOAD,               MVT::v16f32, Legal);
1272     setOperationAction(ISD::LOAD,               MVT::v8f64, Legal);
1273     setOperationAction(ISD::LOAD,               MVT::v8i64, Legal);
1274     setOperationAction(ISD::LOAD,               MVT::v16i32, Legal);
1275     setOperationAction(ISD::LOAD,               MVT::v16i1, Legal);
1276
1277     setOperationAction(ISD::FADD,               MVT::v16f32, Legal);
1278     setOperationAction(ISD::FSUB,               MVT::v16f32, Legal);
1279     setOperationAction(ISD::FMUL,               MVT::v16f32, Legal);
1280     setOperationAction(ISD::FDIV,               MVT::v16f32, Legal);
1281     setOperationAction(ISD::FSQRT,              MVT::v16f32, Legal);
1282     setOperationAction(ISD::FNEG,               MVT::v16f32, Custom);
1283
1284     setOperationAction(ISD::FADD,               MVT::v8f64, Legal);
1285     setOperationAction(ISD::FSUB,               MVT::v8f64, Legal);
1286     setOperationAction(ISD::FMUL,               MVT::v8f64, Legal);
1287     setOperationAction(ISD::FDIV,               MVT::v8f64, Legal);
1288     setOperationAction(ISD::FSQRT,              MVT::v8f64, Legal);
1289     setOperationAction(ISD::FNEG,               MVT::v8f64, Custom);
1290     setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
1291     setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
1292
1293     setOperationAction(ISD::FP_TO_SINT,         MVT::i32, Legal);
1294     setOperationAction(ISD::FP_TO_UINT,         MVT::i32, Legal);
1295     setOperationAction(ISD::SINT_TO_FP,         MVT::i32, Legal);
1296     setOperationAction(ISD::UINT_TO_FP,         MVT::i32, Legal);
1297     if (Subtarget->is64Bit()) {
1298       setOperationAction(ISD::FP_TO_UINT,       MVT::i64, Legal);
1299       setOperationAction(ISD::FP_TO_SINT,       MVT::i64, Legal);
1300       setOperationAction(ISD::SINT_TO_FP,       MVT::i64, Legal);
1301       setOperationAction(ISD::UINT_TO_FP,       MVT::i64, Legal);
1302     }
1303     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
1304     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
1305     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
1306     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
1307     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
1308     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i1,   Custom);
1309     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i1,  Custom);
1310     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i8,  Promote);
1311     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i16, Promote);
1312     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
1313     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
1314     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
1315     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i8, Custom);
1316     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i16, Custom);
1317     setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
1318     setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
1319
1320     setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
1321     setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
1322     setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
1323     if (Subtarget->hasDQI()) {
1324       setOperationAction(ISD::TRUNCATE,           MVT::v2i1, Custom);
1325       setOperationAction(ISD::TRUNCATE,           MVT::v4i1, Custom);
1326     }
1327     setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
1328     setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
1329     setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
1330     setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
1331     setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
1332     setOperationAction(ISD::ANY_EXTEND,         MVT::v16i32, Custom);
1333     setOperationAction(ISD::ANY_EXTEND,         MVT::v8i64, Custom);
1334     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
1335     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
1336     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
1337     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
1338     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
1339     if (Subtarget->hasDQI()) {
1340       setOperationAction(ISD::SIGN_EXTEND,        MVT::v4i32, Custom);
1341       setOperationAction(ISD::SIGN_EXTEND,        MVT::v2i64, Custom);
1342     }
1343     setOperationAction(ISD::FFLOOR,             MVT::v16f32, Legal);
1344     setOperationAction(ISD::FFLOOR,             MVT::v8f64, Legal);
1345     setOperationAction(ISD::FCEIL,              MVT::v16f32, Legal);
1346     setOperationAction(ISD::FCEIL,              MVT::v8f64, Legal);
1347     setOperationAction(ISD::FTRUNC,             MVT::v16f32, Legal);
1348     setOperationAction(ISD::FTRUNC,             MVT::v8f64, Legal);
1349     setOperationAction(ISD::FRINT,              MVT::v16f32, Legal);
1350     setOperationAction(ISD::FRINT,              MVT::v8f64, Legal);
1351     setOperationAction(ISD::FNEARBYINT,         MVT::v16f32, Legal);
1352     setOperationAction(ISD::FNEARBYINT,         MVT::v8f64, Legal);
1353
1354     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
1355     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
1356     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
1357     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
1358     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1, Legal);
1359
1360     setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
1361     setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);
1362
1363     setOperationAction(ISD::MUL,              MVT::v8i64, Custom);
1364
1365     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
1366     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
1367     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
1368     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
1369     setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
1370     setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
1371     setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
1372     setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
1373     setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
1374     setOperationAction(ISD::SELECT,             MVT::v16i1, Custom);
1375     setOperationAction(ISD::SELECT,             MVT::v8i1,  Custom);
1376
1377     setOperationAction(ISD::ADD,                MVT::v8i64, Legal);
1378     setOperationAction(ISD::ADD,                MVT::v16i32, Legal);
1379
1380     setOperationAction(ISD::SUB,                MVT::v8i64, Legal);
1381     setOperationAction(ISD::SUB,                MVT::v16i32, Legal);
1382
1383     setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
1384
1385     setOperationAction(ISD::SRL,                MVT::v8i64, Custom);
1386     setOperationAction(ISD::SRL,                MVT::v16i32, Custom);
1387
1388     setOperationAction(ISD::SHL,                MVT::v8i64, Custom);
1389     setOperationAction(ISD::SHL,                MVT::v16i32, Custom);
1390
1391     setOperationAction(ISD::SRA,                MVT::v8i64, Custom);
1392     setOperationAction(ISD::SRA,                MVT::v16i32, Custom);
1393
1394     setOperationAction(ISD::AND,                MVT::v8i64, Legal);
1395     setOperationAction(ISD::OR,                 MVT::v8i64, Legal);
1396     setOperationAction(ISD::XOR,                MVT::v8i64, Legal);
1397     setOperationAction(ISD::AND,                MVT::v16i32, Legal);
1398     setOperationAction(ISD::OR,                 MVT::v16i32, Legal);
1399     setOperationAction(ISD::XOR,                MVT::v16i32, Legal);
1400
1401     if (Subtarget->hasCDI()) {
1402       setOperationAction(ISD::CTLZ,             MVT::v8i64, Legal);
1403       setOperationAction(ISD::CTLZ,             MVT::v16i32, Legal);
1404     }
1405     if (Subtarget->hasDQI()) {
1406       setOperationAction(ISD::MUL,             MVT::v2i64, Legal);
1407       setOperationAction(ISD::MUL,             MVT::v4i64, Legal);
1408       setOperationAction(ISD::MUL,             MVT::v8i64, Legal);
1409     }
1410     // Custom lower several nodes.
1411     for (MVT VT : MVT::vector_valuetypes()) {
1412       unsigned EltSize = VT.getVectorElementType().getSizeInBits();
1413       if (EltSize == 1) {
1414         setOperationAction(ISD::AND, VT, Legal);
1415         setOperationAction(ISD::OR,  VT, Legal);
1416         setOperationAction(ISD::XOR,  VT, Legal);
1417       }
1418       if (EltSize >= 32 && VT.getSizeInBits() <= 512) {
1419         setOperationAction(ISD::MGATHER,  VT, Custom);
1420         setOperationAction(ISD::MSCATTER, VT, Custom);
1421       }
1422       // Extract subvector is special because the value type
1423       // (result) is 256/128-bit but the source is 512-bit wide.
1424       if (VT.is128BitVector() || VT.is256BitVector()) {
1425         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1426       }
1427       if (VT.getVectorElementType() == MVT::i1)
1428         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1429
1430       // Do not attempt to custom lower other non-512-bit vectors
1431       if (!VT.is512BitVector())
1432         continue;
1433
1434       if (EltSize >= 32) {
1435         setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
1436         setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
1437         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
1438         setOperationAction(ISD::VSELECT,             VT, Legal);
1439         setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
1440         setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
1441         setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
1442         setOperationAction(ISD::MLOAD,               VT, Legal);
1443         setOperationAction(ISD::MSTORE,              VT, Legal);
1444       }
1445     }
1446     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
1447       MVT VT = (MVT::SimpleValueType)i;
1448
1449       // Do not attempt to promote non-512-bit vectors.
1450       if (!VT.is512BitVector())
1451         continue;
1452
1453       setOperationAction(ISD::SELECT, VT, Promote);
1454       AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
1455     }
1456   }// has  AVX-512
1457
1458   if (!Subtarget->useSoftFloat() && Subtarget->hasBWI()) {
1459     addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1460     addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
1461
1462     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
1463     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
1464
1465     setOperationAction(ISD::LOAD,               MVT::v32i16, Legal);
1466     setOperationAction(ISD::LOAD,               MVT::v64i8, Legal);
1467     setOperationAction(ISD::SETCC,              MVT::v32i1, Custom);
1468     setOperationAction(ISD::SETCC,              MVT::v64i1, Custom);
1469     setOperationAction(ISD::ADD,                MVT::v32i16, Legal);
1470     setOperationAction(ISD::ADD,                MVT::v64i8, Legal);
1471     setOperationAction(ISD::SUB,                MVT::v32i16, Legal);
1472     setOperationAction(ISD::SUB,                MVT::v64i8, Legal);
1473     setOperationAction(ISD::MUL,                MVT::v32i16, Legal);
1474     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i1, Custom);
1475     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i1, Custom);
1476     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i1, Custom);
1477     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i1, Custom);
1478     setOperationAction(ISD::SELECT,             MVT::v32i1, Custom);
1479     setOperationAction(ISD::SELECT,             MVT::v64i1, Custom);
1480     setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i8, Custom);
1481     setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i8, Custom);
1482     setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i16, Custom);
1483     setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i16, Custom);
1484     setOperationAction(ISD::SIGN_EXTEND,        MVT::v64i8, Custom);
1485     setOperationAction(ISD::ZERO_EXTEND,        MVT::v64i8, Custom);
1486     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v32i1, Custom);
1487     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v64i1, Custom);
1488     setOperationAction(ISD::VSELECT,            MVT::v32i16, Legal);
1489     setOperationAction(ISD::VSELECT,            MVT::v64i8, Legal);
1490     setOperationAction(ISD::TRUNCATE,           MVT::v32i1, Custom);
1491     setOperationAction(ISD::TRUNCATE,           MVT::v64i1, Custom);
1492
1493     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
1494       const MVT VT = (MVT::SimpleValueType)i;
1495
1496       const unsigned EltSize = VT.getVectorElementType().getSizeInBits();
1497
1498       // Do not attempt to promote non-512-bit vectors.
1499       if (!VT.is512BitVector())
1500         continue;
1501
1502       if (EltSize < 32) {
1503         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
1504         setOperationAction(ISD::VSELECT,             VT, Legal);
1505       }
1506     }
1507   }
1508
1509   if (!Subtarget->useSoftFloat() && Subtarget->hasVLX()) {
1510     addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
1511     addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
1512
1513     setOperationAction(ISD::SETCC,              MVT::v4i1, Custom);
1514     setOperationAction(ISD::SETCC,              MVT::v2i1, Custom);
1515     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i1, Custom);
1516     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1, Custom);
1517     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v8i1, Custom);
1518     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v4i1, Custom);
1519     setOperationAction(ISD::SELECT,             MVT::v4i1, Custom);
1520     setOperationAction(ISD::SELECT,             MVT::v2i1, Custom);
1521     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4i1, Custom);
1522     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i1, Custom);
1523
1524     setOperationAction(ISD::AND,                MVT::v8i32, Legal);
1525     setOperationAction(ISD::OR,                 MVT::v8i32, Legal);
1526     setOperationAction(ISD::XOR,                MVT::v8i32, Legal);
1527     setOperationAction(ISD::AND,                MVT::v4i32, Legal);
1528     setOperationAction(ISD::OR,                 MVT::v4i32, Legal);
1529     setOperationAction(ISD::XOR,                MVT::v4i32, Legal);
1530     setOperationAction(ISD::SRA,                MVT::v2i64, Custom);
1531     setOperationAction(ISD::SRA,                MVT::v4i64, Custom);
1532   }
1533
1534   // We want to custom lower some of our intrinsics.
1535   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1536   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1537   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1538   if (!Subtarget->is64Bit())
1539     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1540
1541   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1542   // handle type legalization for these operations here.
1543   //
1544   // FIXME: We really should do custom legalization for addition and
1545   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
1546   // than generic legalization for 64-bit multiplication-with-overflow, though.
1547   for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
1548     // Add/Sub/Mul with overflow operations are custom lowered.
1549     MVT VT = IntVTs[i];
1550     setOperationAction(ISD::SADDO, VT, Custom);
1551     setOperationAction(ISD::UADDO, VT, Custom);
1552     setOperationAction(ISD::SSUBO, VT, Custom);
1553     setOperationAction(ISD::USUBO, VT, Custom);
1554     setOperationAction(ISD::SMULO, VT, Custom);
1555     setOperationAction(ISD::UMULO, VT, Custom);
1556   }
1557
1558
1559   if (!Subtarget->is64Bit()) {
1560     // These libcalls are not available in 32-bit.
1561     setLibcallName(RTLIB::SHL_I128, nullptr);
1562     setLibcallName(RTLIB::SRL_I128, nullptr);
1563     setLibcallName(RTLIB::SRA_I128, nullptr);
1564   }
1565
1566   // Combine sin / cos into one node or libcall if possible.
1567   if (Subtarget->hasSinCos()) {
1568     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1569     setLibcallName(RTLIB::SINCOS_F64, "sincos");
1570     if (Subtarget->isTargetDarwin()) {
1571       // For MacOSX, we don't want the normal expansion of a libcall to sincos.
1572       // We want to issue a libcall to __sincos_stret to avoid memory traffic.
1573       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1574       setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1575     }
1576   }
1577
1578   if (Subtarget->isTargetWin64()) {
1579     setOperationAction(ISD::SDIV, MVT::i128, Custom);
1580     setOperationAction(ISD::UDIV, MVT::i128, Custom);
1581     setOperationAction(ISD::SREM, MVT::i128, Custom);
1582     setOperationAction(ISD::UREM, MVT::i128, Custom);
1583     setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1584     setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1585   }
1586
1587   // We have target-specific dag combine patterns for the following nodes:
1588   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1589   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1590   setTargetDAGCombine(ISD::BITCAST);
1591   setTargetDAGCombine(ISD::VSELECT);
1592   setTargetDAGCombine(ISD::SELECT);
1593   setTargetDAGCombine(ISD::SHL);
1594   setTargetDAGCombine(ISD::SRA);
1595   setTargetDAGCombine(ISD::SRL);
1596   setTargetDAGCombine(ISD::OR);
1597   setTargetDAGCombine(ISD::AND);
1598   setTargetDAGCombine(ISD::ADD);
1599   setTargetDAGCombine(ISD::FADD);
1600   setTargetDAGCombine(ISD::FSUB);
1601   setTargetDAGCombine(ISD::FMA);
1602   setTargetDAGCombine(ISD::SUB);
1603   setTargetDAGCombine(ISD::LOAD);
1604   setTargetDAGCombine(ISD::MLOAD);
1605   setTargetDAGCombine(ISD::STORE);
1606   setTargetDAGCombine(ISD::MSTORE);
1607   setTargetDAGCombine(ISD::ZERO_EXTEND);
1608   setTargetDAGCombine(ISD::ANY_EXTEND);
1609   setTargetDAGCombine(ISD::SIGN_EXTEND);
1610   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1611   setTargetDAGCombine(ISD::SINT_TO_FP);
1612   setTargetDAGCombine(ISD::SETCC);
1613   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
1614   setTargetDAGCombine(ISD::BUILD_VECTOR);
1615   setTargetDAGCombine(ISD::MUL);
1616   setTargetDAGCombine(ISD::XOR);
1617
1618   computeRegisterProperties(Subtarget->getRegisterInfo());
1619
1620   // On Darwin, -Os means optimize for size without hurting performance,
1621   // do not reduce the limit.
1622   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1623   MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
1624   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1625   MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1626   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1627   MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1628   setPrefLoopAlignment(4); // 2^4 bytes.
1629
1630   // Predictable cmov don't hurt on atom because it's in-order.
1631   PredictableSelectIsExpensive = !Subtarget->isAtom();
1632   EnableExtLdPromotion = true;
1633   setPrefFunctionAlignment(4); // 2^4 bytes.
1634
1635   verifyIntrinsicTables();
1636 }
1637
1638 // This has so far only been implemented for 64-bit MachO.
1639 bool X86TargetLowering::useLoadStackGuardNode() const {
1640   return Subtarget->isTargetMachO() && Subtarget->is64Bit();
1641 }
1642
1643 TargetLoweringBase::LegalizeTypeAction
1644 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1645   if (ExperimentalVectorWideningLegalization &&
1646       VT.getVectorNumElements() != 1 &&
1647       VT.getVectorElementType().getSimpleVT() != MVT::i1)
1648     return TypeWidenVector;
1649
1650   return TargetLoweringBase::getPreferredVectorAction(VT);
1651 }
1652
1653 EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1654   if (!VT.isVector())
1655     return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
1656
1657   const unsigned NumElts = VT.getVectorNumElements();
1658   const EVT EltVT = VT.getVectorElementType();
1659   if (VT.is512BitVector()) {
1660     if (Subtarget->hasAVX512())
1661       if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1662           EltVT == MVT::f32 || EltVT == MVT::f64)
1663         switch(NumElts) {
1664         case  8: return MVT::v8i1;
1665         case 16: return MVT::v16i1;
1666       }
1667     if (Subtarget->hasBWI())
1668       if (EltVT == MVT::i8 || EltVT == MVT::i16)
1669         switch(NumElts) {
1670         case 32: return MVT::v32i1;
1671         case 64: return MVT::v64i1;
1672       }
1673   }
1674
1675   if (VT.is256BitVector() || VT.is128BitVector()) {
1676     if (Subtarget->hasVLX())
1677       if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1678           EltVT == MVT::f32 || EltVT == MVT::f64)
1679         switch(NumElts) {
1680         case 2: return MVT::v2i1;
1681         case 4: return MVT::v4i1;
1682         case 8: return MVT::v8i1;
1683       }
1684     if (Subtarget->hasBWI() && Subtarget->hasVLX())
1685       if (EltVT == MVT::i8 || EltVT == MVT::i16)
1686         switch(NumElts) {
1687         case  8: return MVT::v8i1;
1688         case 16: return MVT::v16i1;
1689         case 32: return MVT::v32i1;
1690       }
1691   }
1692
1693   return VT.changeVectorElementTypeToInteger();
1694 }
1695
1696 /// Helper for getByValTypeAlignment to determine
1697 /// the desired ByVal argument alignment.
1698 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1699   if (MaxAlign == 16)
1700     return;
1701   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1702     if (VTy->getBitWidth() == 128)
1703       MaxAlign = 16;
1704   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1705     unsigned EltAlign = 0;
1706     getMaxByValAlign(ATy->getElementType(), EltAlign);
1707     if (EltAlign > MaxAlign)
1708       MaxAlign = EltAlign;
1709   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1710     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
1711       unsigned EltAlign = 0;
1712       getMaxByValAlign(STy->getElementType(i), EltAlign);
1713       if (EltAlign > MaxAlign)
1714         MaxAlign = EltAlign;
1715       if (MaxAlign == 16)
1716         break;
1717     }
1718   }
1719 }
1720
1721 /// Return the desired alignment for ByVal aggregate
1722 /// function arguments in the caller parameter area. For X86, aggregates
1723 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1724 /// are at 4-byte boundaries.
1725 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
1726   if (Subtarget->is64Bit()) {
1727     // Max of 8 and alignment of type.
1728     unsigned TyAlign = TD->getABITypeAlignment(Ty);
1729     if (TyAlign > 8)
1730       return TyAlign;
1731     return 8;
1732   }
1733
1734   unsigned Align = 4;
1735   if (Subtarget->hasSSE1())
1736     getMaxByValAlign(Ty, Align);
1737   return Align;
1738 }
1739
1740 /// Returns the target specific optimal type for load
1741 /// and store operations as a result of memset, memcpy, and memmove
1742 /// lowering. If DstAlign is zero that means it's safe to destination
1743 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1744 /// means there isn't a need to check it against alignment requirement,
1745 /// probably because the source does not need to be loaded. If 'IsMemset' is
1746 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1747 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1748 /// source is constant so it does not need to be loaded.
1749 /// It returns EVT::Other if the type should be determined using generic
1750 /// target-independent logic.
1751 EVT
1752 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1753                                        unsigned DstAlign, unsigned SrcAlign,
1754                                        bool IsMemset, bool ZeroMemset,
1755                                        bool MemcpyStrSrc,
1756                                        MachineFunction &MF) const {
1757   const Function *F = MF.getFunction();
1758   if ((!IsMemset || ZeroMemset) &&
1759       !F->hasFnAttribute(Attribute::NoImplicitFloat)) {
1760     if (Size >= 16 &&
1761         (Subtarget->isUnalignedMemAccessFast() ||
1762          ((DstAlign == 0 || DstAlign >= 16) &&
1763           (SrcAlign == 0 || SrcAlign >= 16)))) {
1764       if (Size >= 32) {
1765         if (Subtarget->hasInt256())
1766           return MVT::v8i32;
1767         if (Subtarget->hasFp256())
1768           return MVT::v8f32;
1769       }
1770       if (Subtarget->hasSSE2())
1771         return MVT::v4i32;
1772       if (Subtarget->hasSSE1())
1773         return MVT::v4f32;
1774     } else if (!MemcpyStrSrc && Size >= 8 &&
1775                !Subtarget->is64Bit() &&
1776                Subtarget->hasSSE2()) {
1777       // Do not use f64 to lower memcpy if source is string constant. It's
1778       // better to use i32 to avoid the loads.
1779       return MVT::f64;
1780     }
1781   }
1782   if (Subtarget->is64Bit() && Size >= 8)
1783     return MVT::i64;
1784   return MVT::i32;
1785 }
1786
1787 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1788   if (VT == MVT::f32)
1789     return X86ScalarSSEf32;
1790   else if (VT == MVT::f64)
1791     return X86ScalarSSEf64;
1792   return true;
1793 }
1794
1795 bool
1796 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1797                                                   unsigned,
1798                                                   unsigned,
1799                                                   bool *Fast) const {
1800   if (Fast)
1801     *Fast = Subtarget->isUnalignedMemAccessFast();
1802   return true;
1803 }
1804
1805 /// Return the entry encoding for a jump table in the
1806 /// current function.  The returned value is a member of the
1807 /// MachineJumpTableInfo::JTEntryKind enum.
1808 unsigned X86TargetLowering::getJumpTableEncoding() const {
1809   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1810   // symbol.
1811   if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1812       Subtarget->isPICStyleGOT())
1813     return MachineJumpTableInfo::EK_Custom32;
1814
1815   // Otherwise, use the normal jump table encoding heuristics.
1816   return TargetLowering::getJumpTableEncoding();
1817 }
1818
1819 bool X86TargetLowering::useSoftFloat() const {
1820   return Subtarget->useSoftFloat();
1821 }
1822
1823 const MCExpr *
1824 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1825                                              const MachineBasicBlock *MBB,
1826                                              unsigned uid,MCContext &Ctx) const{
1827   assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ &&
1828          Subtarget->isPICStyleGOT());
1829   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1830   // entries.
1831   return MCSymbolRefExpr::create(MBB->getSymbol(),
1832                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
1833 }
1834
1835 /// Returns relocation base for the given PIC jumptable.
1836 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1837                                                     SelectionDAG &DAG) const {
1838   if (!Subtarget->is64Bit())
1839     // This doesn't have SDLoc associated with it, but is not really the
1840     // same as a Register.
1841     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
1842   return Table;
1843 }
1844
1845 /// This returns the relocation base for the given PIC jumptable,
1846 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
1847 const MCExpr *X86TargetLowering::
1848 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1849                              MCContext &Ctx) const {
1850   // X86-64 uses RIP relative addressing based on the jump table label.
1851   if (Subtarget->isPICStyleRIPRel())
1852     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1853
1854   // Otherwise, the reference is relative to the PIC base.
1855   return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
1856 }
1857
1858 std::pair<const TargetRegisterClass *, uint8_t>
1859 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
1860                                            MVT VT) const {
1861   const TargetRegisterClass *RRC = nullptr;
1862   uint8_t Cost = 1;
1863   switch (VT.SimpleTy) {
1864   default:
1865     return TargetLowering::findRepresentativeClass(TRI, VT);
1866   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1867     RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
1868     break;
1869   case MVT::x86mmx:
1870     RRC = &X86::VR64RegClass;
1871     break;
1872   case MVT::f32: case MVT::f64:
1873   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1874   case MVT::v4f32: case MVT::v2f64:
1875   case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
1876   case MVT::v4f64:
1877     RRC = &X86::VR128RegClass;
1878     break;
1879   }
1880   return std::make_pair(RRC, Cost);
1881 }
1882
1883 bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
1884                                                unsigned &Offset) const {
1885   if (!Subtarget->isTargetLinux())
1886     return false;
1887
1888   if (Subtarget->is64Bit()) {
1889     // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
1890     Offset = 0x28;
1891     if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
1892       AddressSpace = 256;
1893     else
1894       AddressSpace = 257;
1895   } else {
1896     // %gs:0x14 on i386
1897     Offset = 0x14;
1898     AddressSpace = 256;
1899   }
1900   return true;
1901 }
1902
1903 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
1904                                             unsigned DestAS) const {
1905   assert(SrcAS != DestAS && "Expected different address spaces!");
1906
1907   return SrcAS < 256 && DestAS < 256;
1908 }
1909
1910 //===----------------------------------------------------------------------===//
1911 //               Return Value Calling Convention Implementation
1912 //===----------------------------------------------------------------------===//
1913
1914 #include "X86GenCallingConv.inc"
1915
1916 bool
1917 X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
1918                                   MachineFunction &MF, bool isVarArg,
1919                         const SmallVectorImpl<ISD::OutputArg> &Outs,
1920                         LLVMContext &Context) const {
1921   SmallVector<CCValAssign, 16> RVLocs;
1922   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
1923   return CCInfo.CheckReturn(Outs, RetCC_X86);
1924 }
1925
1926 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
1927   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
1928   return ScratchRegs;
1929 }
1930
1931 SDValue
1932 X86TargetLowering::LowerReturn(SDValue Chain,
1933                                CallingConv::ID CallConv, bool isVarArg,
1934                                const SmallVectorImpl<ISD::OutputArg> &Outs,
1935                                const SmallVectorImpl<SDValue> &OutVals,
1936                                SDLoc dl, SelectionDAG &DAG) const {
1937   MachineFunction &MF = DAG.getMachineFunction();
1938   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1939
1940   SmallVector<CCValAssign, 16> RVLocs;
1941   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
1942   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
1943
1944   SDValue Flag;
1945   SmallVector<SDValue, 6> RetOps;
1946   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
1947   // Operand #1 = Bytes To Pop
1948   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
1949                    MVT::i16));
1950
1951   // Copy the result values into the output registers.
1952   for (unsigned i = 0; i != RVLocs.size(); ++i) {
1953     CCValAssign &VA = RVLocs[i];
1954     assert(VA.isRegLoc() && "Can only return in registers!");
1955     SDValue ValToCopy = OutVals[i];
1956     EVT ValVT = ValToCopy.getValueType();
1957
1958     // Promote values to the appropriate types.
1959     if (VA.getLocInfo() == CCValAssign::SExt)
1960       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
1961     else if (VA.getLocInfo() == CCValAssign::ZExt)
1962       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
1963     else if (VA.getLocInfo() == CCValAssign::AExt) {
1964       if (ValVT.isVector() && ValVT.getScalarType() == MVT::i1)
1965         ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
1966       else
1967         ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
1968     }
1969     else if (VA.getLocInfo() == CCValAssign::BCvt)
1970       ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
1971
1972     assert(VA.getLocInfo() != CCValAssign::FPExt &&
1973            "Unexpected FP-extend for return value.");
1974
1975     // If this is x86-64, and we disabled SSE, we can't return FP values,
1976     // or SSE or MMX vectors.
1977     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
1978          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
1979           (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
1980       report_fatal_error("SSE register return with SSE disabled");
1981     }
1982     // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
1983     // llvm-gcc has never done it right and no one has noticed, so this
1984     // should be OK for now.
1985     if (ValVT == MVT::f64 &&
1986         (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
1987       report_fatal_error("SSE2 register return with SSE2 disabled");
1988
1989     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
1990     // the RET instruction and handled by the FP Stackifier.
1991     if (VA.getLocReg() == X86::FP0 ||
1992         VA.getLocReg() == X86::FP1) {
1993       // If this is a copy from an xmm register to ST(0), use an FPExtend to
1994       // change the value to the FP stack register class.
1995       if (isScalarFPTypeInSSEReg(VA.getValVT()))
1996         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
1997       RetOps.push_back(ValToCopy);
1998       // Don't emit a copytoreg.
1999       continue;
2000     }
2001
2002     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2003     // which is returned in RAX / RDX.
2004     if (Subtarget->is64Bit()) {
2005       if (ValVT == MVT::x86mmx) {
2006         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2007           ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2008           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2009                                   ValToCopy);
2010           // If we don't have SSE2 available, convert to v4f32 so the generated
2011           // register is legal.
2012           if (!Subtarget->hasSSE2())
2013             ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2014         }
2015       }
2016     }
2017
2018     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
2019     Flag = Chain.getValue(1);
2020     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2021   }
2022
2023   // All x86 ABIs require that for returning structs by value we copy
2024   // the sret argument into %rax/%eax (depending on ABI) for the return.
2025   // We saved the argument into a virtual register in the entry block,
2026   // so now we copy the value out and into %rax/%eax.
2027   //
2028   // Checking Function.hasStructRetAttr() here is insufficient because the IR
2029   // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2030   // false, then an sret argument may be implicitly inserted in the SelDAG. In
2031   // either case FuncInfo->setSRetReturnReg() will have been called.
2032   if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2033     SDValue Val = DAG.getCopyFromReg(Chain, dl, SRetReg, getPointerTy());
2034
2035     unsigned RetValReg
2036         = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
2037           X86::RAX : X86::EAX;
2038     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2039     Flag = Chain.getValue(1);
2040
2041     // RAX/EAX now acts like a return value.
2042     RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
2043   }
2044
2045   RetOps[0] = Chain;  // Update chain.
2046
2047   // Add the flag if we have it.
2048   if (Flag.getNode())
2049     RetOps.push_back(Flag);
2050
2051   return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps);
2052 }
2053
2054 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2055   if (N->getNumValues() != 1)
2056     return false;
2057   if (!N->hasNUsesOfValue(1, 0))
2058     return false;
2059
2060   SDValue TCChain = Chain;
2061   SDNode *Copy = *N->use_begin();
2062   if (Copy->getOpcode() == ISD::CopyToReg) {
2063     // If the copy has a glue operand, we conservatively assume it isn't safe to
2064     // perform a tail call.
2065     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2066       return false;
2067     TCChain = Copy->getOperand(0);
2068   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2069     return false;
2070
2071   bool HasRet = false;
2072   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2073        UI != UE; ++UI) {
2074     if (UI->getOpcode() != X86ISD::RET_FLAG)
2075       return false;
2076     // If we are returning more than one value, we can definitely
2077     // not make a tail call see PR19530
2078     if (UI->getNumOperands() > 4)
2079       return false;
2080     if (UI->getNumOperands() == 4 &&
2081         UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2082       return false;
2083     HasRet = true;
2084   }
2085
2086   if (!HasRet)
2087     return false;
2088
2089   Chain = TCChain;
2090   return true;
2091 }
2092
2093 EVT
2094 X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
2095                                             ISD::NodeType ExtendKind) const {
2096   MVT ReturnMVT;
2097   // TODO: Is this also valid on 32-bit?
2098   if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
2099     ReturnMVT = MVT::i8;
2100   else
2101     ReturnMVT = MVT::i32;
2102
2103   EVT MinVT = getRegisterType(Context, ReturnMVT);
2104   return VT.bitsLT(MinVT) ? MinVT : VT;
2105 }
2106
2107 /// Lower the result values of a call into the
2108 /// appropriate copies out of appropriate physical registers.
2109 ///
2110 SDValue
2111 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
2112                                    CallingConv::ID CallConv, bool isVarArg,
2113                                    const SmallVectorImpl<ISD::InputArg> &Ins,
2114                                    SDLoc dl, SelectionDAG &DAG,
2115                                    SmallVectorImpl<SDValue> &InVals) const {
2116
2117   // Assign locations to each value returned by this call.
2118   SmallVector<CCValAssign, 16> RVLocs;
2119   bool Is64Bit = Subtarget->is64Bit();
2120   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2121                  *DAG.getContext());
2122   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2123
2124   // Copy all of the result registers out of their specified physreg.
2125   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
2126     CCValAssign &VA = RVLocs[i];
2127     EVT CopyVT = VA.getLocVT();
2128
2129     // If this is x86-64, and we disabled SSE, we can't return FP values
2130     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
2131         ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
2132       report_fatal_error("SSE register return with SSE disabled");
2133     }
2134
2135     // If we prefer to use the value in xmm registers, copy it out as f80 and
2136     // use a truncate to move it from fp stack reg to xmm reg.
2137     bool RoundAfterCopy = false;
2138     if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2139         isScalarFPTypeInSSEReg(VA.getValVT())) {
2140       CopyVT = MVT::f80;
2141       RoundAfterCopy = (CopyVT != VA.getLocVT());
2142     }
2143
2144     Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
2145                                CopyVT, InFlag).getValue(1);
2146     SDValue Val = Chain.getValue(0);
2147
2148     if (RoundAfterCopy)
2149       Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2150                         // This truncation won't change the value.
2151                         DAG.getIntPtrConstant(1, dl));
2152
2153     if (VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1)
2154       Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2155
2156     InFlag = Chain.getValue(2);
2157     InVals.push_back(Val);
2158   }
2159
2160   return Chain;
2161 }
2162
2163 //===----------------------------------------------------------------------===//
2164 //                C & StdCall & Fast Calling Convention implementation
2165 //===----------------------------------------------------------------------===//
2166 //  StdCall calling convention seems to be standard for many Windows' API
2167 //  routines and around. It differs from C calling convention just a little:
2168 //  callee should clean up the stack, not caller. Symbols should be also
2169 //  decorated in some fancy way :) It doesn't support any vector arguments.
2170 //  For info on fast calling convention see Fast Calling Convention (tail call)
2171 //  implementation LowerX86_32FastCCCallTo.
2172
2173 /// CallIsStructReturn - Determines whether a call uses struct return
2174 /// semantics.
2175 enum StructReturnType {
2176   NotStructReturn,
2177   RegStructReturn,
2178   StackStructReturn
2179 };
2180 static StructReturnType
2181 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
2182   if (Outs.empty())
2183     return NotStructReturn;
2184
2185   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2186   if (!Flags.isSRet())
2187     return NotStructReturn;
2188   if (Flags.isInReg())
2189     return RegStructReturn;
2190   return StackStructReturn;
2191 }
2192
2193 /// Determines whether a function uses struct return semantics.
2194 static StructReturnType
2195 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
2196   if (Ins.empty())
2197     return NotStructReturn;
2198
2199   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2200   if (!Flags.isSRet())
2201     return NotStructReturn;
2202   if (Flags.isInReg())
2203     return RegStructReturn;
2204   return StackStructReturn;
2205 }
2206
2207 /// Make a copy of an aggregate at address specified by "Src" to address
2208 /// "Dst" with size and alignment information specified by the specific
2209 /// parameter attribute. The copy will be passed as a byval function parameter.
2210 static SDValue
2211 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
2212                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
2213                           SDLoc dl) {
2214   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2215
2216   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2217                        /*isVolatile*/false, /*AlwaysInline=*/true,
2218                        /*isTailCall*/false,
2219                        MachinePointerInfo(), MachinePointerInfo());
2220 }
2221
2222 /// Return true if the calling convention is one that
2223 /// supports tail call optimization.
2224 static bool IsTailCallConvention(CallingConv::ID CC) {
2225   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2226           CC == CallingConv::HiPE);
2227 }
2228
2229 /// \brief Return true if the calling convention is a C calling convention.
2230 static bool IsCCallConvention(CallingConv::ID CC) {
2231   return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||
2232           CC == CallingConv::X86_64_SysV);
2233 }
2234
2235 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
2236   if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
2237     return false;
2238
2239   CallSite CS(CI);
2240   CallingConv::ID CalleeCC = CS.getCallingConv();
2241   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
2242     return false;
2243
2244   return true;
2245 }
2246
2247 /// Return true if the function is being made into
2248 /// a tailcall target by changing its ABI.
2249 static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
2250                                    bool GuaranteedTailCallOpt) {
2251   return GuaranteedTailCallOpt && IsTailCallConvention(CC);
2252 }
2253
2254 SDValue
2255 X86TargetLowering::LowerMemArgument(SDValue Chain,
2256                                     CallingConv::ID CallConv,
2257                                     const SmallVectorImpl<ISD::InputArg> &Ins,
2258                                     SDLoc dl, SelectionDAG &DAG,
2259                                     const CCValAssign &VA,
2260                                     MachineFrameInfo *MFI,
2261                                     unsigned i) const {
2262   // Create the nodes corresponding to a load from this parameter slot.
2263   ISD::ArgFlagsTy Flags = Ins[i].Flags;
2264   bool AlwaysUseMutable = FuncIsMadeTailCallSafe(
2265       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2266   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2267   EVT ValVT;
2268
2269   // If value is passed by pointer we have address passed instead of the value
2270   // itself.
2271   bool ExtendedInMem = VA.isExtInLoc() &&
2272     VA.getValVT().getScalarType() == MVT::i1;
2273
2274   if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
2275     ValVT = VA.getLocVT();
2276   else
2277     ValVT = VA.getValVT();
2278
2279   // FIXME: For now, all byval parameter objects are marked mutable. This can be
2280   // changed with more analysis.
2281   // In case of tail call optimization mark all arguments mutable. Since they
2282   // could be overwritten by lowering of arguments in case of a tail call.
2283   if (Flags.isByVal()) {
2284     unsigned Bytes = Flags.getByValSize();
2285     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2286     int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2287     return DAG.getFrameIndex(FI, getPointerTy());
2288   } else {
2289     int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
2290                                     VA.getLocMemOffset(), isImmutable);
2291     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
2292     SDValue Val =  DAG.getLoad(ValVT, dl, Chain, FIN,
2293                                MachinePointerInfo::getFixedStack(FI),
2294                                false, false, false, 0);
2295     return ExtendedInMem ?
2296       DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val;
2297   }
2298 }
2299
2300 // FIXME: Get this from tablegen.
2301 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2302                                                 const X86Subtarget *Subtarget) {
2303   assert(Subtarget->is64Bit());
2304
2305   if (Subtarget->isCallingConvWin64(CallConv)) {
2306     static const MCPhysReg GPR64ArgRegsWin64[] = {
2307       X86::RCX, X86::RDX, X86::R8,  X86::R9
2308     };
2309     return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2310   }
2311
2312   static const MCPhysReg GPR64ArgRegs64Bit[] = {
2313     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2314   };
2315   return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2316 }
2317
2318 // FIXME: Get this from tablegen.
2319 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2320                                                 CallingConv::ID CallConv,
2321                                                 const X86Subtarget *Subtarget) {
2322   assert(Subtarget->is64Bit());
2323   if (Subtarget->isCallingConvWin64(CallConv)) {
2324     // The XMM registers which might contain var arg parameters are shadowed
2325     // in their paired GPR.  So we only need to save the GPR to their home
2326     // slots.
2327     // TODO: __vectorcall will change this.
2328     return None;
2329   }
2330
2331   const Function *Fn = MF.getFunction();
2332   bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
2333   bool isSoftFloat = Subtarget->useSoftFloat();
2334   assert(!(isSoftFloat && NoImplicitFloatOps) &&
2335          "SSE register cannot be used when SSE is disabled!");
2336   if (isSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1())
2337     // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2338     // registers.
2339     return None;
2340
2341   static const MCPhysReg XMMArgRegs64Bit[] = {
2342     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2343     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2344   };
2345   return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2346 }
2347
2348 SDValue
2349 X86TargetLowering::LowerFormalArguments(SDValue Chain,
2350                                         CallingConv::ID CallConv,
2351                                         bool isVarArg,
2352                                       const SmallVectorImpl<ISD::InputArg> &Ins,
2353                                         SDLoc dl,
2354                                         SelectionDAG &DAG,
2355                                         SmallVectorImpl<SDValue> &InVals)
2356                                           const {
2357   MachineFunction &MF = DAG.getMachineFunction();
2358   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2359   const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
2360
2361   const Function* Fn = MF.getFunction();
2362   if (Fn->hasExternalLinkage() &&
2363       Subtarget->isTargetCygMing() &&
2364       Fn->getName() == "main")
2365     FuncInfo->setForceFramePointer(true);
2366
2367   MachineFrameInfo *MFI = MF.getFrameInfo();
2368   bool Is64Bit = Subtarget->is64Bit();
2369   bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
2370
2371   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
2372          "Var args not supported with calling convention fastcc, ghc or hipe");
2373
2374   // Assign locations to all of the incoming arguments.
2375   SmallVector<CCValAssign, 16> ArgLocs;
2376   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2377
2378   // Allocate shadow area for Win64
2379   if (IsWin64)
2380     CCInfo.AllocateStack(32, 8);
2381
2382   CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
2383
2384   unsigned LastVal = ~0U;
2385   SDValue ArgValue;
2386   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2387     CCValAssign &VA = ArgLocs[i];
2388     // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
2389     // places.
2390     assert(VA.getValNo() != LastVal &&
2391            "Don't support value assigned to multiple locs yet");
2392     (void)LastVal;
2393     LastVal = VA.getValNo();
2394
2395     if (VA.isRegLoc()) {
2396       EVT RegVT = VA.getLocVT();
2397       const TargetRegisterClass *RC;
2398       if (RegVT == MVT::i32)
2399         RC = &X86::GR32RegClass;
2400       else if (Is64Bit && RegVT == MVT::i64)
2401         RC = &X86::GR64RegClass;
2402       else if (RegVT == MVT::f32)
2403         RC = &X86::FR32RegClass;
2404       else if (RegVT == MVT::f64)
2405         RC = &X86::FR64RegClass;
2406       else if (RegVT.is512BitVector())
2407         RC = &X86::VR512RegClass;
2408       else if (RegVT.is256BitVector())
2409         RC = &X86::VR256RegClass;
2410       else if (RegVT.is128BitVector())
2411         RC = &X86::VR128RegClass;
2412       else if (RegVT == MVT::x86mmx)
2413         RC = &X86::VR64RegClass;
2414       else if (RegVT == MVT::i1)
2415         RC = &X86::VK1RegClass;
2416       else if (RegVT == MVT::v8i1)
2417         RC = &X86::VK8RegClass;
2418       else if (RegVT == MVT::v16i1)
2419         RC = &X86::VK16RegClass;
2420       else if (RegVT == MVT::v32i1)
2421         RC = &X86::VK32RegClass;
2422       else if (RegVT == MVT::v64i1)
2423         RC = &X86::VK64RegClass;
2424       else
2425         llvm_unreachable("Unknown argument type!");
2426
2427       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2428       ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2429
2430       // If this is an 8 or 16-bit value, it is really passed promoted to 32
2431       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
2432       // right size.
2433       if (VA.getLocInfo() == CCValAssign::SExt)
2434         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
2435                                DAG.getValueType(VA.getValVT()));
2436       else if (VA.getLocInfo() == CCValAssign::ZExt)
2437         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
2438                                DAG.getValueType(VA.getValVT()));
2439       else if (VA.getLocInfo() == CCValAssign::BCvt)
2440         ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
2441
2442       if (VA.isExtInLoc()) {
2443         // Handle MMX values passed in XMM regs.
2444         if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
2445           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
2446         else
2447           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
2448       }
2449     } else {
2450       assert(VA.isMemLoc());
2451       ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
2452     }
2453
2454     // If value is passed via pointer - do a load.
2455     if (VA.getLocInfo() == CCValAssign::Indirect)
2456       ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
2457                              MachinePointerInfo(), false, false, false, 0);
2458
2459     InVals.push_back(ArgValue);
2460   }
2461
2462   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2463     // All x86 ABIs require that for returning structs by value we copy the
2464     // sret argument into %rax/%eax (depending on ABI) for the return. Save
2465     // the argument into a virtual register so that we can access it from the
2466     // return points.
2467     if (Ins[i].Flags.isSRet()) {
2468       unsigned Reg = FuncInfo->getSRetReturnReg();
2469       if (!Reg) {
2470         MVT PtrTy = getPointerTy();
2471         Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
2472         FuncInfo->setSRetReturnReg(Reg);
2473       }
2474       SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
2475       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
2476       break;
2477     }
2478   }
2479
2480   unsigned StackSize = CCInfo.getNextStackOffset();
2481   // Align stack specially for tail calls.
2482   if (FuncIsMadeTailCallSafe(CallConv,
2483                              MF.getTarget().Options.GuaranteedTailCallOpt))
2484     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
2485
2486   // If the function takes variable number of arguments, make a frame index for
2487   // the start of the first vararg value... for expansion of llvm.va_start. We
2488   // can skip this if there are no va_start calls.
2489   if (MFI->hasVAStart() &&
2490       (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
2491                    CallConv != CallingConv::X86_ThisCall))) {
2492     FuncInfo->setVarArgsFrameIndex(
2493         MFI->CreateFixedObject(1, StackSize, true));
2494   }
2495
2496   MachineModuleInfo &MMI = MF.getMMI();
2497   const Function *WinEHParent = nullptr;
2498   if (IsWin64 && MMI.hasWinEHFuncInfo(Fn))
2499     WinEHParent = MMI.getWinEHParent(Fn);
2500   bool IsWinEHOutlined = WinEHParent && WinEHParent != Fn;
2501   bool IsWinEHParent = WinEHParent && WinEHParent == Fn;
2502
2503   // Figure out if XMM registers are in use.
2504   assert(!(Subtarget->useSoftFloat() &&
2505            Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
2506          "SSE register cannot be used when SSE is disabled!");
2507
2508   // 64-bit calling conventions support varargs and register parameters, so we
2509   // have to do extra work to spill them in the prologue.
2510   if (Is64Bit && isVarArg && MFI->hasVAStart()) {
2511     // Find the first unallocated argument registers.
2512     ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
2513     ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
2514     unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
2515     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
2516     assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
2517            "SSE register cannot be used when SSE is disabled!");
2518
2519     // Gather all the live in physical registers.
2520     SmallVector<SDValue, 6> LiveGPRs;
2521     SmallVector<SDValue, 8> LiveXMMRegs;
2522     SDValue ALVal;
2523     for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
2524       unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
2525       LiveGPRs.push_back(
2526           DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
2527     }
2528     if (!ArgXMMs.empty()) {
2529       unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2530       ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
2531       for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
2532         unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
2533         LiveXMMRegs.push_back(
2534             DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
2535       }
2536     }
2537
2538     if (IsWin64) {
2539       // Get to the caller-allocated home save location.  Add 8 to account
2540       // for the return address.
2541       int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
2542       FuncInfo->setRegSaveFrameIndex(
2543           MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
2544       // Fixup to set vararg frame on shadow area (4 x i64).
2545       if (NumIntRegs < 4)
2546         FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
2547     } else {
2548       // For X86-64, if there are vararg parameters that are passed via
2549       // registers, then we must store them to their spots on the stack so
2550       // they may be loaded by deferencing the result of va_next.
2551       FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
2552       FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
2553       FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
2554           ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
2555     }
2556
2557     // Store the integer parameter registers.
2558     SmallVector<SDValue, 8> MemOps;
2559     SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
2560                                       getPointerTy());
2561     unsigned Offset = FuncInfo->getVarArgsGPOffset();
2562     for (SDValue Val : LiveGPRs) {
2563       SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
2564                                 DAG.getIntPtrConstant(Offset, dl));
2565       SDValue Store =
2566         DAG.getStore(Val.getValue(1), dl, Val, FIN,
2567                      MachinePointerInfo::getFixedStack(
2568                        FuncInfo->getRegSaveFrameIndex(), Offset),
2569                      false, false, 0);
2570       MemOps.push_back(Store);
2571       Offset += 8;
2572     }
2573
2574     if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
2575       // Now store the XMM (fp + vector) parameter registers.
2576       SmallVector<SDValue, 12> SaveXMMOps;
2577       SaveXMMOps.push_back(Chain);
2578       SaveXMMOps.push_back(ALVal);
2579       SaveXMMOps.push_back(DAG.getIntPtrConstant(
2580                              FuncInfo->getRegSaveFrameIndex(), dl));
2581       SaveXMMOps.push_back(DAG.getIntPtrConstant(
2582                              FuncInfo->getVarArgsFPOffset(), dl));
2583       SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
2584                         LiveXMMRegs.end());
2585       MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
2586                                    MVT::Other, SaveXMMOps));
2587     }
2588
2589     if (!MemOps.empty())
2590       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
2591   } else if (IsWinEHOutlined) {
2592     // Get to the caller-allocated home save location.  Add 8 to account
2593     // for the return address.
2594     int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
2595     FuncInfo->setRegSaveFrameIndex(MFI->CreateFixedObject(
2596         /*Size=*/1, /*SPOffset=*/HomeOffset + 8, /*Immutable=*/false));
2597
2598     MMI.getWinEHFuncInfo(Fn)
2599         .CatchHandlerParentFrameObjIdx[const_cast<Function *>(Fn)] =
2600         FuncInfo->getRegSaveFrameIndex();
2601
2602     // Store the second integer parameter (rdx) into rsp+16 relative to the
2603     // stack pointer at the entry of the function.
2604     SDValue RSFIN =
2605         DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), getPointerTy());
2606     unsigned GPR = MF.addLiveIn(X86::RDX, &X86::GR64RegClass);
2607     SDValue Val = DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64);
2608     Chain = DAG.getStore(
2609         Val.getValue(1), dl, Val, RSFIN,
2610         MachinePointerInfo::getFixedStack(FuncInfo->getRegSaveFrameIndex()),
2611         /*isVolatile=*/true, /*isNonTemporal=*/false, /*Alignment=*/0);
2612   }
2613
2614   if (isVarArg && MFI->hasMustTailInVarArgFunc()) {
2615     // Find the largest legal vector type.
2616     MVT VecVT = MVT::Other;
2617     // FIXME: Only some x86_32 calling conventions support AVX512.
2618     if (Subtarget->hasAVX512() &&
2619         (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
2620                      CallConv == CallingConv::Intel_OCL_BI)))
2621       VecVT = MVT::v16f32;
2622     else if (Subtarget->hasAVX())
2623       VecVT = MVT::v8f32;
2624     else if (Subtarget->hasSSE2())
2625       VecVT = MVT::v4f32;
2626
2627     // We forward some GPRs and some vector types.
2628     SmallVector<MVT, 2> RegParmTypes;
2629     MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
2630     RegParmTypes.push_back(IntVT);
2631     if (VecVT != MVT::Other)
2632       RegParmTypes.push_back(VecVT);
2633
2634     // Compute the set of forwarded registers. The rest are scratch.
2635     SmallVectorImpl<ForwardedRegister> &Forwards =
2636         FuncInfo->getForwardedMustTailRegParms();
2637     CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
2638
2639     // Conservatively forward AL on x86_64, since it might be used for varargs.
2640     if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
2641       unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2642       Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
2643     }
2644
2645     // Copy all forwards from physical to virtual registers.
2646     for (ForwardedRegister &F : Forwards) {
2647       // FIXME: Can we use a less constrained schedule?
2648       SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
2649       F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
2650       Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
2651     }
2652   }
2653
2654   // Some CCs need callee pop.
2655   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
2656                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
2657     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
2658   } else {
2659     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
2660     // If this is an sret function, the return should pop the hidden pointer.
2661     if (!Is64Bit && !IsTailCallConvention(CallConv) &&
2662         !Subtarget->getTargetTriple().isOSMSVCRT() &&
2663         argsAreStructReturn(Ins) == StackStructReturn)
2664       FuncInfo->setBytesToPopOnReturn(4);
2665   }
2666
2667   if (!Is64Bit) {
2668     // RegSaveFrameIndex is X86-64 only.
2669     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
2670     if (CallConv == CallingConv::X86_FastCall ||
2671         CallConv == CallingConv::X86_ThisCall)
2672       // fastcc functions can't have varargs.
2673       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
2674   }
2675
2676   FuncInfo->setArgumentStackSize(StackSize);
2677
2678   if (IsWinEHParent) {
2679     int UnwindHelpFI = MFI->CreateStackObject(8, 8, /*isSS=*/false);
2680     SDValue StackSlot = DAG.getFrameIndex(UnwindHelpFI, MVT::i64);
2681     MMI.getWinEHFuncInfo(MF.getFunction()).UnwindHelpFrameIdx = UnwindHelpFI;
2682     SDValue Neg2 = DAG.getConstant(-2, dl, MVT::i64);
2683     Chain = DAG.getStore(Chain, dl, Neg2, StackSlot,
2684                          MachinePointerInfo::getFixedStack(UnwindHelpFI),
2685                          /*isVolatile=*/true,
2686                          /*isNonTemporal=*/false, /*Alignment=*/0);
2687   }
2688
2689   return Chain;
2690 }
2691
2692 SDValue
2693 X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
2694                                     SDValue StackPtr, SDValue Arg,
2695                                     SDLoc dl, SelectionDAG &DAG,
2696                                     const CCValAssign &VA,
2697                                     ISD::ArgFlagsTy Flags) const {
2698   unsigned LocMemOffset = VA.getLocMemOffset();
2699   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
2700   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
2701   if (Flags.isByVal())
2702     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
2703
2704   return DAG.getStore(Chain, dl, Arg, PtrOff,
2705                       MachinePointerInfo::getStack(LocMemOffset),
2706                       false, false, 0);
2707 }
2708
2709 /// Emit a load of return address if tail call
2710 /// optimization is performed and it is required.
2711 SDValue
2712 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
2713                                            SDValue &OutRetAddr, SDValue Chain,
2714                                            bool IsTailCall, bool Is64Bit,
2715                                            int FPDiff, SDLoc dl) const {
2716   // Adjust the Return address stack slot.
2717   EVT VT = getPointerTy();
2718   OutRetAddr = getReturnAddressFrameIndex(DAG);
2719
2720   // Load the "old" Return address.
2721   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
2722                            false, false, false, 0);
2723   return SDValue(OutRetAddr.getNode(), 1);
2724 }
2725
2726 /// Emit a store of the return address if tail call
2727 /// optimization is performed and it is required (FPDiff!=0).
2728 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
2729                                         SDValue Chain, SDValue RetAddrFrIdx,
2730                                         EVT PtrVT, unsigned SlotSize,
2731                                         int FPDiff, SDLoc dl) {
2732   // Store the return address to the appropriate stack slot.
2733   if (!FPDiff) return Chain;
2734   // Calculate the new stack slot for the return address.
2735   int NewReturnAddrFI =
2736     MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
2737                                          false);
2738   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
2739   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
2740                        MachinePointerInfo::getFixedStack(NewReturnAddrFI),
2741                        false, false, 0);
2742   return Chain;
2743 }
2744
2745 SDValue
2746 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2747                              SmallVectorImpl<SDValue> &InVals) const {
2748   SelectionDAG &DAG                     = CLI.DAG;
2749   SDLoc &dl                             = CLI.DL;
2750   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2751   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
2752   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
2753   SDValue Chain                         = CLI.Chain;
2754   SDValue Callee                        = CLI.Callee;
2755   CallingConv::ID CallConv              = CLI.CallConv;
2756   bool &isTailCall                      = CLI.IsTailCall;
2757   bool isVarArg                         = CLI.IsVarArg;
2758
2759   MachineFunction &MF = DAG.getMachineFunction();
2760   bool Is64Bit        = Subtarget->is64Bit();
2761   bool IsWin64        = Subtarget->isCallingConvWin64(CallConv);
2762   StructReturnType SR = callIsStructReturn(Outs);
2763   bool IsSibcall      = false;
2764   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
2765
2766   if (MF.getTarget().Options.DisableTailCalls)
2767     isTailCall = false;
2768
2769   if (Subtarget->isPICStyleGOT() &&
2770       !MF.getTarget().Options.GuaranteedTailCallOpt) {
2771     // If we are using a GOT, disable tail calls to external symbols with
2772     // default visibility. Tail calling such a symbol requires using a GOT
2773     // relocation, which forces early binding of the symbol. This breaks code
2774     // that require lazy function symbol resolution. Using musttail or
2775     // GuaranteedTailCallOpt will override this.
2776     GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
2777     if (!G || (!G->getGlobal()->hasLocalLinkage() &&
2778                G->getGlobal()->hasDefaultVisibility()))
2779       isTailCall = false;
2780   }
2781
2782   bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
2783   if (IsMustTail) {
2784     // Force this to be a tail call.  The verifier rules are enough to ensure
2785     // that we can lower this successfully without moving the return address
2786     // around.
2787     isTailCall = true;
2788   } else if (isTailCall) {
2789     // Check if it's really possible to do a tail call.
2790     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
2791                     isVarArg, SR != NotStructReturn,
2792                     MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
2793                     Outs, OutVals, Ins, DAG);
2794
2795     // Sibcalls are automatically detected tailcalls which do not require
2796     // ABI changes.
2797     if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
2798       IsSibcall = true;
2799
2800     if (isTailCall)
2801       ++NumTailCalls;
2802   }
2803
2804   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
2805          "Var args not supported with calling convention fastcc, ghc or hipe");
2806
2807   // Analyze operands of the call, assigning locations to each operand.
2808   SmallVector<CCValAssign, 16> ArgLocs;
2809   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2810
2811   // Allocate shadow area for Win64
2812   if (IsWin64)
2813     CCInfo.AllocateStack(32, 8);
2814
2815   CCInfo.AnalyzeCallOperands(Outs, CC_X86);
2816
2817   // Get a count of how many bytes are to be pushed on the stack.
2818   unsigned NumBytes = CCInfo.getNextStackOffset();
2819   if (IsSibcall)
2820     // This is a sibcall. The memory operands are available in caller's
2821     // own caller's stack.
2822     NumBytes = 0;
2823   else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
2824            IsTailCallConvention(CallConv))
2825     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
2826
2827   int FPDiff = 0;
2828   if (isTailCall && !IsSibcall && !IsMustTail) {
2829     // Lower arguments at fp - stackoffset + fpdiff.
2830     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
2831
2832     FPDiff = NumBytesCallerPushed - NumBytes;
2833
2834     // Set the delta of movement of the returnaddr stackslot.
2835     // But only set if delta is greater than previous delta.
2836     if (FPDiff < X86Info->getTCReturnAddrDelta())
2837       X86Info->setTCReturnAddrDelta(FPDiff);
2838   }
2839
2840   unsigned NumBytesToPush = NumBytes;
2841   unsigned NumBytesToPop = NumBytes;
2842
2843   // If we have an inalloca argument, all stack space has already been allocated
2844   // for us and be right at the top of the stack.  We don't support multiple
2845   // arguments passed in memory when using inalloca.
2846   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
2847     NumBytesToPush = 0;
2848     if (!ArgLocs.back().isMemLoc())
2849       report_fatal_error("cannot use inalloca attribute on a register "
2850                          "parameter");
2851     if (ArgLocs.back().getLocMemOffset() != 0)
2852       report_fatal_error("any parameter with the inalloca attribute must be "
2853                          "the only memory argument");
2854   }
2855
2856   if (!IsSibcall)
2857     Chain = DAG.getCALLSEQ_START(
2858         Chain, DAG.getIntPtrConstant(NumBytesToPush, dl, true), dl);
2859
2860   SDValue RetAddrFrIdx;
2861   // Load return address for tail calls.
2862   if (isTailCall && FPDiff)
2863     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
2864                                     Is64Bit, FPDiff, dl);
2865
2866   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2867   SmallVector<SDValue, 8> MemOpChains;
2868   SDValue StackPtr;
2869
2870   // Walk the register/memloc assignments, inserting copies/loads.  In the case
2871   // of tail call optimization arguments are handle later.
2872   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
2873   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2874     // Skip inalloca arguments, they have already been written.
2875     ISD::ArgFlagsTy Flags = Outs[i].Flags;
2876     if (Flags.isInAlloca())
2877       continue;
2878
2879     CCValAssign &VA = ArgLocs[i];
2880     EVT RegVT = VA.getLocVT();
2881     SDValue Arg = OutVals[i];
2882     bool isByVal = Flags.isByVal();
2883
2884     // Promote the value if needed.
2885     switch (VA.getLocInfo()) {
2886     default: llvm_unreachable("Unknown loc info!");
2887     case CCValAssign::Full: break;
2888     case CCValAssign::SExt:
2889       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
2890       break;
2891     case CCValAssign::ZExt:
2892       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
2893       break;
2894     case CCValAssign::AExt:
2895       if (Arg.getValueType().isVector() &&
2896           Arg.getValueType().getScalarType() == MVT::i1)
2897         Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
2898       else if (RegVT.is128BitVector()) {
2899         // Special case: passing MMX values in XMM registers.
2900         Arg = DAG.getBitcast(MVT::i64, Arg);
2901         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
2902         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
2903       } else
2904         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
2905       break;
2906     case CCValAssign::BCvt:
2907       Arg = DAG.getBitcast(RegVT, Arg);
2908       break;
2909     case CCValAssign::Indirect: {
2910       // Store the argument.
2911       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
2912       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
2913       Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
2914                            MachinePointerInfo::getFixedStack(FI),
2915                            false, false, 0);
2916       Arg = SpillSlot;
2917       break;
2918     }
2919     }
2920
2921     if (VA.isRegLoc()) {
2922       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2923       if (isVarArg && IsWin64) {
2924         // Win64 ABI requires argument XMM reg to be copied to the corresponding
2925         // shadow reg if callee is a varargs function.
2926         unsigned ShadowReg = 0;
2927         switch (VA.getLocReg()) {
2928         case X86::XMM0: ShadowReg = X86::RCX; break;
2929         case X86::XMM1: ShadowReg = X86::RDX; break;
2930         case X86::XMM2: ShadowReg = X86::R8; break;
2931         case X86::XMM3: ShadowReg = X86::R9; break;
2932         }
2933         if (ShadowReg)
2934           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
2935       }
2936     } else if (!IsSibcall && (!isTailCall || isByVal)) {
2937       assert(VA.isMemLoc());
2938       if (!StackPtr.getNode())
2939         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
2940                                       getPointerTy());
2941       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
2942                                              dl, DAG, VA, Flags));
2943     }
2944   }
2945
2946   if (!MemOpChains.empty())
2947     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2948
2949   if (Subtarget->isPICStyleGOT()) {
2950     // ELF / PIC requires GOT in the EBX register before function calls via PLT
2951     // GOT pointer.
2952     if (!isTailCall) {
2953       RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
2954                DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
2955     } else {
2956       // If we are tail calling and generating PIC/GOT style code load the
2957       // address of the callee into ECX. The value in ecx is used as target of
2958       // the tail jump. This is done to circumvent the ebx/callee-saved problem
2959       // for tail calls on PIC/GOT architectures. Normally we would just put the
2960       // address of GOT into ebx and then call target@PLT. But for tail calls
2961       // ebx would be restored (since ebx is callee saved) before jumping to the
2962       // target@PLT.
2963
2964       // Note: The actual moving to ECX is done further down.
2965       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
2966       if (G && !G->getGlobal()->hasLocalLinkage() &&
2967           G->getGlobal()->hasDefaultVisibility())
2968         Callee = LowerGlobalAddress(Callee, DAG);
2969       else if (isa<ExternalSymbolSDNode>(Callee))
2970         Callee = LowerExternalSymbol(Callee, DAG);
2971     }
2972   }
2973
2974   if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
2975     // From AMD64 ABI document:
2976     // For calls that may call functions that use varargs or stdargs
2977     // (prototype-less calls or calls to functions containing ellipsis (...) in
2978     // the declaration) %al is used as hidden argument to specify the number
2979     // of SSE registers used. The contents of %al do not need to match exactly
2980     // the number of registers, but must be an ubound on the number of SSE
2981     // registers used and is in the range 0 - 8 inclusive.
2982
2983     // Count the number of XMM registers allocated.
2984     static const MCPhysReg XMMArgRegs[] = {
2985       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2986       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2987     };
2988     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
2989     assert((Subtarget->hasSSE1() || !NumXMMRegs)
2990            && "SSE registers cannot be used when SSE is disabled");
2991
2992     RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
2993                                         DAG.getConstant(NumXMMRegs, dl,
2994                                                         MVT::i8)));
2995   }
2996
2997   if (isVarArg && IsMustTail) {
2998     const auto &Forwards = X86Info->getForwardedMustTailRegParms();
2999     for (const auto &F : Forwards) {
3000       SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3001       RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3002     }
3003   }
3004
3005   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
3006   // don't need this because the eligibility check rejects calls that require
3007   // shuffling arguments passed in memory.
3008   if (!IsSibcall && isTailCall) {
3009     // Force all the incoming stack arguments to be loaded from the stack
3010     // before any new outgoing arguments are stored to the stack, because the
3011     // outgoing stack slots may alias the incoming argument stack slots, and
3012     // the alias isn't otherwise explicit. This is slightly more conservative
3013     // than necessary, because it means that each store effectively depends
3014     // on every argument instead of just those arguments it would clobber.
3015     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3016
3017     SmallVector<SDValue, 8> MemOpChains2;
3018     SDValue FIN;
3019     int FI = 0;
3020     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3021       CCValAssign &VA = ArgLocs[i];
3022       if (VA.isRegLoc())
3023         continue;
3024       assert(VA.isMemLoc());
3025       SDValue Arg = OutVals[i];
3026       ISD::ArgFlagsTy Flags = Outs[i].Flags;
3027       // Skip inalloca arguments.  They don't require any work.
3028       if (Flags.isInAlloca())
3029         continue;
3030       // Create frame index.
3031       int32_t Offset = VA.getLocMemOffset()+FPDiff;
3032       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3033       FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
3034       FIN = DAG.getFrameIndex(FI, getPointerTy());
3035
3036       if (Flags.isByVal()) {
3037         // Copy relative to framepointer.
3038         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
3039         if (!StackPtr.getNode())
3040           StackPtr = DAG.getCopyFromReg(Chain, dl,
3041                                         RegInfo->getStackRegister(),
3042                                         getPointerTy());
3043         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
3044
3045         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3046                                                          ArgChain,
3047                                                          Flags, DAG, dl));
3048       } else {
3049         // Store relative to framepointer.
3050         MemOpChains2.push_back(
3051           DAG.getStore(ArgChain, dl, Arg, FIN,
3052                        MachinePointerInfo::getFixedStack(FI),
3053                        false, false, 0));
3054       }
3055     }
3056
3057     if (!MemOpChains2.empty())
3058       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3059
3060     // Store the return address to the appropriate stack slot.
3061     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3062                                      getPointerTy(), RegInfo->getSlotSize(),
3063                                      FPDiff, dl);
3064   }
3065
3066   // Build a sequence of copy-to-reg nodes chained together with token chain
3067   // and flag operands which copy the outgoing args into registers.
3068   SDValue InFlag;
3069   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3070     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3071                              RegsToPass[i].second, InFlag);
3072     InFlag = Chain.getValue(1);
3073   }
3074
3075   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3076     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3077     // In the 64-bit large code model, we have to make all calls
3078     // through a register, since the call instruction's 32-bit
3079     // pc-relative offset may not be large enough to hold the whole
3080     // address.
3081   } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3082     // If the callee is a GlobalAddress node (quite common, every direct call
3083     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3084     // it.
3085     GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3086
3087     // We should use extra load for direct calls to dllimported functions in
3088     // non-JIT mode.
3089     const GlobalValue *GV = G->getGlobal();
3090     if (!GV->hasDLLImportStorageClass()) {
3091       unsigned char OpFlags = 0;
3092       bool ExtraLoad = false;
3093       unsigned WrapperKind = ISD::DELETED_NODE;
3094
3095       // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
3096       // external symbols most go through the PLT in PIC mode.  If the symbol
3097       // has hidden or protected visibility, or if it is static or local, then
3098       // we don't need to use the PLT - we can directly call it.
3099       if (Subtarget->isTargetELF() &&
3100           DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
3101           GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
3102         OpFlags = X86II::MO_PLT;
3103       } else if (Subtarget->isPICStyleStubAny() &&
3104                  (GV->isDeclaration() || GV->isWeakForLinker()) &&
3105                  (!Subtarget->getTargetTriple().isMacOSX() ||
3106                   Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
3107         // PC-relative references to external symbols should go through $stub,
3108         // unless we're building with the leopard linker or later, which
3109         // automatically synthesizes these stubs.
3110         OpFlags = X86II::MO_DARWIN_STUB;
3111       } else if (Subtarget->isPICStyleRIPRel() && isa<Function>(GV) &&
3112                  cast<Function>(GV)->hasFnAttribute(Attribute::NonLazyBind)) {
3113         // If the function is marked as non-lazy, generate an indirect call
3114         // which loads from the GOT directly. This avoids runtime overhead
3115         // at the cost of eager binding (and one extra byte of encoding).
3116         OpFlags = X86II::MO_GOTPCREL;
3117         WrapperKind = X86ISD::WrapperRIP;
3118         ExtraLoad = true;
3119       }
3120
3121       Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
3122                                           G->getOffset(), OpFlags);
3123
3124       // Add a wrapper if needed.
3125       if (WrapperKind != ISD::DELETED_NODE)
3126         Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
3127       // Add extra indirection if needed.
3128       if (ExtraLoad)
3129         Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
3130                              MachinePointerInfo::getGOT(),
3131                              false, false, false, 0);
3132     }
3133   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3134     unsigned char OpFlags = 0;
3135
3136     // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
3137     // external symbols should go through the PLT.
3138     if (Subtarget->isTargetELF() &&
3139         DAG.getTarget().getRelocationModel() == Reloc::PIC_) {
3140       OpFlags = X86II::MO_PLT;
3141     } else if (Subtarget->isPICStyleStubAny() &&
3142                (!Subtarget->getTargetTriple().isMacOSX() ||
3143                 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
3144       // PC-relative references to external symbols should go through $stub,
3145       // unless we're building with the leopard linker or later, which
3146       // automatically synthesizes these stubs.
3147       OpFlags = X86II::MO_DARWIN_STUB;
3148     }
3149
3150     Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
3151                                          OpFlags);
3152   } else if (Subtarget->isTarget64BitILP32() &&
3153              Callee->getValueType(0) == MVT::i32) {
3154     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3155     Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3156   }
3157
3158   // Returns a chain & a flag for retval copy to use.
3159   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3160   SmallVector<SDValue, 8> Ops;
3161
3162   if (!IsSibcall && isTailCall) {
3163     Chain = DAG.getCALLSEQ_END(Chain,
3164                                DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3165                                DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
3166     InFlag = Chain.getValue(1);
3167   }
3168
3169   Ops.push_back(Chain);
3170   Ops.push_back(Callee);
3171
3172   if (isTailCall)
3173     Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
3174
3175   // Add argument registers to the end of the list so that they are known live
3176   // into the call.
3177   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3178     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3179                                   RegsToPass[i].second.getValueType()));
3180
3181   // Add a register mask operand representing the call-preserved registers.
3182   const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
3183   const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3184   assert(Mask && "Missing call preserved mask for calling convention");
3185   Ops.push_back(DAG.getRegisterMask(Mask));
3186
3187   if (InFlag.getNode())
3188     Ops.push_back(InFlag);
3189
3190   if (isTailCall) {
3191     // We used to do:
3192     //// If this is the first return lowered for this function, add the regs
3193     //// to the liveout set for the function.
3194     // This isn't right, although it's probably harmless on x86; liveouts
3195     // should be computed from returns not tail calls.  Consider a void
3196     // function making a tail call to a function returning int.
3197     MF.getFrameInfo()->setHasTailCall();
3198     return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3199   }
3200
3201   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3202   InFlag = Chain.getValue(1);
3203
3204   // Create the CALLSEQ_END node.
3205   unsigned NumBytesForCalleeToPop;
3206   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3207                        DAG.getTarget().Options.GuaranteedTailCallOpt))
3208     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
3209   else if (!Is64Bit && !IsTailCallConvention(CallConv) &&
3210            !Subtarget->getTargetTriple().isOSMSVCRT() &&
3211            SR == StackStructReturn)
3212     // If this is a call to a struct-return function, the callee
3213     // pops the hidden struct pointer, so we have to push it back.
3214     // This is common for Darwin/X86, Linux & Mingw32 targets.
3215     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3216     NumBytesForCalleeToPop = 4;
3217   else
3218     NumBytesForCalleeToPop = 0;  // Callee pops nothing.
3219
3220   // Returns a flag for retval copy to use.
3221   if (!IsSibcall) {
3222     Chain = DAG.getCALLSEQ_END(Chain,
3223                                DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3224                                DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
3225                                                      true),
3226                                InFlag, dl);
3227     InFlag = Chain.getValue(1);
3228   }
3229
3230   // Handle result values, copying them out of physregs into vregs that we
3231   // return.
3232   return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
3233                          Ins, dl, DAG, InVals);
3234 }
3235
3236 //===----------------------------------------------------------------------===//
3237 //                Fast Calling Convention (tail call) implementation
3238 //===----------------------------------------------------------------------===//
3239
3240 //  Like std call, callee cleans arguments, convention except that ECX is
3241 //  reserved for storing the tail called function address. Only 2 registers are
3242 //  free for argument passing (inreg). Tail call optimization is performed
3243 //  provided:
3244 //                * tailcallopt is enabled
3245 //                * caller/callee are fastcc
3246 //  On X86_64 architecture with GOT-style position independent code only local
3247 //  (within module) calls are supported at the moment.
3248 //  To keep the stack aligned according to platform abi the function
3249 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
3250 //  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3251 //  If a tail called function callee has more arguments than the caller the
3252 //  caller needs to make sure that there is room to move the RETADDR to. This is
3253 //  achieved by reserving an area the size of the argument delta right after the
3254 //  original RETADDR, but before the saved framepointer or the spilled registers
3255 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3256 //  stack layout:
3257 //    arg1
3258 //    arg2
3259 //    RETADDR
3260 //    [ new RETADDR
3261 //      move area ]
3262 //    (possible EBP)
3263 //    ESI
3264 //    EDI
3265 //    local1 ..
3266
3267 /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
3268 /// for a 16 byte align requirement.
3269 unsigned
3270 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3271                                                SelectionDAG& DAG) const {
3272   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
3273   const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
3274   unsigned StackAlignment = TFI.getStackAlignment();
3275   uint64_t AlignMask = StackAlignment - 1;
3276   int64_t Offset = StackSize;
3277   unsigned SlotSize = RegInfo->getSlotSize();
3278   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3279     // Number smaller than 12 so just add the difference.
3280     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3281   } else {
3282     // Mask out lower bits, add stackalignment once plus the 12 bytes.
3283     Offset = ((~AlignMask) & Offset) + StackAlignment +
3284       (StackAlignment-SlotSize);
3285   }
3286   return Offset;
3287 }
3288
3289 /// MatchingStackOffset - Return true if the given stack call argument is
3290 /// already available in the same position (relatively) of the caller's
3291 /// incoming argument stack.
3292 static
3293 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3294                          MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
3295                          const X86InstrInfo *TII) {
3296   unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
3297   int FI = INT_MAX;
3298   if (Arg.getOpcode() == ISD::CopyFromReg) {
3299     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3300     if (!TargetRegisterInfo::isVirtualRegister(VR))
3301       return false;
3302     MachineInstr *Def = MRI->getVRegDef(VR);
3303     if (!Def)
3304       return false;
3305     if (!Flags.isByVal()) {
3306       if (!TII->isLoadFromStackSlot(Def, FI))
3307         return false;
3308     } else {
3309       unsigned Opcode = Def->getOpcode();
3310       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
3311            Opcode == X86::LEA64_32r) &&
3312           Def->getOperand(1).isFI()) {
3313         FI = Def->getOperand(1).getIndex();
3314         Bytes = Flags.getByValSize();
3315       } else
3316         return false;
3317     }
3318   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
3319     if (Flags.isByVal())
3320       // ByVal argument is passed in as a pointer but it's now being
3321       // dereferenced. e.g.
3322       // define @foo(%struct.X* %A) {
3323       //   tail call @bar(%struct.X* byval %A)
3324       // }
3325       return false;
3326     SDValue Ptr = Ld->getBasePtr();
3327     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
3328     if (!FINode)
3329       return false;
3330     FI = FINode->getIndex();
3331   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
3332     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
3333     FI = FINode->getIndex();
3334     Bytes = Flags.getByValSize();
3335   } else
3336     return false;
3337
3338   assert(FI != INT_MAX);
3339   if (!MFI->isFixedObjectIndex(FI))
3340     return false;
3341   return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
3342 }
3343
3344 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
3345 /// for tail call optimization. Targets which want to do tail call
3346 /// optimization should implement this function.
3347 bool
3348 X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
3349                                                      CallingConv::ID CalleeCC,
3350                                                      bool isVarArg,
3351                                                      bool isCalleeStructRet,
3352                                                      bool isCallerStructRet,
3353                                                      Type *RetTy,
3354                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
3355                                     const SmallVectorImpl<SDValue> &OutVals,
3356                                     const SmallVectorImpl<ISD::InputArg> &Ins,
3357                                                      SelectionDAG &DAG) const {
3358   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
3359     return false;
3360
3361   // If -tailcallopt is specified, make fastcc functions tail-callable.
3362   const MachineFunction &MF = DAG.getMachineFunction();
3363   const Function *CallerF = MF.getFunction();
3364
3365   // If the function return type is x86_fp80 and the callee return type is not,
3366   // then the FP_EXTEND of the call result is not a nop. It's not safe to
3367   // perform a tailcall optimization here.
3368   if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
3369     return false;
3370
3371   CallingConv::ID CallerCC = CallerF->getCallingConv();
3372   bool CCMatch = CallerCC == CalleeCC;
3373   bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
3374   bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
3375
3376   // Win64 functions have extra shadow space for argument homing. Don't do the
3377   // sibcall if the caller and callee have mismatched expectations for this
3378   // space.
3379   if (IsCalleeWin64 != IsCallerWin64)
3380     return false;
3381
3382   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
3383     if (IsTailCallConvention(CalleeCC) && CCMatch)
3384       return true;
3385     return false;
3386   }
3387
3388   // Look for obvious safe cases to perform tail call optimization that do not
3389   // require ABI changes. This is what gcc calls sibcall.
3390
3391   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
3392   // emit a special epilogue.
3393   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
3394   if (RegInfo->needsStackRealignment(MF))
3395     return false;
3396
3397   // Also avoid sibcall optimization if either caller or callee uses struct
3398   // return semantics.
3399   if (isCalleeStructRet || isCallerStructRet)
3400     return false;
3401
3402   // An stdcall/thiscall caller is expected to clean up its arguments; the
3403   // callee isn't going to do that.
3404   // FIXME: this is more restrictive than needed. We could produce a tailcall
3405   // when the stack adjustment matches. For example, with a thiscall that takes
3406   // only one argument.
3407   if (!CCMatch && (CallerCC == CallingConv::X86_StdCall ||
3408                    CallerCC == CallingConv::X86_ThisCall))
3409     return false;
3410
3411   // Do not sibcall optimize vararg calls unless all arguments are passed via
3412   // registers.
3413   if (isVarArg && !Outs.empty()) {
3414
3415     // Optimizing for varargs on Win64 is unlikely to be safe without
3416     // additional testing.
3417     if (IsCalleeWin64 || IsCallerWin64)
3418       return false;
3419
3420     SmallVector<CCValAssign, 16> ArgLocs;
3421     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
3422                    *DAG.getContext());
3423
3424     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3425     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
3426       if (!ArgLocs[i].isRegLoc())
3427         return false;
3428   }
3429
3430   // If the call result is in ST0 / ST1, it needs to be popped off the x87
3431   // stack.  Therefore, if it's not used by the call it is not safe to optimize
3432   // this into a sibcall.
3433   bool Unused = false;
3434   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
3435     if (!Ins[i].Used) {
3436       Unused = true;
3437       break;
3438     }
3439   }
3440   if (Unused) {
3441     SmallVector<CCValAssign, 16> RVLocs;
3442     CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs,
3443                    *DAG.getContext());
3444     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3445     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
3446       CCValAssign &VA = RVLocs[i];
3447       if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
3448         return false;
3449     }
3450   }
3451
3452   // If the calling conventions do not match, then we'd better make sure the
3453   // results are returned in the same way as what the caller expects.
3454   if (!CCMatch) {
3455     SmallVector<CCValAssign, 16> RVLocs1;
3456     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
3457                     *DAG.getContext());
3458     CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
3459
3460     SmallVector<CCValAssign, 16> RVLocs2;
3461     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
3462                     *DAG.getContext());
3463     CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
3464
3465     if (RVLocs1.size() != RVLocs2.size())
3466       return false;
3467     for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
3468       if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
3469         return false;
3470       if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
3471         return false;
3472       if (RVLocs1[i].isRegLoc()) {
3473         if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
3474           return false;
3475       } else {
3476         if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
3477           return false;
3478       }
3479     }
3480   }
3481
3482   // If the callee takes no arguments then go on to check the results of the
3483   // call.
3484   if (!Outs.empty()) {
3485     // Check if stack adjustment is needed. For now, do not do this if any
3486     // argument is passed on the stack.
3487     SmallVector<CCValAssign, 16> ArgLocs;
3488     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
3489                    *DAG.getContext());
3490
3491     // Allocate shadow area for Win64
3492     if (IsCalleeWin64)
3493       CCInfo.AllocateStack(32, 8);
3494
3495     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3496     if (CCInfo.getNextStackOffset()) {
3497       MachineFunction &MF = DAG.getMachineFunction();
3498       if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
3499         return false;
3500
3501       // Check if the arguments are already laid out in the right way as
3502       // the caller's fixed stack objects.
3503       MachineFrameInfo *MFI = MF.getFrameInfo();
3504       const MachineRegisterInfo *MRI = &MF.getRegInfo();
3505       const X86InstrInfo *TII = Subtarget->getInstrInfo();
3506       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3507         CCValAssign &VA = ArgLocs[i];
3508         SDValue Arg = OutVals[i];
3509         ISD::ArgFlagsTy Flags = Outs[i].Flags;
3510         if (VA.getLocInfo() == CCValAssign::Indirect)
3511           return false;
3512         if (!VA.isRegLoc()) {
3513           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
3514                                    MFI, MRI, TII))
3515             return false;
3516         }
3517       }
3518     }
3519
3520     // If the tailcall address may be in a register, then make sure it's
3521     // possible to register allocate for it. In 32-bit, the call address can
3522     // only target EAX, EDX, or ECX since the tail call must be scheduled after
3523     // callee-saved registers are restored. These happen to be the same
3524     // registers used to pass 'inreg' arguments so watch out for those.
3525     if (!Subtarget->is64Bit() &&
3526         ((!isa<GlobalAddressSDNode>(Callee) &&
3527           !isa<ExternalSymbolSDNode>(Callee)) ||
3528          DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
3529       unsigned NumInRegs = 0;
3530       // In PIC we need an extra register to formulate the address computation
3531       // for the callee.
3532       unsigned MaxInRegs =
3533         (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
3534
3535       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3536         CCValAssign &VA = ArgLocs[i];
3537         if (!VA.isRegLoc())
3538           continue;
3539         unsigned Reg = VA.getLocReg();
3540         switch (Reg) {
3541         default: break;
3542         case X86::EAX: case X86::EDX: case X86::ECX:
3543           if (++NumInRegs == MaxInRegs)
3544             return false;
3545           break;
3546         }
3547       }
3548     }
3549   }
3550
3551   return true;
3552 }
3553
3554 FastISel *
3555 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
3556                                   const TargetLibraryInfo *libInfo) const {
3557   return X86::createFastISel(funcInfo, libInfo);
3558 }
3559
3560 //===----------------------------------------------------------------------===//
3561 //                           Other Lowering Hooks
3562 //===----------------------------------------------------------------------===//
3563
3564 static bool MayFoldLoad(SDValue Op) {
3565   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
3566 }
3567
3568 static bool MayFoldIntoStore(SDValue Op) {
3569   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
3570 }
3571
3572 static bool isTargetShuffle(unsigned Opcode) {
3573   switch(Opcode) {
3574   default: return false;
3575   case X86ISD::BLENDI:
3576   case X86ISD::PSHUFB:
3577   case X86ISD::PSHUFD:
3578   case X86ISD::PSHUFHW:
3579   case X86ISD::PSHUFLW:
3580   case X86ISD::SHUFP:
3581   case X86ISD::PALIGNR:
3582   case X86ISD::MOVLHPS:
3583   case X86ISD::MOVLHPD:
3584   case X86ISD::MOVHLPS:
3585   case X86ISD::MOVLPS:
3586   case X86ISD::MOVLPD:
3587   case X86ISD::MOVSHDUP:
3588   case X86ISD::MOVSLDUP:
3589   case X86ISD::MOVDDUP:
3590   case X86ISD::MOVSS:
3591   case X86ISD::MOVSD:
3592   case X86ISD::UNPCKL:
3593   case X86ISD::UNPCKH:
3594   case X86ISD::VPERMILPI:
3595   case X86ISD::VPERM2X128:
3596   case X86ISD::VPERMI:
3597     return true;
3598   }
3599 }
3600
3601 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3602                                     SDValue V1, unsigned TargetMask,
3603                                     SelectionDAG &DAG) {
3604   switch(Opc) {
3605   default: llvm_unreachable("Unknown x86 shuffle node");
3606   case X86ISD::PSHUFD:
3607   case X86ISD::PSHUFHW:
3608   case X86ISD::PSHUFLW:
3609   case X86ISD::VPERMILPI:
3610   case X86ISD::VPERMI:
3611     return DAG.getNode(Opc, dl, VT, V1,
3612                        DAG.getConstant(TargetMask, dl, MVT::i8));
3613   }
3614 }
3615
3616 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3617                                     SDValue V1, SDValue V2, SelectionDAG &DAG) {
3618   switch(Opc) {
3619   default: llvm_unreachable("Unknown x86 shuffle node");
3620   case X86ISD::MOVLHPS:
3621   case X86ISD::MOVLHPD:
3622   case X86ISD::MOVHLPS:
3623   case X86ISD::MOVLPS:
3624   case X86ISD::MOVLPD:
3625   case X86ISD::MOVSS:
3626   case X86ISD::MOVSD:
3627   case X86ISD::UNPCKL:
3628   case X86ISD::UNPCKH:
3629     return DAG.getNode(Opc, dl, VT, V1, V2);
3630   }
3631 }
3632
3633 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
3634   MachineFunction &MF = DAG.getMachineFunction();
3635   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
3636   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3637   int ReturnAddrIndex = FuncInfo->getRAIndex();
3638
3639   if (ReturnAddrIndex == 0) {
3640     // Set up a frame object for the return address.
3641     unsigned SlotSize = RegInfo->getSlotSize();
3642     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
3643                                                            -(int64_t)SlotSize,
3644                                                            false);
3645     FuncInfo->setRAIndex(ReturnAddrIndex);
3646   }
3647
3648   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
3649 }
3650
3651 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
3652                                        bool hasSymbolicDisplacement) {
3653   // Offset should fit into 32 bit immediate field.
3654   if (!isInt<32>(Offset))
3655     return false;
3656
3657   // If we don't have a symbolic displacement - we don't have any extra
3658   // restrictions.
3659   if (!hasSymbolicDisplacement)
3660     return true;
3661
3662   // FIXME: Some tweaks might be needed for medium code model.
3663   if (M != CodeModel::Small && M != CodeModel::Kernel)
3664     return false;
3665
3666   // For small code model we assume that latest object is 16MB before end of 31
3667   // bits boundary. We may also accept pretty large negative constants knowing
3668   // that all objects are in the positive half of address space.
3669   if (M == CodeModel::Small && Offset < 16*1024*1024)
3670     return true;
3671
3672   // For kernel code model we know that all object resist in the negative half
3673   // of 32bits address space. We may not accept negative offsets, since they may
3674   // be just off and we may accept pretty large positive ones.
3675   if (M == CodeModel::Kernel && Offset >= 0)
3676     return true;
3677
3678   return false;
3679 }
3680
3681 /// isCalleePop - Determines whether the callee is required to pop its
3682 /// own arguments. Callee pop is necessary to support tail calls.
3683 bool X86::isCalleePop(CallingConv::ID CallingConv,
3684                       bool is64Bit, bool IsVarArg, bool TailCallOpt) {
3685   switch (CallingConv) {
3686   default:
3687     return false;
3688   case CallingConv::X86_StdCall:
3689   case CallingConv::X86_FastCall:
3690   case CallingConv::X86_ThisCall:
3691     return !is64Bit;
3692   case CallingConv::Fast:
3693   case CallingConv::GHC:
3694   case CallingConv::HiPE:
3695     if (IsVarArg)
3696       return false;
3697     return TailCallOpt;
3698   }
3699 }
3700
3701 /// \brief Return true if the condition is an unsigned comparison operation.
3702 static bool isX86CCUnsigned(unsigned X86CC) {
3703   switch (X86CC) {
3704   default: llvm_unreachable("Invalid integer condition!");
3705   case X86::COND_E:     return true;
3706   case X86::COND_G:     return false;
3707   case X86::COND_GE:    return false;
3708   case X86::COND_L:     return false;
3709   case X86::COND_LE:    return false;
3710   case X86::COND_NE:    return true;
3711   case X86::COND_B:     return true;
3712   case X86::COND_A:     return true;
3713   case X86::COND_BE:    return true;
3714   case X86::COND_AE:    return true;
3715   }
3716   llvm_unreachable("covered switch fell through?!");
3717 }
3718
3719 /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
3720 /// specific condition code, returning the condition code and the LHS/RHS of the
3721 /// comparison to make.
3722 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, SDLoc DL, bool isFP,
3723                                SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
3724   if (!isFP) {
3725     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
3726       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
3727         // X > -1   -> X == 0, jump !sign.
3728         RHS = DAG.getConstant(0, DL, RHS.getValueType());
3729         return X86::COND_NS;
3730       }
3731       if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
3732         // X < 0   -> X == 0, jump on sign.
3733         return X86::COND_S;
3734       }
3735       if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
3736         // X < 1   -> X <= 0
3737         RHS = DAG.getConstant(0, DL, RHS.getValueType());
3738         return X86::COND_LE;
3739       }
3740     }
3741
3742     switch (SetCCOpcode) {
3743     default: llvm_unreachable("Invalid integer condition!");
3744     case ISD::SETEQ:  return X86::COND_E;
3745     case ISD::SETGT:  return X86::COND_G;
3746     case ISD::SETGE:  return X86::COND_GE;
3747     case ISD::SETLT:  return X86::COND_L;
3748     case ISD::SETLE:  return X86::COND_LE;
3749     case ISD::SETNE:  return X86::COND_NE;
3750     case ISD::SETULT: return X86::COND_B;
3751     case ISD::SETUGT: return X86::COND_A;
3752     case ISD::SETULE: return X86::COND_BE;
3753     case ISD::SETUGE: return X86::COND_AE;
3754     }
3755   }
3756
3757   // First determine if it is required or is profitable to flip the operands.
3758
3759   // If LHS is a foldable load, but RHS is not, flip the condition.
3760   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3761       !ISD::isNON_EXTLoad(RHS.getNode())) {
3762     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3763     std::swap(LHS, RHS);
3764   }
3765
3766   switch (SetCCOpcode) {
3767   default: break;
3768   case ISD::SETOLT:
3769   case ISD::SETOLE:
3770   case ISD::SETUGT:
3771   case ISD::SETUGE:
3772     std::swap(LHS, RHS);
3773     break;
3774   }
3775
3776   // On a floating point condition, the flags are set as follows:
3777   // ZF  PF  CF   op
3778   //  0 | 0 | 0 | X > Y
3779   //  0 | 0 | 1 | X < Y
3780   //  1 | 0 | 0 | X == Y
3781   //  1 | 1 | 1 | unordered
3782   switch (SetCCOpcode) {
3783   default: llvm_unreachable("Condcode should be pre-legalized away");
3784   case ISD::SETUEQ:
3785   case ISD::SETEQ:   return X86::COND_E;
3786   case ISD::SETOLT:              // flipped
3787   case ISD::SETOGT:
3788   case ISD::SETGT:   return X86::COND_A;
3789   case ISD::SETOLE:              // flipped
3790   case ISD::SETOGE:
3791   case ISD::SETGE:   return X86::COND_AE;
3792   case ISD::SETUGT:              // flipped
3793   case ISD::SETULT:
3794   case ISD::SETLT:   return X86::COND_B;
3795   case ISD::SETUGE:              // flipped
3796   case ISD::SETULE:
3797   case ISD::SETLE:   return X86::COND_BE;
3798   case ISD::SETONE:
3799   case ISD::SETNE:   return X86::COND_NE;
3800   case ISD::SETUO:   return X86::COND_P;
3801   case ISD::SETO:    return X86::COND_NP;
3802   case ISD::SETOEQ:
3803   case ISD::SETUNE:  return X86::COND_INVALID;
3804   }
3805 }
3806
3807 /// hasFPCMov - is there a floating point cmov for the specific X86 condition
3808 /// code. Current x86 isa includes the following FP cmov instructions:
3809 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3810 static bool hasFPCMov(unsigned X86CC) {
3811   switch (X86CC) {
3812   default:
3813     return false;
3814   case X86::COND_B:
3815   case X86::COND_BE:
3816   case X86::COND_E:
3817   case X86::COND_P:
3818   case X86::COND_A:
3819   case X86::COND_AE:
3820   case X86::COND_NE:
3821   case X86::COND_NP:
3822     return true;
3823   }
3824 }
3825
3826 /// isFPImmLegal - Returns true if the target can instruction select the
3827 /// specified FP immediate natively. If false, the legalizer will
3828 /// materialize the FP immediate as a load from a constant pool.
3829 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
3830   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
3831     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
3832       return true;
3833   }
3834   return false;
3835 }
3836
3837 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
3838                                               ISD::LoadExtType ExtTy,
3839                                               EVT NewVT) const {
3840   // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3841   // relocation target a movq or addq instruction: don't let the load shrink.
3842   SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3843   if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3844     if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3845       return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3846   return true;
3847 }
3848
3849 /// \brief Returns true if it is beneficial to convert a load of a constant
3850 /// to just the constant itself.
3851 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
3852                                                           Type *Ty) const {
3853   assert(Ty->isIntegerTy());
3854
3855   unsigned BitSize = Ty->getPrimitiveSizeInBits();
3856   if (BitSize == 0 || BitSize > 64)
3857     return false;
3858   return true;
3859 }
3860
3861 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
3862                                                 unsigned Index) const {
3863   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
3864     return false;
3865
3866   return (Index == 0 || Index == ResVT.getVectorNumElements());
3867 }
3868
3869 bool X86TargetLowering::isCheapToSpeculateCttz() const {
3870   // Speculate cttz only if we can directly use TZCNT.
3871   return Subtarget->hasBMI();
3872 }
3873
3874 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
3875   // Speculate ctlz only if we can directly use LZCNT.
3876   return Subtarget->hasLZCNT();
3877 }
3878
3879 /// isUndefOrInRange - Return true if Val is undef or if its value falls within
3880 /// the specified range (L, H].
3881 static bool isUndefOrInRange(int Val, int Low, int Hi) {
3882   return (Val < 0) || (Val >= Low && Val < Hi);
3883 }
3884
3885 /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
3886 /// specified value.
3887 static bool isUndefOrEqual(int Val, int CmpVal) {
3888   return (Val < 0 || Val == CmpVal);
3889 }
3890
3891 /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
3892 /// from position Pos and ending in Pos+Size, falls within the specified
3893 /// sequential range (Low, Low+Size]. or is undef.
3894 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
3895                                        unsigned Pos, unsigned Size, int Low) {
3896   for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
3897     if (!isUndefOrEqual(Mask[i], Low))
3898       return false;
3899   return true;
3900 }
3901
3902 /// isVEXTRACTIndex - Return true if the specified
3903 /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
3904 /// suitable for instruction that extract 128 or 256 bit vectors
3905 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
3906   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
3907   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
3908     return false;
3909
3910   // The index should be aligned on a vecWidth-bit boundary.
3911   uint64_t Index =
3912     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
3913
3914   MVT VT = N->getSimpleValueType(0);
3915   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
3916   bool Result = (Index * ElSize) % vecWidth == 0;
3917
3918   return Result;
3919 }
3920
3921 /// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR
3922 /// operand specifies a subvector insert that is suitable for input to
3923 /// insertion of 128 or 256-bit subvectors
3924 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
3925   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
3926   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
3927     return false;
3928   // The index should be aligned on a vecWidth-bit boundary.
3929   uint64_t Index =
3930     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
3931
3932   MVT VT = N->getSimpleValueType(0);
3933   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
3934   bool Result = (Index * ElSize) % vecWidth == 0;
3935
3936   return Result;
3937 }
3938
3939 bool X86::isVINSERT128Index(SDNode *N) {
3940   return isVINSERTIndex(N, 128);
3941 }
3942
3943 bool X86::isVINSERT256Index(SDNode *N) {
3944   return isVINSERTIndex(N, 256);
3945 }
3946
3947 bool X86::isVEXTRACT128Index(SDNode *N) {
3948   return isVEXTRACTIndex(N, 128);
3949 }
3950
3951 bool X86::isVEXTRACT256Index(SDNode *N) {
3952   return isVEXTRACTIndex(N, 256);
3953 }
3954
3955 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
3956   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
3957   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
3958     llvm_unreachable("Illegal extract subvector for VEXTRACT");
3959
3960   uint64_t Index =
3961     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
3962
3963   MVT VecVT = N->getOperand(0).getSimpleValueType();
3964   MVT ElVT = VecVT.getVectorElementType();
3965
3966   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
3967   return Index / NumElemsPerChunk;
3968 }
3969
3970 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
3971   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
3972   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
3973     llvm_unreachable("Illegal insert subvector for VINSERT");
3974
3975   uint64_t Index =
3976     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
3977
3978   MVT VecVT = N->getSimpleValueType(0);
3979   MVT ElVT = VecVT.getVectorElementType();
3980
3981   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
3982   return Index / NumElemsPerChunk;
3983 }
3984
3985 /// getExtractVEXTRACT128Immediate - Return the appropriate immediate
3986 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
3987 /// and VINSERTI128 instructions.
3988 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
3989   return getExtractVEXTRACTImmediate(N, 128);
3990 }
3991
3992 /// getExtractVEXTRACT256Immediate - Return the appropriate immediate
3993 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4
3994 /// and VINSERTI64x4 instructions.
3995 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
3996   return getExtractVEXTRACTImmediate(N, 256);
3997 }
3998
3999 /// getInsertVINSERT128Immediate - Return the appropriate immediate
4000 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
4001 /// and VINSERTI128 instructions.
4002 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
4003   return getInsertVINSERTImmediate(N, 128);
4004 }
4005
4006 /// getInsertVINSERT256Immediate - Return the appropriate immediate
4007 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4
4008 /// and VINSERTI64x4 instructions.
4009 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
4010   return getInsertVINSERTImmediate(N, 256);
4011 }
4012
4013 /// isZero - Returns true if Elt is a constant integer zero
4014 static bool isZero(SDValue V) {
4015   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
4016   return C && C->isNullValue();
4017 }
4018
4019 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
4020 /// constant +0.0.
4021 bool X86::isZeroNode(SDValue Elt) {
4022   if (isZero(Elt))
4023     return true;
4024   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
4025     return CFP->getValueAPF().isPosZero();
4026   return false;
4027 }
4028
4029 /// getZeroVector - Returns a vector of specified type with all zero elements.
4030 ///
4031 static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
4032                              SelectionDAG &DAG, SDLoc dl) {
4033   assert(VT.isVector() && "Expected a vector type");
4034
4035   // Always build SSE zero vectors as <4 x i32> bitcasted
4036   // to their dest type. This ensures they get CSE'd.
4037   SDValue Vec;
4038   if (VT.is128BitVector()) {  // SSE
4039     if (Subtarget->hasSSE2()) {  // SSE2
4040       SDValue Cst = DAG.getConstant(0, dl, MVT::i32);
4041       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
4042     } else { // SSE1
4043       SDValue Cst = DAG.getConstantFP(+0.0, dl, MVT::f32);
4044       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
4045     }
4046   } else if (VT.is256BitVector()) { // AVX
4047     if (Subtarget->hasInt256()) { // AVX2
4048       SDValue Cst = DAG.getConstant(0, dl, MVT::i32);
4049       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
4050       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
4051     } else {
4052       // 256-bit logic and arithmetic instructions in AVX are all
4053       // floating-point, no support for integer ops. Emit fp zeroed vectors.
4054       SDValue Cst = DAG.getConstantFP(+0.0, dl, MVT::f32);
4055       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
4056       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
4057     }
4058   } else if (VT.is512BitVector()) { // AVX-512
4059       SDValue Cst = DAG.getConstant(0, dl, MVT::i32);
4060       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
4061                         Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
4062       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
4063   } else if (VT.getScalarType() == MVT::i1) {
4064
4065     assert((Subtarget->hasBWI() || VT.getVectorNumElements() <= 16)
4066             && "Unexpected vector type");
4067     assert((Subtarget->hasVLX() || VT.getVectorNumElements() >= 8)
4068             && "Unexpected vector type");
4069     SDValue Cst = DAG.getConstant(0, dl, MVT::i1);
4070     SmallVector<SDValue, 64> Ops(VT.getVectorNumElements(), Cst);
4071     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
4072   } else
4073     llvm_unreachable("Unexpected vector type");
4074
4075   return DAG.getBitcast(VT, Vec);
4076 }
4077
4078 static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
4079                                 SelectionDAG &DAG, SDLoc dl,
4080                                 unsigned vectorWidth) {
4081   assert((vectorWidth == 128 || vectorWidth == 256) &&
4082          "Unsupported vector width");
4083   EVT VT = Vec.getValueType();
4084   EVT ElVT = VT.getVectorElementType();
4085   unsigned Factor = VT.getSizeInBits()/vectorWidth;
4086   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
4087                                   VT.getVectorNumElements()/Factor);
4088
4089   // Extract from UNDEF is UNDEF.
4090   if (Vec.getOpcode() == ISD::UNDEF)
4091     return DAG.getUNDEF(ResultVT);
4092
4093   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
4094   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4095
4096   // This is the index of the first element of the vectorWidth-bit chunk
4097   // we want.
4098   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
4099                                * ElemsPerChunk);
4100
4101   // If the input is a buildvector just emit a smaller one.
4102   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4103     return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
4104                        makeArrayRef(Vec->op_begin() + NormalizedIdxVal,
4105                                     ElemsPerChunk));
4106
4107   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal, dl);
4108   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
4109 }
4110
4111 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
4112 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4113 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4114 /// instructions or a simple subregister reference. Idx is an index in the
4115 /// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
4116 /// lowering EXTRACT_VECTOR_ELT operations easier.
4117 static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
4118                                    SelectionDAG &DAG, SDLoc dl) {
4119   assert((Vec.getValueType().is256BitVector() ||
4120           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
4121   return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
4122 }
4123
4124 /// Generate a DAG to grab 256-bits from a 512-bit vector.
4125 static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
4126                                    SelectionDAG &DAG, SDLoc dl) {
4127   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
4128   return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
4129 }
4130
4131 static SDValue InsertSubVector(SDValue Result, SDValue Vec,
4132                                unsigned IdxVal, SelectionDAG &DAG,
4133                                SDLoc dl, unsigned vectorWidth) {
4134   assert((vectorWidth == 128 || vectorWidth == 256) &&
4135          "Unsupported vector width");
4136   // Inserting UNDEF is Result
4137   if (Vec.getOpcode() == ISD::UNDEF)
4138     return Result;
4139   EVT VT = Vec.getValueType();
4140   EVT ElVT = VT.getVectorElementType();
4141   EVT ResultVT = Result.getValueType();
4142
4143   // Insert the relevant vectorWidth bits.
4144   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
4145
4146   // This is the index of the first element of the vectorWidth-bit chunk
4147   // we want.
4148   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
4149                                * ElemsPerChunk);
4150
4151   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal, dl);
4152   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
4153 }
4154
4155 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
4156 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
4157 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
4158 /// simple superregister reference.  Idx is an index in the 128 bits
4159 /// we want.  It need not be aligned to a 128-bit boundary.  That makes
4160 /// lowering INSERT_VECTOR_ELT operations easier.
4161 static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4162                                   SelectionDAG &DAG, SDLoc dl) {
4163   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
4164
4165   // For insertion into the zero index (low half) of a 256-bit vector, it is
4166   // more efficient to generate a blend with immediate instead of an insert*128.
4167   // We are still creating an INSERT_SUBVECTOR below with an undef node to
4168   // extend the subvector to the size of the result vector. Make sure that
4169   // we are not recursing on that node by checking for undef here.
4170   if (IdxVal == 0 && Result.getValueType().is256BitVector() &&
4171       Result.getOpcode() != ISD::UNDEF) {
4172     EVT ResultVT = Result.getValueType();
4173     SDValue ZeroIndex = DAG.getIntPtrConstant(0, dl);
4174     SDValue Undef = DAG.getUNDEF(ResultVT);
4175     SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef,
4176                                  Vec, ZeroIndex);
4177
4178     // The blend instruction, and therefore its mask, depend on the data type.
4179     MVT ScalarType = ResultVT.getScalarType().getSimpleVT();
4180     if (ScalarType.isFloatingPoint()) {
4181       // Choose either vblendps (float) or vblendpd (double).
4182       unsigned ScalarSize = ScalarType.getSizeInBits();
4183       assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type");
4184       unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f;
4185       SDValue Mask = DAG.getConstant(MaskVal, dl, MVT::i8);
4186       return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask);
4187     }
4188
4189     const X86Subtarget &Subtarget =
4190     static_cast<const X86Subtarget &>(DAG.getSubtarget());
4191
4192     // AVX2 is needed for 256-bit integer blend support.
4193     // Integers must be cast to 32-bit because there is only vpblendd;
4194     // vpblendw can't be used for this because it has a handicapped mask.
4195
4196     // If we don't have AVX2, then cast to float. Using a wrong domain blend
4197     // is still more efficient than using the wrong domain vinsertf128 that
4198     // will be created by InsertSubVector().
4199     MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32;
4200
4201     SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8);
4202     Vec256 = DAG.getBitcast(CastVT, Vec256);
4203     Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask);
4204     return DAG.getBitcast(ResultVT, Vec256);
4205   }
4206
4207   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
4208 }
4209
4210 static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4211                                   SelectionDAG &DAG, SDLoc dl) {
4212   assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
4213   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
4214 }
4215
4216 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
4217 /// instructions. This is used because creating CONCAT_VECTOR nodes of
4218 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
4219 /// large BUILD_VECTORS.
4220 static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
4221                                    unsigned NumElems, SelectionDAG &DAG,
4222                                    SDLoc dl) {
4223   SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
4224   return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
4225 }
4226
4227 static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
4228                                    unsigned NumElems, SelectionDAG &DAG,
4229                                    SDLoc dl) {
4230   SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
4231   return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
4232 }
4233
4234 /// getOnesVector - Returns a vector of specified type with all bits set.
4235 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
4236 /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
4237 /// Then bitcast to their original type, ensuring they get CSE'd.
4238 static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
4239                              SDLoc dl) {
4240   assert(VT.isVector() && "Expected a vector type");
4241
4242   SDValue Cst = DAG.getConstant(~0U, dl, MVT::i32);
4243   SDValue Vec;
4244   if (VT.is256BitVector()) {
4245     if (HasInt256) { // AVX2
4246       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
4247       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
4248     } else { // AVX
4249       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
4250       Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
4251     }
4252   } else if (VT.is128BitVector()) {
4253     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
4254   } else
4255     llvm_unreachable("Unexpected vector type");
4256
4257   return DAG.getBitcast(VT, Vec);
4258 }
4259
4260 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
4261 /// operation of specified width.
4262 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
4263                        SDValue V2) {
4264   unsigned NumElems = VT.getVectorNumElements();
4265   SmallVector<int, 8> Mask;
4266   Mask.push_back(NumElems);
4267   for (unsigned i = 1; i != NumElems; ++i)
4268     Mask.push_back(i);
4269   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
4270 }
4271
4272 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
4273 static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
4274                           SDValue V2) {
4275   unsigned NumElems = VT.getVectorNumElements();
4276   SmallVector<int, 8> Mask;
4277   for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
4278     Mask.push_back(i);
4279     Mask.push_back(i + NumElems);
4280   }
4281   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
4282 }
4283
4284 /// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
4285 static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
4286                           SDValue V2) {
4287   unsigned NumElems = VT.getVectorNumElements();
4288   SmallVector<int, 8> Mask;
4289   for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
4290     Mask.push_back(i + Half);
4291     Mask.push_back(i + NumElems + Half);
4292   }
4293   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
4294 }
4295
4296 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
4297 /// vector of zero or undef vector.  This produces a shuffle where the low
4298 /// element of V2 is swizzled into the zero/undef vector, landing at element
4299 /// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
4300 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
4301                                            bool IsZero,
4302                                            const X86Subtarget *Subtarget,
4303                                            SelectionDAG &DAG) {
4304   MVT VT = V2.getSimpleValueType();
4305   SDValue V1 = IsZero
4306     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
4307   unsigned NumElems = VT.getVectorNumElements();
4308   SmallVector<int, 16> MaskVec;
4309   for (unsigned i = 0; i != NumElems; ++i)
4310     // If this is the insertion idx, put the low elt of V2 here.
4311     MaskVec.push_back(i == Idx ? NumElems : i);
4312   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
4313 }
4314
4315 /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
4316 /// target specific opcode. Returns true if the Mask could be calculated. Sets
4317 /// IsUnary to true if only uses one source. Note that this will set IsUnary for
4318 /// shuffles which use a single input multiple times, and in those cases it will
4319 /// adjust the mask to only have indices within that single input.
4320 static bool getTargetShuffleMask(SDNode *N, MVT VT,
4321                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
4322   unsigned NumElems = VT.getVectorNumElements();
4323   SDValue ImmN;
4324
4325   IsUnary = false;
4326   bool IsFakeUnary = false;
4327   switch(N->getOpcode()) {
4328   case X86ISD::BLENDI:
4329     ImmN = N->getOperand(N->getNumOperands()-1);
4330     DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4331     break;
4332   case X86ISD::SHUFP:
4333     ImmN = N->getOperand(N->getNumOperands()-1);
4334     DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4335     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4336     break;
4337   case X86ISD::UNPCKH:
4338     DecodeUNPCKHMask(VT, Mask);
4339     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4340     break;
4341   case X86ISD::UNPCKL:
4342     DecodeUNPCKLMask(VT, Mask);
4343     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4344     break;
4345   case X86ISD::MOVHLPS:
4346     DecodeMOVHLPSMask(NumElems, Mask);
4347     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4348     break;
4349   case X86ISD::MOVLHPS:
4350     DecodeMOVLHPSMask(NumElems, Mask);
4351     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4352     break;
4353   case X86ISD::PALIGNR:
4354     ImmN = N->getOperand(N->getNumOperands()-1);
4355     DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4356     break;
4357   case X86ISD::PSHUFD:
4358   case X86ISD::VPERMILPI:
4359     ImmN = N->getOperand(N->getNumOperands()-1);
4360     DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4361     IsUnary = true;
4362     break;
4363   case X86ISD::PSHUFHW:
4364     ImmN = N->getOperand(N->getNumOperands()-1);
4365     DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4366     IsUnary = true;
4367     break;
4368   case X86ISD::PSHUFLW:
4369     ImmN = N->getOperand(N->getNumOperands()-1);
4370     DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4371     IsUnary = true;
4372     break;
4373   case X86ISD::PSHUFB: {
4374     IsUnary = true;
4375     SDValue MaskNode = N->getOperand(1);
4376     while (MaskNode->getOpcode() == ISD::BITCAST)
4377       MaskNode = MaskNode->getOperand(0);
4378
4379     if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
4380       // If we have a build-vector, then things are easy.
4381       EVT VT = MaskNode.getValueType();
4382       assert(VT.isVector() &&
4383              "Can't produce a non-vector with a build_vector!");
4384       if (!VT.isInteger())
4385         return false;
4386
4387       int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8;
4388
4389       SmallVector<uint64_t, 32> RawMask;
4390       for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) {
4391         SDValue Op = MaskNode->getOperand(i);
4392         if (Op->getOpcode() == ISD::UNDEF) {
4393           RawMask.push_back((uint64_t)SM_SentinelUndef);
4394           continue;
4395         }
4396         auto *CN = dyn_cast<ConstantSDNode>(Op.getNode());
4397         if (!CN)
4398           return false;
4399         APInt MaskElement = CN->getAPIntValue();
4400
4401         // We now have to decode the element which could be any integer size and
4402         // extract each byte of it.
4403         for (int j = 0; j < NumBytesPerElement; ++j) {
4404           // Note that this is x86 and so always little endian: the low byte is
4405           // the first byte of the mask.
4406           RawMask.push_back(MaskElement.getLoBits(8).getZExtValue());
4407           MaskElement = MaskElement.lshr(8);
4408         }
4409       }
4410       DecodePSHUFBMask(RawMask, Mask);
4411       break;
4412     }
4413
4414     auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
4415     if (!MaskLoad)
4416       return false;
4417
4418     SDValue Ptr = MaskLoad->getBasePtr();
4419     if (Ptr->getOpcode() == X86ISD::Wrapper ||
4420         Ptr->getOpcode() == X86ISD::WrapperRIP)
4421       Ptr = Ptr->getOperand(0);
4422
4423     auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
4424     if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
4425       return false;
4426
4427     if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
4428       DecodePSHUFBMask(C, Mask);
4429       if (Mask.empty())
4430         return false;
4431       break;
4432     }
4433
4434     return false;
4435   }
4436   case X86ISD::VPERMI:
4437     ImmN = N->getOperand(N->getNumOperands()-1);
4438     DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4439     IsUnary = true;
4440     break;
4441   case X86ISD::MOVSS:
4442   case X86ISD::MOVSD:
4443     DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
4444     break;
4445   case X86ISD::VPERM2X128:
4446     ImmN = N->getOperand(N->getNumOperands()-1);
4447     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4448     if (Mask.empty()) return false;
4449     break;
4450   case X86ISD::MOVSLDUP:
4451     DecodeMOVSLDUPMask(VT, Mask);
4452     IsUnary = true;
4453     break;
4454   case X86ISD::MOVSHDUP:
4455     DecodeMOVSHDUPMask(VT, Mask);
4456     IsUnary = true;
4457     break;
4458   case X86ISD::MOVDDUP:
4459     DecodeMOVDDUPMask(VT, Mask);
4460     IsUnary = true;
4461     break;
4462   case X86ISD::MOVLHPD:
4463   case X86ISD::MOVLPD:
4464   case X86ISD::MOVLPS:
4465     // Not yet implemented
4466     return false;
4467   default: llvm_unreachable("unknown target shuffle node");
4468   }
4469
4470   // If we have a fake unary shuffle, the shuffle mask is spread across two
4471   // inputs that are actually the same node. Re-map the mask to always point
4472   // into the first input.
4473   if (IsFakeUnary)
4474     for (int &M : Mask)
4475       if (M >= (int)Mask.size())
4476         M -= Mask.size();
4477
4478   return true;
4479 }
4480
4481 /// getShuffleScalarElt - Returns the scalar element that will make up the ith
4482 /// element of the result of the vector shuffle.
4483 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
4484                                    unsigned Depth) {
4485   if (Depth == 6)
4486     return SDValue();  // Limit search depth.
4487
4488   SDValue V = SDValue(N, 0);
4489   EVT VT = V.getValueType();
4490   unsigned Opcode = V.getOpcode();
4491
4492   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
4493   if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
4494     int Elt = SV->getMaskElt(Index);
4495
4496     if (Elt < 0)
4497       return DAG.getUNDEF(VT.getVectorElementType());
4498
4499     unsigned NumElems = VT.getVectorNumElements();
4500     SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
4501                                          : SV->getOperand(1);
4502     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
4503   }
4504
4505   // Recurse into target specific vector shuffles to find scalars.
4506   if (isTargetShuffle(Opcode)) {
4507     MVT ShufVT = V.getSimpleValueType();
4508     unsigned NumElems = ShufVT.getVectorNumElements();
4509     SmallVector<int, 16> ShuffleMask;
4510     bool IsUnary;
4511
4512     if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
4513       return SDValue();
4514
4515     int Elt = ShuffleMask[Index];
4516     if (Elt < 0)
4517       return DAG.getUNDEF(ShufVT.getVectorElementType());
4518
4519     SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
4520                                          : N->getOperand(1);
4521     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
4522                                Depth+1);
4523   }
4524
4525   // Actual nodes that may contain scalar elements
4526   if (Opcode == ISD::BITCAST) {
4527     V = V.getOperand(0);
4528     EVT SrcVT = V.getValueType();
4529     unsigned NumElems = VT.getVectorNumElements();
4530
4531     if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
4532       return SDValue();
4533   }
4534
4535   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
4536     return (Index == 0) ? V.getOperand(0)
4537                         : DAG.getUNDEF(VT.getVectorElementType());
4538
4539   if (V.getOpcode() == ISD::BUILD_VECTOR)
4540     return V.getOperand(Index);
4541
4542   return SDValue();
4543 }
4544
4545 /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
4546 ///
4547 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
4548                                        unsigned NumNonZero, unsigned NumZero,
4549                                        SelectionDAG &DAG,
4550                                        const X86Subtarget* Subtarget,
4551                                        const TargetLowering &TLI) {
4552   if (NumNonZero > 8)
4553     return SDValue();
4554
4555   SDLoc dl(Op);
4556   SDValue V;
4557   bool First = true;
4558
4559   // SSE4.1 - use PINSRB to insert each byte directly.
4560   if (Subtarget->hasSSE41()) {
4561     for (unsigned i = 0; i < 16; ++i) {
4562       bool isNonZero = (NonZeros & (1 << i)) != 0;
4563       if (isNonZero) {
4564         if (First) {
4565           if (NumZero)
4566             V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
4567           else
4568             V = DAG.getUNDEF(MVT::v16i8);
4569           First = false;
4570         }
4571         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
4572                         MVT::v16i8, V, Op.getOperand(i),
4573                         DAG.getIntPtrConstant(i, dl));
4574       }
4575     }
4576
4577     return V;
4578   }
4579
4580   // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
4581   for (unsigned i = 0; i < 16; ++i) {
4582     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
4583     if (ThisIsNonZero && First) {
4584       if (NumZero)
4585         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
4586       else
4587         V = DAG.getUNDEF(MVT::v8i16);
4588       First = false;
4589     }
4590
4591     if ((i & 1) != 0) {
4592       SDValue ThisElt, LastElt;
4593       bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
4594       if (LastIsNonZero) {
4595         LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
4596                               MVT::i16, Op.getOperand(i-1));
4597       }
4598       if (ThisIsNonZero) {
4599         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
4600         ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
4601                               ThisElt, DAG.getConstant(8, dl, MVT::i8));
4602         if (LastIsNonZero)
4603           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
4604       } else
4605         ThisElt = LastElt;
4606
4607       if (ThisElt.getNode())
4608         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
4609                         DAG.getIntPtrConstant(i/2, dl));
4610     }
4611   }
4612
4613   return DAG.getBitcast(MVT::v16i8, V);
4614 }
4615
4616 /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
4617 ///
4618 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
4619                                      unsigned NumNonZero, unsigned NumZero,
4620                                      SelectionDAG &DAG,
4621                                      const X86Subtarget* Subtarget,
4622                                      const TargetLowering &TLI) {
4623   if (NumNonZero > 4)
4624     return SDValue();
4625
4626   SDLoc dl(Op);
4627   SDValue V;
4628   bool First = true;
4629   for (unsigned i = 0; i < 8; ++i) {
4630     bool isNonZero = (NonZeros & (1 << i)) != 0;
4631     if (isNonZero) {
4632       if (First) {
4633         if (NumZero)
4634           V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
4635         else
4636           V = DAG.getUNDEF(MVT::v8i16);
4637         First = false;
4638       }
4639       V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
4640                       MVT::v8i16, V, Op.getOperand(i),
4641                       DAG.getIntPtrConstant(i, dl));
4642     }
4643   }
4644
4645   return V;
4646 }
4647
4648 /// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
4649 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
4650                                      const X86Subtarget *Subtarget,
4651                                      const TargetLowering &TLI) {
4652   // Find all zeroable elements.
4653   std::bitset<4> Zeroable;
4654   for (int i=0; i < 4; ++i) {
4655     SDValue Elt = Op->getOperand(i);
4656     Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt));
4657   }
4658   assert(Zeroable.size() - Zeroable.count() > 1 &&
4659          "We expect at least two non-zero elements!");
4660
4661   // We only know how to deal with build_vector nodes where elements are either
4662   // zeroable or extract_vector_elt with constant index.
4663   SDValue FirstNonZero;
4664   unsigned FirstNonZeroIdx;
4665   for (unsigned i=0; i < 4; ++i) {
4666     if (Zeroable[i])
4667       continue;
4668     SDValue Elt = Op->getOperand(i);
4669     if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
4670         !isa<ConstantSDNode>(Elt.getOperand(1)))
4671       return SDValue();
4672     // Make sure that this node is extracting from a 128-bit vector.
4673     MVT VT = Elt.getOperand(0).getSimpleValueType();
4674     if (!VT.is128BitVector())
4675       return SDValue();
4676     if (!FirstNonZero.getNode()) {
4677       FirstNonZero = Elt;
4678       FirstNonZeroIdx = i;
4679     }
4680   }
4681
4682   assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
4683   SDValue V1 = FirstNonZero.getOperand(0);
4684   MVT VT = V1.getSimpleValueType();
4685
4686   // See if this build_vector can be lowered as a blend with zero.
4687   SDValue Elt;
4688   unsigned EltMaskIdx, EltIdx;
4689   int Mask[4];
4690   for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
4691     if (Zeroable[EltIdx]) {
4692       // The zero vector will be on the right hand side.
4693       Mask[EltIdx] = EltIdx+4;
4694       continue;
4695     }
4696
4697     Elt = Op->getOperand(EltIdx);
4698     // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
4699     EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
4700     if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
4701       break;
4702     Mask[EltIdx] = EltIdx;
4703   }
4704
4705   if (EltIdx == 4) {
4706     // Let the shuffle legalizer deal with blend operations.
4707     SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
4708     if (V1.getSimpleValueType() != VT)
4709       V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1);
4710     return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]);
4711   }
4712
4713   // See if we can lower this build_vector to a INSERTPS.
4714   if (!Subtarget->hasSSE41())
4715     return SDValue();
4716
4717   SDValue V2 = Elt.getOperand(0);
4718   if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
4719     V1 = SDValue();
4720
4721   bool CanFold = true;
4722   for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
4723     if (Zeroable[i])
4724       continue;
4725
4726     SDValue Current = Op->getOperand(i);
4727     SDValue SrcVector = Current->getOperand(0);
4728     if (!V1.getNode())
4729       V1 = SrcVector;
4730     CanFold = SrcVector == V1 &&
4731       cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
4732   }
4733
4734   if (!CanFold)
4735     return SDValue();
4736
4737   assert(V1.getNode() && "Expected at least two non-zero elements!");
4738   if (V1.getSimpleValueType() != MVT::v4f32)
4739     V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1);
4740   if (V2.getSimpleValueType() != MVT::v4f32)
4741     V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2);
4742
4743   // Ok, we can emit an INSERTPS instruction.
4744   unsigned ZMask = Zeroable.to_ulong();
4745
4746   unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
4747   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
4748   SDLoc DL(Op);
4749   SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
4750                                DAG.getIntPtrConstant(InsertPSMask, DL));
4751   return DAG.getBitcast(VT, Result);
4752 }
4753
4754 /// Return a vector logical shift node.
4755 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
4756                          unsigned NumBits, SelectionDAG &DAG,
4757                          const TargetLowering &TLI, SDLoc dl) {
4758   assert(VT.is128BitVector() && "Unknown type for VShift");
4759   MVT ShVT = MVT::v2i64;
4760   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
4761   SrcOp = DAG.getBitcast(ShVT, SrcOp);
4762   MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(SrcOp.getValueType());
4763   assert(NumBits % 8 == 0 && "Only support byte sized shifts");
4764   SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
4765   return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
4766 }
4767
4768 static SDValue
4769 LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
4770
4771   // Check if the scalar load can be widened into a vector load. And if
4772   // the address is "base + cst" see if the cst can be "absorbed" into
4773   // the shuffle mask.
4774   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
4775     SDValue Ptr = LD->getBasePtr();
4776     if (!ISD::isNormalLoad(LD) || LD->isVolatile())
4777       return SDValue();
4778     EVT PVT = LD->getValueType(0);
4779     if (PVT != MVT::i32 && PVT != MVT::f32)
4780       return SDValue();
4781
4782     int FI = -1;
4783     int64_t Offset = 0;
4784     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
4785       FI = FINode->getIndex();
4786       Offset = 0;
4787     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
4788                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
4789       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
4790       Offset = Ptr.getConstantOperandVal(1);
4791       Ptr = Ptr.getOperand(0);
4792     } else {
4793       return SDValue();
4794     }
4795
4796     // FIXME: 256-bit vector instructions don't require a strict alignment,
4797     // improve this code to support it better.
4798     unsigned RequiredAlign = VT.getSizeInBits()/8;
4799     SDValue Chain = LD->getChain();
4800     // Make sure the stack object alignment is at least 16 or 32.
4801     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
4802     if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
4803       if (MFI->isFixedObjectIndex(FI)) {
4804         // Can't change the alignment. FIXME: It's possible to compute
4805         // the exact stack offset and reference FI + adjust offset instead.
4806         // If someone *really* cares about this. That's the way to implement it.
4807         return SDValue();
4808       } else {
4809         MFI->setObjectAlignment(FI, RequiredAlign);
4810       }
4811     }
4812
4813     // (Offset % 16 or 32) must be multiple of 4. Then address is then
4814     // Ptr + (Offset & ~15).
4815     if (Offset < 0)
4816       return SDValue();
4817     if ((Offset % RequiredAlign) & 3)
4818       return SDValue();
4819     int64_t StartOffset = Offset & ~(RequiredAlign-1);
4820     if (StartOffset) {
4821       SDLoc DL(Ptr);
4822       Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
4823                         DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
4824     }
4825
4826     int EltNo = (Offset - StartOffset) >> 2;
4827     unsigned NumElems = VT.getVectorNumElements();
4828
4829     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
4830     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
4831                              LD->getPointerInfo().getWithOffset(StartOffset),
4832                              false, false, false, 0);
4833
4834     SmallVector<int, 8> Mask(NumElems, EltNo);
4835
4836     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
4837   }
4838
4839   return SDValue();
4840 }
4841
4842 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
4843 /// elements can be replaced by a single large load which has the same value as
4844 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
4845 ///
4846 /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
4847 ///
4848 /// FIXME: we'd also like to handle the case where the last elements are zero
4849 /// rather than undef via VZEXT_LOAD, but we do not detect that case today.
4850 /// There's even a handy isZeroNode for that purpose.
4851 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
4852                                         SDLoc &DL, SelectionDAG &DAG,
4853                                         bool isAfterLegalize) {
4854   unsigned NumElems = Elts.size();
4855
4856   LoadSDNode *LDBase = nullptr;
4857   unsigned LastLoadedElt = -1U;
4858
4859   // For each element in the initializer, see if we've found a load or an undef.
4860   // If we don't find an initial load element, or later load elements are
4861   // non-consecutive, bail out.
4862   for (unsigned i = 0; i < NumElems; ++i) {
4863     SDValue Elt = Elts[i];
4864     // Look through a bitcast.
4865     if (Elt.getNode() && Elt.getOpcode() == ISD::BITCAST)
4866       Elt = Elt.getOperand(0);
4867     if (!Elt.getNode() ||
4868         (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
4869       return SDValue();
4870     if (!LDBase) {
4871       if (Elt.getNode()->getOpcode() == ISD::UNDEF)
4872         return SDValue();
4873       LDBase = cast<LoadSDNode>(Elt.getNode());
4874       LastLoadedElt = i;
4875       continue;
4876     }
4877     if (Elt.getOpcode() == ISD::UNDEF)
4878       continue;
4879
4880     LoadSDNode *LD = cast<LoadSDNode>(Elt);
4881     EVT LdVT = Elt.getValueType();
4882     // Each loaded element must be the correct fractional portion of the
4883     // requested vector load.
4884     if (LdVT.getSizeInBits() != VT.getSizeInBits() / NumElems)
4885       return SDValue();
4886     if (!DAG.isConsecutiveLoad(LD, LDBase, LdVT.getSizeInBits() / 8, i))
4887       return SDValue();
4888     LastLoadedElt = i;
4889   }
4890
4891   // If we have found an entire vector of loads and undefs, then return a large
4892   // load of the entire vector width starting at the base pointer.  If we found
4893   // consecutive loads for the low half, generate a vzext_load node.
4894   if (LastLoadedElt == NumElems - 1) {
4895     assert(LDBase && "Did not find base load for merging consecutive loads");
4896     EVT EltVT = LDBase->getValueType(0);
4897     // Ensure that the input vector size for the merged loads matches the
4898     // cumulative size of the input elements.
4899     if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
4900       return SDValue();
4901
4902     if (isAfterLegalize &&
4903         !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
4904       return SDValue();
4905
4906     SDValue NewLd = SDValue();
4907
4908     NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
4909                         LDBase->getPointerInfo(), LDBase->isVolatile(),
4910                         LDBase->isNonTemporal(), LDBase->isInvariant(),
4911                         LDBase->getAlignment());
4912
4913     if (LDBase->hasAnyUseOfValue(1)) {
4914       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
4915                                      SDValue(LDBase, 1),
4916                                      SDValue(NewLd.getNode(), 1));
4917       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
4918       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
4919                              SDValue(NewLd.getNode(), 1));
4920     }
4921
4922     return NewLd;
4923   }
4924
4925   //TODO: The code below fires only for for loading the low v2i32 / v2f32
4926   //of a v4i32 / v4f32. It's probably worth generalizing.
4927   EVT EltVT = VT.getVectorElementType();
4928   if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) &&
4929       DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
4930     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
4931     SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
4932     SDValue ResNode =
4933         DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
4934                                 LDBase->getPointerInfo(),
4935                                 LDBase->getAlignment(),
4936                                 false/*isVolatile*/, true/*ReadMem*/,
4937                                 false/*WriteMem*/);
4938
4939     // Make sure the newly-created LOAD is in the same position as LDBase in
4940     // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
4941     // update uses of LDBase's output chain to use the TokenFactor.
4942     if (LDBase->hasAnyUseOfValue(1)) {
4943       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
4944                              SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
4945       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
4946       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
4947                              SDValue(ResNode.getNode(), 1));
4948     }
4949
4950     return DAG.getBitcast(VT, ResNode);
4951   }
4952   return SDValue();
4953 }
4954
4955 /// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
4956 /// to generate a splat value for the following cases:
4957 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
4958 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
4959 /// a scalar load, or a constant.
4960 /// The VBROADCAST node is returned when a pattern is found,
4961 /// or SDValue() otherwise.
4962 static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
4963                                     SelectionDAG &DAG) {
4964   // VBROADCAST requires AVX.
4965   // TODO: Splats could be generated for non-AVX CPUs using SSE
4966   // instructions, but there's less potential gain for only 128-bit vectors.
4967   if (!Subtarget->hasAVX())
4968     return SDValue();
4969
4970   MVT VT = Op.getSimpleValueType();
4971   SDLoc dl(Op);
4972
4973   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
4974          "Unsupported vector type for broadcast.");
4975
4976   SDValue Ld;
4977   bool ConstSplatVal;
4978
4979   switch (Op.getOpcode()) {
4980     default:
4981       // Unknown pattern found.
4982       return SDValue();
4983
4984     case ISD::BUILD_VECTOR: {
4985       auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
4986       BitVector UndefElements;
4987       SDValue Splat = BVOp->getSplatValue(&UndefElements);
4988
4989       // We need a splat of a single value to use broadcast, and it doesn't
4990       // make any sense if the value is only in one element of the vector.
4991       if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
4992         return SDValue();
4993
4994       Ld = Splat;
4995       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
4996                        Ld.getOpcode() == ISD::ConstantFP);
4997
4998       // Make sure that all of the users of a non-constant load are from the
4999       // BUILD_VECTOR node.
5000       if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
5001         return SDValue();
5002       break;
5003     }
5004
5005     case ISD::VECTOR_SHUFFLE: {
5006       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
5007
5008       // Shuffles must have a splat mask where the first element is
5009       // broadcasted.
5010       if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
5011         return SDValue();
5012
5013       SDValue Sc = Op.getOperand(0);
5014       if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
5015           Sc.getOpcode() != ISD::BUILD_VECTOR) {
5016
5017         if (!Subtarget->hasInt256())
5018           return SDValue();
5019
5020         // Use the register form of the broadcast instruction available on AVX2.
5021         if (VT.getSizeInBits() >= 256)
5022           Sc = Extract128BitVector(Sc, 0, DAG, dl);
5023         return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
5024       }
5025
5026       Ld = Sc.getOperand(0);
5027       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
5028                        Ld.getOpcode() == ISD::ConstantFP);
5029
5030       // The scalar_to_vector node and the suspected
5031       // load node must have exactly one user.
5032       // Constants may have multiple users.
5033
5034       // AVX-512 has register version of the broadcast
5035       bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
5036         Ld.getValueType().getSizeInBits() >= 32;
5037       if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
5038           !hasRegVer))
5039         return SDValue();
5040       break;
5041     }
5042   }
5043
5044   unsigned ScalarSize = Ld.getValueType().getSizeInBits();
5045   bool IsGE256 = (VT.getSizeInBits() >= 256);
5046
5047   // When optimizing for size, generate up to 5 extra bytes for a broadcast
5048   // instruction to save 8 or more bytes of constant pool data.
5049   // TODO: If multiple splats are generated to load the same constant,
5050   // it may be detrimental to overall size. There needs to be a way to detect
5051   // that condition to know if this is truly a size win.
5052   const Function *F = DAG.getMachineFunction().getFunction();
5053   bool OptForSize = F->hasFnAttribute(Attribute::OptimizeForSize);
5054
5055   // Handle broadcasting a single constant scalar from the constant pool
5056   // into a vector.
5057   // On Sandybridge (no AVX2), it is still better to load a constant vector
5058   // from the constant pool and not to broadcast it from a scalar.
5059   // But override that restriction when optimizing for size.
5060   // TODO: Check if splatting is recommended for other AVX-capable CPUs.
5061   if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) {
5062     EVT CVT = Ld.getValueType();
5063     assert(!CVT.isVector() && "Must not broadcast a vector type");
5064
5065     // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
5066     // For size optimization, also splat v2f64 and v2i64, and for size opt
5067     // with AVX2, also splat i8 and i16.
5068     // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
5069     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
5070         (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) {
5071       const Constant *C = nullptr;
5072       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
5073         C = CI->getConstantIntValue();
5074       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
5075         C = CF->getConstantFPValue();
5076
5077       assert(C && "Invalid constant type");
5078
5079       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5080       SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
5081       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
5082       Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
5083                        MachinePointerInfo::getConstantPool(),
5084                        false, false, false, Alignment);
5085
5086       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5087     }
5088   }
5089
5090   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
5091
5092   // Handle AVX2 in-register broadcasts.
5093   if (!IsLoad && Subtarget->hasInt256() &&
5094       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
5095     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5096
5097   // The scalar source must be a normal load.
5098   if (!IsLoad)
5099     return SDValue();
5100
5101   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
5102       (Subtarget->hasVLX() && ScalarSize == 64))
5103     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5104
5105   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
5106   // double since there is no vbroadcastsd xmm
5107   if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
5108     if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
5109       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5110   }
5111
5112   // Unsupported broadcast.
5113   return SDValue();
5114 }
5115
5116 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
5117 /// underlying vector and index.
5118 ///
5119 /// Modifies \p ExtractedFromVec to the real vector and returns the real
5120 /// index.
5121 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
5122                                          SDValue ExtIdx) {
5123   int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
5124   if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
5125     return Idx;
5126
5127   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
5128   // lowered this:
5129   //   (extract_vector_elt (v8f32 %vreg1), Constant<6>)
5130   // to:
5131   //   (extract_vector_elt (vector_shuffle<2,u,u,u>
5132   //                           (extract_subvector (v8f32 %vreg0), Constant<4>),
5133   //                           undef)
5134   //                       Constant<0>)
5135   // In this case the vector is the extract_subvector expression and the index
5136   // is 2, as specified by the shuffle.
5137   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
5138   SDValue ShuffleVec = SVOp->getOperand(0);
5139   MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
5140   assert(ShuffleVecVT.getVectorElementType() ==
5141          ExtractedFromVec.getSimpleValueType().getVectorElementType());
5142
5143   int ShuffleIdx = SVOp->getMaskElt(Idx);
5144   if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
5145     ExtractedFromVec = ShuffleVec;
5146     return ShuffleIdx;
5147   }
5148   return Idx;
5149 }
5150
5151 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
5152   MVT VT = Op.getSimpleValueType();
5153
5154   // Skip if insert_vec_elt is not supported.
5155   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5156   if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
5157     return SDValue();
5158
5159   SDLoc DL(Op);
5160   unsigned NumElems = Op.getNumOperands();
5161
5162   SDValue VecIn1;
5163   SDValue VecIn2;
5164   SmallVector<unsigned, 4> InsertIndices;
5165   SmallVector<int, 8> Mask(NumElems, -1);
5166
5167   for (unsigned i = 0; i != NumElems; ++i) {
5168     unsigned Opc = Op.getOperand(i).getOpcode();
5169
5170     if (Opc == ISD::UNDEF)
5171       continue;
5172
5173     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
5174       // Quit if more than 1 elements need inserting.
5175       if (InsertIndices.size() > 1)
5176         return SDValue();
5177
5178       InsertIndices.push_back(i);
5179       continue;
5180     }
5181
5182     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
5183     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
5184     // Quit if non-constant index.
5185     if (!isa<ConstantSDNode>(ExtIdx))
5186       return SDValue();
5187     int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
5188
5189     // Quit if extracted from vector of different type.
5190     if (ExtractedFromVec.getValueType() != VT)
5191       return SDValue();
5192
5193     if (!VecIn1.getNode())
5194       VecIn1 = ExtractedFromVec;
5195     else if (VecIn1 != ExtractedFromVec) {
5196       if (!VecIn2.getNode())
5197         VecIn2 = ExtractedFromVec;
5198       else if (VecIn2 != ExtractedFromVec)
5199         // Quit if more than 2 vectors to shuffle
5200         return SDValue();
5201     }
5202
5203     if (ExtractedFromVec == VecIn1)
5204       Mask[i] = Idx;
5205     else if (ExtractedFromVec == VecIn2)
5206       Mask[i] = Idx + NumElems;
5207   }
5208
5209   if (!VecIn1.getNode())
5210     return SDValue();
5211
5212   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
5213   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
5214   for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
5215     unsigned Idx = InsertIndices[i];
5216     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
5217                      DAG.getIntPtrConstant(Idx, DL));
5218   }
5219
5220   return NV;
5221 }
5222
5223 static SDValue ConvertI1VectorToInterger(SDValue Op, SelectionDAG &DAG) {
5224   assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
5225          Op.getScalarValueSizeInBits() == 1 &&
5226          "Can not convert non-constant vector");
5227   uint64_t Immediate = 0;
5228   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
5229     SDValue In = Op.getOperand(idx);
5230     if (In.getOpcode() != ISD::UNDEF)
5231       Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
5232   }
5233   SDLoc dl(Op);
5234   MVT VT =
5235    MVT::getIntegerVT(std::max((int)Op.getValueType().getSizeInBits(), 8));
5236   return DAG.getConstant(Immediate, dl, VT);
5237 }
5238 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
5239 SDValue
5240 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
5241
5242   MVT VT = Op.getSimpleValueType();
5243   assert((VT.getVectorElementType() == MVT::i1) &&
5244          "Unexpected type in LowerBUILD_VECTORvXi1!");
5245
5246   SDLoc dl(Op);
5247   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
5248     SDValue Cst = DAG.getTargetConstant(0, dl, MVT::i1);
5249     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
5250     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
5251   }
5252
5253   if (ISD::isBuildVectorAllOnes(Op.getNode())) {
5254     SDValue Cst = DAG.getTargetConstant(1, dl, MVT::i1);
5255     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
5256     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
5257   }
5258
5259   if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
5260     SDValue Imm = ConvertI1VectorToInterger(Op, DAG);
5261     if (Imm.getValueSizeInBits() == VT.getSizeInBits())
5262       return DAG.getBitcast(VT, Imm);
5263     SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
5264     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
5265                         DAG.getIntPtrConstant(0, dl));
5266   }
5267
5268   // Vector has one or more non-const elements
5269   uint64_t Immediate = 0;
5270   SmallVector<unsigned, 16> NonConstIdx;
5271   bool IsSplat = true;
5272   bool HasConstElts = false;
5273   int SplatIdx = -1;
5274   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
5275     SDValue In = Op.getOperand(idx);
5276     if (In.getOpcode() == ISD::UNDEF)
5277       continue;
5278     if (!isa<ConstantSDNode>(In))
5279       NonConstIdx.push_back(idx);
5280     else {
5281       Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
5282       HasConstElts = true;
5283     }
5284     if (SplatIdx == -1)
5285       SplatIdx = idx;
5286     else if (In != Op.getOperand(SplatIdx))
5287       IsSplat = false;
5288   }
5289
5290   // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
5291   if (IsSplat)
5292     return DAG.getNode(ISD::SELECT, dl, VT, Op.getOperand(SplatIdx),
5293                        DAG.getConstant(1, dl, VT),
5294                        DAG.getConstant(0, dl, VT));
5295
5296   // insert elements one by one
5297   SDValue DstVec;
5298   SDValue Imm;
5299   if (Immediate) {
5300     MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
5301     Imm = DAG.getConstant(Immediate, dl, ImmVT);
5302   }
5303   else if (HasConstElts)
5304     Imm = DAG.getConstant(0, dl, VT);
5305   else
5306     Imm = DAG.getUNDEF(VT);
5307   if (Imm.getValueSizeInBits() == VT.getSizeInBits())
5308     DstVec = DAG.getBitcast(VT, Imm);
5309   else {
5310     SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
5311     DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
5312                          DAG.getIntPtrConstant(0, dl));
5313   }
5314
5315   for (unsigned i = 0; i < NonConstIdx.size(); ++i) {
5316     unsigned InsertIdx = NonConstIdx[i];
5317     DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
5318                          Op.getOperand(InsertIdx),
5319                          DAG.getIntPtrConstant(InsertIdx, dl));
5320   }
5321   return DstVec;
5322 }
5323
5324 /// \brief Return true if \p N implements a horizontal binop and return the
5325 /// operands for the horizontal binop into V0 and V1.
5326 ///
5327 /// This is a helper function of LowerToHorizontalOp().
5328 /// This function checks that the build_vector \p N in input implements a
5329 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
5330 /// operation to match.
5331 /// For example, if \p Opcode is equal to ISD::ADD, then this function
5332 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
5333 /// is equal to ISD::SUB, then this function checks if this is a horizontal
5334 /// arithmetic sub.
5335 ///
5336 /// This function only analyzes elements of \p N whose indices are
5337 /// in range [BaseIdx, LastIdx).
5338 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
5339                               SelectionDAG &DAG,
5340                               unsigned BaseIdx, unsigned LastIdx,
5341                               SDValue &V0, SDValue &V1) {
5342   EVT VT = N->getValueType(0);
5343
5344   assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
5345   assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
5346          "Invalid Vector in input!");
5347
5348   bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
5349   bool CanFold = true;
5350   unsigned ExpectedVExtractIdx = BaseIdx;
5351   unsigned NumElts = LastIdx - BaseIdx;
5352   V0 = DAG.getUNDEF(VT);
5353   V1 = DAG.getUNDEF(VT);
5354
5355   // Check if N implements a horizontal binop.
5356   for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
5357     SDValue Op = N->getOperand(i + BaseIdx);
5358
5359     // Skip UNDEFs.
5360     if (Op->getOpcode() == ISD::UNDEF) {
5361       // Update the expected vector extract index.
5362       if (i * 2 == NumElts)
5363         ExpectedVExtractIdx = BaseIdx;
5364       ExpectedVExtractIdx += 2;
5365       continue;
5366     }
5367
5368     CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
5369
5370     if (!CanFold)
5371       break;
5372
5373     SDValue Op0 = Op.getOperand(0);
5374     SDValue Op1 = Op.getOperand(1);
5375
5376     // Try to match the following pattern:
5377     // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
5378     CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
5379         Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
5380         Op0.getOperand(0) == Op1.getOperand(0) &&
5381         isa<ConstantSDNode>(Op0.getOperand(1)) &&
5382         isa<ConstantSDNode>(Op1.getOperand(1)));
5383     if (!CanFold)
5384       break;
5385
5386     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
5387     unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
5388
5389     if (i * 2 < NumElts) {
5390       if (V0.getOpcode() == ISD::UNDEF) {
5391         V0 = Op0.getOperand(0);
5392         if (V0.getValueType() != VT)
5393           return false;
5394       }
5395     } else {
5396       if (V1.getOpcode() == ISD::UNDEF) {
5397         V1 = Op0.getOperand(0);
5398         if (V1.getValueType() != VT)
5399           return false;
5400       }
5401       if (i * 2 == NumElts)
5402         ExpectedVExtractIdx = BaseIdx;
5403     }
5404
5405     SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
5406     if (I0 == ExpectedVExtractIdx)
5407       CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
5408     else if (IsCommutable && I1 == ExpectedVExtractIdx) {
5409       // Try to match the following dag sequence:
5410       // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
5411       CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
5412     } else
5413       CanFold = false;
5414
5415     ExpectedVExtractIdx += 2;
5416   }
5417
5418   return CanFold;
5419 }
5420
5421 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
5422 /// a concat_vector.
5423 ///
5424 /// This is a helper function of LowerToHorizontalOp().
5425 /// This function expects two 256-bit vectors called V0 and V1.
5426 /// At first, each vector is split into two separate 128-bit vectors.
5427 /// Then, the resulting 128-bit vectors are used to implement two
5428 /// horizontal binary operations.
5429 ///
5430 /// The kind of horizontal binary operation is defined by \p X86Opcode.
5431 ///
5432 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
5433 /// the two new horizontal binop.
5434 /// When Mode is set, the first horizontal binop dag node would take as input
5435 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
5436 /// horizontal binop dag node would take as input the lower 128-bit of V1
5437 /// and the upper 128-bit of V1.
5438 ///   Example:
5439 ///     HADD V0_LO, V0_HI
5440 ///     HADD V1_LO, V1_HI
5441 ///
5442 /// Otherwise, the first horizontal binop dag node takes as input the lower
5443 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
5444 /// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1.
5445 ///   Example:
5446 ///     HADD V0_LO, V1_LO
5447 ///     HADD V0_HI, V1_HI
5448 ///
5449 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
5450 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
5451 /// the upper 128-bits of the result.
5452 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
5453                                      SDLoc DL, SelectionDAG &DAG,
5454                                      unsigned X86Opcode, bool Mode,
5455                                      bool isUndefLO, bool isUndefHI) {
5456   EVT VT = V0.getValueType();
5457   assert(VT.is256BitVector() && VT == V1.getValueType() &&
5458          "Invalid nodes in input!");
5459
5460   unsigned NumElts = VT.getVectorNumElements();
5461   SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL);
5462   SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL);
5463   SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL);
5464   SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL);
5465   EVT NewVT = V0_LO.getValueType();
5466
5467   SDValue LO = DAG.getUNDEF(NewVT);
5468   SDValue HI = DAG.getUNDEF(NewVT);
5469
5470   if (Mode) {
5471     // Don't emit a horizontal binop if the result is expected to be UNDEF.
5472     if (!isUndefLO && V0->getOpcode() != ISD::UNDEF)
5473       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
5474     if (!isUndefHI && V1->getOpcode() != ISD::UNDEF)
5475       HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
5476   } else {
5477     // Don't emit a horizontal binop if the result is expected to be UNDEF.
5478     if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF ||
5479                        V1_LO->getOpcode() != ISD::UNDEF))
5480       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
5481
5482     if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF ||
5483                        V1_HI->getOpcode() != ISD::UNDEF))
5484       HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
5485   }
5486
5487   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
5488 }
5489
5490 /// Try to fold a build_vector that performs an 'addsub' to an X86ISD::ADDSUB
5491 /// node.
5492 static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
5493                              const X86Subtarget *Subtarget, SelectionDAG &DAG) {
5494   EVT VT = BV->getValueType(0);
5495   if ((!Subtarget->hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
5496       (!Subtarget->hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
5497     return SDValue();
5498
5499   SDLoc DL(BV);
5500   unsigned NumElts = VT.getVectorNumElements();
5501   SDValue InVec0 = DAG.getUNDEF(VT);
5502   SDValue InVec1 = DAG.getUNDEF(VT);
5503
5504   assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
5505           VT == MVT::v2f64) && "build_vector with an invalid type found!");
5506
5507   // Odd-numbered elements in the input build vector are obtained from
5508   // adding two integer/float elements.
5509   // Even-numbered elements in the input build vector are obtained from
5510   // subtracting two integer/float elements.
5511   unsigned ExpectedOpcode = ISD::FSUB;
5512   unsigned NextExpectedOpcode = ISD::FADD;
5513   bool AddFound = false;
5514   bool SubFound = false;
5515
5516   for (unsigned i = 0, e = NumElts; i != e; ++i) {
5517     SDValue Op = BV->getOperand(i);
5518
5519     // Skip 'undef' values.
5520     unsigned Opcode = Op.getOpcode();
5521     if (Opcode == ISD::UNDEF) {
5522       std::swap(ExpectedOpcode, NextExpectedOpcode);
5523       continue;
5524     }
5525
5526     // Early exit if we found an unexpected opcode.
5527     if (Opcode != ExpectedOpcode)
5528       return SDValue();
5529
5530     SDValue Op0 = Op.getOperand(0);
5531     SDValue Op1 = Op.getOperand(1);
5532
5533     // Try to match the following pattern:
5534     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
5535     // Early exit if we cannot match that sequence.
5536     if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5537         Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5538         !isa<ConstantSDNode>(Op0.getOperand(1)) ||
5539         !isa<ConstantSDNode>(Op1.getOperand(1)) ||
5540         Op0.getOperand(1) != Op1.getOperand(1))
5541       return SDValue();
5542
5543     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
5544     if (I0 != i)
5545       return SDValue();
5546
5547     // We found a valid add/sub node. Update the information accordingly.
5548     if (i & 1)
5549       AddFound = true;
5550     else
5551       SubFound = true;
5552
5553     // Update InVec0 and InVec1.
5554     if (InVec0.getOpcode() == ISD::UNDEF) {
5555       InVec0 = Op0.getOperand(0);
5556       if (InVec0.getValueType() != VT)
5557         return SDValue();
5558     }
5559     if (InVec1.getOpcode() == ISD::UNDEF) {
5560       InVec1 = Op1.getOperand(0);
5561       if (InVec1.getValueType() != VT)
5562         return SDValue();
5563     }
5564
5565     // Make sure that operands in input to each add/sub node always
5566     // come from a same pair of vectors.
5567     if (InVec0 != Op0.getOperand(0)) {
5568       if (ExpectedOpcode == ISD::FSUB)
5569         return SDValue();
5570
5571       // FADD is commutable. Try to commute the operands
5572       // and then test again.
5573       std::swap(Op0, Op1);
5574       if (InVec0 != Op0.getOperand(0))
5575         return SDValue();
5576     }
5577
5578     if (InVec1 != Op1.getOperand(0))
5579       return SDValue();
5580
5581     // Update the pair of expected opcodes.
5582     std::swap(ExpectedOpcode, NextExpectedOpcode);
5583   }
5584
5585   // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
5586   if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF &&
5587       InVec1.getOpcode() != ISD::UNDEF)
5588     return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
5589
5590   return SDValue();
5591 }
5592
5593 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
5594 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
5595                                    const X86Subtarget *Subtarget,
5596                                    SelectionDAG &DAG) {
5597   EVT VT = BV->getValueType(0);
5598   unsigned NumElts = VT.getVectorNumElements();
5599   unsigned NumUndefsLO = 0;
5600   unsigned NumUndefsHI = 0;
5601   unsigned Half = NumElts/2;
5602
5603   // Count the number of UNDEF operands in the build_vector in input.
5604   for (unsigned i = 0, e = Half; i != e; ++i)
5605     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
5606       NumUndefsLO++;
5607
5608   for (unsigned i = Half, e = NumElts; i != e; ++i)
5609     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
5610       NumUndefsHI++;
5611
5612   // Early exit if this is either a build_vector of all UNDEFs or all the
5613   // operands but one are UNDEF.
5614   if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
5615     return SDValue();
5616
5617   SDLoc DL(BV);
5618   SDValue InVec0, InVec1;
5619   if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) {
5620     // Try to match an SSE3 float HADD/HSUB.
5621     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
5622       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
5623
5624     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
5625       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
5626   } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
5627     // Try to match an SSSE3 integer HADD/HSUB.
5628     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
5629       return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
5630
5631     if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
5632       return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
5633   }
5634
5635   if (!Subtarget->hasAVX())
5636     return SDValue();
5637
5638   if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
5639     // Try to match an AVX horizontal add/sub of packed single/double
5640     // precision floating point values from 256-bit vectors.
5641     SDValue InVec2, InVec3;
5642     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
5643         isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
5644         ((InVec0.getOpcode() == ISD::UNDEF ||
5645           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
5646         ((InVec1.getOpcode() == ISD::UNDEF ||
5647           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
5648       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
5649
5650     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
5651         isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
5652         ((InVec0.getOpcode() == ISD::UNDEF ||
5653           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
5654         ((InVec1.getOpcode() == ISD::UNDEF ||
5655           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
5656       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
5657   } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
5658     // Try to match an AVX2 horizontal add/sub of signed integers.
5659     SDValue InVec2, InVec3;
5660     unsigned X86Opcode;
5661     bool CanFold = true;
5662
5663     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
5664         isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
5665         ((InVec0.getOpcode() == ISD::UNDEF ||
5666           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
5667         ((InVec1.getOpcode() == ISD::UNDEF ||
5668           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
5669       X86Opcode = X86ISD::HADD;
5670     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
5671         isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
5672         ((InVec0.getOpcode() == ISD::UNDEF ||
5673           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
5674         ((InVec1.getOpcode() == ISD::UNDEF ||
5675           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
5676       X86Opcode = X86ISD::HSUB;
5677     else
5678       CanFold = false;
5679
5680     if (CanFold) {
5681       // Fold this build_vector into a single horizontal add/sub.
5682       // Do this only if the target has AVX2.
5683       if (Subtarget->hasAVX2())
5684         return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
5685
5686       // Do not try to expand this build_vector into a pair of horizontal
5687       // add/sub if we can emit a pair of scalar add/sub.
5688       if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
5689         return SDValue();
5690
5691       // Convert this build_vector into a pair of horizontal binop followed by
5692       // a concat vector.
5693       bool isUndefLO = NumUndefsLO == Half;
5694       bool isUndefHI = NumUndefsHI == Half;
5695       return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
5696                                    isUndefLO, isUndefHI);
5697     }
5698   }
5699
5700   if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
5701        VT == MVT::v16i16) && Subtarget->hasAVX()) {
5702     unsigned X86Opcode;
5703     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
5704       X86Opcode = X86ISD::HADD;
5705     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
5706       X86Opcode = X86ISD::HSUB;
5707     else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
5708       X86Opcode = X86ISD::FHADD;
5709     else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
5710       X86Opcode = X86ISD::FHSUB;
5711     else
5712       return SDValue();
5713
5714     // Don't try to expand this build_vector into a pair of horizontal add/sub
5715     // if we can simply emit a pair of scalar add/sub.
5716     if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
5717       return SDValue();
5718
5719     // Convert this build_vector into two horizontal add/sub followed by
5720     // a concat vector.
5721     bool isUndefLO = NumUndefsLO == Half;
5722     bool isUndefHI = NumUndefsHI == Half;
5723     return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
5724                                  isUndefLO, isUndefHI);
5725   }
5726
5727   return SDValue();
5728 }
5729
5730 SDValue
5731 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
5732   SDLoc dl(Op);
5733
5734   MVT VT = Op.getSimpleValueType();
5735   MVT ExtVT = VT.getVectorElementType();
5736   unsigned NumElems = Op.getNumOperands();
5737
5738   // Generate vectors for predicate vectors.
5739   if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512())
5740     return LowerBUILD_VECTORvXi1(Op, DAG);
5741
5742   // Vectors containing all zeros can be matched by pxor and xorps later
5743   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
5744     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
5745     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
5746     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
5747       return Op;
5748
5749     return getZeroVector(VT, Subtarget, DAG, dl);
5750   }
5751
5752   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
5753   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
5754   // vpcmpeqd on 256-bit vectors.
5755   if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
5756     if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
5757       return Op;
5758
5759     if (!VT.is512BitVector())
5760       return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
5761   }
5762
5763   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
5764   if (SDValue AddSub = LowerToAddSub(BV, Subtarget, DAG))
5765     return AddSub;
5766   if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
5767     return HorizontalOp;
5768   if (SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG))
5769     return Broadcast;
5770
5771   unsigned EVTBits = ExtVT.getSizeInBits();
5772
5773   unsigned NumZero  = 0;
5774   unsigned NumNonZero = 0;
5775   unsigned NonZeros = 0;
5776   bool IsAllConstants = true;
5777   SmallSet<SDValue, 8> Values;
5778   for (unsigned i = 0; i < NumElems; ++i) {
5779     SDValue Elt = Op.getOperand(i);
5780     if (Elt.getOpcode() == ISD::UNDEF)
5781       continue;
5782     Values.insert(Elt);
5783     if (Elt.getOpcode() != ISD::Constant &&
5784         Elt.getOpcode() != ISD::ConstantFP)
5785       IsAllConstants = false;
5786     if (X86::isZeroNode(Elt))
5787       NumZero++;
5788     else {
5789       NonZeros |= (1 << i);
5790       NumNonZero++;
5791     }
5792   }
5793
5794   // All undef vector. Return an UNDEF.  All zero vectors were handled above.
5795   if (NumNonZero == 0)
5796     return DAG.getUNDEF(VT);
5797
5798   // Special case for single non-zero, non-undef, element.
5799   if (NumNonZero == 1) {
5800     unsigned Idx = countTrailingZeros(NonZeros);
5801     SDValue Item = Op.getOperand(Idx);
5802
5803     // If this is an insertion of an i64 value on x86-32, and if the top bits of
5804     // the value are obviously zero, truncate the value to i32 and do the
5805     // insertion that way.  Only do this if the value is non-constant or if the
5806     // value is a constant being inserted into element 0.  It is cheaper to do
5807     // a constant pool load than it is to do a movd + shuffle.
5808     if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
5809         (!IsAllConstants || Idx == 0)) {
5810       if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
5811         // Handle SSE only.
5812         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
5813         EVT VecVT = MVT::v4i32;
5814
5815         // Truncate the value (which may itself be a constant) to i32, and
5816         // convert it to a vector with movd (S2V+shuffle to zero extend).
5817         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
5818         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
5819         return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
5820                                       Item, Idx * 2, true, Subtarget, DAG));
5821       }
5822     }
5823
5824     // If we have a constant or non-constant insertion into the low element of
5825     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
5826     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
5827     // depending on what the source datatype is.
5828     if (Idx == 0) {
5829       if (NumZero == 0)
5830         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
5831
5832       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
5833           (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
5834         if (VT.is512BitVector()) {
5835           SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
5836           return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
5837                              Item, DAG.getIntPtrConstant(0, dl));
5838         }
5839         assert((VT.is128BitVector() || VT.is256BitVector()) &&
5840                "Expected an SSE value type!");
5841         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
5842         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
5843         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
5844       }
5845
5846       // We can't directly insert an i8 or i16 into a vector, so zero extend
5847       // it to i32 first.
5848       if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
5849         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
5850         if (VT.is256BitVector()) {
5851           if (Subtarget->hasAVX()) {
5852             Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v8i32, Item);
5853             Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
5854           } else {
5855             // Without AVX, we need to extend to a 128-bit vector and then
5856             // insert into the 256-bit vector.
5857             Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
5858             SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
5859             Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
5860           }
5861         } else {
5862           assert(VT.is128BitVector() && "Expected an SSE value type!");
5863           Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
5864           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
5865         }
5866         return DAG.getBitcast(VT, Item);
5867       }
5868     }
5869
5870     // Is it a vector logical left shift?
5871     if (NumElems == 2 && Idx == 1 &&
5872         X86::isZeroNode(Op.getOperand(0)) &&
5873         !X86::isZeroNode(Op.getOperand(1))) {
5874       unsigned NumBits = VT.getSizeInBits();
5875       return getVShift(true, VT,
5876                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
5877                                    VT, Op.getOperand(1)),
5878                        NumBits/2, DAG, *this, dl);
5879     }
5880
5881     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
5882       return SDValue();
5883
5884     // Otherwise, if this is a vector with i32 or f32 elements, and the element
5885     // is a non-constant being inserted into an element other than the low one,
5886     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
5887     // movd/movss) to move this into the low element, then shuffle it into
5888     // place.
5889     if (EVTBits == 32) {
5890       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
5891       return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
5892     }
5893   }
5894
5895   // Splat is obviously ok. Let legalizer expand it to a shuffle.
5896   if (Values.size() == 1) {
5897     if (EVTBits == 32) {
5898       // Instead of a shuffle like this:
5899       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
5900       // Check if it's possible to issue this instead.
5901       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
5902       unsigned Idx = countTrailingZeros(NonZeros);
5903       SDValue Item = Op.getOperand(Idx);
5904       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
5905         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
5906     }
5907     return SDValue();
5908   }
5909
5910   // A vector full of immediates; various special cases are already
5911   // handled, so this is best done with a single constant-pool load.
5912   if (IsAllConstants)
5913     return SDValue();
5914
5915   // For AVX-length vectors, see if we can use a vector load to get all of the
5916   // elements, otherwise build the individual 128-bit pieces and use
5917   // shuffles to put them in place.
5918   if (VT.is256BitVector() || VT.is512BitVector()) {
5919     SmallVector<SDValue, 64> V(Op->op_begin(), Op->op_begin() + NumElems);
5920
5921     // Check for a build vector of consecutive loads.
5922     if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
5923       return LD;
5924
5925     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
5926
5927     // Build both the lower and upper subvector.
5928     SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
5929                                 makeArrayRef(&V[0], NumElems/2));
5930     SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
5931                                 makeArrayRef(&V[NumElems / 2], NumElems/2));
5932
5933     // Recreate the wider vector with the lower and upper part.
5934     if (VT.is256BitVector())
5935       return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
5936     return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
5937   }
5938
5939   // Let legalizer expand 2-wide build_vectors.
5940   if (EVTBits == 64) {
5941     if (NumNonZero == 1) {
5942       // One half is zero or undef.
5943       unsigned Idx = countTrailingZeros(NonZeros);
5944       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
5945                                  Op.getOperand(Idx));
5946       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
5947     }
5948     return SDValue();
5949   }
5950
5951   // If element VT is < 32 bits, convert it to inserts into a zero vector.
5952   if (EVTBits == 8 && NumElems == 16)
5953     if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
5954                                         Subtarget, *this))
5955       return V;
5956
5957   if (EVTBits == 16 && NumElems == 8)
5958     if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
5959                                       Subtarget, *this))
5960       return V;
5961
5962   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
5963   if (EVTBits == 32 && NumElems == 4)
5964     if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this))
5965       return V;
5966
5967   // If element VT is == 32 bits, turn it into a number of shuffles.
5968   SmallVector<SDValue, 8> V(NumElems);
5969   if (NumElems == 4 && NumZero > 0) {
5970     for (unsigned i = 0; i < 4; ++i) {
5971       bool isZero = !(NonZeros & (1 << i));
5972       if (isZero)
5973         V[i] = getZeroVector(VT, Subtarget, DAG, dl);
5974       else
5975         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
5976     }
5977
5978     for (unsigned i = 0; i < 2; ++i) {
5979       switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
5980         default: break;
5981         case 0:
5982           V[i] = V[i*2];  // Must be a zero vector.
5983           break;
5984         case 1:
5985           V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
5986           break;
5987         case 2:
5988           V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
5989           break;
5990         case 3:
5991           V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
5992           break;
5993       }
5994     }
5995
5996     bool Reverse1 = (NonZeros & 0x3) == 2;
5997     bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
5998     int MaskVec[] = {
5999       Reverse1 ? 1 : 0,
6000       Reverse1 ? 0 : 1,
6001       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
6002       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
6003     };
6004     return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
6005   }
6006
6007   if (Values.size() > 1 && VT.is128BitVector()) {
6008     // Check for a build vector of consecutive loads.
6009     for (unsigned i = 0; i < NumElems; ++i)
6010       V[i] = Op.getOperand(i);
6011
6012     // Check for elements which are consecutive loads.
6013     if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
6014       return LD;
6015
6016     // Check for a build vector from mostly shuffle plus few inserting.
6017     if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
6018       return Sh;
6019
6020     // For SSE 4.1, use insertps to put the high elements into the low element.
6021     if (Subtarget->hasSSE41()) {
6022       SDValue Result;
6023       if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
6024         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
6025       else
6026         Result = DAG.getUNDEF(VT);
6027
6028       for (unsigned i = 1; i < NumElems; ++i) {
6029         if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
6030         Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
6031                              Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
6032       }
6033       return Result;
6034     }
6035
6036     // Otherwise, expand into a number of unpckl*, start by extending each of
6037     // our (non-undef) elements to the full vector width with the element in the
6038     // bottom slot of the vector (which generates no code for SSE).
6039     for (unsigned i = 0; i < NumElems; ++i) {
6040       if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
6041         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
6042       else
6043         V[i] = DAG.getUNDEF(VT);
6044     }
6045
6046     // Next, we iteratively mix elements, e.g. for v4f32:
6047     //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
6048     //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
6049     //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
6050     unsigned EltStride = NumElems >> 1;
6051     while (EltStride != 0) {
6052       for (unsigned i = 0; i < EltStride; ++i) {
6053         // If V[i+EltStride] is undef and this is the first round of mixing,
6054         // then it is safe to just drop this shuffle: V[i] is already in the
6055         // right place, the one element (since it's the first round) being
6056         // inserted as undef can be dropped.  This isn't safe for successive
6057         // rounds because they will permute elements within both vectors.
6058         if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
6059             EltStride == NumElems/2)
6060           continue;
6061
6062         V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
6063       }
6064       EltStride >>= 1;
6065     }
6066     return V[0];
6067   }
6068   return SDValue();
6069 }
6070
6071 // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
6072 // to create 256-bit vectors from two other 128-bit ones.
6073 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
6074   SDLoc dl(Op);
6075   MVT ResVT = Op.getSimpleValueType();
6076
6077   assert((ResVT.is256BitVector() ||
6078           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
6079
6080   SDValue V1 = Op.getOperand(0);
6081   SDValue V2 = Op.getOperand(1);
6082   unsigned NumElems = ResVT.getVectorNumElements();
6083   if (ResVT.is256BitVector())
6084     return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
6085
6086   if (Op.getNumOperands() == 4) {
6087     MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),
6088                                 ResVT.getVectorNumElements()/2);
6089     SDValue V3 = Op.getOperand(2);
6090     SDValue V4 = Op.getOperand(3);
6091     return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl),
6092       Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl);
6093   }
6094   return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
6095 }
6096
6097 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
6098                                        const X86Subtarget *Subtarget,
6099                                        SelectionDAG & DAG) {
6100   SDLoc dl(Op);
6101   MVT ResVT = Op.getSimpleValueType();
6102   unsigned NumOfOperands = Op.getNumOperands();
6103
6104   assert(isPowerOf2_32(NumOfOperands) &&
6105          "Unexpected number of operands in CONCAT_VECTORS");
6106
6107   if (NumOfOperands > 2) {
6108     MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),
6109                                   ResVT.getVectorNumElements()/2);
6110     SmallVector<SDValue, 2> Ops;
6111     for (unsigned i = 0; i < NumOfOperands/2; i++)
6112       Ops.push_back(Op.getOperand(i));
6113     SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
6114     Ops.clear();
6115     for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++)
6116       Ops.push_back(Op.getOperand(i));
6117     SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
6118     return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
6119   }
6120
6121   SDValue V1 = Op.getOperand(0);
6122   SDValue V2 = Op.getOperand(1);
6123   bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
6124   bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
6125
6126   if (IsZeroV1 && IsZeroV2)
6127     return getZeroVector(ResVT, Subtarget, DAG, dl);
6128
6129   SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
6130   SDValue Undef = DAG.getUNDEF(ResVT);
6131   unsigned NumElems = ResVT.getVectorNumElements();
6132   SDValue ShiftBits = DAG.getConstant(NumElems/2, dl, MVT::i8);
6133
6134   V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, ZeroIdx);
6135   V2 = DAG.getNode(X86ISD::VSHLI, dl, ResVT, V2, ShiftBits);
6136   if (IsZeroV1)
6137     return V2;
6138
6139   V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
6140   // Zero the upper bits of V1
6141   V1 = DAG.getNode(X86ISD::VSHLI, dl, ResVT, V1, ShiftBits);
6142   V1 = DAG.getNode(X86ISD::VSRLI, dl, ResVT, V1, ShiftBits);
6143   if (IsZeroV2)
6144     return V1;
6145   return DAG.getNode(ISD::OR, dl, ResVT, V1, V2);
6146 }
6147
6148 static SDValue LowerCONCAT_VECTORS(SDValue Op,
6149                                    const X86Subtarget *Subtarget,
6150                                    SelectionDAG &DAG) {
6151   MVT VT = Op.getSimpleValueType();
6152   if (VT.getVectorElementType() == MVT::i1)
6153     return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
6154
6155   assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
6156          (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
6157           Op.getNumOperands() == 4)));
6158
6159   // AVX can use the vinsertf128 instruction to create 256-bit vectors
6160   // from two other 128-bit ones.
6161
6162   // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
6163   return LowerAVXCONCAT_VECTORS(Op, DAG);
6164 }
6165
6166
6167 //===----------------------------------------------------------------------===//
6168 // Vector shuffle lowering
6169 //
6170 // This is an experimental code path for lowering vector shuffles on x86. It is
6171 // designed to handle arbitrary vector shuffles and blends, gracefully
6172 // degrading performance as necessary. It works hard to recognize idiomatic
6173 // shuffles and lower them to optimal instruction patterns without leaving
6174 // a framework that allows reasonably efficient handling of all vector shuffle
6175 // patterns.
6176 //===----------------------------------------------------------------------===//
6177
6178 /// \brief Tiny helper function to identify a no-op mask.
6179 ///
6180 /// This is a somewhat boring predicate function. It checks whether the mask
6181 /// array input, which is assumed to be a single-input shuffle mask of the kind
6182 /// used by the X86 shuffle instructions (not a fully general
6183 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
6184 /// in-place shuffle are 'no-op's.
6185 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
6186   for (int i = 0, Size = Mask.size(); i < Size; ++i)
6187     if (Mask[i] != -1 && Mask[i] != i)
6188       return false;
6189   return true;
6190 }
6191
6192 /// \brief Helper function to classify a mask as a single-input mask.
6193 ///
6194 /// This isn't a generic single-input test because in the vector shuffle
6195 /// lowering we canonicalize single inputs to be the first input operand. This
6196 /// means we can more quickly test for a single input by only checking whether
6197 /// an input from the second operand exists. We also assume that the size of
6198 /// mask corresponds to the size of the input vectors which isn't true in the
6199 /// fully general case.
6200 static bool isSingleInputShuffleMask(ArrayRef<int> Mask) {
6201   for (int M : Mask)
6202     if (M >= (int)Mask.size())
6203       return false;
6204   return true;
6205 }
6206
6207 /// \brief Test whether there are elements crossing 128-bit lanes in this
6208 /// shuffle mask.
6209 ///
6210 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
6211 /// and we routinely test for these.
6212 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
6213   int LaneSize = 128 / VT.getScalarSizeInBits();
6214   int Size = Mask.size();
6215   for (int i = 0; i < Size; ++i)
6216     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
6217       return true;
6218   return false;
6219 }
6220
6221 /// \brief Test whether a shuffle mask is equivalent within each 128-bit lane.
6222 ///
6223 /// This checks a shuffle mask to see if it is performing the same
6224 /// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies
6225 /// that it is also not lane-crossing. It may however involve a blend from the
6226 /// same lane of a second vector.
6227 ///
6228 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
6229 /// non-trivial to compute in the face of undef lanes. The representation is
6230 /// *not* suitable for use with existing 128-bit shuffles as it will contain
6231 /// entries from both V1 and V2 inputs to the wider mask.
6232 static bool
6233 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
6234                                 SmallVectorImpl<int> &RepeatedMask) {
6235   int LaneSize = 128 / VT.getScalarSizeInBits();
6236   RepeatedMask.resize(LaneSize, -1);
6237   int Size = Mask.size();
6238   for (int i = 0; i < Size; ++i) {
6239     if (Mask[i] < 0)
6240       continue;
6241     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
6242       // This entry crosses lanes, so there is no way to model this shuffle.
6243       return false;
6244
6245     // Ok, handle the in-lane shuffles by detecting if and when they repeat.
6246     if (RepeatedMask[i % LaneSize] == -1)
6247       // This is the first non-undef entry in this slot of a 128-bit lane.
6248       RepeatedMask[i % LaneSize] =
6249           Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size;
6250     else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i])
6251       // Found a mismatch with the repeated mask.
6252       return false;
6253   }
6254   return true;
6255 }
6256
6257 /// \brief Test whether a shuffle mask is equivalent within each 256-bit lane.
6258 ///
6259 /// This checks a shuffle mask to see if it is performing the same
6260 /// 256-bit lane-relative shuffle in each 256-bit lane. This trivially implies
6261 /// that it is also not lane-crossing. It may however involve a blend from the
6262 /// same lane of a second vector.
6263 ///
6264 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
6265 /// non-trivial to compute in the face of undef lanes. The representation is
6266 /// *not* suitable for use with existing 256-bit shuffles as it will contain
6267 /// entries from both V1 and V2 inputs to the wider mask.
6268 static bool
6269 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
6270                                 SmallVectorImpl<int> &RepeatedMask) {
6271   int LaneSize = 256 / VT.getScalarSizeInBits();
6272   RepeatedMask.resize(LaneSize, -1);
6273   int Size = Mask.size();
6274   for (int i = 0; i < Size; ++i) {
6275     if (Mask[i] < 0)
6276       continue;
6277     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
6278       // This entry crosses lanes, so there is no way to model this shuffle.
6279       return false;
6280
6281     // Ok, handle the in-lane shuffles by detecting if and when they repeat.
6282     if (RepeatedMask[i % LaneSize] == -1)
6283       // This is the first non-undef entry in this slot of a 256-bit lane.
6284       RepeatedMask[i % LaneSize] =
6285           Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size;
6286     else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i])
6287       // Found a mismatch with the repeated mask.
6288       return false;
6289   }
6290   return true;
6291 }
6292
6293 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
6294 /// arguments.
6295 ///
6296 /// This is a fast way to test a shuffle mask against a fixed pattern:
6297 ///
6298 ///   if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
6299 ///
6300 /// It returns true if the mask is exactly as wide as the argument list, and
6301 /// each element of the mask is either -1 (signifying undef) or the value given
6302 /// in the argument.
6303 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
6304                                 ArrayRef<int> ExpectedMask) {
6305   if (Mask.size() != ExpectedMask.size())
6306     return false;
6307
6308   int Size = Mask.size();
6309
6310   // If the values are build vectors, we can look through them to find
6311   // equivalent inputs that make the shuffles equivalent.
6312   auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
6313   auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
6314
6315   for (int i = 0; i < Size; ++i)
6316     if (Mask[i] != -1 && Mask[i] != ExpectedMask[i]) {
6317       auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
6318       auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
6319       if (!MaskBV || !ExpectedBV ||
6320           MaskBV->getOperand(Mask[i] % Size) !=
6321               ExpectedBV->getOperand(ExpectedMask[i] % Size))
6322         return false;
6323     }
6324
6325   return true;
6326 }
6327
6328 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
6329 ///
6330 /// This helper function produces an 8-bit shuffle immediate corresponding to
6331 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
6332 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
6333 /// example.
6334 ///
6335 /// NB: We rely heavily on "undef" masks preserving the input lane.
6336 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
6337                                           SelectionDAG &DAG) {
6338   assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
6339   assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
6340   assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
6341   assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
6342   assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
6343
6344   unsigned Imm = 0;
6345   Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0;
6346   Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2;
6347   Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4;
6348   Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6;
6349   return DAG.getConstant(Imm, DL, MVT::i8);
6350 }
6351
6352 /// \brief Get a 8-bit shuffle, 1 bit per lane, immediate for a mask.
6353 ///
6354 /// This helper function produces an 8-bit shuffle immediate corresponding to
6355 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
6356 /// shuffling 8 lanes.
6357 static SDValue get1bitLaneShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
6358                                              SelectionDAG &DAG) {
6359   assert(Mask.size() <= 8 &&
6360          "Up to 8 elts may be in Imm8 1-bit lane shuffle mask");
6361   unsigned Imm = 0;
6362   for (unsigned i = 0; i < Mask.size(); ++i)
6363     if (Mask[i] >= 0)
6364       Imm |= (Mask[i] % 2) << i;
6365   return DAG.getConstant(Imm, DL, MVT::i8);
6366 }
6367
6368 /// \brief Try to emit a blend instruction for a shuffle using bit math.
6369 ///
6370 /// This is used as a fallback approach when first class blend instructions are
6371 /// unavailable. Currently it is only suitable for integer vectors, but could
6372 /// be generalized for floating point vectors if desirable.
6373 static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1,
6374                                             SDValue V2, ArrayRef<int> Mask,
6375                                             SelectionDAG &DAG) {
6376   assert(VT.isInteger() && "Only supports integer vector types!");
6377   MVT EltVT = VT.getScalarType();
6378   int NumEltBits = EltVT.getSizeInBits();
6379   SDValue Zero = DAG.getConstant(0, DL, EltVT);
6380   SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
6381                                     EltVT);
6382   SmallVector<SDValue, 16> MaskOps;
6383   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
6384     if (Mask[i] != -1 && Mask[i] != i && Mask[i] != i + Size)
6385       return SDValue(); // Shuffled input!
6386     MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
6387   }
6388
6389   SDValue V1Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, MaskOps);
6390   V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
6391   // We have to cast V2 around.
6392   MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
6393   V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
6394                                       DAG.getBitcast(MaskVT, V1Mask),
6395                                       DAG.getBitcast(MaskVT, V2)));
6396   return DAG.getNode(ISD::OR, DL, VT, V1, V2);
6397 }
6398
6399 /// \brief Try to emit a blend instruction for a shuffle.
6400 ///
6401 /// This doesn't do any checks for the availability of instructions for blending
6402 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
6403 /// be matched in the backend with the type given. What it does check for is
6404 /// that the shuffle mask is in fact a blend.
6405 static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
6406                                          SDValue V2, ArrayRef<int> Mask,
6407                                          const X86Subtarget *Subtarget,
6408                                          SelectionDAG &DAG) {
6409   unsigned BlendMask = 0;
6410   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
6411     if (Mask[i] >= Size) {
6412       if (Mask[i] != i + Size)
6413         return SDValue(); // Shuffled V2 input!
6414       BlendMask |= 1u << i;
6415       continue;
6416     }
6417     if (Mask[i] >= 0 && Mask[i] != i)
6418       return SDValue(); // Shuffled V1 input!
6419   }
6420   switch (VT.SimpleTy) {
6421   case MVT::v2f64:
6422   case MVT::v4f32:
6423   case MVT::v4f64:
6424   case MVT::v8f32:
6425     return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
6426                        DAG.getConstant(BlendMask, DL, MVT::i8));
6427
6428   case MVT::v4i64:
6429   case MVT::v8i32:
6430     assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
6431     // FALLTHROUGH
6432   case MVT::v2i64:
6433   case MVT::v4i32:
6434     // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
6435     // that instruction.
6436     if (Subtarget->hasAVX2()) {
6437       // Scale the blend by the number of 32-bit dwords per element.
6438       int Scale =  VT.getScalarSizeInBits() / 32;
6439       BlendMask = 0;
6440       for (int i = 0, Size = Mask.size(); i < Size; ++i)
6441         if (Mask[i] >= Size)
6442           for (int j = 0; j < Scale; ++j)
6443             BlendMask |= 1u << (i * Scale + j);
6444
6445       MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
6446       V1 = DAG.getBitcast(BlendVT, V1);
6447       V2 = DAG.getBitcast(BlendVT, V2);
6448       return DAG.getBitcast(
6449           VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
6450                           DAG.getConstant(BlendMask, DL, MVT::i8)));
6451     }
6452     // FALLTHROUGH
6453   case MVT::v8i16: {
6454     // For integer shuffles we need to expand the mask and cast the inputs to
6455     // v8i16s prior to blending.
6456     int Scale = 8 / VT.getVectorNumElements();
6457     BlendMask = 0;
6458     for (int i = 0, Size = Mask.size(); i < Size; ++i)
6459       if (Mask[i] >= Size)
6460         for (int j = 0; j < Scale; ++j)
6461           BlendMask |= 1u << (i * Scale + j);
6462
6463     V1 = DAG.getBitcast(MVT::v8i16, V1);
6464     V2 = DAG.getBitcast(MVT::v8i16, V2);
6465     return DAG.getBitcast(VT,
6466                           DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
6467                                       DAG.getConstant(BlendMask, DL, MVT::i8)));
6468   }
6469
6470   case MVT::v16i16: {
6471     assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
6472     SmallVector<int, 8> RepeatedMask;
6473     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
6474       // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
6475       assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
6476       BlendMask = 0;
6477       for (int i = 0; i < 8; ++i)
6478         if (RepeatedMask[i] >= 16)
6479           BlendMask |= 1u << i;
6480       return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
6481                          DAG.getConstant(BlendMask, DL, MVT::i8));
6482     }
6483   }
6484     // FALLTHROUGH
6485   case MVT::v16i8:
6486   case MVT::v32i8: {
6487     assert((VT.getSizeInBits() == 128 || Subtarget->hasAVX2()) &&
6488            "256-bit byte-blends require AVX2 support!");
6489
6490     // Scale the blend by the number of bytes per element.
6491     int Scale = VT.getScalarSizeInBits() / 8;
6492
6493     // This form of blend is always done on bytes. Compute the byte vector
6494     // type.
6495     MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
6496
6497     // Compute the VSELECT mask. Note that VSELECT is really confusing in the
6498     // mix of LLVM's code generator and the x86 backend. We tell the code
6499     // generator that boolean values in the elements of an x86 vector register
6500     // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
6501     // mapping a select to operand #1, and 'false' mapping to operand #2. The
6502     // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
6503     // of the element (the remaining are ignored) and 0 in that high bit would
6504     // mean operand #1 while 1 in the high bit would mean operand #2. So while
6505     // the LLVM model for boolean values in vector elements gets the relevant
6506     // bit set, it is set backwards and over constrained relative to x86's
6507     // actual model.
6508     SmallVector<SDValue, 32> VSELECTMask;
6509     for (int i = 0, Size = Mask.size(); i < Size; ++i)
6510       for (int j = 0; j < Scale; ++j)
6511         VSELECTMask.push_back(
6512             Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
6513                         : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
6514                                           MVT::i8));
6515
6516     V1 = DAG.getBitcast(BlendVT, V1);
6517     V2 = DAG.getBitcast(BlendVT, V2);
6518     return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, DL, BlendVT,
6519                                           DAG.getNode(ISD::BUILD_VECTOR, DL,
6520                                                       BlendVT, VSELECTMask),
6521                                           V1, V2));
6522   }
6523
6524   default:
6525     llvm_unreachable("Not a supported integer vector type!");
6526   }
6527 }
6528
6529 /// \brief Try to lower as a blend of elements from two inputs followed by
6530 /// a single-input permutation.
6531 ///
6532 /// This matches the pattern where we can blend elements from two inputs and
6533 /// then reduce the shuffle to a single-input permutation.
6534 static SDValue lowerVectorShuffleAsBlendAndPermute(SDLoc DL, MVT VT, SDValue V1,
6535                                                    SDValue V2,
6536                                                    ArrayRef<int> Mask,
6537                                                    SelectionDAG &DAG) {
6538   // We build up the blend mask while checking whether a blend is a viable way
6539   // to reduce the shuffle.
6540   SmallVector<int, 32> BlendMask(Mask.size(), -1);
6541   SmallVector<int, 32> PermuteMask(Mask.size(), -1);
6542
6543   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
6544     if (Mask[i] < 0)
6545       continue;
6546
6547     assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
6548
6549     if (BlendMask[Mask[i] % Size] == -1)
6550       BlendMask[Mask[i] % Size] = Mask[i];
6551     else if (BlendMask[Mask[i] % Size] != Mask[i])
6552       return SDValue(); // Can't blend in the needed input!
6553
6554     PermuteMask[i] = Mask[i] % Size;
6555   }
6556
6557   SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
6558   return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
6559 }
6560
6561 /// \brief Generic routine to decompose a shuffle and blend into indepndent
6562 /// blends and permutes.
6563 ///
6564 /// This matches the extremely common pattern for handling combined
6565 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
6566 /// operations. It will try to pick the best arrangement of shuffles and
6567 /// blends.
6568 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
6569                                                           SDValue V1,
6570                                                           SDValue V2,
6571                                                           ArrayRef<int> Mask,
6572                                                           SelectionDAG &DAG) {
6573   // Shuffle the input elements into the desired positions in V1 and V2 and
6574   // blend them together.
6575   SmallVector<int, 32> V1Mask(Mask.size(), -1);
6576   SmallVector<int, 32> V2Mask(Mask.size(), -1);
6577   SmallVector<int, 32> BlendMask(Mask.size(), -1);
6578   for (int i = 0, Size = Mask.size(); i < Size; ++i)
6579     if (Mask[i] >= 0 && Mask[i] < Size) {
6580       V1Mask[i] = Mask[i];
6581       BlendMask[i] = i;
6582     } else if (Mask[i] >= Size) {
6583       V2Mask[i] = Mask[i] - Size;
6584       BlendMask[i] = i + Size;
6585     }
6586
6587   // Try to lower with the simpler initial blend strategy unless one of the
6588   // input shuffles would be a no-op. We prefer to shuffle inputs as the
6589   // shuffle may be able to fold with a load or other benefit. However, when
6590   // we'll have to do 2x as many shuffles in order to achieve this, blending
6591   // first is a better strategy.
6592   if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
6593     if (SDValue BlendPerm =
6594             lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
6595       return BlendPerm;
6596
6597   V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
6598   V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
6599   return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
6600 }
6601
6602 /// \brief Try to lower a vector shuffle as a byte rotation.
6603 ///
6604 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
6605 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
6606 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
6607 /// try to generically lower a vector shuffle through such an pattern. It
6608 /// does not check for the profitability of lowering either as PALIGNR or
6609 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
6610 /// This matches shuffle vectors that look like:
6611 ///
6612 ///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
6613 ///
6614 /// Essentially it concatenates V1 and V2, shifts right by some number of
6615 /// elements, and takes the low elements as the result. Note that while this is
6616 /// specified as a *right shift* because x86 is little-endian, it is a *left
6617 /// rotate* of the vector lanes.
6618 static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
6619                                               SDValue V2,
6620                                               ArrayRef<int> Mask,
6621                                               const X86Subtarget *Subtarget,
6622                                               SelectionDAG &DAG) {
6623   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
6624
6625   int NumElts = Mask.size();
6626   int NumLanes = VT.getSizeInBits() / 128;
6627   int NumLaneElts = NumElts / NumLanes;
6628
6629   // We need to detect various ways of spelling a rotation:
6630   //   [11, 12, 13, 14, 15,  0,  1,  2]
6631   //   [-1, 12, 13, 14, -1, -1,  1, -1]
6632   //   [-1, -1, -1, -1, -1, -1,  1,  2]
6633   //   [ 3,  4,  5,  6,  7,  8,  9, 10]
6634   //   [-1,  4,  5,  6, -1, -1,  9, -1]
6635   //   [-1,  4,  5,  6, -1, -1, -1, -1]
6636   int Rotation = 0;
6637   SDValue Lo, Hi;
6638   for (int l = 0; l < NumElts; l += NumLaneElts) {
6639     for (int i = 0; i < NumLaneElts; ++i) {
6640       if (Mask[l + i] == -1)
6641         continue;
6642       assert(Mask[l + i] >= 0 && "Only -1 is a valid negative mask element!");
6643
6644       // Get the mod-Size index and lane correct it.
6645       int LaneIdx = (Mask[l + i] % NumElts) - l;
6646       // Make sure it was in this lane.
6647       if (LaneIdx < 0 || LaneIdx >= NumLaneElts)
6648         return SDValue();
6649
6650       // Determine where a rotated vector would have started.
6651       int StartIdx = i - LaneIdx;
6652       if (StartIdx == 0)
6653         // The identity rotation isn't interesting, stop.
6654         return SDValue();
6655
6656       // If we found the tail of a vector the rotation must be the missing
6657       // front. If we found the head of a vector, it must be how much of the
6658       // head.
6659       int CandidateRotation = StartIdx < 0 ? -StartIdx : NumLaneElts - StartIdx;
6660
6661       if (Rotation == 0)
6662         Rotation = CandidateRotation;
6663       else if (Rotation != CandidateRotation)
6664         // The rotations don't match, so we can't match this mask.
6665         return SDValue();
6666
6667       // Compute which value this mask is pointing at.
6668       SDValue MaskV = Mask[l + i] < NumElts ? V1 : V2;
6669
6670       // Compute which of the two target values this index should be assigned
6671       // to. This reflects whether the high elements are remaining or the low
6672       // elements are remaining.
6673       SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
6674
6675       // Either set up this value if we've not encountered it before, or check
6676       // that it remains consistent.
6677       if (!TargetV)
6678         TargetV = MaskV;
6679       else if (TargetV != MaskV)
6680         // This may be a rotation, but it pulls from the inputs in some
6681         // unsupported interleaving.
6682         return SDValue();
6683     }
6684   }
6685
6686   // Check that we successfully analyzed the mask, and normalize the results.
6687   assert(Rotation != 0 && "Failed to locate a viable rotation!");
6688   assert((Lo || Hi) && "Failed to find a rotated input vector!");
6689   if (!Lo)
6690     Lo = Hi;
6691   else if (!Hi)
6692     Hi = Lo;
6693
6694   // The actual rotate instruction rotates bytes, so we need to scale the
6695   // rotation based on how many bytes are in the vector lane.
6696   int Scale = 16 / NumLaneElts;
6697
6698   // SSSE3 targets can use the palignr instruction.
6699   if (Subtarget->hasSSSE3()) {
6700     // Cast the inputs to i8 vector of correct length to match PALIGNR.
6701     MVT AlignVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes);
6702     Lo = DAG.getBitcast(AlignVT, Lo);
6703     Hi = DAG.getBitcast(AlignVT, Hi);
6704
6705     return DAG.getBitcast(
6706         VT, DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Hi, Lo,
6707                         DAG.getConstant(Rotation * Scale, DL, MVT::i8)));
6708   }
6709
6710   assert(VT.getSizeInBits() == 128 &&
6711          "Rotate-based lowering only supports 128-bit lowering!");
6712   assert(Mask.size() <= 16 &&
6713          "Can shuffle at most 16 bytes in a 128-bit vector!");
6714
6715   // Default SSE2 implementation
6716   int LoByteShift = 16 - Rotation * Scale;
6717   int HiByteShift = Rotation * Scale;
6718
6719   // Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ.
6720   Lo = DAG.getBitcast(MVT::v2i64, Lo);
6721   Hi = DAG.getBitcast(MVT::v2i64, Hi);
6722
6723   SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo,
6724                                 DAG.getConstant(LoByteShift, DL, MVT::i8));
6725   SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi,
6726                                 DAG.getConstant(HiByteShift, DL, MVT::i8));
6727   return DAG.getBitcast(VT,
6728                         DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift));
6729 }
6730
6731 /// \brief Compute whether each element of a shuffle is zeroable.
6732 ///
6733 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
6734 /// Either it is an undef element in the shuffle mask, the element of the input
6735 /// referenced is undef, or the element of the input referenced is known to be
6736 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
6737 /// as many lanes with this technique as possible to simplify the remaining
6738 /// shuffle.
6739 static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
6740                                                      SDValue V1, SDValue V2) {
6741   SmallBitVector Zeroable(Mask.size(), false);
6742
6743   while (V1.getOpcode() == ISD::BITCAST)
6744     V1 = V1->getOperand(0);
6745   while (V2.getOpcode() == ISD::BITCAST)
6746     V2 = V2->getOperand(0);
6747
6748   bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
6749   bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
6750
6751   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
6752     int M = Mask[i];
6753     // Handle the easy cases.
6754     if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
6755       Zeroable[i] = true;
6756       continue;
6757     }
6758
6759     // If this is an index into a build_vector node (which has the same number
6760     // of elements), dig out the input value and use it.
6761     SDValue V = M < Size ? V1 : V2;
6762     if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands())
6763       continue;
6764
6765     SDValue Input = V.getOperand(M % Size);
6766     // The UNDEF opcode check really should be dead code here, but not quite
6767     // worth asserting on (it isn't invalid, just unexpected).
6768     if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input))
6769       Zeroable[i] = true;
6770   }
6771
6772   return Zeroable;
6773 }
6774
6775 /// \brief Try to emit a bitmask instruction for a shuffle.
6776 ///
6777 /// This handles cases where we can model a blend exactly as a bitmask due to
6778 /// one of the inputs being zeroable.
6779 static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1,
6780                                            SDValue V2, ArrayRef<int> Mask,
6781                                            SelectionDAG &DAG) {
6782   MVT EltVT = VT.getScalarType();
6783   int NumEltBits = EltVT.getSizeInBits();
6784   MVT IntEltVT = MVT::getIntegerVT(NumEltBits);
6785   SDValue Zero = DAG.getConstant(0, DL, IntEltVT);
6786   SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
6787                                     IntEltVT);
6788   if (EltVT.isFloatingPoint()) {
6789     Zero = DAG.getBitcast(EltVT, Zero);
6790     AllOnes = DAG.getBitcast(EltVT, AllOnes);
6791   }
6792   SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
6793   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
6794   SDValue V;
6795   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
6796     if (Zeroable[i])
6797       continue;
6798     if (Mask[i] % Size != i)
6799       return SDValue(); // Not a blend.
6800     if (!V)
6801       V = Mask[i] < Size ? V1 : V2;
6802     else if (V != (Mask[i] < Size ? V1 : V2))
6803       return SDValue(); // Can only let one input through the mask.
6804
6805     VMaskOps[i] = AllOnes;
6806   }
6807   if (!V)
6808     return SDValue(); // No non-zeroable elements!
6809
6810   SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps);
6811   V = DAG.getNode(VT.isFloatingPoint()
6812                   ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND,
6813                   DL, VT, V, VMask);
6814   return V;
6815 }
6816
6817 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
6818 ///
6819 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
6820 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
6821 /// matches elements from one of the input vectors shuffled to the left or
6822 /// right with zeroable elements 'shifted in'. It handles both the strictly
6823 /// bit-wise element shifts and the byte shift across an entire 128-bit double
6824 /// quad word lane.
6825 ///
6826 /// PSHL : (little-endian) left bit shift.
6827 /// [ zz, 0, zz,  2 ]
6828 /// [ -1, 4, zz, -1 ]
6829 /// PSRL : (little-endian) right bit shift.
6830 /// [  1, zz,  3, zz]
6831 /// [ -1, -1,  7, zz]
6832 /// PSLLDQ : (little-endian) left byte shift
6833 /// [ zz,  0,  1,  2,  3,  4,  5,  6]
6834 /// [ zz, zz, -1, -1,  2,  3,  4, -1]
6835 /// [ zz, zz, zz, zz, zz, zz, -1,  1]
6836 /// PSRLDQ : (little-endian) right byte shift
6837 /// [  5, 6,  7, zz, zz, zz, zz, zz]
6838 /// [ -1, 5,  6,  7, zz, zz, zz, zz]
6839 /// [  1, 2, -1, -1, -1, -1, zz, zz]
6840 static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1,
6841                                          SDValue V2, ArrayRef<int> Mask,
6842                                          SelectionDAG &DAG) {
6843   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
6844
6845   int Size = Mask.size();
6846   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
6847
6848   auto CheckZeros = [&](int Shift, int Scale, bool Left) {
6849     for (int i = 0; i < Size; i += Scale)
6850       for (int j = 0; j < Shift; ++j)
6851         if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
6852           return false;
6853
6854     return true;
6855   };
6856
6857   auto MatchShift = [&](int Shift, int Scale, bool Left, SDValue V) {
6858     for (int i = 0; i != Size; i += Scale) {
6859       unsigned Pos = Left ? i + Shift : i;
6860       unsigned Low = Left ? i : i + Shift;
6861       unsigned Len = Scale - Shift;
6862       if (!isSequentialOrUndefInRange(Mask, Pos, Len,
6863                                       Low + (V == V1 ? 0 : Size)))
6864         return SDValue();
6865     }
6866
6867     int ShiftEltBits = VT.getScalarSizeInBits() * Scale;
6868     bool ByteShift = ShiftEltBits > 64;
6869     unsigned OpCode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
6870                            : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
6871     int ShiftAmt = Shift * VT.getScalarSizeInBits() / (ByteShift ? 8 : 1);
6872
6873     // Normalize the scale for byte shifts to still produce an i64 element
6874     // type.
6875     Scale = ByteShift ? Scale / 2 : Scale;
6876
6877     // We need to round trip through the appropriate type for the shift.
6878     MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale);
6879     MVT ShiftVT = MVT::getVectorVT(ShiftSVT, Size / Scale);
6880     assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
6881            "Illegal integer vector type");
6882     V = DAG.getBitcast(ShiftVT, V);
6883
6884     V = DAG.getNode(OpCode, DL, ShiftVT, V,
6885                     DAG.getConstant(ShiftAmt, DL, MVT::i8));
6886     return DAG.getBitcast(VT, V);
6887   };
6888
6889   // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
6890   // keep doubling the size of the integer elements up to that. We can
6891   // then shift the elements of the integer vector by whole multiples of
6892   // their width within the elements of the larger integer vector. Test each
6893   // multiple to see if we can find a match with the moved element indices
6894   // and that the shifted in elements are all zeroable.
6895   for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= 128; Scale *= 2)
6896     for (int Shift = 1; Shift != Scale; ++Shift)
6897       for (bool Left : {true, false})
6898         if (CheckZeros(Shift, Scale, Left))
6899           for (SDValue V : {V1, V2})
6900             if (SDValue Match = MatchShift(Shift, Scale, Left, V))
6901               return Match;
6902
6903   // no match
6904   return SDValue();
6905 }
6906
6907 /// \brief Lower a vector shuffle as a zero or any extension.
6908 ///
6909 /// Given a specific number of elements, element bit width, and extension
6910 /// stride, produce either a zero or any extension based on the available
6911 /// features of the subtarget.
6912 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
6913     SDLoc DL, MVT VT, int Scale, bool AnyExt, SDValue InputV,
6914     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
6915   assert(Scale > 1 && "Need a scale to extend.");
6916   int NumElements = VT.getVectorNumElements();
6917   int EltBits = VT.getScalarSizeInBits();
6918   assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
6919          "Only 8, 16, and 32 bit elements can be extended.");
6920   assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
6921
6922   // Found a valid zext mask! Try various lowering strategies based on the
6923   // input type and available ISA extensions.
6924   if (Subtarget->hasSSE41()) {
6925     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
6926                                  NumElements / Scale);
6927     return DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV));
6928   }
6929
6930   // For any extends we can cheat for larger element sizes and use shuffle
6931   // instructions that can fold with a load and/or copy.
6932   if (AnyExt && EltBits == 32) {
6933     int PSHUFDMask[4] = {0, -1, 1, -1};
6934     return DAG.getBitcast(
6935         VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
6936                         DAG.getBitcast(MVT::v4i32, InputV),
6937                         getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
6938   }
6939   if (AnyExt && EltBits == 16 && Scale > 2) {
6940     int PSHUFDMask[4] = {0, -1, 0, -1};
6941     InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
6942                          DAG.getBitcast(MVT::v4i32, InputV),
6943                          getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
6944     int PSHUFHWMask[4] = {1, -1, -1, -1};
6945     return DAG.getBitcast(
6946         VT, DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16,
6947                         DAG.getBitcast(MVT::v8i16, InputV),
6948                         getV4X86ShuffleImm8ForMask(PSHUFHWMask, DL, DAG)));
6949   }
6950
6951   // If this would require more than 2 unpack instructions to expand, use
6952   // pshufb when available. We can only use more than 2 unpack instructions
6953   // when zero extending i8 elements which also makes it easier to use pshufb.
6954   if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) {
6955     assert(NumElements == 16 && "Unexpected byte vector width!");
6956     SDValue PSHUFBMask[16];
6957     for (int i = 0; i < 16; ++i)
6958       PSHUFBMask[i] =
6959           DAG.getConstant((i % Scale == 0) ? i / Scale : 0x80, DL, MVT::i8);
6960     InputV = DAG.getBitcast(MVT::v16i8, InputV);
6961     return DAG.getBitcast(VT,
6962                           DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
6963                                       DAG.getNode(ISD::BUILD_VECTOR, DL,
6964                                                   MVT::v16i8, PSHUFBMask)));
6965   }
6966
6967   // Otherwise emit a sequence of unpacks.
6968   do {
6969     MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
6970     SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
6971                          : getZeroVector(InputVT, Subtarget, DAG, DL);
6972     InputV = DAG.getBitcast(InputVT, InputV);
6973     InputV = DAG.getNode(X86ISD::UNPCKL, DL, InputVT, InputV, Ext);
6974     Scale /= 2;
6975     EltBits *= 2;
6976     NumElements /= 2;
6977   } while (Scale > 1);
6978   return DAG.getBitcast(VT, InputV);
6979 }
6980
6981 /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
6982 ///
6983 /// This routine will try to do everything in its power to cleverly lower
6984 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
6985 /// check for the profitability of this lowering,  it tries to aggressively
6986 /// match this pattern. It will use all of the micro-architectural details it
6987 /// can to emit an efficient lowering. It handles both blends with all-zero
6988 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
6989 /// masking out later).
6990 ///
6991 /// The reason we have dedicated lowering for zext-style shuffles is that they
6992 /// are both incredibly common and often quite performance sensitive.
6993 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
6994     SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
6995     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
6996   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
6997
6998   int Bits = VT.getSizeInBits();
6999   int NumElements = VT.getVectorNumElements();
7000   assert(VT.getScalarSizeInBits() <= 32 &&
7001          "Exceeds 32-bit integer zero extension limit");
7002   assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
7003
7004   // Define a helper function to check a particular ext-scale and lower to it if
7005   // valid.
7006   auto Lower = [&](int Scale) -> SDValue {
7007     SDValue InputV;
7008     bool AnyExt = true;
7009     for (int i = 0; i < NumElements; ++i) {
7010       if (Mask[i] == -1)
7011         continue; // Valid anywhere but doesn't tell us anything.
7012       if (i % Scale != 0) {
7013         // Each of the extended elements need to be zeroable.
7014         if (!Zeroable[i])
7015           return SDValue();
7016
7017         // We no longer are in the anyext case.
7018         AnyExt = false;
7019         continue;
7020       }
7021
7022       // Each of the base elements needs to be consecutive indices into the
7023       // same input vector.
7024       SDValue V = Mask[i] < NumElements ? V1 : V2;
7025       if (!InputV)
7026         InputV = V;
7027       else if (InputV != V)
7028         return SDValue(); // Flip-flopping inputs.
7029
7030       if (Mask[i] % NumElements != i / Scale)
7031         return SDValue(); // Non-consecutive strided elements.
7032     }
7033
7034     // If we fail to find an input, we have a zero-shuffle which should always
7035     // have already been handled.
7036     // FIXME: Maybe handle this here in case during blending we end up with one?
7037     if (!InputV)
7038       return SDValue();
7039
7040     return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
7041         DL, VT, Scale, AnyExt, InputV, Subtarget, DAG);
7042   };
7043
7044   // The widest scale possible for extending is to a 64-bit integer.
7045   assert(Bits % 64 == 0 &&
7046          "The number of bits in a vector must be divisible by 64 on x86!");
7047   int NumExtElements = Bits / 64;
7048
7049   // Each iteration, try extending the elements half as much, but into twice as
7050   // many elements.
7051   for (; NumExtElements < NumElements; NumExtElements *= 2) {
7052     assert(NumElements % NumExtElements == 0 &&
7053            "The input vector size must be divisible by the extended size.");
7054     if (SDValue V = Lower(NumElements / NumExtElements))
7055       return V;
7056   }
7057
7058   // General extends failed, but 128-bit vectors may be able to use MOVQ.
7059   if (Bits != 128)
7060     return SDValue();
7061
7062   // Returns one of the source operands if the shuffle can be reduced to a
7063   // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
7064   auto CanZExtLowHalf = [&]() {
7065     for (int i = NumElements / 2; i != NumElements; ++i)
7066       if (!Zeroable[i])
7067         return SDValue();
7068     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
7069       return V1;
7070     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
7071       return V2;
7072     return SDValue();
7073   };
7074
7075   if (SDValue V = CanZExtLowHalf()) {
7076     V = DAG.getBitcast(MVT::v2i64, V);
7077     V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
7078     return DAG.getBitcast(VT, V);
7079   }
7080
7081   // No viable ext lowering found.
7082   return SDValue();
7083 }
7084
7085 /// \brief Try to get a scalar value for a specific element of a vector.
7086 ///
7087 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
7088 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
7089                                               SelectionDAG &DAG) {
7090   MVT VT = V.getSimpleValueType();
7091   MVT EltVT = VT.getVectorElementType();
7092   while (V.getOpcode() == ISD::BITCAST)
7093     V = V.getOperand(0);
7094   // If the bitcasts shift the element size, we can't extract an equivalent
7095   // element from it.
7096   MVT NewVT = V.getSimpleValueType();
7097   if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
7098     return SDValue();
7099
7100   if (V.getOpcode() == ISD::BUILD_VECTOR ||
7101       (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
7102     // Ensure the scalar operand is the same size as the destination.
7103     // FIXME: Add support for scalar truncation where possible.
7104     SDValue S = V.getOperand(Idx);
7105     if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
7106       return DAG.getNode(ISD::BITCAST, SDLoc(V), EltVT, S);
7107   }
7108
7109   return SDValue();
7110 }
7111
7112 /// \brief Helper to test for a load that can be folded with x86 shuffles.
7113 ///
7114 /// This is particularly important because the set of instructions varies
7115 /// significantly based on whether the operand is a load or not.
7116 static bool isShuffleFoldableLoad(SDValue V) {
7117   while (V.getOpcode() == ISD::BITCAST)
7118     V = V.getOperand(0);
7119
7120   return ISD::isNON_EXTLoad(V.getNode());
7121 }
7122
7123 /// \brief Try to lower insertion of a single element into a zero vector.
7124 ///
7125 /// This is a common pattern that we have especially efficient patterns to lower
7126 /// across all subtarget feature sets.
7127 static SDValue lowerVectorShuffleAsElementInsertion(
7128     SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
7129     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
7130   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7131   MVT ExtVT = VT;
7132   MVT EltVT = VT.getVectorElementType();
7133
7134   int V2Index = std::find_if(Mask.begin(), Mask.end(),
7135                              [&Mask](int M) { return M >= (int)Mask.size(); }) -
7136                 Mask.begin();
7137   bool IsV1Zeroable = true;
7138   for (int i = 0, Size = Mask.size(); i < Size; ++i)
7139     if (i != V2Index && !Zeroable[i]) {
7140       IsV1Zeroable = false;
7141       break;
7142     }
7143
7144   // Check for a single input from a SCALAR_TO_VECTOR node.
7145   // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
7146   // all the smarts here sunk into that routine. However, the current
7147   // lowering of BUILD_VECTOR makes that nearly impossible until the old
7148   // vector shuffle lowering is dead.
7149   if (SDValue V2S = getScalarValueForVectorElement(
7150           V2, Mask[V2Index] - Mask.size(), DAG)) {
7151     // We need to zext the scalar if it is smaller than an i32.
7152     V2S = DAG.getBitcast(EltVT, V2S);
7153     if (EltVT == MVT::i8 || EltVT == MVT::i16) {
7154       // Using zext to expand a narrow element won't work for non-zero
7155       // insertions.
7156       if (!IsV1Zeroable)
7157         return SDValue();
7158
7159       // Zero-extend directly to i32.
7160       ExtVT = MVT::v4i32;
7161       V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
7162     }
7163     V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
7164   } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
7165              EltVT == MVT::i16) {
7166     // Either not inserting from the low element of the input or the input
7167     // element size is too small to use VZEXT_MOVL to clear the high bits.
7168     return SDValue();
7169   }
7170
7171   if (!IsV1Zeroable) {
7172     // If V1 can't be treated as a zero vector we have fewer options to lower
7173     // this. We can't support integer vectors or non-zero targets cheaply, and
7174     // the V1 elements can't be permuted in any way.
7175     assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
7176     if (!VT.isFloatingPoint() || V2Index != 0)
7177       return SDValue();
7178     SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
7179     V1Mask[V2Index] = -1;
7180     if (!isNoopShuffleMask(V1Mask))
7181       return SDValue();
7182     // This is essentially a special case blend operation, but if we have
7183     // general purpose blend operations, they are always faster. Bail and let
7184     // the rest of the lowering handle these as blends.
7185     if (Subtarget->hasSSE41())
7186       return SDValue();
7187
7188     // Otherwise, use MOVSD or MOVSS.
7189     assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
7190            "Only two types of floating point element types to handle!");
7191     return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
7192                        ExtVT, V1, V2);
7193   }
7194
7195   // This lowering only works for the low element with floating point vectors.
7196   if (VT.isFloatingPoint() && V2Index != 0)
7197     return SDValue();
7198
7199   V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
7200   if (ExtVT != VT)
7201     V2 = DAG.getBitcast(VT, V2);
7202
7203   if (V2Index != 0) {
7204     // If we have 4 or fewer lanes we can cheaply shuffle the element into
7205     // the desired position. Otherwise it is more efficient to do a vector
7206     // shift left. We know that we can do a vector shift left because all
7207     // the inputs are zero.
7208     if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
7209       SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
7210       V2Shuffle[V2Index] = 0;
7211       V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
7212     } else {
7213       V2 = DAG.getBitcast(MVT::v2i64, V2);
7214       V2 = DAG.getNode(
7215           X86ISD::VSHLDQ, DL, MVT::v2i64, V2,
7216           DAG.getConstant(
7217               V2Index * EltVT.getSizeInBits()/8, DL,
7218               DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64)));
7219       V2 = DAG.getBitcast(VT, V2);
7220     }
7221   }
7222   return V2;
7223 }
7224
7225 /// \brief Try to lower broadcast of a single element.
7226 ///
7227 /// For convenience, this code also bundles all of the subtarget feature set
7228 /// filtering. While a little annoying to re-dispatch on type here, there isn't
7229 /// a convenient way to factor it out.
7230 static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V,
7231                                              ArrayRef<int> Mask,
7232                                              const X86Subtarget *Subtarget,
7233                                              SelectionDAG &DAG) {
7234   if (!Subtarget->hasAVX())
7235     return SDValue();
7236   if (VT.isInteger() && !Subtarget->hasAVX2())
7237     return SDValue();
7238
7239   // Check that the mask is a broadcast.
7240   int BroadcastIdx = -1;
7241   for (int M : Mask)
7242     if (M >= 0 && BroadcastIdx == -1)
7243       BroadcastIdx = M;
7244     else if (M >= 0 && M != BroadcastIdx)
7245       return SDValue();
7246
7247   assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
7248                                             "a sorted mask where the broadcast "
7249                                             "comes from V1.");
7250
7251   // Go up the chain of (vector) values to find a scalar load that we can
7252   // combine with the broadcast.
7253   for (;;) {
7254     switch (V.getOpcode()) {
7255     case ISD::CONCAT_VECTORS: {
7256       int OperandSize = Mask.size() / V.getNumOperands();
7257       V = V.getOperand(BroadcastIdx / OperandSize);
7258       BroadcastIdx %= OperandSize;
7259       continue;
7260     }
7261
7262     case ISD::INSERT_SUBVECTOR: {
7263       SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
7264       auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
7265       if (!ConstantIdx)
7266         break;
7267
7268       int BeginIdx = (int)ConstantIdx->getZExtValue();
7269       int EndIdx =
7270           BeginIdx + (int)VInner.getValueType().getVectorNumElements();
7271       if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
7272         BroadcastIdx -= BeginIdx;
7273         V = VInner;
7274       } else {
7275         V = VOuter;
7276       }
7277       continue;
7278     }
7279     }
7280     break;
7281   }
7282
7283   // Check if this is a broadcast of a scalar. We special case lowering
7284   // for scalars so that we can more effectively fold with loads.
7285   if (V.getOpcode() == ISD::BUILD_VECTOR ||
7286       (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
7287     V = V.getOperand(BroadcastIdx);
7288
7289     // If the scalar isn't a load, we can't broadcast from it in AVX1.
7290     // Only AVX2 has register broadcasts.
7291     if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V))
7292       return SDValue();
7293   } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) {
7294     // We can't broadcast from a vector register without AVX2, and we can only
7295     // broadcast from the zero-element of a vector register.
7296     return SDValue();
7297   }
7298
7299   return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V);
7300 }
7301
7302 // Check for whether we can use INSERTPS to perform the shuffle. We only use
7303 // INSERTPS when the V1 elements are already in the correct locations
7304 // because otherwise we can just always use two SHUFPS instructions which
7305 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
7306 // perform INSERTPS if a single V1 element is out of place and all V2
7307 // elements are zeroable.
7308 static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
7309                                             ArrayRef<int> Mask,
7310                                             SelectionDAG &DAG) {
7311   assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
7312   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
7313   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
7314   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
7315
7316   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7317
7318   unsigned ZMask = 0;
7319   int V1DstIndex = -1;
7320   int V2DstIndex = -1;
7321   bool V1UsedInPlace = false;
7322
7323   for (int i = 0; i < 4; ++i) {
7324     // Synthesize a zero mask from the zeroable elements (includes undefs).
7325     if (Zeroable[i]) {
7326       ZMask |= 1 << i;
7327       continue;
7328     }
7329
7330     // Flag if we use any V1 inputs in place.
7331     if (i == Mask[i]) {
7332       V1UsedInPlace = true;
7333       continue;
7334     }
7335
7336     // We can only insert a single non-zeroable element.
7337     if (V1DstIndex != -1 || V2DstIndex != -1)
7338       return SDValue();
7339
7340     if (Mask[i] < 4) {
7341       // V1 input out of place for insertion.
7342       V1DstIndex = i;
7343     } else {
7344       // V2 input for insertion.
7345       V2DstIndex = i;
7346     }
7347   }
7348
7349   // Don't bother if we have no (non-zeroable) element for insertion.
7350   if (V1DstIndex == -1 && V2DstIndex == -1)
7351     return SDValue();
7352
7353   // Determine element insertion src/dst indices. The src index is from the
7354   // start of the inserted vector, not the start of the concatenated vector.
7355   unsigned V2SrcIndex = 0;
7356   if (V1DstIndex != -1) {
7357     // If we have a V1 input out of place, we use V1 as the V2 element insertion
7358     // and don't use the original V2 at all.
7359     V2SrcIndex = Mask[V1DstIndex];
7360     V2DstIndex = V1DstIndex;
7361     V2 = V1;
7362   } else {
7363     V2SrcIndex = Mask[V2DstIndex] - 4;
7364   }
7365
7366   // If no V1 inputs are used in place, then the result is created only from
7367   // the zero mask and the V2 insertion - so remove V1 dependency.
7368   if (!V1UsedInPlace)
7369     V1 = DAG.getUNDEF(MVT::v4f32);
7370
7371   unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
7372   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
7373
7374   // Insert the V2 element into the desired position.
7375   SDLoc DL(Op);
7376   return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
7377                      DAG.getConstant(InsertPSMask, DL, MVT::i8));
7378 }
7379
7380 /// \brief Try to lower a shuffle as a permute of the inputs followed by an
7381 /// UNPCK instruction.
7382 ///
7383 /// This specifically targets cases where we end up with alternating between
7384 /// the two inputs, and so can permute them into something that feeds a single
7385 /// UNPCK instruction. Note that this routine only targets integer vectors
7386 /// because for floating point vectors we have a generalized SHUFPS lowering
7387 /// strategy that handles everything that doesn't *exactly* match an unpack,
7388 /// making this clever lowering unnecessary.
7389 static SDValue lowerVectorShuffleAsUnpack(SDLoc DL, MVT VT, SDValue V1,
7390                                           SDValue V2, ArrayRef<int> Mask,
7391                                           SelectionDAG &DAG) {
7392   assert(!VT.isFloatingPoint() &&
7393          "This routine only supports integer vectors.");
7394   assert(!isSingleInputShuffleMask(Mask) &&
7395          "This routine should only be used when blending two inputs.");
7396   assert(Mask.size() >= 2 && "Single element masks are invalid.");
7397
7398   int Size = Mask.size();
7399
7400   int NumLoInputs = std::count_if(Mask.begin(), Mask.end(), [Size](int M) {
7401     return M >= 0 && M % Size < Size / 2;
7402   });
7403   int NumHiInputs = std::count_if(
7404       Mask.begin(), Mask.end(), [Size](int M) { return M % Size >= Size / 2; });
7405
7406   bool UnpackLo = NumLoInputs >= NumHiInputs;
7407
7408   auto TryUnpack = [&](MVT UnpackVT, int Scale) {
7409     SmallVector<int, 32> V1Mask(Mask.size(), -1);
7410     SmallVector<int, 32> V2Mask(Mask.size(), -1);
7411
7412     for (int i = 0; i < Size; ++i) {
7413       if (Mask[i] < 0)
7414         continue;
7415
7416       // Each element of the unpack contains Scale elements from this mask.
7417       int UnpackIdx = i / Scale;
7418
7419       // We only handle the case where V1 feeds the first slots of the unpack.
7420       // We rely on canonicalization to ensure this is the case.
7421       if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
7422         return SDValue();
7423
7424       // Setup the mask for this input. The indexing is tricky as we have to
7425       // handle the unpack stride.
7426       SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
7427       VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
7428           Mask[i] % Size;
7429     }
7430
7431     // If we will have to shuffle both inputs to use the unpack, check whether
7432     // we can just unpack first and shuffle the result. If so, skip this unpack.
7433     if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
7434         !isNoopShuffleMask(V2Mask))
7435       return SDValue();
7436
7437     // Shuffle the inputs into place.
7438     V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
7439     V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
7440
7441     // Cast the inputs to the type we will use to unpack them.
7442     V1 = DAG.getBitcast(UnpackVT, V1);
7443     V2 = DAG.getBitcast(UnpackVT, V2);
7444
7445     // Unpack the inputs and cast the result back to the desired type.
7446     return DAG.getBitcast(
7447         VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
7448                         UnpackVT, V1, V2));
7449   };
7450
7451   // We try each unpack from the largest to the smallest to try and find one
7452   // that fits this mask.
7453   int OrigNumElements = VT.getVectorNumElements();
7454   int OrigScalarSize = VT.getScalarSizeInBits();
7455   for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2) {
7456     int Scale = ScalarSize / OrigScalarSize;
7457     int NumElements = OrigNumElements / Scale;
7458     MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), NumElements);
7459     if (SDValue Unpack = TryUnpack(UnpackVT, Scale))
7460       return Unpack;
7461   }
7462
7463   // If none of the unpack-rooted lowerings worked (or were profitable) try an
7464   // initial unpack.
7465   if (NumLoInputs == 0 || NumHiInputs == 0) {
7466     assert((NumLoInputs > 0 || NumHiInputs > 0) &&
7467            "We have to have *some* inputs!");
7468     int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
7469
7470     // FIXME: We could consider the total complexity of the permute of each
7471     // possible unpacking. Or at the least we should consider how many
7472     // half-crossings are created.
7473     // FIXME: We could consider commuting the unpacks.
7474
7475     SmallVector<int, 32> PermMask;
7476     PermMask.assign(Size, -1);
7477     for (int i = 0; i < Size; ++i) {
7478       if (Mask[i] < 0)
7479         continue;
7480
7481       assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
7482
7483       PermMask[i] =
7484           2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
7485     }
7486     return DAG.getVectorShuffle(
7487         VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
7488                             DL, VT, V1, V2),
7489         DAG.getUNDEF(VT), PermMask);
7490   }
7491
7492   return SDValue();
7493 }
7494
7495 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
7496 ///
7497 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
7498 /// support for floating point shuffles but not integer shuffles. These
7499 /// instructions will incur a domain crossing penalty on some chips though so
7500 /// it is better to avoid lowering through this for integer vectors where
7501 /// possible.
7502 static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
7503                                        const X86Subtarget *Subtarget,
7504                                        SelectionDAG &DAG) {
7505   SDLoc DL(Op);
7506   assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!");
7507   assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
7508   assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
7509   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
7510   ArrayRef<int> Mask = SVOp->getMask();
7511   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
7512
7513   if (isSingleInputShuffleMask(Mask)) {
7514     // Use low duplicate instructions for masks that match their pattern.
7515     if (Subtarget->hasSSE3())
7516       if (isShuffleEquivalent(V1, V2, Mask, {0, 0}))
7517         return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, V1);
7518
7519     // Straight shuffle of a single input vector. Simulate this by using the
7520     // single input as both of the "inputs" to this instruction..
7521     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
7522
7523     if (Subtarget->hasAVX()) {
7524       // If we have AVX, we can use VPERMILPS which will allow folding a load
7525       // into the shuffle.
7526       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
7527                          DAG.getConstant(SHUFPDMask, DL, MVT::i8));
7528     }
7529
7530     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V1,
7531                        DAG.getConstant(SHUFPDMask, DL, MVT::i8));
7532   }
7533   assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
7534   assert(Mask[1] >= 2 && "Non-canonicalized blend!");
7535
7536   // If we have a single input, insert that into V1 if we can do so cheaply.
7537   if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
7538     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
7539             DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
7540       return Insertion;
7541     // Try inverting the insertion since for v2 masks it is easy to do and we
7542     // can't reliably sort the mask one way or the other.
7543     int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
7544                           Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
7545     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
7546             DL, MVT::v2f64, V2, V1, InverseMask, Subtarget, DAG))
7547       return Insertion;
7548   }
7549
7550   // Try to use one of the special instruction patterns to handle two common
7551   // blend patterns if a zero-blend above didn't work.
7552   if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
7553       isShuffleEquivalent(V1, V2, Mask, {1, 3}))
7554     if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
7555       // We can either use a special instruction to load over the low double or
7556       // to move just the low double.
7557       return DAG.getNode(
7558           isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
7559           DL, MVT::v2f64, V2,
7560           DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
7561
7562   if (Subtarget->hasSSE41())
7563     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
7564                                                   Subtarget, DAG))
7565       return Blend;
7566
7567   // Use dedicated unpack instructions for masks that match their pattern.
7568   if (isShuffleEquivalent(V1, V2, Mask, {0, 2}))
7569     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2);
7570   if (isShuffleEquivalent(V1, V2, Mask, {1, 3}))
7571     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2);
7572
7573   unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
7574   return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
7575                      DAG.getConstant(SHUFPDMask, DL, MVT::i8));
7576 }
7577
7578 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
7579 ///
7580 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
7581 /// the integer unit to minimize domain crossing penalties. However, for blends
7582 /// it falls back to the floating point shuffle operation with appropriate bit
7583 /// casting.
7584 static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
7585                                        const X86Subtarget *Subtarget,
7586                                        SelectionDAG &DAG) {
7587   SDLoc DL(Op);
7588   assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!");
7589   assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
7590   assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
7591   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
7592   ArrayRef<int> Mask = SVOp->getMask();
7593   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
7594
7595   if (isSingleInputShuffleMask(Mask)) {
7596     // Check for being able to broadcast a single element.
7597     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v2i64, V1,
7598                                                           Mask, Subtarget, DAG))
7599       return Broadcast;
7600
7601     // Straight shuffle of a single input vector. For everything from SSE2
7602     // onward this has a single fast instruction with no scary immediates.
7603     // We have to map the mask as it is actually a v4i32 shuffle instruction.
7604     V1 = DAG.getBitcast(MVT::v4i32, V1);
7605     int WidenedMask[4] = {
7606         std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
7607         std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
7608     return DAG.getBitcast(
7609         MVT::v2i64,
7610         DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
7611                     getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
7612   }
7613   assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
7614   assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
7615   assert(Mask[0] < 2 && "We sort V1 to be the first input.");
7616   assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
7617
7618   // If we have a blend of two PACKUS operations an the blend aligns with the
7619   // low and half halves, we can just merge the PACKUS operations. This is
7620   // particularly important as it lets us merge shuffles that this routine itself
7621   // creates.
7622   auto GetPackNode = [](SDValue V) {
7623     while (V.getOpcode() == ISD::BITCAST)
7624       V = V.getOperand(0);
7625
7626     return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
7627   };
7628   if (SDValue V1Pack = GetPackNode(V1))
7629     if (SDValue V2Pack = GetPackNode(V2))
7630       return DAG.getBitcast(MVT::v2i64,
7631                             DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8,
7632                                         Mask[0] == 0 ? V1Pack.getOperand(0)
7633                                                      : V1Pack.getOperand(1),
7634                                         Mask[1] == 2 ? V2Pack.getOperand(0)
7635                                                      : V2Pack.getOperand(1)));
7636
7637   // Try to use shift instructions.
7638   if (SDValue Shift =
7639           lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, DAG))
7640     return Shift;
7641
7642   // When loading a scalar and then shuffling it into a vector we can often do
7643   // the insertion cheaply.
7644   if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
7645           DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
7646     return Insertion;
7647   // Try inverting the insertion since for v2 masks it is easy to do and we
7648   // can't reliably sort the mask one way or the other.
7649   int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
7650   if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
7651           DL, MVT::v2i64, V2, V1, InverseMask, Subtarget, DAG))
7652     return Insertion;
7653
7654   // We have different paths for blend lowering, but they all must use the
7655   // *exact* same predicate.
7656   bool IsBlendSupported = Subtarget->hasSSE41();
7657   if (IsBlendSupported)
7658     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
7659                                                   Subtarget, DAG))
7660       return Blend;
7661
7662   // Use dedicated unpack instructions for masks that match their pattern.
7663   if (isShuffleEquivalent(V1, V2, Mask, {0, 2}))
7664     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2);
7665   if (isShuffleEquivalent(V1, V2, Mask, {1, 3}))
7666     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2);
7667
7668   // Try to use byte rotation instructions.
7669   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
7670   if (Subtarget->hasSSSE3())
7671     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
7672             DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
7673       return Rotate;
7674
7675   // If we have direct support for blends, we should lower by decomposing into
7676   // a permute. That will be faster than the domain cross.
7677   if (IsBlendSupported)
7678     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
7679                                                       Mask, DAG);
7680
7681   // We implement this with SHUFPD which is pretty lame because it will likely
7682   // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
7683   // However, all the alternatives are still more cycles and newer chips don't
7684   // have this problem. It would be really nice if x86 had better shuffles here.
7685   V1 = DAG.getBitcast(MVT::v2f64, V1);
7686   V2 = DAG.getBitcast(MVT::v2f64, V2);
7687   return DAG.getBitcast(MVT::v2i64,
7688                         DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
7689 }
7690
7691 /// \brief Test whether this can be lowered with a single SHUFPS instruction.
7692 ///
7693 /// This is used to disable more specialized lowerings when the shufps lowering
7694 /// will happen to be efficient.
7695 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
7696   // This routine only handles 128-bit shufps.
7697   assert(Mask.size() == 4 && "Unsupported mask size!");
7698
7699   // To lower with a single SHUFPS we need to have the low half and high half
7700   // each requiring a single input.
7701   if (Mask[0] != -1 && Mask[1] != -1 && (Mask[0] < 4) != (Mask[1] < 4))
7702     return false;
7703   if (Mask[2] != -1 && Mask[3] != -1 && (Mask[2] < 4) != (Mask[3] < 4))
7704     return false;
7705
7706   return true;
7707 }
7708
7709 /// \brief Lower a vector shuffle using the SHUFPS instruction.
7710 ///
7711 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
7712 /// It makes no assumptions about whether this is the *best* lowering, it simply
7713 /// uses it.
7714 static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT,
7715                                             ArrayRef<int> Mask, SDValue V1,
7716                                             SDValue V2, SelectionDAG &DAG) {
7717   SDValue LowV = V1, HighV = V2;
7718   int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
7719
7720   int NumV2Elements =
7721       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
7722
7723   if (NumV2Elements == 1) {
7724     int V2Index =
7725         std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
7726         Mask.begin();
7727
7728     // Compute the index adjacent to V2Index and in the same half by toggling
7729     // the low bit.
7730     int V2AdjIndex = V2Index ^ 1;
7731
7732     if (Mask[V2AdjIndex] == -1) {
7733       // Handles all the cases where we have a single V2 element and an undef.
7734       // This will only ever happen in the high lanes because we commute the
7735       // vector otherwise.
7736       if (V2Index < 2)
7737         std::swap(LowV, HighV);
7738       NewMask[V2Index] -= 4;
7739     } else {
7740       // Handle the case where the V2 element ends up adjacent to a V1 element.
7741       // To make this work, blend them together as the first step.
7742       int V1Index = V2AdjIndex;
7743       int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
7744       V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
7745                        getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
7746
7747       // Now proceed to reconstruct the final blend as we have the necessary
7748       // high or low half formed.
7749       if (V2Index < 2) {
7750         LowV = V2;
7751         HighV = V1;
7752       } else {
7753         HighV = V2;
7754       }
7755       NewMask[V1Index] = 2; // We put the V1 element in V2[2].
7756       NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
7757     }
7758   } else if (NumV2Elements == 2) {
7759     if (Mask[0] < 4 && Mask[1] < 4) {
7760       // Handle the easy case where we have V1 in the low lanes and V2 in the
7761       // high lanes.
7762       NewMask[2] -= 4;
7763       NewMask[3] -= 4;
7764     } else if (Mask[2] < 4 && Mask[3] < 4) {
7765       // We also handle the reversed case because this utility may get called
7766       // when we detect a SHUFPS pattern but can't easily commute the shuffle to
7767       // arrange things in the right direction.
7768       NewMask[0] -= 4;
7769       NewMask[1] -= 4;
7770       HighV = V1;
7771       LowV = V2;
7772     } else {
7773       // We have a mixture of V1 and V2 in both low and high lanes. Rather than
7774       // trying to place elements directly, just blend them and set up the final
7775       // shuffle to place them.
7776
7777       // The first two blend mask elements are for V1, the second two are for
7778       // V2.
7779       int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
7780                           Mask[2] < 4 ? Mask[2] : Mask[3],
7781                           (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
7782                           (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
7783       V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
7784                        getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
7785
7786       // Now we do a normal shuffle of V1 by giving V1 as both operands to
7787       // a blend.
7788       LowV = HighV = V1;
7789       NewMask[0] = Mask[0] < 4 ? 0 : 2;
7790       NewMask[1] = Mask[0] < 4 ? 2 : 0;
7791       NewMask[2] = Mask[2] < 4 ? 1 : 3;
7792       NewMask[3] = Mask[2] < 4 ? 3 : 1;
7793     }
7794   }
7795   return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
7796                      getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
7797 }
7798
7799 /// \brief Lower 4-lane 32-bit floating point shuffles.
7800 ///
7801 /// Uses instructions exclusively from the floating point unit to minimize
7802 /// domain crossing penalties, as these are sufficient to implement all v4f32
7803 /// shuffles.
7804 static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
7805                                        const X86Subtarget *Subtarget,
7806                                        SelectionDAG &DAG) {
7807   SDLoc DL(Op);
7808   assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
7809   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
7810   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
7811   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
7812   ArrayRef<int> Mask = SVOp->getMask();
7813   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
7814
7815   int NumV2Elements =
7816       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
7817
7818   if (NumV2Elements == 0) {
7819     // Check for being able to broadcast a single element.
7820     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f32, V1,
7821                                                           Mask, Subtarget, DAG))
7822       return Broadcast;
7823
7824     // Use even/odd duplicate instructions for masks that match their pattern.
7825     if (Subtarget->hasSSE3()) {
7826       if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
7827         return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
7828       if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
7829         return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
7830     }
7831
7832     if (Subtarget->hasAVX()) {
7833       // If we have AVX, we can use VPERMILPS which will allow folding a load
7834       // into the shuffle.
7835       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
7836                          getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
7837     }
7838
7839     // Otherwise, use a straight shuffle of a single input vector. We pass the
7840     // input vector to both operands to simulate this with a SHUFPS.
7841     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
7842                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
7843   }
7844
7845   // There are special ways we can lower some single-element blends. However, we
7846   // have custom ways we can lower more complex single-element blends below that
7847   // we defer to if both this and BLENDPS fail to match, so restrict this to
7848   // when the V2 input is targeting element 0 of the mask -- that is the fast
7849   // case here.
7850   if (NumV2Elements == 1 && Mask[0] >= 4)
7851     if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4f32, V1, V2,
7852                                                          Mask, Subtarget, DAG))
7853       return V;
7854
7855   if (Subtarget->hasSSE41()) {
7856     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
7857                                                   Subtarget, DAG))
7858       return Blend;
7859
7860     // Use INSERTPS if we can complete the shuffle efficiently.
7861     if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG))
7862       return V;
7863
7864     if (!isSingleSHUFPSMask(Mask))
7865       if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
7866               DL, MVT::v4f32, V1, V2, Mask, DAG))
7867         return BlendPerm;
7868   }
7869
7870   // Use dedicated unpack instructions for masks that match their pattern.
7871   if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 1, 5}))
7872     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2);
7873   if (isShuffleEquivalent(V1, V2, Mask, {2, 6, 3, 7}))
7874     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
7875   if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 5, 1}))
7876     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V2, V1);
7877   if (isShuffleEquivalent(V1, V2, Mask, {6, 2, 7, 3}))
7878     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V2, V1);
7879
7880   // Otherwise fall back to a SHUFPS lowering strategy.
7881   return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
7882 }
7883
7884 /// \brief Lower 4-lane i32 vector shuffles.
7885 ///
7886 /// We try to handle these with integer-domain shuffles where we can, but for
7887 /// blends we use the floating point domain blend instructions.
7888 static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
7889                                        const X86Subtarget *Subtarget,
7890                                        SelectionDAG &DAG) {
7891   SDLoc DL(Op);
7892   assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!");
7893   assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
7894   assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
7895   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
7896   ArrayRef<int> Mask = SVOp->getMask();
7897   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
7898
7899   // Whenever we can lower this as a zext, that instruction is strictly faster
7900   // than any alternative. It also allows us to fold memory operands into the
7901   // shuffle in many cases.
7902   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2,
7903                                                          Mask, Subtarget, DAG))
7904     return ZExt;
7905
7906   int NumV2Elements =
7907       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
7908
7909   if (NumV2Elements == 0) {
7910     // Check for being able to broadcast a single element.
7911     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i32, V1,
7912                                                           Mask, Subtarget, DAG))
7913       return Broadcast;
7914
7915     // Straight shuffle of a single input vector. For everything from SSE2
7916     // onward this has a single fast instruction with no scary immediates.
7917     // We coerce the shuffle pattern to be compatible with UNPCK instructions
7918     // but we aren't actually going to use the UNPCK instruction because doing
7919     // so prevents folding a load into this instruction or making a copy.
7920     const int UnpackLoMask[] = {0, 0, 1, 1};
7921     const int UnpackHiMask[] = {2, 2, 3, 3};
7922     if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
7923       Mask = UnpackLoMask;
7924     else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
7925       Mask = UnpackHiMask;
7926
7927     return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
7928                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
7929   }
7930
7931   // Try to use shift instructions.
7932   if (SDValue Shift =
7933           lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, DAG))
7934     return Shift;
7935
7936   // There are special ways we can lower some single-element blends.
7937   if (NumV2Elements == 1)
7938     if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4i32, V1, V2,
7939                                                          Mask, Subtarget, DAG))
7940       return V;
7941
7942   // We have different paths for blend lowering, but they all must use the
7943   // *exact* same predicate.
7944   bool IsBlendSupported = Subtarget->hasSSE41();
7945   if (IsBlendSupported)
7946     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
7947                                                   Subtarget, DAG))
7948       return Blend;
7949
7950   if (SDValue Masked =
7951           lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, DAG))
7952     return Masked;
7953
7954   // Use dedicated unpack instructions for masks that match their pattern.
7955   if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 1, 5}))
7956     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2);
7957   if (isShuffleEquivalent(V1, V2, Mask, {2, 6, 3, 7}))
7958     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
7959   if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 5, 1}))
7960     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V2, V1);
7961   if (isShuffleEquivalent(V1, V2, Mask, {6, 2, 7, 3}))
7962     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V2, V1);
7963
7964   // Try to use byte rotation instructions.
7965   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
7966   if (Subtarget->hasSSSE3())
7967     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
7968             DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
7969       return Rotate;
7970
7971   // If we have direct support for blends, we should lower by decomposing into
7972   // a permute. That will be faster than the domain cross.
7973   if (IsBlendSupported)
7974     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
7975                                                       Mask, DAG);
7976
7977   // Try to lower by permuting the inputs into an unpack instruction.
7978   if (SDValue Unpack =
7979           lowerVectorShuffleAsUnpack(DL, MVT::v4i32, V1, V2, Mask, DAG))
7980     return Unpack;
7981
7982   // We implement this with SHUFPS because it can blend from two vectors.
7983   // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
7984   // up the inputs, bypassing domain shift penalties that we would encur if we
7985   // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
7986   // relevant.
7987   return DAG.getBitcast(
7988       MVT::v4i32,
7989       DAG.getVectorShuffle(MVT::v4f32, DL, DAG.getBitcast(MVT::v4f32, V1),
7990                            DAG.getBitcast(MVT::v4f32, V2), Mask));
7991 }
7992
7993 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
7994 /// shuffle lowering, and the most complex part.
7995 ///
7996 /// The lowering strategy is to try to form pairs of input lanes which are
7997 /// targeted at the same half of the final vector, and then use a dword shuffle
7998 /// to place them onto the right half, and finally unpack the paired lanes into
7999 /// their final position.
8000 ///
8001 /// The exact breakdown of how to form these dword pairs and align them on the
8002 /// correct sides is really tricky. See the comments within the function for
8003 /// more of the details.
8004 ///
8005 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
8006 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
8007 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
8008 /// vector, form the analogous 128-bit 8-element Mask.
8009 static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
8010     SDLoc DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
8011     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
8012   assert(VT.getScalarType() == MVT::i16 && "Bad input type!");
8013   MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
8014
8015   assert(Mask.size() == 8 && "Shuffle mask length doen't match!");
8016   MutableArrayRef<int> LoMask = Mask.slice(0, 4);
8017   MutableArrayRef<int> HiMask = Mask.slice(4, 4);
8018
8019   SmallVector<int, 4> LoInputs;
8020   std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
8021                [](int M) { return M >= 0; });
8022   std::sort(LoInputs.begin(), LoInputs.end());
8023   LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
8024   SmallVector<int, 4> HiInputs;
8025   std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
8026                [](int M) { return M >= 0; });
8027   std::sort(HiInputs.begin(), HiInputs.end());
8028   HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
8029   int NumLToL =
8030       std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
8031   int NumHToL = LoInputs.size() - NumLToL;
8032   int NumLToH =
8033       std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
8034   int NumHToH = HiInputs.size() - NumLToH;
8035   MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
8036   MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
8037   MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
8038   MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
8039
8040   // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
8041   // such inputs we can swap two of the dwords across the half mark and end up
8042   // with <=2 inputs to each half in each half. Once there, we can fall through
8043   // to the generic code below. For example:
8044   //
8045   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
8046   // Mask:  [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
8047   //
8048   // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
8049   // and an existing 2-into-2 on the other half. In this case we may have to
8050   // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
8051   // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
8052   // Fortunately, we don't have to handle anything but a 2-into-2 pattern
8053   // because any other situation (including a 3-into-1 or 1-into-3 in the other
8054   // half than the one we target for fixing) will be fixed when we re-enter this
8055   // path. We will also combine away any sequence of PSHUFD instructions that
8056   // result into a single instruction. Here is an example of the tricky case:
8057   //
8058   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
8059   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
8060   //
8061   // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
8062   //
8063   // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
8064   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
8065   //
8066   // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
8067   // Mask:  [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
8068   //
8069   // The result is fine to be handled by the generic logic.
8070   auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
8071                           ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
8072                           int AOffset, int BOffset) {
8073     assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
8074            "Must call this with A having 3 or 1 inputs from the A half.");
8075     assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
8076            "Must call this with B having 1 or 3 inputs from the B half.");
8077     assert(AToAInputs.size() + BToAInputs.size() == 4 &&
8078            "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
8079
8080     // Compute the index of dword with only one word among the three inputs in
8081     // a half by taking the sum of the half with three inputs and subtracting
8082     // the sum of the actual three inputs. The difference is the remaining
8083     // slot.
8084     int ADWord, BDWord;
8085     int &TripleDWord = AToAInputs.size() == 3 ? ADWord : BDWord;
8086     int &OneInputDWord = AToAInputs.size() == 3 ? BDWord : ADWord;
8087     int TripleInputOffset = AToAInputs.size() == 3 ? AOffset : BOffset;
8088     ArrayRef<int> TripleInputs = AToAInputs.size() == 3 ? AToAInputs : BToAInputs;
8089     int OneInput = AToAInputs.size() == 3 ? BToAInputs[0] : AToAInputs[0];
8090     int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
8091     int TripleNonInputIdx =
8092         TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
8093     TripleDWord = TripleNonInputIdx / 2;
8094
8095     // We use xor with one to compute the adjacent DWord to whichever one the
8096     // OneInput is in.
8097     OneInputDWord = (OneInput / 2) ^ 1;
8098
8099     // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
8100     // and BToA inputs. If there is also such a problem with the BToB and AToB
8101     // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
8102     // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
8103     // is essential that we don't *create* a 3<-1 as then we might oscillate.
8104     if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
8105       // Compute how many inputs will be flipped by swapping these DWords. We
8106       // need
8107       // to balance this to ensure we don't form a 3-1 shuffle in the other
8108       // half.
8109       int NumFlippedAToBInputs =
8110           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
8111           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
8112       int NumFlippedBToBInputs =
8113           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
8114           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
8115       if ((NumFlippedAToBInputs == 1 &&
8116            (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
8117           (NumFlippedBToBInputs == 1 &&
8118            (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
8119         // We choose whether to fix the A half or B half based on whether that
8120         // half has zero flipped inputs. At zero, we may not be able to fix it
8121         // with that half. We also bias towards fixing the B half because that
8122         // will more commonly be the high half, and we have to bias one way.
8123         auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
8124                                                        ArrayRef<int> Inputs) {
8125           int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
8126           bool IsFixIdxInput = std::find(Inputs.begin(), Inputs.end(),
8127                                          PinnedIdx ^ 1) != Inputs.end();
8128           // Determine whether the free index is in the flipped dword or the
8129           // unflipped dword based on where the pinned index is. We use this bit
8130           // in an xor to conditionally select the adjacent dword.
8131           int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
8132           bool IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
8133                                              FixFreeIdx) != Inputs.end();
8134           if (IsFixIdxInput == IsFixFreeIdxInput)
8135             FixFreeIdx += 1;
8136           IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
8137                                         FixFreeIdx) != Inputs.end();
8138           assert(IsFixIdxInput != IsFixFreeIdxInput &&
8139                  "We need to be changing the number of flipped inputs!");
8140           int PSHUFHalfMask[] = {0, 1, 2, 3};
8141           std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
8142           V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
8143                           MVT::v8i16, V,
8144                           getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
8145
8146           for (int &M : Mask)
8147             if (M != -1 && M == FixIdx)
8148               M = FixFreeIdx;
8149             else if (M != -1 && M == FixFreeIdx)
8150               M = FixIdx;
8151         };
8152         if (NumFlippedBToBInputs != 0) {
8153           int BPinnedIdx =
8154               BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
8155           FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
8156         } else {
8157           assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
8158           int APinnedIdx =
8159               AToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
8160           FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
8161         }
8162       }
8163     }
8164
8165     int PSHUFDMask[] = {0, 1, 2, 3};
8166     PSHUFDMask[ADWord] = BDWord;
8167     PSHUFDMask[BDWord] = ADWord;
8168     V = DAG.getBitcast(
8169         VT,
8170         DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
8171                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
8172
8173     // Adjust the mask to match the new locations of A and B.
8174     for (int &M : Mask)
8175       if (M != -1 && M/2 == ADWord)
8176         M = 2 * BDWord + M % 2;
8177       else if (M != -1 && M/2 == BDWord)
8178         M = 2 * ADWord + M % 2;
8179
8180     // Recurse back into this routine to re-compute state now that this isn't
8181     // a 3 and 1 problem.
8182     return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
8183                                                      DAG);
8184   };
8185   if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
8186     return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
8187   else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
8188     return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
8189
8190   // At this point there are at most two inputs to the low and high halves from
8191   // each half. That means the inputs can always be grouped into dwords and
8192   // those dwords can then be moved to the correct half with a dword shuffle.
8193   // We use at most one low and one high word shuffle to collect these paired
8194   // inputs into dwords, and finally a dword shuffle to place them.
8195   int PSHUFLMask[4] = {-1, -1, -1, -1};
8196   int PSHUFHMask[4] = {-1, -1, -1, -1};
8197   int PSHUFDMask[4] = {-1, -1, -1, -1};
8198
8199   // First fix the masks for all the inputs that are staying in their
8200   // original halves. This will then dictate the targets of the cross-half
8201   // shuffles.
8202   auto fixInPlaceInputs =
8203       [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
8204                     MutableArrayRef<int> SourceHalfMask,
8205                     MutableArrayRef<int> HalfMask, int HalfOffset) {
8206     if (InPlaceInputs.empty())
8207       return;
8208     if (InPlaceInputs.size() == 1) {
8209       SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
8210           InPlaceInputs[0] - HalfOffset;
8211       PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
8212       return;
8213     }
8214     if (IncomingInputs.empty()) {
8215       // Just fix all of the in place inputs.
8216       for (int Input : InPlaceInputs) {
8217         SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
8218         PSHUFDMask[Input / 2] = Input / 2;
8219       }
8220       return;
8221     }
8222
8223     assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
8224     SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
8225         InPlaceInputs[0] - HalfOffset;
8226     // Put the second input next to the first so that they are packed into
8227     // a dword. We find the adjacent index by toggling the low bit.
8228     int AdjIndex = InPlaceInputs[0] ^ 1;
8229     SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
8230     std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
8231     PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
8232   };
8233   fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
8234   fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
8235
8236   // Now gather the cross-half inputs and place them into a free dword of
8237   // their target half.
8238   // FIXME: This operation could almost certainly be simplified dramatically to
8239   // look more like the 3-1 fixing operation.
8240   auto moveInputsToRightHalf = [&PSHUFDMask](
8241       MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
8242       MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
8243       MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
8244       int DestOffset) {
8245     auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
8246       return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word;
8247     };
8248     auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
8249                                                int Word) {
8250       int LowWord = Word & ~1;
8251       int HighWord = Word | 1;
8252       return isWordClobbered(SourceHalfMask, LowWord) ||
8253              isWordClobbered(SourceHalfMask, HighWord);
8254     };
8255
8256     if (IncomingInputs.empty())
8257       return;
8258
8259     if (ExistingInputs.empty()) {
8260       // Map any dwords with inputs from them into the right half.
8261       for (int Input : IncomingInputs) {
8262         // If the source half mask maps over the inputs, turn those into
8263         // swaps and use the swapped lane.
8264         if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
8265           if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == -1) {
8266             SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
8267                 Input - SourceOffset;
8268             // We have to swap the uses in our half mask in one sweep.
8269             for (int &M : HalfMask)
8270               if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
8271                 M = Input;
8272               else if (M == Input)
8273                 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
8274           } else {
8275             assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
8276                        Input - SourceOffset &&
8277                    "Previous placement doesn't match!");
8278           }
8279           // Note that this correctly re-maps both when we do a swap and when
8280           // we observe the other side of the swap above. We rely on that to
8281           // avoid swapping the members of the input list directly.
8282           Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
8283         }
8284
8285         // Map the input's dword into the correct half.
8286         if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == -1)
8287           PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
8288         else
8289           assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
8290                      Input / 2 &&
8291                  "Previous placement doesn't match!");
8292       }
8293
8294       // And just directly shift any other-half mask elements to be same-half
8295       // as we will have mirrored the dword containing the element into the
8296       // same position within that half.
8297       for (int &M : HalfMask)
8298         if (M >= SourceOffset && M < SourceOffset + 4) {
8299           M = M - SourceOffset + DestOffset;
8300           assert(M >= 0 && "This should never wrap below zero!");
8301         }
8302       return;
8303     }
8304
8305     // Ensure we have the input in a viable dword of its current half. This
8306     // is particularly tricky because the original position may be clobbered
8307     // by inputs being moved and *staying* in that half.
8308     if (IncomingInputs.size() == 1) {
8309       if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
8310         int InputFixed = std::find(std::begin(SourceHalfMask),
8311                                    std::end(SourceHalfMask), -1) -
8312                          std::begin(SourceHalfMask) + SourceOffset;
8313         SourceHalfMask[InputFixed - SourceOffset] =
8314             IncomingInputs[0] - SourceOffset;
8315         std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
8316                      InputFixed);
8317         IncomingInputs[0] = InputFixed;
8318       }
8319     } else if (IncomingInputs.size() == 2) {
8320       if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
8321           isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
8322         // We have two non-adjacent or clobbered inputs we need to extract from
8323         // the source half. To do this, we need to map them into some adjacent
8324         // dword slot in the source mask.
8325         int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
8326                               IncomingInputs[1] - SourceOffset};
8327
8328         // If there is a free slot in the source half mask adjacent to one of
8329         // the inputs, place the other input in it. We use (Index XOR 1) to
8330         // compute an adjacent index.
8331         if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
8332             SourceHalfMask[InputsFixed[0] ^ 1] == -1) {
8333           SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
8334           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
8335           InputsFixed[1] = InputsFixed[0] ^ 1;
8336         } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
8337                    SourceHalfMask[InputsFixed[1] ^ 1] == -1) {
8338           SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
8339           SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
8340           InputsFixed[0] = InputsFixed[1] ^ 1;
8341         } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] == -1 &&
8342                    SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] == -1) {
8343           // The two inputs are in the same DWord but it is clobbered and the
8344           // adjacent DWord isn't used at all. Move both inputs to the free
8345           // slot.
8346           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
8347           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
8348           InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
8349           InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
8350         } else {
8351           // The only way we hit this point is if there is no clobbering
8352           // (because there are no off-half inputs to this half) and there is no
8353           // free slot adjacent to one of the inputs. In this case, we have to
8354           // swap an input with a non-input.
8355           for (int i = 0; i < 4; ++i)
8356             assert((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) &&
8357                    "We can't handle any clobbers here!");
8358           assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
8359                  "Cannot have adjacent inputs here!");
8360
8361           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
8362           SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
8363
8364           // We also have to update the final source mask in this case because
8365           // it may need to undo the above swap.
8366           for (int &M : FinalSourceHalfMask)
8367             if (M == (InputsFixed[0] ^ 1) + SourceOffset)
8368               M = InputsFixed[1] + SourceOffset;
8369             else if (M == InputsFixed[1] + SourceOffset)
8370               M = (InputsFixed[0] ^ 1) + SourceOffset;
8371
8372           InputsFixed[1] = InputsFixed[0] ^ 1;
8373         }
8374
8375         // Point everything at the fixed inputs.
8376         for (int &M : HalfMask)
8377           if (M == IncomingInputs[0])
8378             M = InputsFixed[0] + SourceOffset;
8379           else if (M == IncomingInputs[1])
8380             M = InputsFixed[1] + SourceOffset;
8381
8382         IncomingInputs[0] = InputsFixed[0] + SourceOffset;
8383         IncomingInputs[1] = InputsFixed[1] + SourceOffset;
8384       }
8385     } else {
8386       llvm_unreachable("Unhandled input size!");
8387     }
8388
8389     // Now hoist the DWord down to the right half.
8390     int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2;
8391     assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free");
8392     PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
8393     for (int &M : HalfMask)
8394       for (int Input : IncomingInputs)
8395         if (M == Input)
8396           M = FreeDWord * 2 + Input % 2;
8397   };
8398   moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
8399                         /*SourceOffset*/ 4, /*DestOffset*/ 0);
8400   moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
8401                         /*SourceOffset*/ 0, /*DestOffset*/ 4);
8402
8403   // Now enact all the shuffles we've computed to move the inputs into their
8404   // target half.
8405   if (!isNoopShuffleMask(PSHUFLMask))
8406     V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
8407                     getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
8408   if (!isNoopShuffleMask(PSHUFHMask))
8409     V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
8410                     getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
8411   if (!isNoopShuffleMask(PSHUFDMask))
8412     V = DAG.getBitcast(
8413         VT,
8414         DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
8415                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
8416
8417   // At this point, each half should contain all its inputs, and we can then
8418   // just shuffle them into their final position.
8419   assert(std::count_if(LoMask.begin(), LoMask.end(),
8420                        [](int M) { return M >= 4; }) == 0 &&
8421          "Failed to lift all the high half inputs to the low mask!");
8422   assert(std::count_if(HiMask.begin(), HiMask.end(),
8423                        [](int M) { return M >= 0 && M < 4; }) == 0 &&
8424          "Failed to lift all the low half inputs to the high mask!");
8425
8426   // Do a half shuffle for the low mask.
8427   if (!isNoopShuffleMask(LoMask))
8428     V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
8429                     getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
8430
8431   // Do a half shuffle with the high mask after shifting its values down.
8432   for (int &M : HiMask)
8433     if (M >= 0)
8434       M -= 4;
8435   if (!isNoopShuffleMask(HiMask))
8436     V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
8437                     getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
8438
8439   return V;
8440 }
8441
8442 /// \brief Helper to form a PSHUFB-based shuffle+blend.
8443 static SDValue lowerVectorShuffleAsPSHUFB(SDLoc DL, MVT VT, SDValue V1,
8444                                           SDValue V2, ArrayRef<int> Mask,
8445                                           SelectionDAG &DAG, bool &V1InUse,
8446                                           bool &V2InUse) {
8447   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8448   SDValue V1Mask[16];
8449   SDValue V2Mask[16];
8450   V1InUse = false;
8451   V2InUse = false;
8452
8453   int Size = Mask.size();
8454   int Scale = 16 / Size;
8455   for (int i = 0; i < 16; ++i) {
8456     if (Mask[i / Scale] == -1) {
8457       V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
8458     } else {
8459       const int ZeroMask = 0x80;
8460       int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
8461                                           : ZeroMask;
8462       int V2Idx = Mask[i / Scale] < Size
8463                       ? ZeroMask
8464                       : (Mask[i / Scale] - Size) * Scale + i % Scale;
8465       if (Zeroable[i / Scale])
8466         V1Idx = V2Idx = ZeroMask;
8467       V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
8468       V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
8469       V1InUse |= (ZeroMask != V1Idx);
8470       V2InUse |= (ZeroMask != V2Idx);
8471     }
8472   }
8473
8474   if (V1InUse)
8475     V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
8476                      DAG.getBitcast(MVT::v16i8, V1),
8477                      DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
8478   if (V2InUse)
8479     V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
8480                      DAG.getBitcast(MVT::v16i8, V2),
8481                      DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));
8482
8483   // If we need shuffled inputs from both, blend the two.
8484   SDValue V;
8485   if (V1InUse && V2InUse)
8486     V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
8487   else
8488     V = V1InUse ? V1 : V2;
8489
8490   // Cast the result back to the correct type.
8491   return DAG.getBitcast(VT, V);
8492 }
8493
8494 /// \brief Generic lowering of 8-lane i16 shuffles.
8495 ///
8496 /// This handles both single-input shuffles and combined shuffle/blends with
8497 /// two inputs. The single input shuffles are immediately delegated to
8498 /// a dedicated lowering routine.
8499 ///
8500 /// The blends are lowered in one of three fundamental ways. If there are few
8501 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
8502 /// of the input is significantly cheaper when lowered as an interleaving of
8503 /// the two inputs, try to interleave them. Otherwise, blend the low and high
8504 /// halves of the inputs separately (making them have relatively few inputs)
8505 /// and then concatenate them.
8506 static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8507                                        const X86Subtarget *Subtarget,
8508                                        SelectionDAG &DAG) {
8509   SDLoc DL(Op);
8510   assert(Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!");
8511   assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
8512   assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
8513   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8514   ArrayRef<int> OrigMask = SVOp->getMask();
8515   int MaskStorage[8] = {OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3],
8516                         OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7]};
8517   MutableArrayRef<int> Mask(MaskStorage);
8518
8519   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
8520
8521   // Whenever we can lower this as a zext, that instruction is strictly faster
8522   // than any alternative.
8523   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
8524           DL, MVT::v8i16, V1, V2, OrigMask, Subtarget, DAG))
8525     return ZExt;
8526
8527   auto isV1 = [](int M) { return M >= 0 && M < 8; };
8528   (void)isV1;
8529   auto isV2 = [](int M) { return M >= 8; };
8530
8531   int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2);
8532
8533   if (NumV2Inputs == 0) {
8534     // Check for being able to broadcast a single element.
8535     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i16, V1,
8536                                                           Mask, Subtarget, DAG))
8537       return Broadcast;
8538
8539     // Try to use shift instructions.
8540     if (SDValue Shift =
8541             lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, DAG))
8542       return Shift;
8543
8544     // Use dedicated unpack instructions for masks that match their pattern.
8545     if (isShuffleEquivalent(V1, V1, Mask, {0, 0, 1, 1, 2, 2, 3, 3}))
8546       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V1);
8547     if (isShuffleEquivalent(V1, V1, Mask, {4, 4, 5, 5, 6, 6, 7, 7}))
8548       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V1);
8549
8550     // Try to use byte rotation instructions.
8551     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
8552                                                         Mask, Subtarget, DAG))
8553       return Rotate;
8554
8555     return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1, Mask,
8556                                                      Subtarget, DAG);
8557   }
8558
8559   assert(std::any_of(Mask.begin(), Mask.end(), isV1) &&
8560          "All single-input shuffles should be canonicalized to be V1-input "
8561          "shuffles.");
8562
8563   // Try to use shift instructions.
8564   if (SDValue Shift =
8565           lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, DAG))
8566     return Shift;
8567
8568   // There are special ways we can lower some single-element blends.
8569   if (NumV2Inputs == 1)
8570     if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v8i16, V1, V2,
8571                                                          Mask, Subtarget, DAG))
8572       return V;
8573
8574   // We have different paths for blend lowering, but they all must use the
8575   // *exact* same predicate.
8576   bool IsBlendSupported = Subtarget->hasSSE41();
8577   if (IsBlendSupported)
8578     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
8579                                                   Subtarget, DAG))
8580       return Blend;
8581
8582   if (SDValue Masked =
8583           lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, DAG))
8584     return Masked;
8585
8586   // Use dedicated unpack instructions for masks that match their pattern.
8587   if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 2, 10, 3, 11}))
8588     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2);
8589   if (isShuffleEquivalent(V1, V2, Mask, {4, 12, 5, 13, 6, 14, 7, 15}))
8590     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2);
8591
8592   // Try to use byte rotation instructions.
8593   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
8594           DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
8595     return Rotate;
8596
8597   if (SDValue BitBlend =
8598           lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
8599     return BitBlend;
8600
8601   if (SDValue Unpack =
8602           lowerVectorShuffleAsUnpack(DL, MVT::v8i16, V1, V2, Mask, DAG))
8603     return Unpack;
8604
8605   // If we can't directly blend but can use PSHUFB, that will be better as it
8606   // can both shuffle and set up the inefficient blend.
8607   if (!IsBlendSupported && Subtarget->hasSSSE3()) {
8608     bool V1InUse, V2InUse;
8609     return lowerVectorShuffleAsPSHUFB(DL, MVT::v8i16, V1, V2, Mask, DAG,
8610                                       V1InUse, V2InUse);
8611   }
8612
8613   // We can always bit-blend if we have to so the fallback strategy is to
8614   // decompose into single-input permutes and blends.
8615   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
8616                                                       Mask, DAG);
8617 }
8618
8619 /// \brief Check whether a compaction lowering can be done by dropping even
8620 /// elements and compute how many times even elements must be dropped.
8621 ///
8622 /// This handles shuffles which take every Nth element where N is a power of
8623 /// two. Example shuffle masks:
8624 ///
8625 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14,  0,  2,  4,  6,  8, 10, 12, 14
8626 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
8627 ///  N = 2:  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12
8628 ///  N = 2:  0,  4,  8, 12, 16, 20, 24, 28,  0,  4,  8, 12, 16, 20, 24, 28
8629 ///  N = 3:  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8
8630 ///  N = 3:  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24
8631 ///
8632 /// Any of these lanes can of course be undef.
8633 ///
8634 /// This routine only supports N <= 3.
8635 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
8636 /// for larger N.
8637 ///
8638 /// \returns N above, or the number of times even elements must be dropped if
8639 /// there is such a number. Otherwise returns zero.
8640 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask) {
8641   // Figure out whether we're looping over two inputs or just one.
8642   bool IsSingleInput = isSingleInputShuffleMask(Mask);
8643
8644   // The modulus for the shuffle vector entries is based on whether this is
8645   // a single input or not.
8646   int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
8647   assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
8648          "We should only be called with masks with a power-of-2 size!");
8649
8650   uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
8651
8652   // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
8653   // and 2^3 simultaneously. This is because we may have ambiguity with
8654   // partially undef inputs.
8655   bool ViableForN[3] = {true, true, true};
8656
8657   for (int i = 0, e = Mask.size(); i < e; ++i) {
8658     // Ignore undef lanes, we'll optimistically collapse them to the pattern we
8659     // want.
8660     if (Mask[i] == -1)
8661       continue;
8662
8663     bool IsAnyViable = false;
8664     for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
8665       if (ViableForN[j]) {
8666         uint64_t N = j + 1;
8667
8668         // The shuffle mask must be equal to (i * 2^N) % M.
8669         if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
8670           IsAnyViable = true;
8671         else
8672           ViableForN[j] = false;
8673       }
8674     // Early exit if we exhaust the possible powers of two.
8675     if (!IsAnyViable)
8676       break;
8677   }
8678
8679   for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
8680     if (ViableForN[j])
8681       return j + 1;
8682
8683   // Return 0 as there is no viable power of two.
8684   return 0;
8685 }
8686
8687 /// \brief Generic lowering of v16i8 shuffles.
8688 ///
8689 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
8690 /// detect any complexity reducing interleaving. If that doesn't help, it uses
8691 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
8692 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
8693 /// back together.
8694 static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8695                                        const X86Subtarget *Subtarget,
8696                                        SelectionDAG &DAG) {
8697   SDLoc DL(Op);
8698   assert(Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!");
8699   assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
8700   assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
8701   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8702   ArrayRef<int> Mask = SVOp->getMask();
8703   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
8704
8705   // Try to use shift instructions.
8706   if (SDValue Shift =
8707           lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, DAG))
8708     return Shift;
8709
8710   // Try to use byte rotation instructions.
8711   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
8712           DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
8713     return Rotate;
8714
8715   // Try to use a zext lowering.
8716   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
8717           DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
8718     return ZExt;
8719
8720   int NumV2Elements =
8721       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; });
8722
8723   // For single-input shuffles, there are some nicer lowering tricks we can use.
8724   if (NumV2Elements == 0) {
8725     // Check for being able to broadcast a single element.
8726     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i8, V1,
8727                                                           Mask, Subtarget, DAG))
8728       return Broadcast;
8729
8730     // Check whether we can widen this to an i16 shuffle by duplicating bytes.
8731     // Notably, this handles splat and partial-splat shuffles more efficiently.
8732     // However, it only makes sense if the pre-duplication shuffle simplifies
8733     // things significantly. Currently, this means we need to be able to
8734     // express the pre-duplication shuffle as an i16 shuffle.
8735     //
8736     // FIXME: We should check for other patterns which can be widened into an
8737     // i16 shuffle as well.
8738     auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
8739       for (int i = 0; i < 16; i += 2)
8740         if (Mask[i] != -1 && Mask[i + 1] != -1 && Mask[i] != Mask[i + 1])
8741           return false;
8742
8743       return true;
8744     };
8745     auto tryToWidenViaDuplication = [&]() -> SDValue {
8746       if (!canWidenViaDuplication(Mask))
8747         return SDValue();
8748       SmallVector<int, 4> LoInputs;
8749       std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
8750                    [](int M) { return M >= 0 && M < 8; });
8751       std::sort(LoInputs.begin(), LoInputs.end());
8752       LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
8753                      LoInputs.end());
8754       SmallVector<int, 4> HiInputs;
8755       std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
8756                    [](int M) { return M >= 8; });
8757       std::sort(HiInputs.begin(), HiInputs.end());
8758       HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
8759                      HiInputs.end());
8760
8761       bool TargetLo = LoInputs.size() >= HiInputs.size();
8762       ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
8763       ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
8764
8765       int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
8766       SmallDenseMap<int, int, 8> LaneMap;
8767       for (int I : InPlaceInputs) {
8768         PreDupI16Shuffle[I/2] = I/2;
8769         LaneMap[I] = I;
8770       }
8771       int j = TargetLo ? 0 : 4, je = j + 4;
8772       for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
8773         // Check if j is already a shuffle of this input. This happens when
8774         // there are two adjacent bytes after we move the low one.
8775         if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
8776           // If we haven't yet mapped the input, search for a slot into which
8777           // we can map it.
8778           while (j < je && PreDupI16Shuffle[j] != -1)
8779             ++j;
8780
8781           if (j == je)
8782             // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
8783             return SDValue();
8784
8785           // Map this input with the i16 shuffle.
8786           PreDupI16Shuffle[j] = MovingInputs[i] / 2;
8787         }
8788
8789         // Update the lane map based on the mapping we ended up with.
8790         LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
8791       }
8792       V1 = DAG.getBitcast(
8793           MVT::v16i8,
8794           DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
8795                                DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
8796
8797       // Unpack the bytes to form the i16s that will be shuffled into place.
8798       V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
8799                        MVT::v16i8, V1, V1);
8800
8801       int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
8802       for (int i = 0; i < 16; ++i)
8803         if (Mask[i] != -1) {
8804           int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
8805           assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
8806           if (PostDupI16Shuffle[i / 2] == -1)
8807             PostDupI16Shuffle[i / 2] = MappedMask;
8808           else
8809             assert(PostDupI16Shuffle[i / 2] == MappedMask &&
8810                    "Conflicting entrties in the original shuffle!");
8811         }
8812       return DAG.getBitcast(
8813           MVT::v16i8,
8814           DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
8815                                DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
8816     };
8817     if (SDValue V = tryToWidenViaDuplication())
8818       return V;
8819   }
8820
8821   // Use dedicated unpack instructions for masks that match their pattern.
8822   if (isShuffleEquivalent(V1, V2, Mask, {// Low half.
8823                                          0, 16, 1, 17, 2, 18, 3, 19,
8824                                          // High half.
8825                                          4, 20, 5, 21, 6, 22, 7, 23}))
8826     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V1, V2);
8827   if (isShuffleEquivalent(V1, V2, Mask, {// Low half.
8828                                          8, 24, 9, 25, 10, 26, 11, 27,
8829                                          // High half.
8830                                          12, 28, 13, 29, 14, 30, 15, 31}))
8831     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V1, V2);
8832
8833   // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
8834   // with PSHUFB. It is important to do this before we attempt to generate any
8835   // blends but after all of the single-input lowerings. If the single input
8836   // lowerings can find an instruction sequence that is faster than a PSHUFB, we
8837   // want to preserve that and we can DAG combine any longer sequences into
8838   // a PSHUFB in the end. But once we start blending from multiple inputs,
8839   // the complexity of DAG combining bad patterns back into PSHUFB is too high,
8840   // and there are *very* few patterns that would actually be faster than the
8841   // PSHUFB approach because of its ability to zero lanes.
8842   //
8843   // FIXME: The only exceptions to the above are blends which are exact
8844   // interleavings with direct instructions supporting them. We currently don't
8845   // handle those well here.
8846   if (Subtarget->hasSSSE3()) {
8847     bool V1InUse = false;
8848     bool V2InUse = false;
8849
8850     SDValue PSHUFB = lowerVectorShuffleAsPSHUFB(DL, MVT::v16i8, V1, V2, Mask,
8851                                                 DAG, V1InUse, V2InUse);
8852
8853     // If both V1 and V2 are in use and we can use a direct blend or an unpack,
8854     // do so. This avoids using them to handle blends-with-zero which is
8855     // important as a single pshufb is significantly faster for that.
8856     if (V1InUse && V2InUse) {
8857       if (Subtarget->hasSSE41())
8858         if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i8, V1, V2,
8859                                                       Mask, Subtarget, DAG))
8860           return Blend;
8861
8862       // We can use an unpack to do the blending rather than an or in some
8863       // cases. Even though the or may be (very minorly) more efficient, we
8864       // preference this lowering because there are common cases where part of
8865       // the complexity of the shuffles goes away when we do the final blend as
8866       // an unpack.
8867       // FIXME: It might be worth trying to detect if the unpack-feeding
8868       // shuffles will both be pshufb, in which case we shouldn't bother with
8869       // this.
8870       if (SDValue Unpack =
8871               lowerVectorShuffleAsUnpack(DL, MVT::v16i8, V1, V2, Mask, DAG))
8872         return Unpack;
8873     }
8874
8875     return PSHUFB;
8876   }
8877
8878   // There are special ways we can lower some single-element blends.
8879   if (NumV2Elements == 1)
8880     if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v16i8, V1, V2,
8881                                                          Mask, Subtarget, DAG))
8882       return V;
8883
8884   if (SDValue BitBlend =
8885           lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
8886     return BitBlend;
8887
8888   // Check whether a compaction lowering can be done. This handles shuffles
8889   // which take every Nth element for some even N. See the helper function for
8890   // details.
8891   //
8892   // We special case these as they can be particularly efficiently handled with
8893   // the PACKUSB instruction on x86 and they show up in common patterns of
8894   // rearranging bytes to truncate wide elements.
8895   if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask)) {
8896     // NumEvenDrops is the power of two stride of the elements. Another way of
8897     // thinking about it is that we need to drop the even elements this many
8898     // times to get the original input.
8899     bool IsSingleInput = isSingleInputShuffleMask(Mask);
8900
8901     // First we need to zero all the dropped bytes.
8902     assert(NumEvenDrops <= 3 &&
8903            "No support for dropping even elements more than 3 times.");
8904     // We use the mask type to pick which bytes are preserved based on how many
8905     // elements are dropped.
8906     MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
8907     SDValue ByteClearMask = DAG.getBitcast(
8908         MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
8909     V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
8910     if (!IsSingleInput)
8911       V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
8912
8913     // Now pack things back together.
8914     V1 = DAG.getBitcast(MVT::v8i16, V1);
8915     V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
8916     SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
8917     for (int i = 1; i < NumEvenDrops; ++i) {
8918       Result = DAG.getBitcast(MVT::v8i16, Result);
8919       Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
8920     }
8921
8922     return Result;
8923   }
8924
8925   // Handle multi-input cases by blending single-input shuffles.
8926   if (NumV2Elements > 0)
8927     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
8928                                                       Mask, DAG);
8929
8930   // The fallback path for single-input shuffles widens this into two v8i16
8931   // vectors with unpacks, shuffles those, and then pulls them back together
8932   // with a pack.
8933   SDValue V = V1;
8934
8935   int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
8936   int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
8937   for (int i = 0; i < 16; ++i)
8938     if (Mask[i] >= 0)
8939       (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
8940
8941   SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
8942
8943   SDValue VLoHalf, VHiHalf;
8944   // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
8945   // them out and avoid using UNPCK{L,H} to extract the elements of V as
8946   // i16s.
8947   if (std::none_of(std::begin(LoBlendMask), std::end(LoBlendMask),
8948                    [](int M) { return M >= 0 && M % 2 == 1; }) &&
8949       std::none_of(std::begin(HiBlendMask), std::end(HiBlendMask),
8950                    [](int M) { return M >= 0 && M % 2 == 1; })) {
8951     // Use a mask to drop the high bytes.
8952     VLoHalf = DAG.getBitcast(MVT::v8i16, V);
8953     VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
8954                      DAG.getConstant(0x00FF, DL, MVT::v8i16));
8955
8956     // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
8957     VHiHalf = DAG.getUNDEF(MVT::v8i16);
8958
8959     // Squash the masks to point directly into VLoHalf.
8960     for (int &M : LoBlendMask)
8961       if (M >= 0)
8962         M /= 2;
8963     for (int &M : HiBlendMask)
8964       if (M >= 0)
8965         M /= 2;
8966   } else {
8967     // Otherwise just unpack the low half of V into VLoHalf and the high half into
8968     // VHiHalf so that we can blend them as i16s.
8969     VLoHalf = DAG.getBitcast(
8970         MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
8971     VHiHalf = DAG.getBitcast(
8972         MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
8973   }
8974
8975   SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
8976   SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
8977
8978   return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
8979 }
8980
8981 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
8982 ///
8983 /// This routine breaks down the specific type of 128-bit shuffle and
8984 /// dispatches to the lowering routines accordingly.
8985 static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8986                                         MVT VT, const X86Subtarget *Subtarget,
8987                                         SelectionDAG &DAG) {
8988   switch (VT.SimpleTy) {
8989   case MVT::v2i64:
8990     return lowerV2I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
8991   case MVT::v2f64:
8992     return lowerV2F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
8993   case MVT::v4i32:
8994     return lowerV4I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
8995   case MVT::v4f32:
8996     return lowerV4F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
8997   case MVT::v8i16:
8998     return lowerV8I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
8999   case MVT::v16i8:
9000     return lowerV16I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
9001
9002   default:
9003     llvm_unreachable("Unimplemented!");
9004   }
9005 }
9006
9007 /// \brief Helper function to test whether a shuffle mask could be
9008 /// simplified by widening the elements being shuffled.
9009 ///
9010 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
9011 /// leaves it in an unspecified state.
9012 ///
9013 /// NOTE: This must handle normal vector shuffle masks and *target* vector
9014 /// shuffle masks. The latter have the special property of a '-2' representing
9015 /// a zero-ed lane of a vector.
9016 static bool canWidenShuffleElements(ArrayRef<int> Mask,
9017                                     SmallVectorImpl<int> &WidenedMask) {
9018   for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
9019     // If both elements are undef, its trivial.
9020     if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
9021       WidenedMask.push_back(SM_SentinelUndef);
9022       continue;
9023     }
9024
9025     // Check for an undef mask and a mask value properly aligned to fit with
9026     // a pair of values. If we find such a case, use the non-undef mask's value.
9027     if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) {
9028       WidenedMask.push_back(Mask[i + 1] / 2);
9029       continue;
9030     }
9031     if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
9032       WidenedMask.push_back(Mask[i] / 2);
9033       continue;
9034     }
9035
9036     // When zeroing, we need to spread the zeroing across both lanes to widen.
9037     if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
9038       if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
9039           (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
9040         WidenedMask.push_back(SM_SentinelZero);
9041         continue;
9042       }
9043       return false;
9044     }
9045
9046     // Finally check if the two mask values are adjacent and aligned with
9047     // a pair.
9048     if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) {
9049       WidenedMask.push_back(Mask[i] / 2);
9050       continue;
9051     }
9052
9053     // Otherwise we can't safely widen the elements used in this shuffle.
9054     return false;
9055   }
9056   assert(WidenedMask.size() == Mask.size() / 2 &&
9057          "Incorrect size of mask after widening the elements!");
9058
9059   return true;
9060 }
9061
9062 /// \brief Generic routine to split vector shuffle into half-sized shuffles.
9063 ///
9064 /// This routine just extracts two subvectors, shuffles them independently, and
9065 /// then concatenates them back together. This should work effectively with all
9066 /// AVX vector shuffle types.
9067 static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
9068                                           SDValue V2, ArrayRef<int> Mask,
9069                                           SelectionDAG &DAG) {
9070   assert(VT.getSizeInBits() >= 256 &&
9071          "Only for 256-bit or wider vector shuffles!");
9072   assert(V1.getSimpleValueType() == VT && "Bad operand type!");
9073   assert(V2.getSimpleValueType() == VT && "Bad operand type!");
9074
9075   ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
9076   ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
9077
9078   int NumElements = VT.getVectorNumElements();
9079   int SplitNumElements = NumElements / 2;
9080   MVT ScalarVT = VT.getScalarType();
9081   MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
9082
9083   // Rather than splitting build-vectors, just build two narrower build
9084   // vectors. This helps shuffling with splats and zeros.
9085   auto SplitVector = [&](SDValue V) {
9086     while (V.getOpcode() == ISD::BITCAST)
9087       V = V->getOperand(0);
9088
9089     MVT OrigVT = V.getSimpleValueType();
9090     int OrigNumElements = OrigVT.getVectorNumElements();
9091     int OrigSplitNumElements = OrigNumElements / 2;
9092     MVT OrigScalarVT = OrigVT.getScalarType();
9093     MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
9094
9095     SDValue LoV, HiV;
9096
9097     auto *BV = dyn_cast<BuildVectorSDNode>(V);
9098     if (!BV) {
9099       LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
9100                         DAG.getIntPtrConstant(0, DL));
9101       HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
9102                         DAG.getIntPtrConstant(OrigSplitNumElements, DL));
9103     } else {
9104
9105       SmallVector<SDValue, 16> LoOps, HiOps;
9106       for (int i = 0; i < OrigSplitNumElements; ++i) {
9107         LoOps.push_back(BV->getOperand(i));
9108         HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
9109       }
9110       LoV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, LoOps);
9111       HiV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, HiOps);
9112     }
9113     return std::make_pair(DAG.getBitcast(SplitVT, LoV),
9114                           DAG.getBitcast(SplitVT, HiV));
9115   };
9116
9117   SDValue LoV1, HiV1, LoV2, HiV2;
9118   std::tie(LoV1, HiV1) = SplitVector(V1);
9119   std::tie(LoV2, HiV2) = SplitVector(V2);
9120
9121   // Now create two 4-way blends of these half-width vectors.
9122   auto HalfBlend = [&](ArrayRef<int> HalfMask) {
9123     bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
9124     SmallVector<int, 32> V1BlendMask, V2BlendMask, BlendMask;
9125     for (int i = 0; i < SplitNumElements; ++i) {
9126       int M = HalfMask[i];
9127       if (M >= NumElements) {
9128         if (M >= NumElements + SplitNumElements)
9129           UseHiV2 = true;
9130         else
9131           UseLoV2 = true;
9132         V2BlendMask.push_back(M - NumElements);
9133         V1BlendMask.push_back(-1);
9134         BlendMask.push_back(SplitNumElements + i);
9135       } else if (M >= 0) {
9136         if (M >= SplitNumElements)
9137           UseHiV1 = true;
9138         else
9139           UseLoV1 = true;
9140         V2BlendMask.push_back(-1);
9141         V1BlendMask.push_back(M);
9142         BlendMask.push_back(i);
9143       } else {
9144         V2BlendMask.push_back(-1);
9145         V1BlendMask.push_back(-1);
9146         BlendMask.push_back(-1);
9147       }
9148     }
9149
9150     // Because the lowering happens after all combining takes place, we need to
9151     // manually combine these blend masks as much as possible so that we create
9152     // a minimal number of high-level vector shuffle nodes.
9153
9154     // First try just blending the halves of V1 or V2.
9155     if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
9156       return DAG.getUNDEF(SplitVT);
9157     if (!UseLoV2 && !UseHiV2)
9158       return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
9159     if (!UseLoV1 && !UseHiV1)
9160       return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
9161
9162     SDValue V1Blend, V2Blend;
9163     if (UseLoV1 && UseHiV1) {
9164       V1Blend =
9165         DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
9166     } else {
9167       // We only use half of V1 so map the usage down into the final blend mask.
9168       V1Blend = UseLoV1 ? LoV1 : HiV1;
9169       for (int i = 0; i < SplitNumElements; ++i)
9170         if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
9171           BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
9172     }
9173     if (UseLoV2 && UseHiV2) {
9174       V2Blend =
9175         DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
9176     } else {
9177       // We only use half of V2 so map the usage down into the final blend mask.
9178       V2Blend = UseLoV2 ? LoV2 : HiV2;
9179       for (int i = 0; i < SplitNumElements; ++i)
9180         if (BlendMask[i] >= SplitNumElements)
9181           BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
9182     }
9183     return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
9184   };
9185   SDValue Lo = HalfBlend(LoMask);
9186   SDValue Hi = HalfBlend(HiMask);
9187   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
9188 }
9189
9190 /// \brief Either split a vector in halves or decompose the shuffles and the
9191 /// blend.
9192 ///
9193 /// This is provided as a good fallback for many lowerings of non-single-input
9194 /// shuffles with more than one 128-bit lane. In those cases, we want to select
9195 /// between splitting the shuffle into 128-bit components and stitching those
9196 /// back together vs. extracting the single-input shuffles and blending those
9197 /// results.
9198 static SDValue lowerVectorShuffleAsSplitOrBlend(SDLoc DL, MVT VT, SDValue V1,
9199                                                 SDValue V2, ArrayRef<int> Mask,
9200                                                 SelectionDAG &DAG) {
9201   assert(!isSingleInputShuffleMask(Mask) && "This routine must not be used to "
9202                                             "lower single-input shuffles as it "
9203                                             "could then recurse on itself.");
9204   int Size = Mask.size();
9205
9206   // If this can be modeled as a broadcast of two elements followed by a blend,
9207   // prefer that lowering. This is especially important because broadcasts can
9208   // often fold with memory operands.
9209   auto DoBothBroadcast = [&] {
9210     int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
9211     for (int M : Mask)
9212       if (M >= Size) {
9213         if (V2BroadcastIdx == -1)
9214           V2BroadcastIdx = M - Size;
9215         else if (M - Size != V2BroadcastIdx)
9216           return false;
9217       } else if (M >= 0) {
9218         if (V1BroadcastIdx == -1)
9219           V1BroadcastIdx = M;
9220         else if (M != V1BroadcastIdx)
9221           return false;
9222       }
9223     return true;
9224   };
9225   if (DoBothBroadcast())
9226     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
9227                                                       DAG);
9228
9229   // If the inputs all stem from a single 128-bit lane of each input, then we
9230   // split them rather than blending because the split will decompose to
9231   // unusually few instructions.
9232   int LaneCount = VT.getSizeInBits() / 128;
9233   int LaneSize = Size / LaneCount;
9234   SmallBitVector LaneInputs[2];
9235   LaneInputs[0].resize(LaneCount, false);
9236   LaneInputs[1].resize(LaneCount, false);
9237   for (int i = 0; i < Size; ++i)
9238     if (Mask[i] >= 0)
9239       LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
9240   if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
9241     return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
9242
9243   // Otherwise, just fall back to decomposed shuffles and a blend. This requires
9244   // that the decomposed single-input shuffles don't end up here.
9245   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
9246 }
9247
9248 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
9249 /// a permutation and blend of those lanes.
9250 ///
9251 /// This essentially blends the out-of-lane inputs to each lane into the lane
9252 /// from a permuted copy of the vector. This lowering strategy results in four
9253 /// instructions in the worst case for a single-input cross lane shuffle which
9254 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
9255 /// of. Special cases for each particular shuffle pattern should be handled
9256 /// prior to trying this lowering.
9257 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT,
9258                                                        SDValue V1, SDValue V2,
9259                                                        ArrayRef<int> Mask,
9260                                                        SelectionDAG &DAG) {
9261   // FIXME: This should probably be generalized for 512-bit vectors as well.
9262   assert(VT.getSizeInBits() == 256 && "Only for 256-bit vector shuffles!");
9263   int LaneSize = Mask.size() / 2;
9264
9265   // If there are only inputs from one 128-bit lane, splitting will in fact be
9266   // less expensive. The flags track whether the given lane contains an element
9267   // that crosses to another lane.
9268   bool LaneCrossing[2] = {false, false};
9269   for (int i = 0, Size = Mask.size(); i < Size; ++i)
9270     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
9271       LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
9272   if (!LaneCrossing[0] || !LaneCrossing[1])
9273     return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
9274
9275   if (isSingleInputShuffleMask(Mask)) {
9276     SmallVector<int, 32> FlippedBlendMask;
9277     for (int i = 0, Size = Mask.size(); i < Size; ++i)
9278       FlippedBlendMask.push_back(
9279           Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
9280                                   ? Mask[i]
9281                                   : Mask[i] % LaneSize +
9282                                         (i / LaneSize) * LaneSize + Size));
9283
9284     // Flip the vector, and blend the results which should now be in-lane. The
9285     // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
9286     // 5 for the high source. The value 3 selects the high half of source 2 and
9287     // the value 2 selects the low half of source 2. We only use source 2 to
9288     // allow folding it into a memory operand.
9289     unsigned PERMMask = 3 | 2 << 4;
9290     SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
9291                                   V1, DAG.getConstant(PERMMask, DL, MVT::i8));
9292     return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
9293   }
9294
9295   // This now reduces to two single-input shuffles of V1 and V2 which at worst
9296   // will be handled by the above logic and a blend of the results, much like
9297   // other patterns in AVX.
9298   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
9299 }
9300
9301 /// \brief Handle lowering 2-lane 128-bit shuffles.
9302 static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
9303                                         SDValue V2, ArrayRef<int> Mask,
9304                                         const X86Subtarget *Subtarget,
9305                                         SelectionDAG &DAG) {
9306   // TODO: If minimizing size and one of the inputs is a zero vector and the
9307   // the zero vector has only one use, we could use a VPERM2X128 to save the
9308   // instruction bytes needed to explicitly generate the zero vector.
9309
9310   // Blends are faster and handle all the non-lane-crossing cases.
9311   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
9312                                                 Subtarget, DAG))
9313     return Blend;
9314
9315   bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode());
9316   bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode());
9317
9318   // If either input operand is a zero vector, use VPERM2X128 because its mask
9319   // allows us to replace the zero input with an implicit zero.
9320   if (!IsV1Zero && !IsV2Zero) {
9321     // Check for patterns which can be matched with a single insert of a 128-bit
9322     // subvector.
9323     bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
9324     if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
9325       MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
9326                                    VT.getVectorNumElements() / 2);
9327       SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
9328                                 DAG.getIntPtrConstant(0, DL));
9329       SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
9330                                 OnlyUsesV1 ? V1 : V2,
9331                                 DAG.getIntPtrConstant(0, DL));
9332       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
9333     }
9334   }
9335
9336   // Otherwise form a 128-bit permutation. After accounting for undefs,
9337   // convert the 64-bit shuffle mask selection values into 128-bit
9338   // selection bits by dividing the indexes by 2 and shifting into positions
9339   // defined by a vperm2*128 instruction's immediate control byte.
9340
9341   // The immediate permute control byte looks like this:
9342   //    [1:0] - select 128 bits from sources for low half of destination
9343   //    [2]   - ignore
9344   //    [3]   - zero low half of destination
9345   //    [5:4] - select 128 bits from sources for high half of destination
9346   //    [6]   - ignore
9347   //    [7]   - zero high half of destination
9348
9349   int MaskLO = Mask[0];
9350   if (MaskLO == SM_SentinelUndef)
9351     MaskLO = Mask[1] == SM_SentinelUndef ? 0 : Mask[1];
9352
9353   int MaskHI = Mask[2];
9354   if (MaskHI == SM_SentinelUndef)
9355     MaskHI = Mask[3] == SM_SentinelUndef ? 0 : Mask[3];
9356
9357   unsigned PermMask = MaskLO / 2 | (MaskHI / 2) << 4;
9358
9359   // If either input is a zero vector, replace it with an undef input.
9360   // Shuffle mask values <  4 are selecting elements of V1.
9361   // Shuffle mask values >= 4 are selecting elements of V2.
9362   // Adjust each half of the permute mask by clearing the half that was
9363   // selecting the zero vector and setting the zero mask bit.
9364   if (IsV1Zero) {
9365     V1 = DAG.getUNDEF(VT);
9366     if (MaskLO < 4)
9367       PermMask = (PermMask & 0xf0) | 0x08;
9368     if (MaskHI < 4)
9369       PermMask = (PermMask & 0x0f) | 0x80;
9370   }
9371   if (IsV2Zero) {
9372     V2 = DAG.getUNDEF(VT);
9373     if (MaskLO >= 4)
9374       PermMask = (PermMask & 0xf0) | 0x08;
9375     if (MaskHI >= 4)
9376       PermMask = (PermMask & 0x0f) | 0x80;
9377   }
9378
9379   return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
9380                      DAG.getConstant(PermMask, DL, MVT::i8));
9381 }
9382
9383 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
9384 /// shuffling each lane.
9385 ///
9386 /// This will only succeed when the result of fixing the 128-bit lanes results
9387 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
9388 /// each 128-bit lanes. This handles many cases where we can quickly blend away
9389 /// the lane crosses early and then use simpler shuffles within each lane.
9390 ///
9391 /// FIXME: It might be worthwhile at some point to support this without
9392 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
9393 /// in x86 only floating point has interesting non-repeating shuffles, and even
9394 /// those are still *marginally* more expensive.
9395 static SDValue lowerVectorShuffleByMerging128BitLanes(
9396     SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9397     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
9398   assert(!isSingleInputShuffleMask(Mask) &&
9399          "This is only useful with multiple inputs.");
9400
9401   int Size = Mask.size();
9402   int LaneSize = 128 / VT.getScalarSizeInBits();
9403   int NumLanes = Size / LaneSize;
9404   assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
9405
9406   // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
9407   // check whether the in-128-bit lane shuffles share a repeating pattern.
9408   SmallVector<int, 4> Lanes;
9409   Lanes.resize(NumLanes, -1);
9410   SmallVector<int, 4> InLaneMask;
9411   InLaneMask.resize(LaneSize, -1);
9412   for (int i = 0; i < Size; ++i) {
9413     if (Mask[i] < 0)
9414       continue;
9415
9416     int j = i / LaneSize;
9417
9418     if (Lanes[j] < 0) {
9419       // First entry we've seen for this lane.
9420       Lanes[j] = Mask[i] / LaneSize;
9421     } else if (Lanes[j] != Mask[i] / LaneSize) {
9422       // This doesn't match the lane selected previously!
9423       return SDValue();
9424     }
9425
9426     // Check that within each lane we have a consistent shuffle mask.
9427     int k = i % LaneSize;
9428     if (InLaneMask[k] < 0) {
9429       InLaneMask[k] = Mask[i] % LaneSize;
9430     } else if (InLaneMask[k] != Mask[i] % LaneSize) {
9431       // This doesn't fit a repeating in-lane mask.
9432       return SDValue();
9433     }
9434   }
9435
9436   // First shuffle the lanes into place.
9437   MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
9438                                 VT.getSizeInBits() / 64);
9439   SmallVector<int, 8> LaneMask;
9440   LaneMask.resize(NumLanes * 2, -1);
9441   for (int i = 0; i < NumLanes; ++i)
9442     if (Lanes[i] >= 0) {
9443       LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
9444       LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
9445     }
9446
9447   V1 = DAG.getBitcast(LaneVT, V1);
9448   V2 = DAG.getBitcast(LaneVT, V2);
9449   SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
9450
9451   // Cast it back to the type we actually want.
9452   LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
9453
9454   // Now do a simple shuffle that isn't lane crossing.
9455   SmallVector<int, 8> NewMask;
9456   NewMask.resize(Size, -1);
9457   for (int i = 0; i < Size; ++i)
9458     if (Mask[i] >= 0)
9459       NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
9460   assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
9461          "Must not introduce lane crosses at this point!");
9462
9463   return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
9464 }
9465
9466 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
9467 /// given mask.
9468 ///
9469 /// This returns true if the elements from a particular input are already in the
9470 /// slot required by the given mask and require no permutation.
9471 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
9472   assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
9473   int Size = Mask.size();
9474   for (int i = 0; i < Size; ++i)
9475     if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
9476       return false;
9477
9478   return true;
9479 }
9480
9481 static SDValue lowerVectorShuffleWithSHUFPD(SDLoc DL, MVT VT,
9482                                             ArrayRef<int> Mask, SDValue V1,
9483                                             SDValue V2, SelectionDAG &DAG) {
9484
9485   // Mask for V8F64: 0/1,  8/9,  2/3,  10/11, 4/5, ..
9486   // Mask for V4F64; 0/1,  4/5,  2/3,  6/7..
9487   assert(VT.getScalarSizeInBits() == 64 && "Unexpected data type for VSHUFPD");
9488   int NumElts = VT.getVectorNumElements();
9489   bool ShufpdMask = true;
9490   bool CommutableMask = true;
9491   unsigned Immediate = 0;
9492   for (int i = 0; i < NumElts; ++i) {
9493     if (Mask[i] < 0)
9494       continue;
9495     int Val = (i & 6) + NumElts * (i & 1);
9496     int CommutVal = (i & 0xe) + NumElts * ((i & 1)^1);
9497     if (Mask[i] < Val ||  Mask[i] > Val + 1)
9498       ShufpdMask = false;
9499     if (Mask[i] < CommutVal ||  Mask[i] > CommutVal + 1)
9500       CommutableMask = false;
9501     Immediate |= (Mask[i] % 2) << i;
9502   }
9503   if (ShufpdMask)
9504     return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
9505                        DAG.getConstant(Immediate, DL, MVT::i8));
9506   if (CommutableMask)
9507     return DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
9508                        DAG.getConstant(Immediate, DL, MVT::i8));
9509   return SDValue();
9510 }
9511
9512 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
9513 ///
9514 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
9515 /// isn't available.
9516 static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9517                                        const X86Subtarget *Subtarget,
9518                                        SelectionDAG &DAG) {
9519   SDLoc DL(Op);
9520   assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
9521   assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
9522   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
9523   ArrayRef<int> Mask = SVOp->getMask();
9524   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
9525
9526   SmallVector<int, 4> WidenedMask;
9527   if (canWidenShuffleElements(Mask, WidenedMask))
9528     return lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, Subtarget,
9529                                     DAG);
9530
9531   if (isSingleInputShuffleMask(Mask)) {
9532     // Check for being able to broadcast a single element.
9533     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f64, V1,
9534                                                           Mask, Subtarget, DAG))
9535       return Broadcast;
9536
9537     // Use low duplicate instructions for masks that match their pattern.
9538     if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
9539       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
9540
9541     if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
9542       // Non-half-crossing single input shuffles can be lowerid with an
9543       // interleaved permutation.
9544       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
9545                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
9546       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
9547                          DAG.getConstant(VPERMILPMask, DL, MVT::i8));
9548     }
9549
9550     // With AVX2 we have direct support for this permutation.
9551     if (Subtarget->hasAVX2())
9552       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
9553                          getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
9554
9555     // Otherwise, fall back.
9556     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
9557                                                    DAG);
9558   }
9559
9560   // X86 has dedicated unpack instructions that can handle specific blend
9561   // operations: UNPCKH and UNPCKL.
9562   if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 2, 6}))
9563     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2);
9564   if (isShuffleEquivalent(V1, V2, Mask, {1, 5, 3, 7}))
9565     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2);
9566   if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 6, 2}))
9567     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V2, V1);
9568   if (isShuffleEquivalent(V1, V2, Mask, {5, 1, 7, 3}))
9569     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V2, V1);
9570
9571   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
9572                                                 Subtarget, DAG))
9573     return Blend;
9574
9575   // Check if the blend happens to exactly fit that of SHUFPD.
9576   if (SDValue Op =
9577       lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
9578     return Op;
9579
9580   // Try to simplify this by merging 128-bit lanes to enable a lane-based
9581   // shuffle. However, if we have AVX2 and either inputs are already in place,
9582   // we will be able to shuffle even across lanes the other input in a single
9583   // instruction so skip this pattern.
9584   if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
9585                                  isShuffleMaskInputInPlace(1, Mask))))
9586     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
9587             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
9588       return Result;
9589
9590   // If we have AVX2 then we always want to lower with a blend because an v4 we
9591   // can fully permute the elements.
9592   if (Subtarget->hasAVX2())
9593     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
9594                                                       Mask, DAG);
9595
9596   // Otherwise fall back on generic lowering.
9597   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
9598 }
9599
9600 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
9601 ///
9602 /// This routine is only called when we have AVX2 and thus a reasonable
9603 /// instruction set for v4i64 shuffling..
9604 static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9605                                        const X86Subtarget *Subtarget,
9606                                        SelectionDAG &DAG) {
9607   SDLoc DL(Op);
9608   assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
9609   assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
9610   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
9611   ArrayRef<int> Mask = SVOp->getMask();
9612   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
9613   assert(Subtarget->hasAVX2() && "We can only lower v4i64 with AVX2!");
9614
9615   SmallVector<int, 4> WidenedMask;
9616   if (canWidenShuffleElements(Mask, WidenedMask))
9617     return lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, Subtarget,
9618                                     DAG);
9619
9620   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
9621                                                 Subtarget, DAG))
9622     return Blend;
9623
9624   // Check for being able to broadcast a single element.
9625   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1,
9626                                                         Mask, Subtarget, DAG))
9627     return Broadcast;
9628
9629   // When the shuffle is mirrored between the 128-bit lanes of the unit, we can
9630   // use lower latency instructions that will operate on both 128-bit lanes.
9631   SmallVector<int, 2> RepeatedMask;
9632   if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
9633     if (isSingleInputShuffleMask(Mask)) {
9634       int PSHUFDMask[] = {-1, -1, -1, -1};
9635       for (int i = 0; i < 2; ++i)
9636         if (RepeatedMask[i] >= 0) {
9637           PSHUFDMask[2 * i] = 2 * RepeatedMask[i];
9638           PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1;
9639         }
9640       return DAG.getBitcast(
9641           MVT::v4i64,
9642           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
9643                       DAG.getBitcast(MVT::v8i32, V1),
9644                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
9645     }
9646   }
9647
9648   // AVX2 provides a direct instruction for permuting a single input across
9649   // lanes.
9650   if (isSingleInputShuffleMask(Mask))
9651     return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
9652                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
9653
9654   // Try to use shift instructions.
9655   if (SDValue Shift =
9656           lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, DAG))
9657     return Shift;
9658
9659   // Use dedicated unpack instructions for masks that match their pattern.
9660   if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 2, 6}))
9661     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
9662   if (isShuffleEquivalent(V1, V2, Mask, {1, 5, 3, 7}))
9663     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);
9664   if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 6, 2}))
9665     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V2, V1);
9666   if (isShuffleEquivalent(V1, V2, Mask, {5, 1, 7, 3}))
9667     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V2, V1);
9668
9669   // Try to simplify this by merging 128-bit lanes to enable a lane-based
9670   // shuffle. However, if we have AVX2 and either inputs are already in place,
9671   // we will be able to shuffle even across lanes the other input in a single
9672   // instruction so skip this pattern.
9673   if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
9674                                  isShuffleMaskInputInPlace(1, Mask))))
9675     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
9676             DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
9677       return Result;
9678
9679   // Otherwise fall back on generic blend lowering.
9680   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
9681                                                     Mask, DAG);
9682 }
9683
9684 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
9685 ///
9686 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
9687 /// isn't available.
9688 static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9689                                        const X86Subtarget *Subtarget,
9690                                        SelectionDAG &DAG) {
9691   SDLoc DL(Op);
9692   assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
9693   assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
9694   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
9695   ArrayRef<int> Mask = SVOp->getMask();
9696   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
9697
9698   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
9699                                                 Subtarget, DAG))
9700     return Blend;
9701
9702   // Check for being able to broadcast a single element.
9703   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1,
9704                                                         Mask, Subtarget, DAG))
9705     return Broadcast;
9706
9707   // If the shuffle mask is repeated in each 128-bit lane, we have many more
9708   // options to efficiently lower the shuffle.
9709   SmallVector<int, 4> RepeatedMask;
9710   if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
9711     assert(RepeatedMask.size() == 4 &&
9712            "Repeated masks must be half the mask width!");
9713
9714     // Use even/odd duplicate instructions for masks that match their pattern.
9715     if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
9716       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
9717     if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3, 5, 5, 7, 7}))
9718       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
9719
9720     if (isSingleInputShuffleMask(Mask))
9721       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
9722                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
9723
9724     // Use dedicated unpack instructions for masks that match their pattern.
9725     if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 4, 12, 5, 13}))
9726       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2);
9727     if (isShuffleEquivalent(V1, V2, Mask, {2, 10, 3, 11, 6, 14, 7, 15}))
9728       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2);
9729     if (isShuffleEquivalent(V1, V2, Mask, {8, 0, 9, 1, 12, 4, 13, 5}))
9730       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V2, V1);
9731     if (isShuffleEquivalent(V1, V2, Mask, {10, 2, 11, 3, 14, 6, 15, 7}))
9732       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V2, V1);
9733
9734     // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
9735     // have already handled any direct blends. We also need to squash the
9736     // repeated mask into a simulated v4f32 mask.
9737     for (int i = 0; i < 4; ++i)
9738       if (RepeatedMask[i] >= 8)
9739         RepeatedMask[i] -= 4;
9740     return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
9741   }
9742
9743   // If we have a single input shuffle with different shuffle patterns in the
9744   // two 128-bit lanes use the variable mask to VPERMILPS.
9745   if (isSingleInputShuffleMask(Mask)) {
9746     SDValue VPermMask[8];
9747     for (int i = 0; i < 8; ++i)
9748       VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
9749                                  : DAG.getConstant(Mask[i], DL, MVT::i32);
9750     if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
9751       return DAG.getNode(
9752           X86ISD::VPERMILPV, DL, MVT::v8f32, V1,
9753           DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask));
9754
9755     if (Subtarget->hasAVX2())
9756       return DAG.getNode(
9757           X86ISD::VPERMV, DL, MVT::v8f32,
9758           DAG.getBitcast(MVT::v8f32, DAG.getNode(ISD::BUILD_VECTOR, DL,
9759                                                  MVT::v8i32, VPermMask)),
9760           V1);
9761
9762     // Otherwise, fall back.
9763     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
9764                                                    DAG);
9765   }
9766
9767   // Try to simplify this by merging 128-bit lanes to enable a lane-based
9768   // shuffle.
9769   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
9770           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
9771     return Result;
9772
9773   // If we have AVX2 then we always want to lower with a blend because at v8 we
9774   // can fully permute the elements.
9775   if (Subtarget->hasAVX2())
9776     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
9777                                                       Mask, DAG);
9778
9779   // Otherwise fall back on generic lowering.
9780   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
9781 }
9782
9783 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
9784 ///
9785 /// This routine is only called when we have AVX2 and thus a reasonable
9786 /// instruction set for v8i32 shuffling..
9787 static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9788                                        const X86Subtarget *Subtarget,
9789                                        SelectionDAG &DAG) {
9790   SDLoc DL(Op);
9791   assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
9792   assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
9793   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
9794   ArrayRef<int> Mask = SVOp->getMask();
9795   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
9796   assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!");
9797
9798   // Whenever we can lower this as a zext, that instruction is strictly faster
9799   // than any alternative. It also allows us to fold memory operands into the
9800   // shuffle in many cases.
9801   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2,
9802                                                          Mask, Subtarget, DAG))
9803     return ZExt;
9804
9805   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
9806                                                 Subtarget, DAG))
9807     return Blend;
9808
9809   // Check for being able to broadcast a single element.
9810   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1,
9811                                                         Mask, Subtarget, DAG))
9812     return Broadcast;
9813
9814   // If the shuffle mask is repeated in each 128-bit lane we can use more
9815   // efficient instructions that mirror the shuffles across the two 128-bit
9816   // lanes.
9817   SmallVector<int, 4> RepeatedMask;
9818   if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) {
9819     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
9820     if (isSingleInputShuffleMask(Mask))
9821       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
9822                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
9823
9824     // Use dedicated unpack instructions for masks that match their pattern.
9825     if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 4, 12, 5, 13}))
9826       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2);
9827     if (isShuffleEquivalent(V1, V2, Mask, {2, 10, 3, 11, 6, 14, 7, 15}))
9828       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2);
9829     if (isShuffleEquivalent(V1, V2, Mask, {8, 0, 9, 1, 12, 4, 13, 5}))
9830       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V2, V1);
9831     if (isShuffleEquivalent(V1, V2, Mask, {10, 2, 11, 3, 14, 6, 15, 7}))
9832       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V2, V1);
9833   }
9834
9835   // Try to use shift instructions.
9836   if (SDValue Shift =
9837           lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, DAG))
9838     return Shift;
9839
9840   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9841           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
9842     return Rotate;
9843
9844   // If the shuffle patterns aren't repeated but it is a single input, directly
9845   // generate a cross-lane VPERMD instruction.
9846   if (isSingleInputShuffleMask(Mask)) {
9847     SDValue VPermMask[8];
9848     for (int i = 0; i < 8; ++i)
9849       VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
9850                                  : DAG.getConstant(Mask[i], DL, MVT::i32);
9851     return DAG.getNode(
9852         X86ISD::VPERMV, DL, MVT::v8i32,
9853         DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
9854   }
9855
9856   // Try to simplify this by merging 128-bit lanes to enable a lane-based
9857   // shuffle.
9858   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
9859           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
9860     return Result;
9861
9862   // Otherwise fall back on generic blend lowering.
9863   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
9864                                                     Mask, DAG);
9865 }
9866
9867 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
9868 ///
9869 /// This routine is only called when we have AVX2 and thus a reasonable
9870 /// instruction set for v16i16 shuffling..
9871 static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9872                                         const X86Subtarget *Subtarget,
9873                                         SelectionDAG &DAG) {
9874   SDLoc DL(Op);
9875   assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
9876   assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
9877   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
9878   ArrayRef<int> Mask = SVOp->getMask();
9879   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
9880   assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!");
9881
9882   // Whenever we can lower this as a zext, that instruction is strictly faster
9883   // than any alternative. It also allows us to fold memory operands into the
9884   // shuffle in many cases.
9885   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v16i16, V1, V2,
9886                                                          Mask, Subtarget, DAG))
9887     return ZExt;
9888
9889   // Check for being able to broadcast a single element.
9890   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1,
9891                                                         Mask, Subtarget, DAG))
9892     return Broadcast;
9893
9894   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
9895                                                 Subtarget, DAG))
9896     return Blend;
9897
9898   // Use dedicated unpack instructions for masks that match their pattern.
9899   if (isShuffleEquivalent(V1, V2, Mask,
9900                           {// First 128-bit lane:
9901                            0, 16, 1, 17, 2, 18, 3, 19,
9902                            // Second 128-bit lane:
9903                            8, 24, 9, 25, 10, 26, 11, 27}))
9904     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2);
9905   if (isShuffleEquivalent(V1, V2, Mask,
9906                           {// First 128-bit lane:
9907                            4, 20, 5, 21, 6, 22, 7, 23,
9908                            // Second 128-bit lane:
9909                            12, 28, 13, 29, 14, 30, 15, 31}))
9910     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2);
9911
9912   // Try to use shift instructions.
9913   if (SDValue Shift =
9914           lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, DAG))
9915     return Shift;
9916
9917   // Try to use byte rotation instructions.
9918   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9919           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
9920     return Rotate;
9921
9922   if (isSingleInputShuffleMask(Mask)) {
9923     // There are no generalized cross-lane shuffle operations available on i16
9924     // element types.
9925     if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
9926       return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
9927                                                      Mask, DAG);
9928
9929     SmallVector<int, 8> RepeatedMask;
9930     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
9931       // As this is a single-input shuffle, the repeated mask should be
9932       // a strictly valid v8i16 mask that we can pass through to the v8i16
9933       // lowering to handle even the v16 case.
9934       return lowerV8I16GeneralSingleInputVectorShuffle(
9935           DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
9936     }
9937
9938     SDValue PSHUFBMask[32];
9939     for (int i = 0; i < 16; ++i) {
9940       if (Mask[i] == -1) {
9941         PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8);
9942         continue;
9943       }
9944
9945       int M = i < 8 ? Mask[i] : Mask[i] - 8;
9946       assert(M >= 0 && M < 8 && "Invalid single-input mask!");
9947       PSHUFBMask[2 * i] = DAG.getConstant(2 * M, DL, MVT::i8);
9948       PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, DL, MVT::i8);
9949     }
9950     return DAG.getBitcast(MVT::v16i16,
9951                           DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8,
9952                                       DAG.getBitcast(MVT::v32i8, V1),
9953                                       DAG.getNode(ISD::BUILD_VECTOR, DL,
9954                                                   MVT::v32i8, PSHUFBMask)));
9955   }
9956
9957   // Try to simplify this by merging 128-bit lanes to enable a lane-based
9958   // shuffle.
9959   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
9960           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
9961     return Result;
9962
9963   // Otherwise fall back on generic lowering.
9964   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
9965 }
9966
9967 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
9968 ///
9969 /// This routine is only called when we have AVX2 and thus a reasonable
9970 /// instruction set for v32i8 shuffling..
9971 static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9972                                        const X86Subtarget *Subtarget,
9973                                        SelectionDAG &DAG) {
9974   SDLoc DL(Op);
9975   assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
9976   assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
9977   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
9978   ArrayRef<int> Mask = SVOp->getMask();
9979   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
9980   assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!");
9981
9982   // Whenever we can lower this as a zext, that instruction is strictly faster
9983   // than any alternative. It also allows us to fold memory operands into the
9984   // shuffle in many cases.
9985   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2,
9986                                                          Mask, Subtarget, DAG))
9987     return ZExt;
9988
9989   // Check for being able to broadcast a single element.
9990   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1,
9991                                                         Mask, Subtarget, DAG))
9992     return Broadcast;
9993
9994   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
9995                                                 Subtarget, DAG))
9996     return Blend;
9997
9998   // Use dedicated unpack instructions for masks that match their pattern.
9999   // Note that these are repeated 128-bit lane unpacks, not unpacks across all
10000   // 256-bit lanes.
10001   if (isShuffleEquivalent(
10002           V1, V2, Mask,
10003           {// First 128-bit lane:
10004            0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
10005            // Second 128-bit lane:
10006            16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55}))
10007     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2);
10008   if (isShuffleEquivalent(
10009           V1, V2, Mask,
10010           {// First 128-bit lane:
10011            8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
10012            // Second 128-bit lane:
10013            24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63}))
10014     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2);
10015
10016   // Try to use shift instructions.
10017   if (SDValue Shift =
10018           lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, DAG))
10019     return Shift;
10020
10021   // Try to use byte rotation instructions.
10022   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10023           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
10024     return Rotate;
10025
10026   if (isSingleInputShuffleMask(Mask)) {
10027     // There are no generalized cross-lane shuffle operations available on i8
10028     // element types.
10029     if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
10030       return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2,
10031                                                      Mask, DAG);
10032
10033     SDValue PSHUFBMask[32];
10034     for (int i = 0; i < 32; ++i)
10035       PSHUFBMask[i] =
10036           Mask[i] < 0
10037               ? DAG.getUNDEF(MVT::i8)
10038               : DAG.getConstant(Mask[i] < 16 ? Mask[i] : Mask[i] - 16, DL,
10039                                 MVT::i8);
10040
10041     return DAG.getNode(
10042         X86ISD::PSHUFB, DL, MVT::v32i8, V1,
10043         DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask));
10044   }
10045
10046   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10047   // shuffle.
10048   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10049           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
10050     return Result;
10051
10052   // Otherwise fall back on generic lowering.
10053   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
10054 }
10055
10056 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
10057 ///
10058 /// This routine either breaks down the specific type of a 256-bit x86 vector
10059 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
10060 /// together based on the available instructions.
10061 static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10062                                         MVT VT, const X86Subtarget *Subtarget,
10063                                         SelectionDAG &DAG) {
10064   SDLoc DL(Op);
10065   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10066   ArrayRef<int> Mask = SVOp->getMask();
10067
10068   // If we have a single input to the zero element, insert that into V1 if we
10069   // can do so cheaply.
10070   int NumElts = VT.getVectorNumElements();
10071   int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [NumElts](int M) {
10072     return M >= NumElts;
10073   });
10074
10075   if (NumV2Elements == 1 && Mask[0] >= NumElts)
10076     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10077                               DL, VT, V1, V2, Mask, Subtarget, DAG))
10078       return Insertion;
10079
10080   // There is a really nice hard cut-over between AVX1 and AVX2 that means we can
10081   // check for those subtargets here and avoid much of the subtarget querying in
10082   // the per-vector-type lowering routines. With AVX1 we have essentially *zero*
10083   // ability to manipulate a 256-bit vector with integer types. Since we'll use
10084   // floating point types there eventually, just immediately cast everything to
10085   // a float and operate entirely in that domain.
10086   if (VT.isInteger() && !Subtarget->hasAVX2()) {
10087     int ElementBits = VT.getScalarSizeInBits();
10088     if (ElementBits < 32)
10089       // No floating point type available, decompose into 128-bit vectors.
10090       return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10091
10092     MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
10093                                 VT.getVectorNumElements());
10094     V1 = DAG.getBitcast(FpVT, V1);
10095     V2 = DAG.getBitcast(FpVT, V2);
10096     return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
10097   }
10098
10099   switch (VT.SimpleTy) {
10100   case MVT::v4f64:
10101     return lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
10102   case MVT::v4i64:
10103     return lowerV4I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
10104   case MVT::v8f32:
10105     return lowerV8F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10106   case MVT::v8i32:
10107     return lowerV8I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10108   case MVT::v16i16:
10109     return lowerV16I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
10110   case MVT::v32i8:
10111     return lowerV32I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
10112
10113   default:
10114     llvm_unreachable("Not a valid 256-bit x86 vector type!");
10115   }
10116 }
10117
10118 static SDValue lowerVectorShuffleWithVALIGN(SDLoc DL, MVT VT,
10119                                             ArrayRef<int> Mask, SDValue V1,
10120                                             SDValue V2, SelectionDAG &DAG) {
10121
10122   assert(VT.getScalarSizeInBits() >= 32 && "Unexpected data type for VALIGN");
10123   // VALIGN pattern 2, 3, 4, 5, .. (sequential, shifted right)
10124   int AlignVal = -1;
10125   for (int i = 0; i < (signed)VT.getVectorNumElements(); ++i) {
10126     if (Mask[i] < 0)
10127       continue;
10128     if (Mask[i] < i)
10129       return SDValue();
10130     if (AlignVal == -1)
10131       AlignVal = Mask[i] - i;
10132     else if (Mask[i] - i != AlignVal)
10133       return SDValue();
10134   }
10135   // Vector source operands should be swapped
10136   return DAG.getNode(X86ISD::VALIGN, DL, VT, V2, V1,
10137                      DAG.getConstant(AlignVal, DL, MVT::i8));
10138 }
10139
10140 static SDValue lowerVectorShuffleWithPERMV(SDLoc DL, MVT VT,
10141                                            ArrayRef<int> Mask, SDValue V1,
10142                                            SDValue V2, SelectionDAG &DAG) {
10143
10144   assert(VT.getScalarSizeInBits() >= 16 && "Unexpected data type for PERMV");
10145
10146   MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
10147   MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
10148
10149   SmallVector<SDValue, 32>  VPermMask;
10150   for (unsigned i = 0; i < VT.getVectorNumElements(); ++i)
10151     VPermMask.push_back(Mask[i] < 0 ? DAG.getUNDEF(MaskEltVT) :
10152                         DAG.getConstant(Mask[i], DL,MaskEltVT));
10153   SDValue MaskNode = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVecVT,
10154                                  VPermMask);
10155   if (isSingleInputShuffleMask(Mask))
10156     return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
10157
10158   return DAG.getNode(X86ISD::VPERMV3, DL, VT, MaskNode, V1, V2);
10159 }
10160
10161
10162 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
10163 static SDValue lowerV8X64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10164                                        const X86Subtarget *Subtarget,
10165                                        SelectionDAG &DAG) {
10166   SDLoc DL(Op);
10167   MVT VT = Op.getSimpleValueType();
10168   assert((V1.getSimpleValueType() == MVT::v8f64 ||
10169           V1.getSimpleValueType() == MVT::v8i64) && "Bad operand type!");
10170   assert((V2.getSimpleValueType() == MVT::v8f64 ||
10171           V2.getSimpleValueType() == MVT::v8i64) && "Bad operand type!");
10172   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10173   ArrayRef<int> Mask = SVOp->getMask();
10174   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10175
10176   // X86 has dedicated unpack instructions that can handle specific blend
10177   // operations: UNPCKH and UNPCKL.
10178   if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14}))
10179     return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
10180   if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15}))
10181     return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
10182
10183   if (SDValue Op = lowerVectorShuffleWithVALIGN(DL, VT, Mask, V1, V2, DAG))
10184     return Op;
10185
10186   if (SDValue Op = lowerVectorShuffleWithSHUFPD(DL, VT, Mask, V1, V2, DAG))
10187     return Op;
10188
10189   // PERMILPD instruction - mask 0/1, 0/1, 2/3, 2/3, 4/5, 4/5, 6/7, 6/7
10190   if (isSingleInputShuffleMask(Mask)) {
10191     if (!is128BitLaneCrossingShuffleMask(VT, Mask))
10192       return DAG.getNode(X86ISD::VPERMILPI, DL, VT, V1,
10193                          get1bitLaneShuffleImm8ForMask(Mask, DL, DAG));
10194
10195     SmallVector<int, 4> RepeatedMask;
10196     if (is256BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
10197       return DAG.getNode(X86ISD::VPERMI, DL, VT, V1,
10198                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
10199   }
10200   return lowerVectorShuffleWithPERMV(DL, VT, Mask, V1, V2, DAG);
10201 }
10202
10203 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
10204 static SDValue lowerV16X32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10205                                        const X86Subtarget *Subtarget,
10206                                        SelectionDAG &DAG) {
10207   MVT VT = Op.getSimpleValueType();
10208   SDLoc DL(Op);
10209   assert((V1.getSimpleValueType() == MVT::v16i32 ||
10210           V1.getSimpleValueType() == MVT::v16f32) && "Bad operand type!");
10211   assert((V2.getSimpleValueType() == MVT::v16i32 ||
10212           V2.getSimpleValueType() == MVT::v16f32) && "Bad operand type!");
10213   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10214   ArrayRef<int> Mask = SVOp->getMask();
10215   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
10216
10217   // Use dedicated unpack instructions for masks that match their pattern.
10218   if (isShuffleEquivalent(V1, V2, Mask,
10219                           {// First 128-bit lane.
10220                            0, 16, 1, 17, 4, 20, 5, 21,
10221                            // Second 128-bit lane.
10222                            8, 24, 9, 25, 12, 28, 13, 29}))
10223     return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
10224   if (isShuffleEquivalent(V1, V2, Mask,
10225                           {// First 128-bit lane.
10226                            2, 18, 3, 19, 6, 22, 7, 23,
10227                            // Second 128-bit lane.
10228                            10, 26, 11, 27, 14, 30, 15, 31}))
10229     return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
10230
10231   if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10,
10232                                          12, 12, 14, 14}))
10233     return DAG.getNode(X86ISD::MOVSLDUP, DL, VT, V1);
10234   if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11,
10235                                          13, 13, 15, 15}))
10236     return DAG.getNode(X86ISD::MOVSHDUP, DL, VT, V1);
10237
10238   SmallVector<int, 4> RepeatedMask;
10239   if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask)) {
10240     if (isSingleInputShuffleMask(Mask)) {
10241       unsigned Opc = VT.isInteger() ? X86ISD::PSHUFD : X86ISD::VPERMILPI;
10242       return DAG.getNode(Opc, DL, VT, V1,
10243                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
10244     }
10245
10246     for (int i = 0; i < 4; ++i)
10247       if (RepeatedMask[i] >= 16)
10248         RepeatedMask[i] -= 12;
10249      return lowerVectorShuffleWithSHUFPS(DL, VT, RepeatedMask, V1, V2, DAG);
10250   }
10251
10252   if (SDValue Op = lowerVectorShuffleWithVALIGN(DL, VT, Mask, V1, V2, DAG))
10253     return Op;
10254
10255   return lowerVectorShuffleWithPERMV(DL, VT, Mask, V1, V2, DAG);
10256 }
10257
10258 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
10259 static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10260                                         const X86Subtarget *Subtarget,
10261                                         SelectionDAG &DAG) {
10262   SDLoc DL(Op);
10263   assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
10264   assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
10265   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10266   ArrayRef<int> Mask = SVOp->getMask();
10267   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
10268   assert(Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
10269
10270   // FIXME: Implement direct support for this type!
10271   return splitAndLowerVectorShuffle(DL, MVT::v32i16, V1, V2, Mask, DAG);
10272 }
10273
10274 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
10275 static SDValue lowerV64I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10276                                        const X86Subtarget *Subtarget,
10277                                        SelectionDAG &DAG) {
10278   SDLoc DL(Op);
10279   assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
10280   assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
10281   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10282   ArrayRef<int> Mask = SVOp->getMask();
10283   assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
10284   assert(Subtarget->hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
10285
10286   // FIXME: Implement direct support for this type!
10287   return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
10288 }
10289
10290 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
10291 ///
10292 /// This routine either breaks down the specific type of a 512-bit x86 vector
10293 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
10294 /// together based on the available instructions.
10295 static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10296                                         MVT VT, const X86Subtarget *Subtarget,
10297                                         SelectionDAG &DAG) {
10298   SDLoc DL(Op);
10299   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10300   ArrayRef<int> Mask = SVOp->getMask();
10301   assert(Subtarget->hasAVX512() &&
10302          "Cannot lower 512-bit vectors w/ basic ISA!");
10303
10304   // Check for being able to broadcast a single element.
10305   if (SDValue Broadcast =
10306           lowerVectorShuffleAsBroadcast(DL, VT, V1, Mask, Subtarget, DAG))
10307     return Broadcast;
10308
10309   // Dispatch to each element type for lowering. If we don't have supprot for
10310   // specific element type shuffles at 512 bits, immediately split them and
10311   // lower them. Each lowering routine of a given type is allowed to assume that
10312   // the requisite ISA extensions for that element type are available.
10313   switch (VT.SimpleTy) {
10314   case MVT::v8f64:
10315   case MVT::v8i64:
10316     return lowerV8X64VectorShuffle(Op, V1, V2, Subtarget, DAG);
10317   case MVT::v16f32:
10318   case MVT::v16i32:
10319     return lowerV16X32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10320   case MVT::v32i16:
10321     if (Subtarget->hasBWI())
10322       return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
10323     break;
10324   case MVT::v64i8:
10325     if (Subtarget->hasBWI())
10326       return lowerV64I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
10327     break;
10328
10329   default:
10330     llvm_unreachable("Not a valid 512-bit x86 vector type!");
10331   }
10332
10333   // Otherwise fall back on splitting.
10334   return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10335 }
10336
10337 /// \brief Top-level lowering for x86 vector shuffles.
10338 ///
10339 /// This handles decomposition, canonicalization, and lowering of all x86
10340 /// vector shuffles. Most of the specific lowering strategies are encapsulated
10341 /// above in helper routines. The canonicalization attempts to widen shuffles
10342 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
10343 /// s.t. only one of the two inputs needs to be tested, etc.
10344 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
10345                                   SelectionDAG &DAG) {
10346   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10347   ArrayRef<int> Mask = SVOp->getMask();
10348   SDValue V1 = Op.getOperand(0);
10349   SDValue V2 = Op.getOperand(1);
10350   MVT VT = Op.getSimpleValueType();
10351   int NumElements = VT.getVectorNumElements();
10352   SDLoc dl(Op);
10353
10354   assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
10355
10356   bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
10357   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
10358   if (V1IsUndef && V2IsUndef)
10359     return DAG.getUNDEF(VT);
10360
10361   // When we create a shuffle node we put the UNDEF node to second operand,
10362   // but in some cases the first operand may be transformed to UNDEF.
10363   // In this case we should just commute the node.
10364   if (V1IsUndef)
10365     return DAG.getCommutedVectorShuffle(*SVOp);
10366
10367   // Check for non-undef masks pointing at an undef vector and make the masks
10368   // undef as well. This makes it easier to match the shuffle based solely on
10369   // the mask.
10370   if (V2IsUndef)
10371     for (int M : Mask)
10372       if (M >= NumElements) {
10373         SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
10374         for (int &M : NewMask)
10375           if (M >= NumElements)
10376             M = -1;
10377         return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask);
10378       }
10379
10380   // We actually see shuffles that are entirely re-arrangements of a set of
10381   // zero inputs. This mostly happens while decomposing complex shuffles into
10382   // simple ones. Directly lower these as a buildvector of zeros.
10383   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
10384   if (Zeroable.all())
10385     return getZeroVector(VT, Subtarget, DAG, dl);
10386
10387   // Try to collapse shuffles into using a vector type with fewer elements but
10388   // wider element types. We cap this to not form integers or floating point
10389   // elements wider than 64 bits, but it might be interesting to form i128
10390   // integers to handle flipping the low and high halves of AVX 256-bit vectors.
10391   SmallVector<int, 16> WidenedMask;
10392   if (VT.getScalarSizeInBits() < 64 &&
10393       canWidenShuffleElements(Mask, WidenedMask)) {
10394     MVT NewEltVT = VT.isFloatingPoint()
10395                        ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
10396                        : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
10397     MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
10398     // Make sure that the new vector type is legal. For example, v2f64 isn't
10399     // legal on SSE1.
10400     if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
10401       V1 = DAG.getBitcast(NewVT, V1);
10402       V2 = DAG.getBitcast(NewVT, V2);
10403       return DAG.getBitcast(
10404           VT, DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask));
10405     }
10406   }
10407
10408   int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0;
10409   for (int M : SVOp->getMask())
10410     if (M < 0)
10411       ++NumUndefElements;
10412     else if (M < NumElements)
10413       ++NumV1Elements;
10414     else
10415       ++NumV2Elements;
10416
10417   // Commute the shuffle as needed such that more elements come from V1 than
10418   // V2. This allows us to match the shuffle pattern strictly on how many
10419   // elements come from V1 without handling the symmetric cases.
10420   if (NumV2Elements > NumV1Elements)
10421     return DAG.getCommutedVectorShuffle(*SVOp);
10422
10423   // When the number of V1 and V2 elements are the same, try to minimize the
10424   // number of uses of V2 in the low half of the vector. When that is tied,
10425   // ensure that the sum of indices for V1 is equal to or lower than the sum
10426   // indices for V2. When those are equal, try to ensure that the number of odd
10427   // indices for V1 is lower than the number of odd indices for V2.
10428   if (NumV1Elements == NumV2Elements) {
10429     int LowV1Elements = 0, LowV2Elements = 0;
10430     for (int M : SVOp->getMask().slice(0, NumElements / 2))
10431       if (M >= NumElements)
10432         ++LowV2Elements;
10433       else if (M >= 0)
10434         ++LowV1Elements;
10435     if (LowV2Elements > LowV1Elements) {
10436       return DAG.getCommutedVectorShuffle(*SVOp);
10437     } else if (LowV2Elements == LowV1Elements) {
10438       int SumV1Indices = 0, SumV2Indices = 0;
10439       for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
10440         if (SVOp->getMask()[i] >= NumElements)
10441           SumV2Indices += i;
10442         else if (SVOp->getMask()[i] >= 0)
10443           SumV1Indices += i;
10444       if (SumV2Indices < SumV1Indices) {
10445         return DAG.getCommutedVectorShuffle(*SVOp);
10446       } else if (SumV2Indices == SumV1Indices) {
10447         int NumV1OddIndices = 0, NumV2OddIndices = 0;
10448         for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
10449           if (SVOp->getMask()[i] >= NumElements)
10450             NumV2OddIndices += i % 2;
10451           else if (SVOp->getMask()[i] >= 0)
10452             NumV1OddIndices += i % 2;
10453         if (NumV2OddIndices < NumV1OddIndices)
10454           return DAG.getCommutedVectorShuffle(*SVOp);
10455       }
10456     }
10457   }
10458
10459   // For each vector width, delegate to a specialized lowering routine.
10460   if (VT.getSizeInBits() == 128)
10461     return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
10462
10463   if (VT.getSizeInBits() == 256)
10464     return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
10465
10466   // Force AVX-512 vectors to be scalarized for now.
10467   // FIXME: Implement AVX-512 support!
10468   if (VT.getSizeInBits() == 512)
10469     return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
10470
10471   llvm_unreachable("Unimplemented!");
10472 }
10473
10474 // This function assumes its argument is a BUILD_VECTOR of constants or
10475 // undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is
10476 // true.
10477 static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
10478                                     unsigned &MaskValue) {
10479   MaskValue = 0;
10480   unsigned NumElems = BuildVector->getNumOperands();
10481   // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
10482   unsigned NumLanes = (NumElems - 1) / 8 + 1;
10483   unsigned NumElemsInLane = NumElems / NumLanes;
10484
10485   // Blend for v16i16 should be symetric for the both lanes.
10486   for (unsigned i = 0; i < NumElemsInLane; ++i) {
10487     SDValue EltCond = BuildVector->getOperand(i);
10488     SDValue SndLaneEltCond =
10489         (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond;
10490
10491     int Lane1Cond = -1, Lane2Cond = -1;
10492     if (isa<ConstantSDNode>(EltCond))
10493       Lane1Cond = !isZero(EltCond);
10494     if (isa<ConstantSDNode>(SndLaneEltCond))
10495       Lane2Cond = !isZero(SndLaneEltCond);
10496
10497     if (Lane1Cond == Lane2Cond || Lane2Cond < 0)
10498       // Lane1Cond != 0, means we want the first argument.
10499       // Lane1Cond == 0, means we want the second argument.
10500       // The encoding of this argument is 0 for the first argument, 1
10501       // for the second. Therefore, invert the condition.
10502       MaskValue |= !Lane1Cond << i;
10503     else if (Lane1Cond < 0)
10504       MaskValue |= !Lane2Cond << i;
10505     else
10506       return false;
10507   }
10508   return true;
10509 }
10510
10511 /// \brief Try to lower a VSELECT instruction to a vector shuffle.
10512 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
10513                                            const X86Subtarget *Subtarget,
10514                                            SelectionDAG &DAG) {
10515   SDValue Cond = Op.getOperand(0);
10516   SDValue LHS = Op.getOperand(1);
10517   SDValue RHS = Op.getOperand(2);
10518   SDLoc dl(Op);
10519   MVT VT = Op.getSimpleValueType();
10520
10521   if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
10522     return SDValue();
10523   auto *CondBV = cast<BuildVectorSDNode>(Cond);
10524
10525   // Only non-legal VSELECTs reach this lowering, convert those into generic
10526   // shuffles and re-use the shuffle lowering path for blends.
10527   SmallVector<int, 32> Mask;
10528   for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
10529     SDValue CondElt = CondBV->getOperand(i);
10530     Mask.push_back(
10531         isa<ConstantSDNode>(CondElt) ? i + (isZero(CondElt) ? Size : 0) : -1);
10532   }
10533   return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
10534 }
10535
10536 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
10537   // A vselect where all conditions and data are constants can be optimized into
10538   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
10539   if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
10540       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
10541       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
10542     return SDValue();
10543
10544   // Try to lower this to a blend-style vector shuffle. This can handle all
10545   // constant condition cases.
10546   if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
10547     return BlendOp;
10548
10549   // Variable blends are only legal from SSE4.1 onward.
10550   if (!Subtarget->hasSSE41())
10551     return SDValue();
10552
10553   // Only some types will be legal on some subtargets. If we can emit a legal
10554   // VSELECT-matching blend, return Op, and but if we need to expand, return
10555   // a null value.
10556   switch (Op.getSimpleValueType().SimpleTy) {
10557   default:
10558     // Most of the vector types have blends past SSE4.1.
10559     return Op;
10560
10561   case MVT::v32i8:
10562     // The byte blends for AVX vectors were introduced only in AVX2.
10563     if (Subtarget->hasAVX2())
10564       return Op;
10565
10566     return SDValue();
10567
10568   case MVT::v8i16:
10569   case MVT::v16i16:
10570     // AVX-512 BWI and VLX features support VSELECT with i16 elements.
10571     if (Subtarget->hasBWI() && Subtarget->hasVLX())
10572       return Op;
10573
10574     // FIXME: We should custom lower this by fixing the condition and using i8
10575     // blends.
10576     return SDValue();
10577   }
10578 }
10579
10580 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
10581   MVT VT = Op.getSimpleValueType();
10582   SDLoc dl(Op);
10583
10584   if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
10585     return SDValue();
10586
10587   if (VT.getSizeInBits() == 8) {
10588     SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
10589                                   Op.getOperand(0), Op.getOperand(1));
10590     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
10591                                   DAG.getValueType(VT));
10592     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
10593   }
10594
10595   if (VT.getSizeInBits() == 16) {
10596     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
10597     // If Idx is 0, it's cheaper to do a move instead of a pextrw.
10598     if (Idx == 0)
10599       return DAG.getNode(
10600           ISD::TRUNCATE, dl, MVT::i16,
10601           DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
10602                       DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
10603                       Op.getOperand(1)));
10604     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
10605                                   Op.getOperand(0), Op.getOperand(1));
10606     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
10607                                   DAG.getValueType(VT));
10608     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
10609   }
10610
10611   if (VT == MVT::f32) {
10612     // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
10613     // the result back to FR32 register. It's only worth matching if the
10614     // result has a single use which is a store or a bitcast to i32.  And in
10615     // the case of a store, it's not worth it if the index is a constant 0,
10616     // because a MOVSSmr can be used instead, which is smaller and faster.
10617     if (!Op.hasOneUse())
10618       return SDValue();
10619     SDNode *User = *Op.getNode()->use_begin();
10620     if ((User->getOpcode() != ISD::STORE ||
10621          (isa<ConstantSDNode>(Op.getOperand(1)) &&
10622           cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
10623         (User->getOpcode() != ISD::BITCAST ||
10624          User->getValueType(0) != MVT::i32))
10625       return SDValue();
10626     SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
10627                                   DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
10628                                   Op.getOperand(1));
10629     return DAG.getBitcast(MVT::f32, Extract);
10630   }
10631
10632   if (VT == MVT::i32 || VT == MVT::i64) {
10633     // ExtractPS/pextrq works with constant index.
10634     if (isa<ConstantSDNode>(Op.getOperand(1)))
10635       return Op;
10636   }
10637   return SDValue();
10638 }
10639
10640 /// Extract one bit from mask vector, like v16i1 or v8i1.
10641 /// AVX-512 feature.
10642 SDValue
10643 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
10644   SDValue Vec = Op.getOperand(0);
10645   SDLoc dl(Vec);
10646   MVT VecVT = Vec.getSimpleValueType();
10647   SDValue Idx = Op.getOperand(1);
10648   MVT EltVT = Op.getSimpleValueType();
10649
10650   assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
10651   assert((VecVT.getVectorNumElements() <= 16 || Subtarget->hasBWI()) &&
10652          "Unexpected vector type in ExtractBitFromMaskVector");
10653
10654   // variable index can't be handled in mask registers,
10655   // extend vector to VR512
10656   if (!isa<ConstantSDNode>(Idx)) {
10657     MVT ExtVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
10658     SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
10659     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
10660                               ExtVT.getVectorElementType(), Ext, Idx);
10661     return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
10662   }
10663
10664   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
10665   const TargetRegisterClass* rc = getRegClassFor(VecVT);
10666   if (!Subtarget->hasDQI() && (VecVT.getVectorNumElements() <= 8))
10667     rc = getRegClassFor(MVT::v16i1);
10668   unsigned MaxSift = rc->getSize()*8 - 1;
10669   Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
10670                     DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
10671   Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
10672                     DAG.getConstant(MaxSift, dl, MVT::i8));
10673   return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
10674                        DAG.getIntPtrConstant(0, dl));
10675 }
10676
10677 SDValue
10678 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
10679                                            SelectionDAG &DAG) const {
10680   SDLoc dl(Op);
10681   SDValue Vec = Op.getOperand(0);
10682   MVT VecVT = Vec.getSimpleValueType();
10683   SDValue Idx = Op.getOperand(1);
10684
10685   if (Op.getSimpleValueType() == MVT::i1)
10686     return ExtractBitFromMaskVector(Op, DAG);
10687
10688   if (!isa<ConstantSDNode>(Idx)) {
10689     if (VecVT.is512BitVector() ||
10690         (VecVT.is256BitVector() && Subtarget->hasInt256() &&
10691          VecVT.getVectorElementType().getSizeInBits() == 32)) {
10692
10693       MVT MaskEltVT =
10694         MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits());
10695       MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
10696                                     MaskEltVT.getSizeInBits());
10697
10698       Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
10699       SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
10700                                 getZeroVector(MaskVT, Subtarget, DAG, dl),
10701                                 Idx, DAG.getConstant(0, dl, getPointerTy()));
10702       SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
10703       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(),
10704                         Perm, DAG.getConstant(0, dl, getPointerTy()));
10705     }
10706     return SDValue();
10707   }
10708
10709   // If this is a 256-bit vector result, first extract the 128-bit vector and
10710   // then extract the element from the 128-bit vector.
10711   if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
10712
10713     unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
10714     // Get the 128-bit vector.
10715     Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
10716     MVT EltVT = VecVT.getVectorElementType();
10717
10718     unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
10719
10720     //if (IdxVal >= NumElems/2)
10721     //  IdxVal -= NumElems/2;
10722     IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk;
10723     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
10724                        DAG.getConstant(IdxVal, dl, MVT::i32));
10725   }
10726
10727   assert(VecVT.is128BitVector() && "Unexpected vector length");
10728
10729   if (Subtarget->hasSSE41()) {
10730     SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
10731     if (Res.getNode())
10732       return Res;
10733   }
10734
10735   MVT VT = Op.getSimpleValueType();
10736   // TODO: handle v16i8.
10737   if (VT.getSizeInBits() == 16) {
10738     SDValue Vec = Op.getOperand(0);
10739     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
10740     if (Idx == 0)
10741       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
10742                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
10743                                      DAG.getBitcast(MVT::v4i32, Vec),
10744                                      Op.getOperand(1)));
10745     // Transform it so it match pextrw which produces a 32-bit result.
10746     MVT EltVT = MVT::i32;
10747     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
10748                                   Op.getOperand(0), Op.getOperand(1));
10749     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
10750                                   DAG.getValueType(VT));
10751     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
10752   }
10753
10754   if (VT.getSizeInBits() == 32) {
10755     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
10756     if (Idx == 0)
10757       return Op;
10758
10759     // SHUFPS the element to the lowest double word, then movss.
10760     int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
10761     MVT VVT = Op.getOperand(0).getSimpleValueType();
10762     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
10763                                        DAG.getUNDEF(VVT), Mask);
10764     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
10765                        DAG.getIntPtrConstant(0, dl));
10766   }
10767
10768   if (VT.getSizeInBits() == 64) {
10769     // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
10770     // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
10771     //        to match extract_elt for f64.
10772     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
10773     if (Idx == 0)
10774       return Op;
10775
10776     // UNPCKHPD the element to the lowest double word, then movsd.
10777     // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
10778     // to a f64mem, the whole operation is folded into a single MOVHPDmr.
10779     int Mask[2] = { 1, -1 };
10780     MVT VVT = Op.getOperand(0).getSimpleValueType();
10781     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
10782                                        DAG.getUNDEF(VVT), Mask);
10783     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
10784                        DAG.getIntPtrConstant(0, dl));
10785   }
10786
10787   return SDValue();
10788 }
10789
10790 /// Insert one bit to mask vector, like v16i1 or v8i1.
10791 /// AVX-512 feature.
10792 SDValue
10793 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
10794   SDLoc dl(Op);
10795   SDValue Vec = Op.getOperand(0);
10796   SDValue Elt = Op.getOperand(1);
10797   SDValue Idx = Op.getOperand(2);
10798   MVT VecVT = Vec.getSimpleValueType();
10799
10800   if (!isa<ConstantSDNode>(Idx)) {
10801     // Non constant index. Extend source and destination,
10802     // insert element and then truncate the result.
10803     MVT ExtVecVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
10804     MVT ExtEltVT = (VecVT == MVT::v8i1 ?  MVT::i64 : MVT::i32);
10805     SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
10806       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
10807       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
10808     return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
10809   }
10810
10811   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
10812   SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
10813   if (IdxVal)
10814     EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
10815                            DAG.getConstant(IdxVal, dl, MVT::i8));
10816   if (Vec.getOpcode() == ISD::UNDEF)
10817     return EltInVec;
10818   return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
10819 }
10820
10821 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
10822                                                   SelectionDAG &DAG) const {
10823   MVT VT = Op.getSimpleValueType();
10824   MVT EltVT = VT.getVectorElementType();
10825
10826   if (EltVT == MVT::i1)
10827     return InsertBitToMaskVector(Op, DAG);
10828
10829   SDLoc dl(Op);
10830   SDValue N0 = Op.getOperand(0);
10831   SDValue N1 = Op.getOperand(1);
10832   SDValue N2 = Op.getOperand(2);
10833   if (!isa<ConstantSDNode>(N2))
10834     return SDValue();
10835   auto *N2C = cast<ConstantSDNode>(N2);
10836   unsigned IdxVal = N2C->getZExtValue();
10837
10838   // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
10839   // into that, and then insert the subvector back into the result.
10840   if (VT.is256BitVector() || VT.is512BitVector()) {
10841     // With a 256-bit vector, we can insert into the zero element efficiently
10842     // using a blend if we have AVX or AVX2 and the right data type.
10843     if (VT.is256BitVector() && IdxVal == 0) {
10844       // TODO: It is worthwhile to cast integer to floating point and back
10845       // and incur a domain crossing penalty if that's what we'll end up
10846       // doing anyway after extracting to a 128-bit vector.
10847       if ((Subtarget->hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
10848           (Subtarget->hasAVX2() && EltVT == MVT::i32)) {
10849         SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
10850         N2 = DAG.getIntPtrConstant(1, dl);
10851         return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
10852       }
10853     }
10854
10855     // Get the desired 128-bit vector chunk.
10856     SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
10857
10858     // Insert the element into the desired chunk.
10859     unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
10860     unsigned IdxIn128 = IdxVal - (IdxVal / NumEltsIn128) * NumEltsIn128;
10861
10862     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
10863                     DAG.getConstant(IdxIn128, dl, MVT::i32));
10864
10865     // Insert the changed part back into the bigger vector
10866     return Insert128BitVector(N0, V, IdxVal, DAG, dl);
10867   }
10868   assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
10869
10870   if (Subtarget->hasSSE41()) {
10871     if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) {
10872       unsigned Opc;
10873       if (VT == MVT::v8i16) {
10874         Opc = X86ISD::PINSRW;
10875       } else {
10876         assert(VT == MVT::v16i8);
10877         Opc = X86ISD::PINSRB;
10878       }
10879
10880       // Transform it so it match pinsr{b,w} which expects a GR32 as its second
10881       // argument.
10882       if (N1.getValueType() != MVT::i32)
10883         N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
10884       if (N2.getValueType() != MVT::i32)
10885         N2 = DAG.getIntPtrConstant(IdxVal, dl);
10886       return DAG.getNode(Opc, dl, VT, N0, N1, N2);
10887     }
10888
10889     if (EltVT == MVT::f32) {
10890       // Bits [7:6] of the constant are the source select. This will always be
10891       //   zero here. The DAG Combiner may combine an extract_elt index into
10892       //   these bits. For example (insert (extract, 3), 2) could be matched by
10893       //   putting the '3' into bits [7:6] of X86ISD::INSERTPS.
10894       // Bits [5:4] of the constant are the destination select. This is the
10895       //   value of the incoming immediate.
10896       // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
10897       //   combine either bitwise AND or insert of float 0.0 to set these bits.
10898
10899       const Function *F = DAG.getMachineFunction().getFunction();
10900       bool MinSize = F->hasFnAttribute(Attribute::MinSize);
10901       if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
10902         // If this is an insertion of 32-bits into the low 32-bits of
10903         // a vector, we prefer to generate a blend with immediate rather
10904         // than an insertps. Blends are simpler operations in hardware and so
10905         // will always have equal or better performance than insertps.
10906         // But if optimizing for size and there's a load folding opportunity,
10907         // generate insertps because blendps does not have a 32-bit memory
10908         // operand form.
10909         N2 = DAG.getIntPtrConstant(1, dl);
10910         N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
10911         return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
10912       }
10913       N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
10914       // Create this as a scalar to vector..
10915       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
10916       return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
10917     }
10918
10919     if (EltVT == MVT::i32 || EltVT == MVT::i64) {
10920       // PINSR* works with constant index.
10921       return Op;
10922     }
10923   }
10924
10925   if (EltVT == MVT::i8)
10926     return SDValue();
10927
10928   if (EltVT.getSizeInBits() == 16) {
10929     // Transform it so it match pinsrw which expects a 16-bit value in a GR32
10930     // as its second argument.
10931     if (N1.getValueType() != MVT::i32)
10932       N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
10933     if (N2.getValueType() != MVT::i32)
10934       N2 = DAG.getIntPtrConstant(IdxVal, dl);
10935     return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
10936   }
10937   return SDValue();
10938 }
10939
10940 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
10941   SDLoc dl(Op);
10942   MVT OpVT = Op.getSimpleValueType();
10943
10944   // If this is a 256-bit vector result, first insert into a 128-bit
10945   // vector and then insert into the 256-bit vector.
10946   if (!OpVT.is128BitVector()) {
10947     // Insert into a 128-bit vector.
10948     unsigned SizeFactor = OpVT.getSizeInBits()/128;
10949     MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
10950                                  OpVT.getVectorNumElements() / SizeFactor);
10951
10952     Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
10953
10954     // Insert the 128-bit vector.
10955     return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
10956   }
10957
10958   if (OpVT == MVT::v1i64 &&
10959       Op.getOperand(0).getValueType() == MVT::i64)
10960     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
10961
10962   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
10963   assert(OpVT.is128BitVector() && "Expected an SSE type!");
10964   return DAG.getBitcast(
10965       OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
10966 }
10967
10968 // Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
10969 // a simple subregister reference or explicit instructions to grab
10970 // upper bits of a vector.
10971 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
10972                                       SelectionDAG &DAG) {
10973   SDLoc dl(Op);
10974   SDValue In =  Op.getOperand(0);
10975   SDValue Idx = Op.getOperand(1);
10976   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
10977   MVT ResVT   = Op.getSimpleValueType();
10978   MVT InVT    = In.getSimpleValueType();
10979
10980   if (Subtarget->hasFp256()) {
10981     if (ResVT.is128BitVector() &&
10982         (InVT.is256BitVector() || InVT.is512BitVector()) &&
10983         isa<ConstantSDNode>(Idx)) {
10984       return Extract128BitVector(In, IdxVal, DAG, dl);
10985     }
10986     if (ResVT.is256BitVector() && InVT.is512BitVector() &&
10987         isa<ConstantSDNode>(Idx)) {
10988       return Extract256BitVector(In, IdxVal, DAG, dl);
10989     }
10990   }
10991   return SDValue();
10992 }
10993
10994 // Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
10995 // simple superregister reference or explicit instructions to insert
10996 // the upper bits of a vector.
10997 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
10998                                      SelectionDAG &DAG) {
10999   if (!Subtarget->hasAVX())
11000     return SDValue();
11001
11002   SDLoc dl(Op);
11003   SDValue Vec = Op.getOperand(0);
11004   SDValue SubVec = Op.getOperand(1);
11005   SDValue Idx = Op.getOperand(2);
11006
11007   if (!isa<ConstantSDNode>(Idx))
11008     return SDValue();
11009
11010   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
11011   MVT OpVT = Op.getSimpleValueType();
11012   MVT SubVecVT = SubVec.getSimpleValueType();
11013
11014   // Fold two 16-byte subvector loads into one 32-byte load:
11015   // (insert_subvector (insert_subvector undef, (load addr), 0),
11016   //                   (load addr + 16), Elts/2)
11017   // --> load32 addr
11018   if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
11019       Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
11020       OpVT.is256BitVector() && SubVecVT.is128BitVector() &&
11021       !Subtarget->isUnalignedMem32Slow()) {
11022     SDValue SubVec2 = Vec.getOperand(1);
11023     if (auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2))) {
11024       if (Idx2->getZExtValue() == 0) {
11025         SDValue Ops[] = { SubVec2, SubVec };
11026         SDValue LD = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false);
11027         if (LD.getNode())
11028           return LD;
11029       }
11030     }
11031   }
11032
11033   if ((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
11034       SubVecVT.is128BitVector())
11035     return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
11036
11037   if (OpVT.is512BitVector() && SubVecVT.is256BitVector())
11038     return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
11039
11040   if (OpVT.getVectorElementType() == MVT::i1) {
11041     if (IdxVal == 0  && Vec.getOpcode() == ISD::UNDEF) // the operation is legal
11042       return Op;
11043     SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
11044     SDValue Undef = DAG.getUNDEF(OpVT);
11045     unsigned NumElems = OpVT.getVectorNumElements();
11046     SDValue ShiftBits = DAG.getConstant(NumElems/2, dl, MVT::i8);
11047
11048     if (IdxVal == OpVT.getVectorNumElements() / 2) {
11049       // Zero upper bits of the Vec
11050       Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits);
11051       Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits);
11052
11053       SDValue Vec2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
11054                                  SubVec, ZeroIdx);
11055       Vec2 = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec2, ShiftBits);
11056       return DAG.getNode(ISD::OR, dl, OpVT, Vec, Vec2);
11057     }
11058     if (IdxVal == 0) {
11059       SDValue Vec2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
11060                                  SubVec, ZeroIdx);
11061       // Zero upper bits of the Vec2
11062       Vec2 = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec2, ShiftBits);
11063       Vec2 = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec2, ShiftBits);
11064       // Zero lower bits of the Vec
11065       Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits);
11066       Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits);
11067       // Merge them together
11068       return DAG.getNode(ISD::OR, dl, OpVT, Vec, Vec2);
11069     }
11070   }
11071   return SDValue();
11072 }
11073
11074 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
11075 // their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
11076 // one of the above mentioned nodes. It has to be wrapped because otherwise
11077 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
11078 // be used to form addressing mode. These wrapped nodes will be selected
11079 // into MOV32ri.
11080 SDValue
11081 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
11082   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
11083
11084   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
11085   // global base reg.
11086   unsigned char OpFlag = 0;
11087   unsigned WrapperKind = X86ISD::Wrapper;
11088   CodeModel::Model M = DAG.getTarget().getCodeModel();
11089
11090   if (Subtarget->isPICStyleRIPRel() &&
11091       (M == CodeModel::Small || M == CodeModel::Kernel))
11092     WrapperKind = X86ISD::WrapperRIP;
11093   else if (Subtarget->isPICStyleGOT())
11094     OpFlag = X86II::MO_GOTOFF;
11095   else if (Subtarget->isPICStyleStubPIC())
11096     OpFlag = X86II::MO_PIC_BASE_OFFSET;
11097
11098   SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
11099                                              CP->getAlignment(),
11100                                              CP->getOffset(), OpFlag);
11101   SDLoc DL(CP);
11102   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
11103   // With PIC, the address is actually $g + Offset.
11104   if (OpFlag) {
11105     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
11106                          DAG.getNode(X86ISD::GlobalBaseReg,
11107                                      SDLoc(), getPointerTy()),
11108                          Result);
11109   }
11110
11111   return Result;
11112 }
11113
11114 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
11115   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
11116
11117   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
11118   // global base reg.
11119   unsigned char OpFlag = 0;
11120   unsigned WrapperKind = X86ISD::Wrapper;
11121   CodeModel::Model M = DAG.getTarget().getCodeModel();
11122
11123   if (Subtarget->isPICStyleRIPRel() &&
11124       (M == CodeModel::Small || M == CodeModel::Kernel))
11125     WrapperKind = X86ISD::WrapperRIP;
11126   else if (Subtarget->isPICStyleGOT())
11127     OpFlag = X86II::MO_GOTOFF;
11128   else if (Subtarget->isPICStyleStubPIC())
11129     OpFlag = X86II::MO_PIC_BASE_OFFSET;
11130
11131   SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
11132                                           OpFlag);
11133   SDLoc DL(JT);
11134   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
11135
11136   // With PIC, the address is actually $g + Offset.
11137   if (OpFlag)
11138     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
11139                          DAG.getNode(X86ISD::GlobalBaseReg,
11140                                      SDLoc(), getPointerTy()),
11141                          Result);
11142
11143   return Result;
11144 }
11145
11146 SDValue
11147 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
11148   const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
11149
11150   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
11151   // global base reg.
11152   unsigned char OpFlag = 0;
11153   unsigned WrapperKind = X86ISD::Wrapper;
11154   CodeModel::Model M = DAG.getTarget().getCodeModel();
11155
11156   if (Subtarget->isPICStyleRIPRel() &&
11157       (M == CodeModel::Small || M == CodeModel::Kernel)) {
11158     if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())
11159       OpFlag = X86II::MO_GOTPCREL;
11160     WrapperKind = X86ISD::WrapperRIP;
11161   } else if (Subtarget->isPICStyleGOT()) {
11162     OpFlag = X86II::MO_GOT;
11163   } else if (Subtarget->isPICStyleStubPIC()) {
11164     OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE;
11165   } else if (Subtarget->isPICStyleStubNoDynamic()) {
11166     OpFlag = X86II::MO_DARWIN_NONLAZY;
11167   }
11168
11169   SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
11170
11171   SDLoc DL(Op);
11172   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
11173
11174   // With PIC, the address is actually $g + Offset.
11175   if (DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
11176       !Subtarget->is64Bit()) {
11177     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
11178                          DAG.getNode(X86ISD::GlobalBaseReg,
11179                                      SDLoc(), getPointerTy()),
11180                          Result);
11181   }
11182
11183   // For symbols that require a load from a stub to get the address, emit the
11184   // load.
11185   if (isGlobalStubReference(OpFlag))
11186     Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
11187                          MachinePointerInfo::getGOT(), false, false, false, 0);
11188
11189   return Result;
11190 }
11191
11192 SDValue
11193 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
11194   // Create the TargetBlockAddressAddress node.
11195   unsigned char OpFlags =
11196     Subtarget->ClassifyBlockAddressReference();
11197   CodeModel::Model M = DAG.getTarget().getCodeModel();
11198   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
11199   int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
11200   SDLoc dl(Op);
11201   SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset,
11202                                              OpFlags);
11203
11204   if (Subtarget->isPICStyleRIPRel() &&
11205       (M == CodeModel::Small || M == CodeModel::Kernel))
11206     Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
11207   else
11208     Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
11209
11210   // With PIC, the address is actually $g + Offset.
11211   if (isGlobalRelativeToPICBase(OpFlags)) {
11212     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
11213                          DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
11214                          Result);
11215   }
11216
11217   return Result;
11218 }
11219
11220 SDValue
11221 X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
11222                                       int64_t Offset, SelectionDAG &DAG) const {
11223   // Create the TargetGlobalAddress node, folding in the constant
11224   // offset if it is legal.
11225   unsigned char OpFlags =
11226       Subtarget->ClassifyGlobalReference(GV, DAG.getTarget());
11227   CodeModel::Model M = DAG.getTarget().getCodeModel();
11228   SDValue Result;
11229   if (OpFlags == X86II::MO_NO_FLAG &&
11230       X86::isOffsetSuitableForCodeModel(Offset, M)) {
11231     // A direct static reference to a global.
11232     Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
11233     Offset = 0;
11234   } else {
11235     Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
11236   }
11237
11238   if (Subtarget->isPICStyleRIPRel() &&
11239       (M == CodeModel::Small || M == CodeModel::Kernel))
11240     Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
11241   else
11242     Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
11243
11244   // With PIC, the address is actually $g + Offset.
11245   if (isGlobalRelativeToPICBase(OpFlags)) {
11246     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
11247                          DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
11248                          Result);
11249   }
11250
11251   // For globals that require a load from a stub to get the address, emit the
11252   // load.
11253   if (isGlobalStubReference(OpFlags))
11254     Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
11255                          MachinePointerInfo::getGOT(), false, false, false, 0);
11256
11257   // If there was a non-zero offset that we didn't fold, create an explicit
11258   // addition for it.
11259   if (Offset != 0)
11260     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
11261                          DAG.getConstant(Offset, dl, getPointerTy()));
11262
11263   return Result;
11264 }
11265
11266 SDValue
11267 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
11268   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
11269   int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
11270   return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
11271 }
11272
11273 static SDValue
11274 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
11275            SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
11276            unsigned char OperandFlags, bool LocalDynamic = false) {
11277   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
11278   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
11279   SDLoc dl(GA);
11280   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
11281                                            GA->getValueType(0),
11282                                            GA->getOffset(),
11283                                            OperandFlags);
11284
11285   X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
11286                                            : X86ISD::TLSADDR;
11287
11288   if (InFlag) {
11289     SDValue Ops[] = { Chain,  TGA, *InFlag };
11290     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
11291   } else {
11292     SDValue Ops[]  = { Chain, TGA };
11293     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
11294   }
11295
11296   // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
11297   MFI->setAdjustsStack(true);
11298   MFI->setHasCalls(true);
11299
11300   SDValue Flag = Chain.getValue(1);
11301   return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
11302 }
11303
11304 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
11305 static SDValue
11306 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
11307                                 const EVT PtrVT) {
11308   SDValue InFlag;
11309   SDLoc dl(GA);  // ? function entry point might be better
11310   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
11311                                    DAG.getNode(X86ISD::GlobalBaseReg,
11312                                                SDLoc(), PtrVT), InFlag);
11313   InFlag = Chain.getValue(1);
11314
11315   return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
11316 }
11317
11318 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
11319 static SDValue
11320 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
11321                                 const EVT PtrVT) {
11322   return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
11323                     X86::RAX, X86II::MO_TLSGD);
11324 }
11325
11326 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
11327                                            SelectionDAG &DAG,
11328                                            const EVT PtrVT,
11329                                            bool is64Bit) {
11330   SDLoc dl(GA);
11331
11332   // Get the start address of the TLS block for this module.
11333   X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
11334       .getInfo<X86MachineFunctionInfo>();
11335   MFI->incNumLocalDynamicTLSAccesses();
11336
11337   SDValue Base;
11338   if (is64Bit) {
11339     Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
11340                       X86II::MO_TLSLD, /*LocalDynamic=*/true);
11341   } else {
11342     SDValue InFlag;
11343     SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
11344         DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
11345     InFlag = Chain.getValue(1);
11346     Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
11347                       X86II::MO_TLSLDM, /*LocalDynamic=*/true);
11348   }
11349
11350   // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
11351   // of Base.
11352
11353   // Build x@dtpoff.
11354   unsigned char OperandFlags = X86II::MO_DTPOFF;
11355   unsigned WrapperKind = X86ISD::Wrapper;
11356   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
11357                                            GA->getValueType(0),
11358                                            GA->getOffset(), OperandFlags);
11359   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
11360
11361   // Add x@dtpoff with the base.
11362   return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
11363 }
11364
11365 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
11366 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
11367                                    const EVT PtrVT, TLSModel::Model model,
11368                                    bool is64Bit, bool isPIC) {
11369   SDLoc dl(GA);
11370
11371   // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
11372   Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
11373                                                          is64Bit ? 257 : 256));
11374
11375   SDValue ThreadPointer =
11376       DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
11377                   MachinePointerInfo(Ptr), false, false, false, 0);
11378
11379   unsigned char OperandFlags = 0;
11380   // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
11381   // initialexec.
11382   unsigned WrapperKind = X86ISD::Wrapper;
11383   if (model == TLSModel::LocalExec) {
11384     OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
11385   } else if (model == TLSModel::InitialExec) {
11386     if (is64Bit) {
11387       OperandFlags = X86II::MO_GOTTPOFF;
11388       WrapperKind = X86ISD::WrapperRIP;
11389     } else {
11390       OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
11391     }
11392   } else {
11393     llvm_unreachable("Unexpected model");
11394   }
11395
11396   // emit "addl x@ntpoff,%eax" (local exec)
11397   // or "addl x@indntpoff,%eax" (initial exec)
11398   // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
11399   SDValue TGA =
11400       DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
11401                                  GA->getOffset(), OperandFlags);
11402   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
11403
11404   if (model == TLSModel::InitialExec) {
11405     if (isPIC && !is64Bit) {
11406       Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
11407                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
11408                            Offset);
11409     }
11410
11411     Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
11412                          MachinePointerInfo::getGOT(), false, false, false, 0);
11413   }
11414
11415   // The address of the thread local variable is the add of the thread
11416   // pointer with the offset of the variable.
11417   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
11418 }
11419
11420 SDValue
11421 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
11422
11423   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
11424   const GlobalValue *GV = GA->getGlobal();
11425
11426   if (Subtarget->isTargetELF()) {
11427     TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
11428     switch (model) {
11429       case TLSModel::GeneralDynamic:
11430         if (Subtarget->is64Bit())
11431           return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
11432         return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
11433       case TLSModel::LocalDynamic:
11434         return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(),
11435                                            Subtarget->is64Bit());
11436       case TLSModel::InitialExec:
11437       case TLSModel::LocalExec:
11438         return LowerToTLSExecModel(
11439             GA, DAG, getPointerTy(), model, Subtarget->is64Bit(),
11440             DAG.getTarget().getRelocationModel() == Reloc::PIC_);
11441     }
11442     llvm_unreachable("Unknown TLS model.");
11443   }
11444
11445   if (Subtarget->isTargetDarwin()) {
11446     // Darwin only has one model of TLS.  Lower to that.
11447     unsigned char OpFlag = 0;
11448     unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
11449                            X86ISD::WrapperRIP : X86ISD::Wrapper;
11450
11451     // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
11452     // global base reg.
11453     bool PIC32 = (DAG.getTarget().getRelocationModel() == Reloc::PIC_) &&
11454                  !Subtarget->is64Bit();
11455     if (PIC32)
11456       OpFlag = X86II::MO_TLVP_PIC_BASE;
11457     else
11458       OpFlag = X86II::MO_TLVP;
11459     SDLoc DL(Op);
11460     SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
11461                                                 GA->getValueType(0),
11462                                                 GA->getOffset(), OpFlag);
11463     SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
11464
11465     // With PIC32, the address is actually $g + Offset.
11466     if (PIC32)
11467       Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),
11468                            DAG.getNode(X86ISD::GlobalBaseReg,
11469                                        SDLoc(), getPointerTy()),
11470                            Offset);
11471
11472     // Lowering the machine isd will make sure everything is in the right
11473     // location.
11474     SDValue Chain = DAG.getEntryNode();
11475     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
11476     SDValue Args[] = { Chain, Offset };
11477     Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
11478
11479     // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
11480     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
11481     MFI->setAdjustsStack(true);
11482
11483     // And our return value (tls address) is in the standard call return value
11484     // location.
11485     unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
11486     return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
11487                               Chain.getValue(1));
11488   }
11489
11490   if (Subtarget->isTargetKnownWindowsMSVC() ||
11491       Subtarget->isTargetWindowsGNU()) {
11492     // Just use the implicit TLS architecture
11493     // Need to generate someting similar to:
11494     //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
11495     //                                  ; from TEB
11496     //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
11497     //   mov     rcx, qword [rdx+rcx*8]
11498     //   mov     eax, .tls$:tlsvar
11499     //   [rax+rcx] contains the address
11500     // Windows 64bit: gs:0x58
11501     // Windows 32bit: fs:__tls_array
11502
11503     SDLoc dl(GA);
11504     SDValue Chain = DAG.getEntryNode();
11505
11506     // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
11507     // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
11508     // use its literal value of 0x2C.
11509     Value *Ptr = Constant::getNullValue(Subtarget->is64Bit()
11510                                         ? Type::getInt8PtrTy(*DAG.getContext(),
11511                                                              256)
11512                                         : Type::getInt32PtrTy(*DAG.getContext(),
11513                                                               257));
11514
11515     SDValue TlsArray =
11516         Subtarget->is64Bit()
11517             ? DAG.getIntPtrConstant(0x58, dl)
11518             : (Subtarget->isTargetWindowsGNU()
11519                    ? DAG.getIntPtrConstant(0x2C, dl)
11520                    : DAG.getExternalSymbol("_tls_array", getPointerTy()));
11521
11522     SDValue ThreadPointer =
11523         DAG.getLoad(getPointerTy(), dl, Chain, TlsArray,
11524                     MachinePointerInfo(Ptr), false, false, false, 0);
11525
11526     SDValue res;
11527     if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
11528       res = ThreadPointer;
11529     } else {
11530       // Load the _tls_index variable
11531       SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy());
11532       if (Subtarget->is64Bit())
11533         IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain, IDX,
11534                              MachinePointerInfo(), MVT::i32, false, false,
11535                              false, 0);
11536       else
11537         IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(),
11538                           false, false, false, 0);
11539
11540       SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()), dl,
11541                                       getPointerTy());
11542       IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);
11543
11544       res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);
11545     }
11546
11547     res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(),
11548                       false, false, false, 0);
11549
11550     // Get the offset of start of .tls section
11551     SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
11552                                              GA->getValueType(0),
11553                                              GA->getOffset(), X86II::MO_SECREL);
11554     SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA);
11555
11556     // The address of the thread local variable is the add of the thread
11557     // pointer with the offset of the variable.
11558     return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset);
11559   }
11560
11561   llvm_unreachable("TLS not implemented for this target.");
11562 }
11563
11564 /// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
11565 /// and take a 2 x i32 value to shift plus a shift amount.
11566 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
11567   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
11568   MVT VT = Op.getSimpleValueType();
11569   unsigned VTBits = VT.getSizeInBits();
11570   SDLoc dl(Op);
11571   bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
11572   SDValue ShOpLo = Op.getOperand(0);
11573   SDValue ShOpHi = Op.getOperand(1);
11574   SDValue ShAmt  = Op.getOperand(2);
11575   // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
11576   // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
11577   // during isel.
11578   SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
11579                                   DAG.getConstant(VTBits - 1, dl, MVT::i8));
11580   SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
11581                                      DAG.getConstant(VTBits - 1, dl, MVT::i8))
11582                        : DAG.getConstant(0, dl, VT);
11583
11584   SDValue Tmp2, Tmp3;
11585   if (Op.getOpcode() == ISD::SHL_PARTS) {
11586     Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
11587     Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
11588   } else {
11589     Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
11590     Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
11591   }
11592
11593   // If the shift amount is larger or equal than the width of a part we can't
11594   // rely on the results of shld/shrd. Insert a test and select the appropriate
11595   // values for large shift amounts.
11596   SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
11597                                 DAG.getConstant(VTBits, dl, MVT::i8));
11598   SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
11599                              AndNode, DAG.getConstant(0, dl, MVT::i8));
11600
11601   SDValue Hi, Lo;
11602   SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
11603   SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
11604   SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
11605
11606   if (Op.getOpcode() == ISD::SHL_PARTS) {
11607     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
11608     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
11609   } else {
11610     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
11611     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
11612   }
11613
11614   SDValue Ops[2] = { Lo, Hi };
11615   return DAG.getMergeValues(Ops, dl);
11616 }
11617
11618 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
11619                                            SelectionDAG &DAG) const {
11620   MVT SrcVT = Op.getOperand(0).getSimpleValueType();
11621   SDLoc dl(Op);
11622
11623   if (SrcVT.isVector()) {
11624     if (SrcVT.getVectorElementType() == MVT::i1) {
11625       MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
11626       return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
11627                          DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT,
11628                                      Op.getOperand(0)));
11629     }
11630     return SDValue();
11631   }
11632
11633   assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
11634          "Unknown SINT_TO_FP to lower!");
11635
11636   // These are really Legal; return the operand so the caller accepts it as
11637   // Legal.
11638   if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
11639     return Op;
11640   if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
11641       Subtarget->is64Bit()) {
11642     return Op;
11643   }
11644
11645   unsigned Size = SrcVT.getSizeInBits()/8;
11646   MachineFunction &MF = DAG.getMachineFunction();
11647   int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
11648   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
11649   SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
11650                                StackSlot,
11651                                MachinePointerInfo::getFixedStack(SSFI),
11652                                false, false, 0);
11653   return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
11654 }
11655
11656 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
11657                                      SDValue StackSlot,
11658                                      SelectionDAG &DAG) const {
11659   // Build the FILD
11660   SDLoc DL(Op);
11661   SDVTList Tys;
11662   bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
11663   if (useSSE)
11664     Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
11665   else
11666     Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
11667
11668   unsigned ByteSize = SrcVT.getSizeInBits()/8;
11669
11670   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
11671   MachineMemOperand *MMO;
11672   if (FI) {
11673     int SSFI = FI->getIndex();
11674     MMO =
11675       DAG.getMachineFunction()
11676       .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
11677                             MachineMemOperand::MOLoad, ByteSize, ByteSize);
11678   } else {
11679     MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
11680     StackSlot = StackSlot.getOperand(1);
11681   }
11682   SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
11683   SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
11684                                            X86ISD::FILD, DL,
11685                                            Tys, Ops, SrcVT, MMO);
11686
11687   if (useSSE) {
11688     Chain = Result.getValue(1);
11689     SDValue InFlag = Result.getValue(2);
11690
11691     // FIXME: Currently the FST is flagged to the FILD_FLAG. This
11692     // shouldn't be necessary except that RFP cannot be live across
11693     // multiple blocks. When stackifier is fixed, they can be uncoupled.
11694     MachineFunction &MF = DAG.getMachineFunction();
11695     unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
11696     int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
11697     SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
11698     Tys = DAG.getVTList(MVT::Other);
11699     SDValue Ops[] = {
11700       Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
11701     };
11702     MachineMemOperand *MMO =
11703       DAG.getMachineFunction()
11704       .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
11705                             MachineMemOperand::MOStore, SSFISize, SSFISize);
11706
11707     Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
11708                                     Ops, Op.getValueType(), MMO);
11709     Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
11710                          MachinePointerInfo::getFixedStack(SSFI),
11711                          false, false, false, 0);
11712   }
11713
11714   return Result;
11715 }
11716
11717 // LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
11718 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
11719                                                SelectionDAG &DAG) const {
11720   // This algorithm is not obvious. Here it is what we're trying to output:
11721   /*
11722      movq       %rax,  %xmm0
11723      punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
11724      subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
11725      #ifdef __SSE3__
11726        haddpd   %xmm0, %xmm0
11727      #else
11728        pshufd   $0x4e, %xmm0, %xmm1
11729        addpd    %xmm1, %xmm0
11730      #endif
11731   */
11732
11733   SDLoc dl(Op);
11734   LLVMContext *Context = DAG.getContext();
11735
11736   // Build some magic constants.
11737   static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
11738   Constant *C0 = ConstantDataVector::get(*Context, CV0);
11739   SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
11740
11741   SmallVector<Constant*,2> CV1;
11742   CV1.push_back(
11743     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
11744                                       APInt(64, 0x4330000000000000ULL))));
11745   CV1.push_back(
11746     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
11747                                       APInt(64, 0x4530000000000000ULL))));
11748   Constant *C1 = ConstantVector::get(CV1);
11749   SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
11750
11751   // Load the 64-bit value into an XMM register.
11752   SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
11753                             Op.getOperand(0));
11754   SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
11755                               MachinePointerInfo::getConstantPool(),
11756                               false, false, false, 16);
11757   SDValue Unpck1 =
11758       getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
11759
11760   SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
11761                               MachinePointerInfo::getConstantPool(),
11762                               false, false, false, 16);
11763   SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
11764   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
11765   SDValue Result;
11766
11767   if (Subtarget->hasSSE3()) {
11768     // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
11769     Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
11770   } else {
11771     SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
11772     SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
11773                                            S2F, 0x4E, DAG);
11774     Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
11775                          DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
11776   }
11777
11778   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
11779                      DAG.getIntPtrConstant(0, dl));
11780 }
11781
11782 // LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
11783 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
11784                                                SelectionDAG &DAG) const {
11785   SDLoc dl(Op);
11786   // FP constant to bias correct the final result.
11787   SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
11788                                    MVT::f64);
11789
11790   // Load the 32-bit value into an XMM register.
11791   SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
11792                              Op.getOperand(0));
11793
11794   // Zero out the upper parts of the register.
11795   Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
11796
11797   Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
11798                      DAG.getBitcast(MVT::v2f64, Load),
11799                      DAG.getIntPtrConstant(0, dl));
11800
11801   // Or the load with the bias.
11802   SDValue Or = DAG.getNode(
11803       ISD::OR, dl, MVT::v2i64,
11804       DAG.getBitcast(MVT::v2i64,
11805                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
11806       DAG.getBitcast(MVT::v2i64,
11807                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
11808   Or =
11809       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
11810                   DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
11811
11812   // Subtract the bias.
11813   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
11814
11815   // Handle final rounding.
11816   EVT DestVT = Op.getValueType();
11817
11818   if (DestVT.bitsLT(MVT::f64))
11819     return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
11820                        DAG.getIntPtrConstant(0, dl));
11821   if (DestVT.bitsGT(MVT::f64))
11822     return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
11823
11824   // Handle final rounding.
11825   return Sub;
11826 }
11827
11828 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
11829                                      const X86Subtarget &Subtarget) {
11830   // The algorithm is the following:
11831   // #ifdef __SSE4_1__
11832   //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
11833   //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
11834   //                                 (uint4) 0x53000000, 0xaa);
11835   // #else
11836   //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
11837   //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
11838   // #endif
11839   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
11840   //     return (float4) lo + fhi;
11841
11842   SDLoc DL(Op);
11843   SDValue V = Op->getOperand(0);
11844   EVT VecIntVT = V.getValueType();
11845   bool Is128 = VecIntVT == MVT::v4i32;
11846   EVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
11847   // If we convert to something else than the supported type, e.g., to v4f64,
11848   // abort early.
11849   if (VecFloatVT != Op->getValueType(0))
11850     return SDValue();
11851
11852   unsigned NumElts = VecIntVT.getVectorNumElements();
11853   assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
11854          "Unsupported custom type");
11855   assert(NumElts <= 8 && "The size of the constant array must be fixed");
11856
11857   // In the #idef/#else code, we have in common:
11858   // - The vector of constants:
11859   // -- 0x4b000000
11860   // -- 0x53000000
11861   // - A shift:
11862   // -- v >> 16
11863
11864   // Create the splat vector for 0x4b000000.
11865   SDValue CstLow = DAG.getConstant(0x4b000000, DL, MVT::i32);
11866   SDValue CstLowArray[] = {CstLow, CstLow, CstLow, CstLow,
11867                            CstLow, CstLow, CstLow, CstLow};
11868   SDValue VecCstLow = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
11869                                   makeArrayRef(&CstLowArray[0], NumElts));
11870   // Create the splat vector for 0x53000000.
11871   SDValue CstHigh = DAG.getConstant(0x53000000, DL, MVT::i32);
11872   SDValue CstHighArray[] = {CstHigh, CstHigh, CstHigh, CstHigh,
11873                             CstHigh, CstHigh, CstHigh, CstHigh};
11874   SDValue VecCstHigh = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
11875                                    makeArrayRef(&CstHighArray[0], NumElts));
11876
11877   // Create the right shift.
11878   SDValue CstShift = DAG.getConstant(16, DL, MVT::i32);
11879   SDValue CstShiftArray[] = {CstShift, CstShift, CstShift, CstShift,
11880                              CstShift, CstShift, CstShift, CstShift};
11881   SDValue VecCstShift = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
11882                                     makeArrayRef(&CstShiftArray[0], NumElts));
11883   SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
11884
11885   SDValue Low, High;
11886   if (Subtarget.hasSSE41()) {
11887     EVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
11888     //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
11889     SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
11890     SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
11891     // Low will be bitcasted right away, so do not bother bitcasting back to its
11892     // original type.
11893     Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
11894                       VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
11895     //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
11896     //                                 (uint4) 0x53000000, 0xaa);
11897     SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
11898     SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
11899     // High will be bitcasted right away, so do not bother bitcasting back to
11900     // its original type.
11901     High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
11902                        VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
11903   } else {
11904     SDValue CstMask = DAG.getConstant(0xffff, DL, MVT::i32);
11905     SDValue VecCstMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, CstMask,
11906                                      CstMask, CstMask, CstMask);
11907     //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
11908     SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
11909     Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
11910
11911     //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
11912     High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
11913   }
11914
11915   // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
11916   SDValue CstFAdd = DAG.getConstantFP(
11917       APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), DL, MVT::f32);
11918   SDValue CstFAddArray[] = {CstFAdd, CstFAdd, CstFAdd, CstFAdd,
11919                             CstFAdd, CstFAdd, CstFAdd, CstFAdd};
11920   SDValue VecCstFAdd = DAG.getNode(ISD::BUILD_VECTOR, DL, VecFloatVT,
11921                                    makeArrayRef(&CstFAddArray[0], NumElts));
11922
11923   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
11924   SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
11925   SDValue FHigh =
11926       DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
11927   //     return (float4) lo + fhi;
11928   SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
11929   return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
11930 }
11931
11932 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
11933                                                SelectionDAG &DAG) const {
11934   SDValue N0 = Op.getOperand(0);
11935   MVT SVT = N0.getSimpleValueType();
11936   SDLoc dl(Op);
11937
11938   switch (SVT.SimpleTy) {
11939   default:
11940     llvm_unreachable("Custom UINT_TO_FP is not supported!");
11941   case MVT::v4i8:
11942   case MVT::v4i16:
11943   case MVT::v8i8:
11944   case MVT::v8i16: {
11945     MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements());
11946     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
11947                        DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
11948   }
11949   case MVT::v4i32:
11950   case MVT::v8i32:
11951     return lowerUINT_TO_FP_vXi32(Op, DAG, *Subtarget);
11952   case MVT::v16i8:
11953   case MVT::v16i16:
11954     if (Subtarget->hasAVX512())
11955       return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
11956                          DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
11957   }
11958   llvm_unreachable(nullptr);
11959 }
11960
11961 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
11962                                            SelectionDAG &DAG) const {
11963   SDValue N0 = Op.getOperand(0);
11964   SDLoc dl(Op);
11965
11966   if (Op.getValueType().isVector())
11967     return lowerUINT_TO_FP_vec(Op, DAG);
11968
11969   // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
11970   // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
11971   // the optimization here.
11972   if (DAG.SignBitIsZero(N0))
11973     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
11974
11975   MVT SrcVT = N0.getSimpleValueType();
11976   MVT DstVT = Op.getSimpleValueType();
11977   if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
11978     return LowerUINT_TO_FP_i64(Op, DAG);
11979   if (SrcVT == MVT::i32 && X86ScalarSSEf64)
11980     return LowerUINT_TO_FP_i32(Op, DAG);
11981   if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
11982     return SDValue();
11983
11984   // Make a 64-bit buffer, and use it to build an FILD.
11985   SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
11986   if (SrcVT == MVT::i32) {
11987     SDValue WordOff = DAG.getConstant(4, dl, getPointerTy());
11988     SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
11989                                      getPointerTy(), StackSlot, WordOff);
11990     SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
11991                                   StackSlot, MachinePointerInfo(),
11992                                   false, false, 0);
11993     SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
11994                                   OffsetSlot, MachinePointerInfo(),
11995                                   false, false, 0);
11996     SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
11997     return Fild;
11998   }
11999
12000   assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
12001   SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
12002                                StackSlot, MachinePointerInfo(),
12003                                false, false, 0);
12004   // For i64 source, we need to add the appropriate power of 2 if the input
12005   // was negative.  This is the same as the optimization in
12006   // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
12007   // we must be careful to do the computation in x87 extended precision, not
12008   // in SSE. (The generic code can't know it's OK to do this, or how to.)
12009   int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
12010   MachineMemOperand *MMO =
12011     DAG.getMachineFunction()
12012     .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
12013                           MachineMemOperand::MOLoad, 8, 8);
12014
12015   SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
12016   SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
12017   SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
12018                                          MVT::i64, MMO);
12019
12020   APInt FF(32, 0x5F800000ULL);
12021
12022   // Check whether the sign bit is set.
12023   SDValue SignSet = DAG.getSetCC(dl,
12024                                  getSetCCResultType(*DAG.getContext(), MVT::i64),
12025                                  Op.getOperand(0),
12026                                  DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
12027
12028   // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
12029   SDValue FudgePtr = DAG.getConstantPool(
12030                              ConstantInt::get(*DAG.getContext(), FF.zext(64)),
12031                                          getPointerTy());
12032
12033   // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
12034   SDValue Zero = DAG.getIntPtrConstant(0, dl);
12035   SDValue Four = DAG.getIntPtrConstant(4, dl);
12036   SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
12037                                Zero, Four);
12038   FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset);
12039
12040   // Load the value out, extending it from f32 to f80.
12041   // FIXME: Avoid the extend by constructing the right constant pool?
12042   SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(),
12043                                  FudgePtr, MachinePointerInfo::getConstantPool(),
12044                                  MVT::f32, false, false, false, 4);
12045   // Extend everything to 80 bits to force it to be done on x87.
12046   SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
12047   return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
12048                      DAG.getIntPtrConstant(0, dl));
12049 }
12050
12051 std::pair<SDValue,SDValue>
12052 X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
12053                                     bool IsSigned, bool IsReplace) const {
12054   SDLoc DL(Op);
12055
12056   EVT DstTy = Op.getValueType();
12057
12058   if (!IsSigned && !isIntegerTypeFTOL(DstTy)) {
12059     assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
12060     DstTy = MVT::i64;
12061   }
12062
12063   assert(DstTy.getSimpleVT() <= MVT::i64 &&
12064          DstTy.getSimpleVT() >= MVT::i16 &&
12065          "Unknown FP_TO_INT to lower!");
12066
12067   // These are really Legal.
12068   if (DstTy == MVT::i32 &&
12069       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
12070     return std::make_pair(SDValue(), SDValue());
12071   if (Subtarget->is64Bit() &&
12072       DstTy == MVT::i64 &&
12073       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
12074     return std::make_pair(SDValue(), SDValue());
12075
12076   // We lower FP->int64 either into FISTP64 followed by a load from a temporary
12077   // stack slot, or into the FTOL runtime function.
12078   MachineFunction &MF = DAG.getMachineFunction();
12079   unsigned MemSize = DstTy.getSizeInBits()/8;
12080   int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
12081   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
12082
12083   unsigned Opc;
12084   if (!IsSigned && isIntegerTypeFTOL(DstTy))
12085     Opc = X86ISD::WIN_FTOL;
12086   else
12087     switch (DstTy.getSimpleVT().SimpleTy) {
12088     default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
12089     case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
12090     case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
12091     case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
12092     }
12093
12094   SDValue Chain = DAG.getEntryNode();
12095   SDValue Value = Op.getOperand(0);
12096   EVT TheVT = Op.getOperand(0).getValueType();
12097   // FIXME This causes a redundant load/store if the SSE-class value is already
12098   // in memory, such as if it is on the callstack.
12099   if (isScalarFPTypeInSSEReg(TheVT)) {
12100     assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
12101     Chain = DAG.getStore(Chain, DL, Value, StackSlot,
12102                          MachinePointerInfo::getFixedStack(SSFI),
12103                          false, false, 0);
12104     SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
12105     SDValue Ops[] = {
12106       Chain, StackSlot, DAG.getValueType(TheVT)
12107     };
12108
12109     MachineMemOperand *MMO =
12110       MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
12111                               MachineMemOperand::MOLoad, MemSize, MemSize);
12112     Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
12113     Chain = Value.getValue(1);
12114     SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
12115     StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
12116   }
12117
12118   MachineMemOperand *MMO =
12119     MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
12120                             MachineMemOperand::MOStore, MemSize, MemSize);
12121
12122   if (Opc != X86ISD::WIN_FTOL) {
12123     // Build the FP_TO_INT*_IN_MEM
12124     SDValue Ops[] = { Chain, Value, StackSlot };
12125     SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
12126                                            Ops, DstTy, MMO);
12127     return std::make_pair(FIST, StackSlot);
12128   } else {
12129     SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
12130       DAG.getVTList(MVT::Other, MVT::Glue),
12131       Chain, Value);
12132     SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX,
12133       MVT::i32, ftol.getValue(1));
12134     SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX,
12135       MVT::i32, eax.getValue(2));
12136     SDValue Ops[] = { eax, edx };
12137     SDValue pair = IsReplace
12138       ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops)
12139       : DAG.getMergeValues(Ops, DL);
12140     return std::make_pair(pair, SDValue());
12141   }
12142 }
12143
12144 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
12145                               const X86Subtarget *Subtarget) {
12146   MVT VT = Op->getSimpleValueType(0);
12147   SDValue In = Op->getOperand(0);
12148   MVT InVT = In.getSimpleValueType();
12149   SDLoc dl(Op);
12150
12151   if (VT.is512BitVector() || InVT.getScalarType() == MVT::i1)
12152     return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In);
12153
12154   // Optimize vectors in AVX mode:
12155   //
12156   //   v8i16 -> v8i32
12157   //   Use vpunpcklwd for 4 lower elements  v8i16 -> v4i32.
12158   //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
12159   //   Concat upper and lower parts.
12160   //
12161   //   v4i32 -> v4i64
12162   //   Use vpunpckldq for 4 lower elements  v4i32 -> v2i64.
12163   //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
12164   //   Concat upper and lower parts.
12165   //
12166
12167   if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
12168       ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
12169       ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
12170     return SDValue();
12171
12172   if (Subtarget->hasInt256())
12173     return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
12174
12175   SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
12176   SDValue Undef = DAG.getUNDEF(InVT);
12177   bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
12178   SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
12179   SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
12180
12181   MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
12182                              VT.getVectorNumElements()/2);
12183
12184   OpLo = DAG.getBitcast(HVT, OpLo);
12185   OpHi = DAG.getBitcast(HVT, OpHi);
12186
12187   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
12188 }
12189
12190 static  SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
12191                   const X86Subtarget *Subtarget, SelectionDAG &DAG) {
12192   MVT VT = Op->getSimpleValueType(0);
12193   SDValue In = Op->getOperand(0);
12194   MVT InVT = In.getSimpleValueType();
12195   SDLoc DL(Op);
12196   unsigned int NumElts = VT.getVectorNumElements();
12197   if (NumElts != 8 && NumElts != 16 && !Subtarget->hasBWI())
12198     return SDValue();
12199
12200   if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
12201     return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
12202
12203   assert(InVT.getVectorElementType() == MVT::i1);
12204   MVT ExtVT = NumElts == 8 ? MVT::v8i64 : MVT::v16i32;
12205   SDValue One =
12206    DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT);
12207   SDValue Zero =
12208    DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
12209
12210   SDValue V = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero);
12211   if (VT.is512BitVector())
12212     return V;
12213   return DAG.getNode(X86ISD::VTRUNC, DL, VT, V);
12214 }
12215
12216 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
12217                                SelectionDAG &DAG) {
12218   if (Subtarget->hasFp256()) {
12219     SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
12220     if (Res.getNode())
12221       return Res;
12222   }
12223
12224   return SDValue();
12225 }
12226
12227 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
12228                                 SelectionDAG &DAG) {
12229   SDLoc DL(Op);
12230   MVT VT = Op.getSimpleValueType();
12231   SDValue In = Op.getOperand(0);
12232   MVT SVT = In.getSimpleValueType();
12233
12234   if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
12235     return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG);
12236
12237   if (Subtarget->hasFp256()) {
12238     SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
12239     if (Res.getNode())
12240       return Res;
12241   }
12242
12243   assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
12244          VT.getVectorNumElements() != SVT.getVectorNumElements());
12245   return SDValue();
12246 }
12247
12248 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
12249   SDLoc DL(Op);
12250   MVT VT = Op.getSimpleValueType();
12251   SDValue In = Op.getOperand(0);
12252   MVT InVT = In.getSimpleValueType();
12253
12254   if (VT == MVT::i1) {
12255     assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
12256            "Invalid scalar TRUNCATE operation");
12257     if (InVT.getSizeInBits() >= 32)
12258       return SDValue();
12259     In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
12260     return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
12261   }
12262   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
12263          "Invalid TRUNCATE operation");
12264
12265   // move vector to mask - truncate solution for SKX
12266   if (VT.getVectorElementType() == MVT::i1) {
12267     if (InVT.is512BitVector() && InVT.getScalarSizeInBits() <= 16 &&
12268         Subtarget->hasBWI())
12269       return Op; // legal, will go to VPMOVB2M, VPMOVW2M
12270     if ((InVT.is256BitVector() || InVT.is128BitVector())
12271         && InVT.getScalarSizeInBits() <= 16 &&
12272         Subtarget->hasBWI() && Subtarget->hasVLX())
12273       return Op; // legal, will go to VPMOVB2M, VPMOVW2M
12274     if (InVT.is512BitVector() && InVT.getScalarSizeInBits() >= 32 &&
12275         Subtarget->hasDQI())
12276       return Op; // legal, will go to VPMOVD2M, VPMOVQ2M
12277     if ((InVT.is256BitVector() || InVT.is128BitVector())
12278         && InVT.getScalarSizeInBits() >= 32 &&
12279         Subtarget->hasDQI() && Subtarget->hasVLX())
12280       return Op; // legal, will go to VPMOVB2M, VPMOVQ2M
12281   }
12282   if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) {
12283     if (VT.getVectorElementType().getSizeInBits() >=8)
12284       return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
12285
12286     assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
12287     unsigned NumElts = InVT.getVectorNumElements();
12288     assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type");
12289     if (InVT.getSizeInBits() < 512) {
12290       MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64;
12291       In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
12292       InVT = ExtVT;
12293     }
12294
12295     SDValue OneV =
12296      DAG.getConstant(APInt::getSignBit(InVT.getScalarSizeInBits()), DL, InVT);
12297     SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In);
12298     return DAG.getNode(X86ISD::TESTM, DL, VT, And, And);
12299   }
12300
12301   if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
12302     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
12303     if (Subtarget->hasInt256()) {
12304       static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
12305       In = DAG.getBitcast(MVT::v8i32, In);
12306       In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
12307                                 ShufMask);
12308       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
12309                          DAG.getIntPtrConstant(0, DL));
12310     }
12311
12312     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
12313                                DAG.getIntPtrConstant(0, DL));
12314     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
12315                                DAG.getIntPtrConstant(2, DL));
12316     OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
12317     OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
12318     static const int ShufMask[] = {0, 2, 4, 6};
12319     return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
12320   }
12321
12322   if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
12323     // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
12324     if (Subtarget->hasInt256()) {
12325       In = DAG.getBitcast(MVT::v32i8, In);
12326
12327       SmallVector<SDValue,32> pshufbMask;
12328       for (unsigned i = 0; i < 2; ++i) {
12329         pshufbMask.push_back(DAG.getConstant(0x0, DL, MVT::i8));
12330         pshufbMask.push_back(DAG.getConstant(0x1, DL, MVT::i8));
12331         pshufbMask.push_back(DAG.getConstant(0x4, DL, MVT::i8));
12332         pshufbMask.push_back(DAG.getConstant(0x5, DL, MVT::i8));
12333         pshufbMask.push_back(DAG.getConstant(0x8, DL, MVT::i8));
12334         pshufbMask.push_back(DAG.getConstant(0x9, DL, MVT::i8));
12335         pshufbMask.push_back(DAG.getConstant(0xc, DL, MVT::i8));
12336         pshufbMask.push_back(DAG.getConstant(0xd, DL, MVT::i8));
12337         for (unsigned j = 0; j < 8; ++j)
12338           pshufbMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
12339       }
12340       SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask);
12341       In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
12342       In = DAG.getBitcast(MVT::v4i64, In);
12343
12344       static const int ShufMask[] = {0,  2,  -1,  -1};
12345       In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, DAG.getUNDEF(MVT::v4i64),
12346                                 &ShufMask[0]);
12347       In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
12348                        DAG.getIntPtrConstant(0, DL));
12349       return DAG.getBitcast(VT, In);
12350     }
12351
12352     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
12353                                DAG.getIntPtrConstant(0, DL));
12354
12355     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
12356                                DAG.getIntPtrConstant(4, DL));
12357
12358     OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
12359     OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
12360
12361     // The PSHUFB mask:
12362     static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
12363                                    -1, -1, -1, -1, -1, -1, -1, -1};
12364
12365     SDValue Undef = DAG.getUNDEF(MVT::v16i8);
12366     OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
12367     OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
12368
12369     OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
12370     OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
12371
12372     // The MOVLHPS Mask:
12373     static const int ShufMask2[] = {0, 1, 4, 5};
12374     SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
12375     return DAG.getBitcast(MVT::v8i16, res);
12376   }
12377
12378   // Handle truncation of V256 to V128 using shuffles.
12379   if (!VT.is128BitVector() || !InVT.is256BitVector())
12380     return SDValue();
12381
12382   assert(Subtarget->hasFp256() && "256-bit vector without AVX!");
12383
12384   unsigned NumElems = VT.getVectorNumElements();
12385   MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
12386
12387   SmallVector<int, 16> MaskVec(NumElems * 2, -1);
12388   // Prepare truncation shuffle mask
12389   for (unsigned i = 0; i != NumElems; ++i)
12390     MaskVec[i] = i * 2;
12391   SDValue V = DAG.getVectorShuffle(NVT, DL, DAG.getBitcast(NVT, In),
12392                                    DAG.getUNDEF(NVT), &MaskVec[0]);
12393   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
12394                      DAG.getIntPtrConstant(0, DL));
12395 }
12396
12397 SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
12398                                            SelectionDAG &DAG) const {
12399   assert(!Op.getSimpleValueType().isVector());
12400
12401   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
12402     /*IsSigned=*/ true, /*IsReplace=*/ false);
12403   SDValue FIST = Vals.first, StackSlot = Vals.second;
12404   // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
12405   if (!FIST.getNode()) return Op;
12406
12407   if (StackSlot.getNode())
12408     // Load the result.
12409     return DAG.getLoad(Op.getValueType(), SDLoc(Op),
12410                        FIST, StackSlot, MachinePointerInfo(),
12411                        false, false, false, 0);
12412
12413   // The node is the result.
12414   return FIST;
12415 }
12416
12417 SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
12418                                            SelectionDAG &DAG) const {
12419   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
12420     /*IsSigned=*/ false, /*IsReplace=*/ false);
12421   SDValue FIST = Vals.first, StackSlot = Vals.second;
12422   assert(FIST.getNode() && "Unexpected failure");
12423
12424   if (StackSlot.getNode())
12425     // Load the result.
12426     return DAG.getLoad(Op.getValueType(), SDLoc(Op),
12427                        FIST, StackSlot, MachinePointerInfo(),
12428                        false, false, false, 0);
12429
12430   // The node is the result.
12431   return FIST;
12432 }
12433
12434 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
12435   SDLoc DL(Op);
12436   MVT VT = Op.getSimpleValueType();
12437   SDValue In = Op.getOperand(0);
12438   MVT SVT = In.getSimpleValueType();
12439
12440   assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
12441
12442   return DAG.getNode(X86ISD::VFPEXT, DL, VT,
12443                      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
12444                                  In, DAG.getUNDEF(SVT)));
12445 }
12446
12447 /// The only differences between FABS and FNEG are the mask and the logic op.
12448 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
12449 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
12450   assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
12451          "Wrong opcode for lowering FABS or FNEG.");
12452
12453   bool IsFABS = (Op.getOpcode() == ISD::FABS);
12454
12455   // If this is a FABS and it has an FNEG user, bail out to fold the combination
12456   // into an FNABS. We'll lower the FABS after that if it is still in use.
12457   if (IsFABS)
12458     for (SDNode *User : Op->uses())
12459       if (User->getOpcode() == ISD::FNEG)
12460         return Op;
12461
12462   SDValue Op0 = Op.getOperand(0);
12463   bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
12464
12465   SDLoc dl(Op);
12466   MVT VT = Op.getSimpleValueType();
12467   // Assume scalar op for initialization; update for vector if needed.
12468   // Note that there are no scalar bitwise logical SSE/AVX instructions, so we
12469   // generate a 16-byte vector constant and logic op even for the scalar case.
12470   // Using a 16-byte mask allows folding the load of the mask with
12471   // the logic op, so it can save (~4 bytes) on code size.
12472   MVT EltVT = VT;
12473   unsigned NumElts = VT == MVT::f64 ? 2 : 4;
12474   // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
12475   // decide if we should generate a 16-byte constant mask when we only need 4 or
12476   // 8 bytes for the scalar case.
12477   if (VT.isVector()) {
12478     EltVT = VT.getVectorElementType();
12479     NumElts = VT.getVectorNumElements();
12480   }
12481
12482   unsigned EltBits = EltVT.getSizeInBits();
12483   LLVMContext *Context = DAG.getContext();
12484   // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
12485   APInt MaskElt =
12486     IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits);
12487   Constant *C = ConstantInt::get(*Context, MaskElt);
12488   C = ConstantVector::getSplat(NumElts, C);
12489   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12490   SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy());
12491   unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
12492   SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
12493                              MachinePointerInfo::getConstantPool(),
12494                              false, false, false, Alignment);
12495
12496   if (VT.isVector()) {
12497     // For a vector, cast operands to a vector type, perform the logic op,
12498     // and cast the result back to the original value type.
12499     MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
12500     SDValue MaskCasted = DAG.getBitcast(VecVT, Mask);
12501     SDValue Operand = IsFNABS ? DAG.getBitcast(VecVT, Op0.getOperand(0))
12502                               : DAG.getBitcast(VecVT, Op0);
12503     unsigned BitOp = IsFABS ? ISD::AND : IsFNABS ? ISD::OR : ISD::XOR;
12504     return DAG.getBitcast(VT,
12505                           DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted));
12506   }
12507
12508   // If not vector, then scalar.
12509   unsigned BitOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
12510   SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
12511   return DAG.getNode(BitOp, dl, VT, Operand, Mask);
12512 }
12513
12514 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
12515   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12516   LLVMContext *Context = DAG.getContext();
12517   SDValue Op0 = Op.getOperand(0);
12518   SDValue Op1 = Op.getOperand(1);
12519   SDLoc dl(Op);
12520   MVT VT = Op.getSimpleValueType();
12521   MVT SrcVT = Op1.getSimpleValueType();
12522
12523   // If second operand is smaller, extend it first.
12524   if (SrcVT.bitsLT(VT)) {
12525     Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
12526     SrcVT = VT;
12527   }
12528   // And if it is bigger, shrink it first.
12529   if (SrcVT.bitsGT(VT)) {
12530     Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1, dl));
12531     SrcVT = VT;
12532   }
12533
12534   // At this point the operands and the result should have the same
12535   // type, and that won't be f80 since that is not custom lowered.
12536
12537   const fltSemantics &Sem =
12538       VT == MVT::f64 ? APFloat::IEEEdouble : APFloat::IEEEsingle;
12539   const unsigned SizeInBits = VT.getSizeInBits();
12540
12541   SmallVector<Constant *, 4> CV(
12542       VT == MVT::f64 ? 2 : 4,
12543       ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0))));
12544
12545   // First, clear all bits but the sign bit from the second operand (sign).
12546   CV[0] = ConstantFP::get(*Context,
12547                           APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1)));
12548   Constant *C = ConstantVector::get(CV);
12549   SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
12550   SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
12551                               MachinePointerInfo::getConstantPool(),
12552                               false, false, false, 16);
12553   SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
12554
12555   // Next, clear the sign bit from the first operand (magnitude).
12556   // If it's a constant, we can clear it here.
12557   if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Op0)) {
12558     APFloat APF = Op0CN->getValueAPF();
12559     // If the magnitude is a positive zero, the sign bit alone is enough.
12560     if (APF.isPosZero())
12561       return SignBit;
12562     APF.clearSign();
12563     CV[0] = ConstantFP::get(*Context, APF);
12564   } else {
12565     CV[0] = ConstantFP::get(
12566         *Context,
12567         APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1)));
12568   }
12569   C = ConstantVector::get(CV);
12570   CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
12571   SDValue Val = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
12572                             MachinePointerInfo::getConstantPool(),
12573                             false, false, false, 16);
12574   // If the magnitude operand wasn't a constant, we need to AND out the sign.
12575   if (!isa<ConstantFPSDNode>(Op0))
12576     Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Val);
12577
12578   // OR the magnitude value with the sign bit.
12579   return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
12580 }
12581
12582 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
12583   SDValue N0 = Op.getOperand(0);
12584   SDLoc dl(Op);
12585   MVT VT = Op.getSimpleValueType();
12586
12587   // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
12588   SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
12589                                   DAG.getConstant(1, dl, VT));
12590   return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, dl, VT));
12591 }
12592
12593 // Check whether an OR'd tree is PTEST-able.
12594 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
12595                                       SelectionDAG &DAG) {
12596   assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
12597
12598   if (!Subtarget->hasSSE41())
12599     return SDValue();
12600
12601   if (!Op->hasOneUse())
12602     return SDValue();
12603
12604   SDNode *N = Op.getNode();
12605   SDLoc DL(N);
12606
12607   SmallVector<SDValue, 8> Opnds;
12608   DenseMap<SDValue, unsigned> VecInMap;
12609   SmallVector<SDValue, 8> VecIns;
12610   EVT VT = MVT::Other;
12611
12612   // Recognize a special case where a vector is casted into wide integer to
12613   // test all 0s.
12614   Opnds.push_back(N->getOperand(0));
12615   Opnds.push_back(N->getOperand(1));
12616
12617   for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
12618     SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
12619     // BFS traverse all OR'd operands.
12620     if (I->getOpcode() == ISD::OR) {
12621       Opnds.push_back(I->getOperand(0));
12622       Opnds.push_back(I->getOperand(1));
12623       // Re-evaluate the number of nodes to be traversed.
12624       e += 2; // 2 more nodes (LHS and RHS) are pushed.
12625       continue;
12626     }
12627
12628     // Quit if a non-EXTRACT_VECTOR_ELT
12629     if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
12630       return SDValue();
12631
12632     // Quit if without a constant index.
12633     SDValue Idx = I->getOperand(1);
12634     if (!isa<ConstantSDNode>(Idx))
12635       return SDValue();
12636
12637     SDValue ExtractedFromVec = I->getOperand(0);
12638     DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
12639     if (M == VecInMap.end()) {
12640       VT = ExtractedFromVec.getValueType();
12641       // Quit if not 128/256-bit vector.
12642       if (!VT.is128BitVector() && !VT.is256BitVector())
12643         return SDValue();
12644       // Quit if not the same type.
12645       if (VecInMap.begin() != VecInMap.end() &&
12646           VT != VecInMap.begin()->first.getValueType())
12647         return SDValue();
12648       M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
12649       VecIns.push_back(ExtractedFromVec);
12650     }
12651     M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
12652   }
12653
12654   assert((VT.is128BitVector() || VT.is256BitVector()) &&
12655          "Not extracted from 128-/256-bit vector.");
12656
12657   unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
12658
12659   for (DenseMap<SDValue, unsigned>::const_iterator
12660         I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
12661     // Quit if not all elements are used.
12662     if (I->second != FullMask)
12663       return SDValue();
12664   }
12665
12666   EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
12667
12668   // Cast all vectors into TestVT for PTEST.
12669   for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
12670     VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
12671
12672   // If more than one full vectors are evaluated, OR them first before PTEST.
12673   for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
12674     // Each iteration will OR 2 nodes and append the result until there is only
12675     // 1 node left, i.e. the final OR'd value of all vectors.
12676     SDValue LHS = VecIns[Slot];
12677     SDValue RHS = VecIns[Slot + 1];
12678     VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
12679   }
12680
12681   return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
12682                      VecIns.back(), VecIns.back());
12683 }
12684
12685 /// \brief return true if \c Op has a use that doesn't just read flags.
12686 static bool hasNonFlagsUse(SDValue Op) {
12687   for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
12688        ++UI) {
12689     SDNode *User = *UI;
12690     unsigned UOpNo = UI.getOperandNo();
12691     if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
12692       // Look pass truncate.
12693       UOpNo = User->use_begin().getOperandNo();
12694       User = *User->use_begin();
12695     }
12696
12697     if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
12698         !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
12699       return true;
12700   }
12701   return false;
12702 }
12703
12704 /// Emit nodes that will be selected as "test Op0,Op0", or something
12705 /// equivalent.
12706 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
12707                                     SelectionDAG &DAG) const {
12708   if (Op.getValueType() == MVT::i1) {
12709     SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
12710     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
12711                        DAG.getConstant(0, dl, MVT::i8));
12712   }
12713   // CF and OF aren't always set the way we want. Determine which
12714   // of these we need.
12715   bool NeedCF = false;
12716   bool NeedOF = false;
12717   switch (X86CC) {
12718   default: break;
12719   case X86::COND_A: case X86::COND_AE:
12720   case X86::COND_B: case X86::COND_BE:
12721     NeedCF = true;
12722     break;
12723   case X86::COND_G: case X86::COND_GE:
12724   case X86::COND_L: case X86::COND_LE:
12725   case X86::COND_O: case X86::COND_NO: {
12726     // Check if we really need to set the
12727     // Overflow flag. If NoSignedWrap is present
12728     // that is not actually needed.
12729     switch (Op->getOpcode()) {
12730     case ISD::ADD:
12731     case ISD::SUB:
12732     case ISD::MUL:
12733     case ISD::SHL: {
12734       const auto *BinNode = cast<BinaryWithFlagsSDNode>(Op.getNode());
12735       if (BinNode->Flags.hasNoSignedWrap())
12736         break;
12737     }
12738     default:
12739       NeedOF = true;
12740       break;
12741     }
12742     break;
12743   }
12744   }
12745   // See if we can use the EFLAGS value from the operand instead of
12746   // doing a separate TEST. TEST always sets OF and CF to 0, so unless
12747   // we prove that the arithmetic won't overflow, we can't use OF or CF.
12748   if (Op.getResNo() != 0 || NeedOF || NeedCF) {
12749     // Emit a CMP with 0, which is the TEST pattern.
12750     //if (Op.getValueType() == MVT::i1)
12751     //  return DAG.getNode(X86ISD::CMP, dl, MVT::i1, Op,
12752     //                     DAG.getConstant(0, MVT::i1));
12753     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
12754                        DAG.getConstant(0, dl, Op.getValueType()));
12755   }
12756   unsigned Opcode = 0;
12757   unsigned NumOperands = 0;
12758
12759   // Truncate operations may prevent the merge of the SETCC instruction
12760   // and the arithmetic instruction before it. Attempt to truncate the operands
12761   // of the arithmetic instruction and use a reduced bit-width instruction.
12762   bool NeedTruncation = false;
12763   SDValue ArithOp = Op;
12764   if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
12765     SDValue Arith = Op->getOperand(0);
12766     // Both the trunc and the arithmetic op need to have one user each.
12767     if (Arith->hasOneUse())
12768       switch (Arith.getOpcode()) {
12769         default: break;
12770         case ISD::ADD:
12771         case ISD::SUB:
12772         case ISD::AND:
12773         case ISD::OR:
12774         case ISD::XOR: {
12775           NeedTruncation = true;
12776           ArithOp = Arith;
12777         }
12778       }
12779   }
12780
12781   // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
12782   // which may be the result of a CAST.  We use the variable 'Op', which is the
12783   // non-casted variable when we check for possible users.
12784   switch (ArithOp.getOpcode()) {
12785   case ISD::ADD:
12786     // Due to an isel shortcoming, be conservative if this add is likely to be
12787     // selected as part of a load-modify-store instruction. When the root node
12788     // in a match is a store, isel doesn't know how to remap non-chain non-flag
12789     // uses of other nodes in the match, such as the ADD in this case. This
12790     // leads to the ADD being left around and reselected, with the result being
12791     // two adds in the output.  Alas, even if none our users are stores, that
12792     // doesn't prove we're O.K.  Ergo, if we have any parents that aren't
12793     // CopyToReg or SETCC, eschew INC/DEC.  A better fix seems to require
12794     // climbing the DAG back to the root, and it doesn't seem to be worth the
12795     // effort.
12796     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
12797          UE = Op.getNode()->use_end(); UI != UE; ++UI)
12798       if (UI->getOpcode() != ISD::CopyToReg &&
12799           UI->getOpcode() != ISD::SETCC &&
12800           UI->getOpcode() != ISD::STORE)
12801         goto default_case;
12802
12803     if (ConstantSDNode *C =
12804         dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
12805       // An add of one will be selected as an INC.
12806       if (C->getAPIntValue() == 1 && !Subtarget->slowIncDec()) {
12807         Opcode = X86ISD::INC;
12808         NumOperands = 1;
12809         break;
12810       }
12811
12812       // An add of negative one (subtract of one) will be selected as a DEC.
12813       if (C->getAPIntValue().isAllOnesValue() && !Subtarget->slowIncDec()) {
12814         Opcode = X86ISD::DEC;
12815         NumOperands = 1;
12816         break;
12817       }
12818     }
12819
12820     // Otherwise use a regular EFLAGS-setting add.
12821     Opcode = X86ISD::ADD;
12822     NumOperands = 2;
12823     break;
12824   case ISD::SHL:
12825   case ISD::SRL:
12826     // If we have a constant logical shift that's only used in a comparison
12827     // against zero turn it into an equivalent AND. This allows turning it into
12828     // a TEST instruction later.
12829     if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
12830         isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
12831       EVT VT = Op.getValueType();
12832       unsigned BitWidth = VT.getSizeInBits();
12833       unsigned ShAmt = Op->getConstantOperandVal(1);
12834       if (ShAmt >= BitWidth) // Avoid undefined shifts.
12835         break;
12836       APInt Mask = ArithOp.getOpcode() == ISD::SRL
12837                        ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
12838                        : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
12839       if (!Mask.isSignedIntN(32)) // Avoid large immediates.
12840         break;
12841       SDValue New = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
12842                                 DAG.getConstant(Mask, dl, VT));
12843       DAG.ReplaceAllUsesWith(Op, New);
12844       Op = New;
12845     }
12846     break;
12847
12848   case ISD::AND:
12849     // If the primary and result isn't used, don't bother using X86ISD::AND,
12850     // because a TEST instruction will be better.
12851     if (!hasNonFlagsUse(Op))
12852       break;
12853     // FALL THROUGH
12854   case ISD::SUB:
12855   case ISD::OR:
12856   case ISD::XOR:
12857     // Due to the ISEL shortcoming noted above, be conservative if this op is
12858     // likely to be selected as part of a load-modify-store instruction.
12859     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
12860            UE = Op.getNode()->use_end(); UI != UE; ++UI)
12861       if (UI->getOpcode() == ISD::STORE)
12862         goto default_case;
12863
12864     // Otherwise use a regular EFLAGS-setting instruction.
12865     switch (ArithOp.getOpcode()) {
12866     default: llvm_unreachable("unexpected operator!");
12867     case ISD::SUB: Opcode = X86ISD::SUB; break;
12868     case ISD::XOR: Opcode = X86ISD::XOR; break;
12869     case ISD::AND: Opcode = X86ISD::AND; break;
12870     case ISD::OR: {
12871       if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
12872         SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG);
12873         if (EFLAGS.getNode())
12874           return EFLAGS;
12875       }
12876       Opcode = X86ISD::OR;
12877       break;
12878     }
12879     }
12880
12881     NumOperands = 2;
12882     break;
12883   case X86ISD::ADD:
12884   case X86ISD::SUB:
12885   case X86ISD::INC:
12886   case X86ISD::DEC:
12887   case X86ISD::OR:
12888   case X86ISD::XOR:
12889   case X86ISD::AND:
12890     return SDValue(Op.getNode(), 1);
12891   default:
12892   default_case:
12893     break;
12894   }
12895
12896   // If we found that truncation is beneficial, perform the truncation and
12897   // update 'Op'.
12898   if (NeedTruncation) {
12899     EVT VT = Op.getValueType();
12900     SDValue WideVal = Op->getOperand(0);
12901     EVT WideVT = WideVal.getValueType();
12902     unsigned ConvertedOp = 0;
12903     // Use a target machine opcode to prevent further DAGCombine
12904     // optimizations that may separate the arithmetic operations
12905     // from the setcc node.
12906     switch (WideVal.getOpcode()) {
12907       default: break;
12908       case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
12909       case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
12910       case ISD::AND: ConvertedOp = X86ISD::AND; break;
12911       case ISD::OR:  ConvertedOp = X86ISD::OR;  break;
12912       case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
12913     }
12914
12915     if (ConvertedOp) {
12916       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12917       if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
12918         SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
12919         SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
12920         Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
12921       }
12922     }
12923   }
12924
12925   if (Opcode == 0)
12926     // Emit a CMP with 0, which is the TEST pattern.
12927     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
12928                        DAG.getConstant(0, dl, Op.getValueType()));
12929
12930   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
12931   SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
12932
12933   SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
12934   DAG.ReplaceAllUsesWith(Op, New);
12935   return SDValue(New.getNode(), 1);
12936 }
12937
12938 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
12939 /// equivalent.
12940 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
12941                                    SDLoc dl, SelectionDAG &DAG) const {
12942   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) {
12943     if (C->getAPIntValue() == 0)
12944       return EmitTest(Op0, X86CC, dl, DAG);
12945
12946      if (Op0.getValueType() == MVT::i1)
12947        llvm_unreachable("Unexpected comparison operation for MVT::i1 operands");
12948   }
12949
12950   if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
12951        Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
12952     // Do the comparison at i32 if it's smaller, besides the Atom case.
12953     // This avoids subregister aliasing issues. Keep the smaller reference
12954     // if we're optimizing for size, however, as that'll allow better folding
12955     // of memory operations.
12956     if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 &&
12957         !DAG.getMachineFunction().getFunction()->hasFnAttribute(
12958             Attribute::MinSize) &&
12959         !Subtarget->isAtom()) {
12960       unsigned ExtendOp =
12961           isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
12962       Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
12963       Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
12964     }
12965     // Use SUB instead of CMP to enable CSE between SUB and CMP.
12966     SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
12967     SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
12968                               Op0, Op1);
12969     return SDValue(Sub.getNode(), 1);
12970   }
12971   return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
12972 }
12973
12974 /// Convert a comparison if required by the subtarget.
12975 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
12976                                                  SelectionDAG &DAG) const {
12977   // If the subtarget does not support the FUCOMI instruction, floating-point
12978   // comparisons have to be converted.
12979   if (Subtarget->hasCMov() ||
12980       Cmp.getOpcode() != X86ISD::CMP ||
12981       !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
12982       !Cmp.getOperand(1).getValueType().isFloatingPoint())
12983     return Cmp;
12984
12985   // The instruction selector will select an FUCOM instruction instead of
12986   // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
12987   // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
12988   // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
12989   SDLoc dl(Cmp);
12990   SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
12991   SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
12992   SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
12993                             DAG.getConstant(8, dl, MVT::i8));
12994   SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
12995   return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
12996 }
12997
12998 /// The minimum architected relative accuracy is 2^-12. We need one
12999 /// Newton-Raphson step to have a good float result (24 bits of precision).
13000 SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
13001                                             DAGCombinerInfo &DCI,
13002                                             unsigned &RefinementSteps,
13003                                             bool &UseOneConstNR) const {
13004   EVT VT = Op.getValueType();
13005   const char *RecipOp;
13006
13007   // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
13008   // TODO: Add support for AVX512 (v16f32).
13009   // It is likely not profitable to do this for f64 because a double-precision
13010   // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
13011   // instructions: convert to single, rsqrtss, convert back to double, refine
13012   // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
13013   // along with FMA, this could be a throughput win.
13014   if (VT == MVT::f32 && Subtarget->hasSSE1())
13015     RecipOp = "sqrtf";
13016   else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) ||
13017            (VT == MVT::v8f32 && Subtarget->hasAVX()))
13018     RecipOp = "vec-sqrtf";
13019   else
13020     return SDValue();
13021
13022   TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
13023   if (!Recips.isEnabled(RecipOp))
13024     return SDValue();
13025
13026   RefinementSteps = Recips.getRefinementSteps(RecipOp);
13027   UseOneConstNR = false;
13028   return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
13029 }
13030
13031 /// The minimum architected relative accuracy is 2^-12. We need one
13032 /// Newton-Raphson step to have a good float result (24 bits of precision).
13033 SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
13034                                             DAGCombinerInfo &DCI,
13035                                             unsigned &RefinementSteps) const {
13036   EVT VT = Op.getValueType();
13037   const char *RecipOp;
13038
13039   // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
13040   // TODO: Add support for AVX512 (v16f32).
13041   // It is likely not profitable to do this for f64 because a double-precision
13042   // reciprocal estimate with refinement on x86 prior to FMA requires
13043   // 15 instructions: convert to single, rcpss, convert back to double, refine
13044   // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
13045   // along with FMA, this could be a throughput win.
13046   if (VT == MVT::f32 && Subtarget->hasSSE1())
13047     RecipOp = "divf";
13048   else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) ||
13049            (VT == MVT::v8f32 && Subtarget->hasAVX()))
13050     RecipOp = "vec-divf";
13051   else
13052     return SDValue();
13053
13054   TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
13055   if (!Recips.isEnabled(RecipOp))
13056     return SDValue();
13057
13058   RefinementSteps = Recips.getRefinementSteps(RecipOp);
13059   return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
13060 }
13061
13062 /// If we have at least two divisions that use the same divisor, convert to
13063 /// multplication by a reciprocal. This may need to be adjusted for a given
13064 /// CPU if a division's cost is not at least twice the cost of a multiplication.
13065 /// This is because we still need one division to calculate the reciprocal and
13066 /// then we need two multiplies by that reciprocal as replacements for the
13067 /// original divisions.
13068 bool X86TargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const {
13069   return NumUsers > 1;
13070 }
13071
13072 static bool isAllOnes(SDValue V) {
13073   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
13074   return C && C->isAllOnesValue();
13075 }
13076
13077 /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
13078 /// if it's possible.
13079 SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
13080                                      SDLoc dl, SelectionDAG &DAG) const {
13081   SDValue Op0 = And.getOperand(0);
13082   SDValue Op1 = And.getOperand(1);
13083   if (Op0.getOpcode() == ISD::TRUNCATE)
13084     Op0 = Op0.getOperand(0);
13085   if (Op1.getOpcode() == ISD::TRUNCATE)
13086     Op1 = Op1.getOperand(0);
13087
13088   SDValue LHS, RHS;
13089   if (Op1.getOpcode() == ISD::SHL)
13090     std::swap(Op0, Op1);
13091   if (Op0.getOpcode() == ISD::SHL) {
13092     if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0)))
13093       if (And00C->getZExtValue() == 1) {
13094         // If we looked past a truncate, check that it's only truncating away
13095         // known zeros.
13096         unsigned BitWidth = Op0.getValueSizeInBits();
13097         unsigned AndBitWidth = And.getValueSizeInBits();
13098         if (BitWidth > AndBitWidth) {
13099           APInt Zeros, Ones;
13100           DAG.computeKnownBits(Op0, Zeros, Ones);
13101           if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
13102             return SDValue();
13103         }
13104         LHS = Op1;
13105         RHS = Op0.getOperand(1);
13106       }
13107   } else if (Op1.getOpcode() == ISD::Constant) {
13108     ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
13109     uint64_t AndRHSVal = AndRHS->getZExtValue();
13110     SDValue AndLHS = Op0;
13111
13112     if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
13113       LHS = AndLHS.getOperand(0);
13114       RHS = AndLHS.getOperand(1);
13115     }
13116
13117     // Use BT if the immediate can't be encoded in a TEST instruction.
13118     if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
13119       LHS = AndLHS;
13120       RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
13121     }
13122   }
13123
13124   if (LHS.getNode()) {
13125     // If LHS is i8, promote it to i32 with any_extend.  There is no i8 BT
13126     // instruction.  Since the shift amount is in-range-or-undefined, we know
13127     // that doing a bittest on the i32 value is ok.  We extend to i32 because
13128     // the encoding for the i16 version is larger than the i32 version.
13129     // Also promote i16 to i32 for performance / code size reason.
13130     if (LHS.getValueType() == MVT::i8 ||
13131         LHS.getValueType() == MVT::i16)
13132       LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
13133
13134     // If the operand types disagree, extend the shift amount to match.  Since
13135     // BT ignores high bits (like shifts) we can use anyextend.
13136     if (LHS.getValueType() != RHS.getValueType())
13137       RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
13138
13139     SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
13140     X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
13141     return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
13142                        DAG.getConstant(Cond, dl, MVT::i8), BT);
13143   }
13144
13145   return SDValue();
13146 }
13147
13148 /// \brief - Turns an ISD::CondCode into a value suitable for SSE floating point
13149 /// mask CMPs.
13150 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
13151                               SDValue &Op1) {
13152   unsigned SSECC;
13153   bool Swap = false;
13154
13155   // SSE Condition code mapping:
13156   //  0 - EQ
13157   //  1 - LT
13158   //  2 - LE
13159   //  3 - UNORD
13160   //  4 - NEQ
13161   //  5 - NLT
13162   //  6 - NLE
13163   //  7 - ORD
13164   switch (SetCCOpcode) {
13165   default: llvm_unreachable("Unexpected SETCC condition");
13166   case ISD::SETOEQ:
13167   case ISD::SETEQ:  SSECC = 0; break;
13168   case ISD::SETOGT:
13169   case ISD::SETGT:  Swap = true; // Fallthrough
13170   case ISD::SETLT:
13171   case ISD::SETOLT: SSECC = 1; break;
13172   case ISD::SETOGE:
13173   case ISD::SETGE:  Swap = true; // Fallthrough
13174   case ISD::SETLE:
13175   case ISD::SETOLE: SSECC = 2; break;
13176   case ISD::SETUO:  SSECC = 3; break;
13177   case ISD::SETUNE:
13178   case ISD::SETNE:  SSECC = 4; break;
13179   case ISD::SETULE: Swap = true; // Fallthrough
13180   case ISD::SETUGE: SSECC = 5; break;
13181   case ISD::SETULT: Swap = true; // Fallthrough
13182   case ISD::SETUGT: SSECC = 6; break;
13183   case ISD::SETO:   SSECC = 7; break;
13184   case ISD::SETUEQ:
13185   case ISD::SETONE: SSECC = 8; break;
13186   }
13187   if (Swap)
13188     std::swap(Op0, Op1);
13189
13190   return SSECC;
13191 }
13192
13193 // Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
13194 // ones, and then concatenate the result back.
13195 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
13196   MVT VT = Op.getSimpleValueType();
13197
13198   assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
13199          "Unsupported value type for operation");
13200
13201   unsigned NumElems = VT.getVectorNumElements();
13202   SDLoc dl(Op);
13203   SDValue CC = Op.getOperand(2);
13204
13205   // Extract the LHS vectors
13206   SDValue LHS = Op.getOperand(0);
13207   SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
13208   SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
13209
13210   // Extract the RHS vectors
13211   SDValue RHS = Op.getOperand(1);
13212   SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
13213   SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
13214
13215   // Issue the operation on the smaller types and concatenate the result back
13216   MVT EltVT = VT.getVectorElementType();
13217   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
13218   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
13219                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
13220                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
13221 }
13222
13223 static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
13224   SDValue Op0 = Op.getOperand(0);
13225   SDValue Op1 = Op.getOperand(1);
13226   SDValue CC = Op.getOperand(2);
13227   MVT VT = Op.getSimpleValueType();
13228   SDLoc dl(Op);
13229
13230   assert(Op0.getValueType().getVectorElementType() == MVT::i1 &&
13231          "Unexpected type for boolean compare operation");
13232   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
13233   SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
13234                                DAG.getConstant(-1, dl, VT));
13235   SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
13236                                DAG.getConstant(-1, dl, VT));
13237   switch (SetCCOpcode) {
13238   default: llvm_unreachable("Unexpected SETCC condition");
13239   case ISD::SETNE:
13240     // (x != y) -> ~(x ^ y)
13241     return DAG.getNode(ISD::XOR, dl, VT,
13242                        DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
13243                        DAG.getConstant(-1, dl, VT));
13244   case ISD::SETEQ:
13245     // (x == y) -> (x ^ y)
13246     return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
13247   case ISD::SETUGT:
13248   case ISD::SETGT:
13249     // (x > y) -> (x & ~y)
13250     return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
13251   case ISD::SETULT:
13252   case ISD::SETLT:
13253     // (x < y) -> (~x & y)
13254     return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
13255   case ISD::SETULE:
13256   case ISD::SETLE:
13257     // (x <= y) -> (~x | y)
13258     return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
13259   case ISD::SETUGE:
13260   case ISD::SETGE:
13261     // (x >=y) -> (x | ~y)
13262     return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
13263   }
13264 }
13265
13266 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG,
13267                                      const X86Subtarget *Subtarget) {
13268   SDValue Op0 = Op.getOperand(0);
13269   SDValue Op1 = Op.getOperand(1);
13270   SDValue CC = Op.getOperand(2);
13271   MVT VT = Op.getSimpleValueType();
13272   SDLoc dl(Op);
13273
13274   assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 8 &&
13275          Op.getValueType().getScalarType() == MVT::i1 &&
13276          "Cannot set masked compare for this operation");
13277
13278   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
13279   unsigned  Opc = 0;
13280   bool Unsigned = false;
13281   bool Swap = false;
13282   unsigned SSECC;
13283   switch (SetCCOpcode) {
13284   default: llvm_unreachable("Unexpected SETCC condition");
13285   case ISD::SETNE:  SSECC = 4; break;
13286   case ISD::SETEQ:  Opc = X86ISD::PCMPEQM; break;
13287   case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
13288   case ISD::SETLT:  Swap = true; //fall-through
13289   case ISD::SETGT:  Opc = X86ISD::PCMPGTM; break;
13290   case ISD::SETULT: SSECC = 1; Unsigned = true; break;
13291   case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
13292   case ISD::SETGE:  Swap = true; SSECC = 2; break; // LE + swap
13293   case ISD::SETULE: Unsigned = true; //fall-through
13294   case ISD::SETLE:  SSECC = 2; break;
13295   }
13296
13297   if (Swap)
13298     std::swap(Op0, Op1);
13299   if (Opc)
13300     return DAG.getNode(Opc, dl, VT, Op0, Op1);
13301   Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
13302   return DAG.getNode(Opc, dl, VT, Op0, Op1,
13303                      DAG.getConstant(SSECC, dl, MVT::i8));
13304 }
13305
13306 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
13307 /// operand \p Op1.  If non-trivial (for example because it's not constant)
13308 /// return an empty value.
13309 static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG)
13310 {
13311   BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
13312   if (!BV)
13313     return SDValue();
13314
13315   MVT VT = Op1.getSimpleValueType();
13316   MVT EVT = VT.getVectorElementType();
13317   unsigned n = VT.getVectorNumElements();
13318   SmallVector<SDValue, 8> ULTOp1;
13319
13320   for (unsigned i = 0; i < n; ++i) {
13321     ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
13322     if (!Elt || Elt->isOpaque() || Elt->getValueType(0) != EVT)
13323       return SDValue();
13324
13325     // Avoid underflow.
13326     APInt Val = Elt->getAPIntValue();
13327     if (Val == 0)
13328       return SDValue();
13329
13330     ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
13331   }
13332
13333   return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ULTOp1);
13334 }
13335
13336 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
13337                            SelectionDAG &DAG) {
13338   SDValue Op0 = Op.getOperand(0);
13339   SDValue Op1 = Op.getOperand(1);
13340   SDValue CC = Op.getOperand(2);
13341   MVT VT = Op.getSimpleValueType();
13342   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
13343   bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
13344   SDLoc dl(Op);
13345
13346   if (isFP) {
13347 #ifndef NDEBUG
13348     MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
13349     assert(EltVT == MVT::f32 || EltVT == MVT::f64);
13350 #endif
13351
13352     unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
13353     unsigned Opc = X86ISD::CMPP;
13354     if (Subtarget->hasAVX512() && VT.getVectorElementType() == MVT::i1) {
13355       assert(VT.getVectorNumElements() <= 16);
13356       Opc = X86ISD::CMPM;
13357     }
13358     // In the two special cases we can't handle, emit two comparisons.
13359     if (SSECC == 8) {
13360       unsigned CC0, CC1;
13361       unsigned CombineOpc;
13362       if (SetCCOpcode == ISD::SETUEQ) {
13363         CC0 = 3; CC1 = 0; CombineOpc = ISD::OR;
13364       } else {
13365         assert(SetCCOpcode == ISD::SETONE);
13366         CC0 = 7; CC1 = 4; CombineOpc = ISD::AND;
13367       }
13368
13369       SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
13370                                  DAG.getConstant(CC0, dl, MVT::i8));
13371       SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
13372                                  DAG.getConstant(CC1, dl, MVT::i8));
13373       return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
13374     }
13375     // Handle all other FP comparisons here.
13376     return DAG.getNode(Opc, dl, VT, Op0, Op1,
13377                        DAG.getConstant(SSECC, dl, MVT::i8));
13378   }
13379
13380   // Break 256-bit integer vector compare into smaller ones.
13381   if (VT.is256BitVector() && !Subtarget->hasInt256())
13382     return Lower256IntVSETCC(Op, DAG);
13383
13384   EVT OpVT = Op1.getValueType();
13385   if (OpVT.getVectorElementType() == MVT::i1)
13386     return LowerBoolVSETCC_AVX512(Op, DAG);
13387
13388   bool MaskResult = (VT.getVectorElementType() == MVT::i1);
13389   if (Subtarget->hasAVX512()) {
13390     if (Op1.getValueType().is512BitVector() ||
13391         (Subtarget->hasBWI() && Subtarget->hasVLX()) ||
13392         (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32))
13393       return LowerIntVSETCC_AVX512(Op, DAG, Subtarget);
13394
13395     // In AVX-512 architecture setcc returns mask with i1 elements,
13396     // But there is no compare instruction for i8 and i16 elements in KNL.
13397     // We are not talking about 512-bit operands in this case, these
13398     // types are illegal.
13399     if (MaskResult &&
13400         (OpVT.getVectorElementType().getSizeInBits() < 32 &&
13401          OpVT.getVectorElementType().getSizeInBits() >= 8))
13402       return DAG.getNode(ISD::TRUNCATE, dl, VT,
13403                          DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
13404   }
13405
13406   // We are handling one of the integer comparisons here.  Since SSE only has
13407   // GT and EQ comparisons for integer, swapping operands and multiple
13408   // operations may be required for some comparisons.
13409   unsigned Opc;
13410   bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
13411   bool Subus = false;
13412
13413   switch (SetCCOpcode) {
13414   default: llvm_unreachable("Unexpected SETCC condition");
13415   case ISD::SETNE:  Invert = true;
13416   case ISD::SETEQ:  Opc = X86ISD::PCMPEQ; break;
13417   case ISD::SETLT:  Swap = true;
13418   case ISD::SETGT:  Opc = X86ISD::PCMPGT; break;
13419   case ISD::SETGE:  Swap = true;
13420   case ISD::SETLE:  Opc = X86ISD::PCMPGT;
13421                     Invert = true; break;
13422   case ISD::SETULT: Swap = true;
13423   case ISD::SETUGT: Opc = X86ISD::PCMPGT;
13424                     FlipSigns = true; break;
13425   case ISD::SETUGE: Swap = true;
13426   case ISD::SETULE: Opc = X86ISD::PCMPGT;
13427                     FlipSigns = true; Invert = true; break;
13428   }
13429
13430   // Special case: Use min/max operations for SETULE/SETUGE
13431   MVT VET = VT.getVectorElementType();
13432   bool hasMinMax =
13433        (Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
13434     || (Subtarget->hasSSE2()  && (VET == MVT::i8));
13435
13436   if (hasMinMax) {
13437     switch (SetCCOpcode) {
13438     default: break;
13439     case ISD::SETULE: Opc = X86ISD::UMIN; MinMax = true; break;
13440     case ISD::SETUGE: Opc = X86ISD::UMAX; MinMax = true; break;
13441     }
13442
13443     if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
13444   }
13445
13446   bool hasSubus = Subtarget->hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
13447   if (!MinMax && hasSubus) {
13448     // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
13449     // Op0 u<= Op1:
13450     //   t = psubus Op0, Op1
13451     //   pcmpeq t, <0..0>
13452     switch (SetCCOpcode) {
13453     default: break;
13454     case ISD::SETULT: {
13455       // If the comparison is against a constant we can turn this into a
13456       // setule.  With psubus, setule does not require a swap.  This is
13457       // beneficial because the constant in the register is no longer
13458       // destructed as the destination so it can be hoisted out of a loop.
13459       // Only do this pre-AVX since vpcmp* is no longer destructive.
13460       if (Subtarget->hasAVX())
13461         break;
13462       SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG);
13463       if (ULEOp1.getNode()) {
13464         Op1 = ULEOp1;
13465         Subus = true; Invert = false; Swap = false;
13466       }
13467       break;
13468     }
13469     // Psubus is better than flip-sign because it requires no inversion.
13470     case ISD::SETUGE: Subus = true; Invert = false; Swap = true;  break;
13471     case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
13472     }
13473
13474     if (Subus) {
13475       Opc = X86ISD::SUBUS;
13476       FlipSigns = false;
13477     }
13478   }
13479
13480   if (Swap)
13481     std::swap(Op0, Op1);
13482
13483   // Check that the operation in question is available (most are plain SSE2,
13484   // but PCMPGTQ and PCMPEQQ have different requirements).
13485   if (VT == MVT::v2i64) {
13486     if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) {
13487       assert(Subtarget->hasSSE2() && "Don't know how to lower!");
13488
13489       // First cast everything to the right type.
13490       Op0 = DAG.getBitcast(MVT::v4i32, Op0);
13491       Op1 = DAG.getBitcast(MVT::v4i32, Op1);
13492
13493       // Since SSE has no unsigned integer comparisons, we need to flip the sign
13494       // bits of the inputs before performing those operations. The lower
13495       // compare is always unsigned.
13496       SDValue SB;
13497       if (FlipSigns) {
13498         SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
13499       } else {
13500         SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
13501         SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
13502         SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
13503                          Sign, Zero, Sign, Zero);
13504       }
13505       Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
13506       Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
13507
13508       // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
13509       SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
13510       SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
13511
13512       // Create masks for only the low parts/high parts of the 64 bit integers.
13513       static const int MaskHi[] = { 1, 1, 3, 3 };
13514       static const int MaskLo[] = { 0, 0, 2, 2 };
13515       SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
13516       SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
13517       SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
13518
13519       SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
13520       Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
13521
13522       if (Invert)
13523         Result = DAG.getNOT(dl, Result, MVT::v4i32);
13524
13525       return DAG.getBitcast(VT, Result);
13526     }
13527
13528     if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) {
13529       // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
13530       // pcmpeqd + pshufd + pand.
13531       assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!");
13532
13533       // First cast everything to the right type.
13534       Op0 = DAG.getBitcast(MVT::v4i32, Op0);
13535       Op1 = DAG.getBitcast(MVT::v4i32, Op1);
13536
13537       // Do the compare.
13538       SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
13539
13540       // Make sure the lower and upper halves are both all-ones.
13541       static const int Mask[] = { 1, 0, 3, 2 };
13542       SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
13543       Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
13544
13545       if (Invert)
13546         Result = DAG.getNOT(dl, Result, MVT::v4i32);
13547
13548       return DAG.getBitcast(VT, Result);
13549     }
13550   }
13551
13552   // Since SSE has no unsigned integer comparisons, we need to flip the sign
13553   // bits of the inputs before performing those operations.
13554   if (FlipSigns) {
13555     EVT EltVT = VT.getVectorElementType();
13556     SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), dl,
13557                                  VT);
13558     Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
13559     Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
13560   }
13561
13562   SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
13563
13564   // If the logical-not of the result is required, perform that now.
13565   if (Invert)
13566     Result = DAG.getNOT(dl, Result, VT);
13567
13568   if (MinMax)
13569     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
13570
13571   if (Subus)
13572     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
13573                          getZeroVector(VT, Subtarget, DAG, dl));
13574
13575   return Result;
13576 }
13577
13578 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
13579
13580   MVT VT = Op.getSimpleValueType();
13581
13582   if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
13583
13584   assert(((!Subtarget->hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
13585          && "SetCC type must be 8-bit or 1-bit integer");
13586   SDValue Op0 = Op.getOperand(0);
13587   SDValue Op1 = Op.getOperand(1);
13588   SDLoc dl(Op);
13589   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
13590
13591   // Optimize to BT if possible.
13592   // Lower (X & (1 << N)) == 0 to BT(X, N).
13593   // Lower ((X >>u N) & 1) != 0 to BT(X, N).
13594   // Lower ((X >>s N) & 1) != 0 to BT(X, N).
13595   if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
13596       Op1.getOpcode() == ISD::Constant &&
13597       cast<ConstantSDNode>(Op1)->isNullValue() &&
13598       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
13599     SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
13600     if (NewSetCC.getNode()) {
13601       if (VT == MVT::i1)
13602         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
13603       return NewSetCC;
13604     }
13605   }
13606
13607   // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
13608   // these.
13609   if (Op1.getOpcode() == ISD::Constant &&
13610       (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
13611        cast<ConstantSDNode>(Op1)->isNullValue()) &&
13612       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
13613
13614     // If the input is a setcc, then reuse the input setcc or use a new one with
13615     // the inverted condition.
13616     if (Op0.getOpcode() == X86ISD::SETCC) {
13617       X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
13618       bool Invert = (CC == ISD::SETNE) ^
13619         cast<ConstantSDNode>(Op1)->isNullValue();
13620       if (!Invert)
13621         return Op0;
13622
13623       CCode = X86::GetOppositeBranchCondition(CCode);
13624       SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
13625                                   DAG.getConstant(CCode, dl, MVT::i8),
13626                                   Op0.getOperand(1));
13627       if (VT == MVT::i1)
13628         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
13629       return SetCC;
13630     }
13631   }
13632   if ((Op0.getValueType() == MVT::i1) && (Op1.getOpcode() == ISD::Constant) &&
13633       (cast<ConstantSDNode>(Op1)->getZExtValue() == 1) &&
13634       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
13635
13636     ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
13637     return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC);
13638   }
13639
13640   bool isFP = Op1.getSimpleValueType().isFloatingPoint();
13641   unsigned X86CC = TranslateX86CC(CC, dl, isFP, Op0, Op1, DAG);
13642   if (X86CC == X86::COND_INVALID)
13643     return SDValue();
13644
13645   SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
13646   EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
13647   SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
13648                               DAG.getConstant(X86CC, dl, MVT::i8), EFLAGS);
13649   if (VT == MVT::i1)
13650     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
13651   return SetCC;
13652 }
13653
13654 // isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
13655 static bool isX86LogicalCmp(SDValue Op) {
13656   unsigned Opc = Op.getNode()->getOpcode();
13657   if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
13658       Opc == X86ISD::SAHF)
13659     return true;
13660   if (Op.getResNo() == 1 &&
13661       (Opc == X86ISD::ADD ||
13662        Opc == X86ISD::SUB ||
13663        Opc == X86ISD::ADC ||
13664        Opc == X86ISD::SBB ||
13665        Opc == X86ISD::SMUL ||
13666        Opc == X86ISD::UMUL ||
13667        Opc == X86ISD::INC ||
13668        Opc == X86ISD::DEC ||
13669        Opc == X86ISD::OR ||
13670        Opc == X86ISD::XOR ||
13671        Opc == X86ISD::AND))
13672     return true;
13673
13674   if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
13675     return true;
13676
13677   return false;
13678 }
13679
13680 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
13681   if (V.getOpcode() != ISD::TRUNCATE)
13682     return false;
13683
13684   SDValue VOp0 = V.getOperand(0);
13685   unsigned InBits = VOp0.getValueSizeInBits();
13686   unsigned Bits = V.getValueSizeInBits();
13687   return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
13688 }
13689
13690 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
13691   bool addTest = true;
13692   SDValue Cond  = Op.getOperand(0);
13693   SDValue Op1 = Op.getOperand(1);
13694   SDValue Op2 = Op.getOperand(2);
13695   SDLoc DL(Op);
13696   EVT VT = Op1.getValueType();
13697   SDValue CC;
13698
13699   // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
13700   // are available or VBLENDV if AVX is available.
13701   // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
13702   if (Cond.getOpcode() == ISD::SETCC &&
13703       ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
13704        (Subtarget->hasSSE1() && VT == MVT::f32)) &&
13705       VT == Cond.getOperand(0).getValueType() && Cond->hasOneUse()) {
13706     SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
13707     int SSECC = translateX86FSETCC(
13708         cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
13709
13710     if (SSECC != 8) {
13711       if (Subtarget->hasAVX512()) {
13712         SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1,
13713                                   DAG.getConstant(SSECC, DL, MVT::i8));
13714         return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2);
13715       }
13716
13717       SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
13718                                 DAG.getConstant(SSECC, DL, MVT::i8));
13719
13720       // If we have AVX, we can use a variable vector select (VBLENDV) instead
13721       // of 3 logic instructions for size savings and potentially speed.
13722       // Unfortunately, there is no scalar form of VBLENDV.
13723
13724       // If either operand is a constant, don't try this. We can expect to
13725       // optimize away at least one of the logic instructions later in that
13726       // case, so that sequence would be faster than a variable blend.
13727
13728       // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
13729       // uses XMM0 as the selection register. That may need just as many
13730       // instructions as the AND/ANDN/OR sequence due to register moves, so
13731       // don't bother.
13732
13733       if (Subtarget->hasAVX() &&
13734           !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
13735
13736         // Convert to vectors, do a VSELECT, and convert back to scalar.
13737         // All of the conversions should be optimized away.
13738
13739         EVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
13740         SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
13741         SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
13742         SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
13743
13744         EVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
13745         VCmp = DAG.getBitcast(VCmpVT, VCmp);
13746
13747         SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2);
13748
13749         return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
13750                            VSel, DAG.getIntPtrConstant(0, DL));
13751       }
13752       SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
13753       SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
13754       return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
13755     }
13756   }
13757
13758     if (VT.isVector() && VT.getScalarType() == MVT::i1) {
13759       SDValue Op1Scalar;
13760       if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
13761         Op1Scalar = ConvertI1VectorToInterger(Op1, DAG);
13762       else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
13763         Op1Scalar = Op1.getOperand(0);
13764       SDValue Op2Scalar;
13765       if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
13766         Op2Scalar = ConvertI1VectorToInterger(Op2, DAG);
13767       else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
13768         Op2Scalar = Op2.getOperand(0);
13769       if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
13770         SDValue newSelect = DAG.getNode(ISD::SELECT, DL,
13771                                         Op1Scalar.getValueType(),
13772                                         Cond, Op1Scalar, Op2Scalar);
13773         if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
13774           return DAG.getBitcast(VT, newSelect);
13775         SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
13776         return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
13777                            DAG.getIntPtrConstant(0, DL));
13778     }
13779   }
13780
13781   if (VT == MVT::v4i1 || VT == MVT::v2i1) {
13782     SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
13783     Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
13784                       DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
13785     Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
13786                       DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
13787     SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::v8i1,
13788                                     Cond, Op1, Op2);
13789     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
13790   }
13791
13792   if (Cond.getOpcode() == ISD::SETCC) {
13793     SDValue NewCond = LowerSETCC(Cond, DAG);
13794     if (NewCond.getNode())
13795       Cond = NewCond;
13796   }
13797
13798   // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
13799   // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
13800   // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
13801   // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
13802   if (Cond.getOpcode() == X86ISD::SETCC &&
13803       Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
13804       isZero(Cond.getOperand(1).getOperand(1))) {
13805     SDValue Cmp = Cond.getOperand(1);
13806
13807     unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
13808
13809     if ((isAllOnes(Op1) || isAllOnes(Op2)) &&
13810         (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
13811       SDValue Y = isAllOnes(Op2) ? Op1 : Op2;
13812
13813       SDValue CmpOp0 = Cmp.getOperand(0);
13814       // Apply further optimizations for special cases
13815       // (select (x != 0), -1, 0) -> neg & sbb
13816       // (select (x == 0), 0, -1) -> neg & sbb
13817       if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y))
13818         if (YC->isNullValue() &&
13819             (isAllOnes(Op1) == (CondCode == X86::COND_NE))) {
13820           SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
13821           SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
13822                                     DAG.getConstant(0, DL,
13823                                                     CmpOp0.getValueType()),
13824                                     CmpOp0);
13825           SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
13826                                     DAG.getConstant(X86::COND_B, DL, MVT::i8),
13827                                     SDValue(Neg.getNode(), 1));
13828           return Res;
13829         }
13830
13831       Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
13832                         CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
13833       Cmp = ConvertCmpIfNecessary(Cmp, DAG);
13834
13835       SDValue Res =   // Res = 0 or -1.
13836         DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
13837                     DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
13838
13839       if (isAllOnes(Op1) != (CondCode == X86::COND_E))
13840         Res = DAG.getNOT(DL, Res, Res.getValueType());
13841
13842       ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
13843       if (!N2C || !N2C->isNullValue())
13844         Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
13845       return Res;
13846     }
13847   }
13848
13849   // Look past (and (setcc_carry (cmp ...)), 1).
13850   if (Cond.getOpcode() == ISD::AND &&
13851       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
13852     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
13853     if (C && C->getAPIntValue() == 1)
13854       Cond = Cond.getOperand(0);
13855   }
13856
13857   // If condition flag is set by a X86ISD::CMP, then use it as the condition
13858   // setting operand in place of the X86ISD::SETCC.
13859   unsigned CondOpcode = Cond.getOpcode();
13860   if (CondOpcode == X86ISD::SETCC ||
13861       CondOpcode == X86ISD::SETCC_CARRY) {
13862     CC = Cond.getOperand(0);
13863
13864     SDValue Cmp = Cond.getOperand(1);
13865     unsigned Opc = Cmp.getOpcode();
13866     MVT VT = Op.getSimpleValueType();
13867
13868     bool IllegalFPCMov = false;
13869     if (VT.isFloatingPoint() && !VT.isVector() &&
13870         !isScalarFPTypeInSSEReg(VT))  // FPStack?
13871       IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
13872
13873     if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
13874         Opc == X86ISD::BT) { // FIXME
13875       Cond = Cmp;
13876       addTest = false;
13877     }
13878   } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
13879              CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
13880              ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
13881               Cond.getOperand(0).getValueType() != MVT::i8)) {
13882     SDValue LHS = Cond.getOperand(0);
13883     SDValue RHS = Cond.getOperand(1);
13884     unsigned X86Opcode;
13885     unsigned X86Cond;
13886     SDVTList VTs;
13887     switch (CondOpcode) {
13888     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
13889     case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
13890     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
13891     case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
13892     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
13893     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
13894     default: llvm_unreachable("unexpected overflowing operator");
13895     }
13896     if (CondOpcode == ISD::UMULO)
13897       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
13898                           MVT::i32);
13899     else
13900       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
13901
13902     SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
13903
13904     if (CondOpcode == ISD::UMULO)
13905       Cond = X86Op.getValue(2);
13906     else
13907       Cond = X86Op.getValue(1);
13908
13909     CC = DAG.getConstant(X86Cond, DL, MVT::i8);
13910     addTest = false;
13911   }
13912
13913   if (addTest) {
13914     // Look pass the truncate if the high bits are known zero.
13915     if (isTruncWithZeroHighBitsInput(Cond, DAG))
13916         Cond = Cond.getOperand(0);
13917
13918     // We know the result of AND is compared against zero. Try to match
13919     // it to BT.
13920     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
13921       SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG);
13922       if (NewSetCC.getNode()) {
13923         CC = NewSetCC.getOperand(0);
13924         Cond = NewSetCC.getOperand(1);
13925         addTest = false;
13926       }
13927     }
13928   }
13929
13930   if (addTest) {
13931     CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
13932     Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
13933   }
13934
13935   // a <  b ? -1 :  0 -> RES = ~setcc_carry
13936   // a <  b ?  0 : -1 -> RES = setcc_carry
13937   // a >= b ? -1 :  0 -> RES = setcc_carry
13938   // a >= b ?  0 : -1 -> RES = ~setcc_carry
13939   if (Cond.getOpcode() == X86ISD::SUB) {
13940     Cond = ConvertCmpIfNecessary(Cond, DAG);
13941     unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
13942
13943     if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
13944         (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) {
13945       SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
13946                                 DAG.getConstant(X86::COND_B, DL, MVT::i8),
13947                                 Cond);
13948       if (isAllOnes(Op1) != (CondCode == X86::COND_B))
13949         return DAG.getNOT(DL, Res, Res.getValueType());
13950       return Res;
13951     }
13952   }
13953
13954   // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
13955   // widen the cmov and push the truncate through. This avoids introducing a new
13956   // branch during isel and doesn't add any extensions.
13957   if (Op.getValueType() == MVT::i8 &&
13958       Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
13959     SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
13960     if (T1.getValueType() == T2.getValueType() &&
13961         // Blacklist CopyFromReg to avoid partial register stalls.
13962         T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
13963       SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
13964       SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
13965       return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
13966     }
13967   }
13968
13969   // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
13970   // condition is true.
13971   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
13972   SDValue Ops[] = { Op2, Op1, CC, Cond };
13973   return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
13974 }
13975
13976 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
13977                                        const X86Subtarget *Subtarget,
13978                                        SelectionDAG &DAG) {
13979   MVT VT = Op->getSimpleValueType(0);
13980   SDValue In = Op->getOperand(0);
13981   MVT InVT = In.getSimpleValueType();
13982   MVT VTElt = VT.getVectorElementType();
13983   MVT InVTElt = InVT.getVectorElementType();
13984   SDLoc dl(Op);
13985
13986   // SKX processor
13987   if ((InVTElt == MVT::i1) &&
13988       (((Subtarget->hasBWI() && Subtarget->hasVLX() &&
13989         VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) ||
13990
13991        ((Subtarget->hasBWI() && VT.is512BitVector() &&
13992         VTElt.getSizeInBits() <= 16)) ||
13993
13994        ((Subtarget->hasDQI() && Subtarget->hasVLX() &&
13995         VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
13996
13997        ((Subtarget->hasDQI() && VT.is512BitVector() &&
13998         VTElt.getSizeInBits() >= 32))))
13999     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
14000
14001   unsigned int NumElts = VT.getVectorNumElements();
14002
14003   if (NumElts != 8 && NumElts != 16 && !Subtarget->hasBWI())
14004     return SDValue();
14005
14006   if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) {
14007     if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
14008       return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
14009     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
14010   }
14011
14012   assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
14013   MVT ExtVT = NumElts == 8 ? MVT::v8i64 : MVT::v16i32;
14014   SDValue NegOne =
14015    DAG.getConstant(APInt::getAllOnesValue(ExtVT.getScalarSizeInBits()), dl,
14016                    ExtVT);
14017   SDValue Zero =
14018    DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), dl, ExtVT);
14019
14020   SDValue V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
14021   if (VT.is512BitVector())
14022     return V;
14023   return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
14024 }
14025
14026 static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op,
14027                                              const X86Subtarget *Subtarget,
14028                                              SelectionDAG &DAG) {
14029   SDValue In = Op->getOperand(0);
14030   MVT VT = Op->getSimpleValueType(0);
14031   MVT InVT = In.getSimpleValueType();
14032   assert(VT.getSizeInBits() == InVT.getSizeInBits());
14033
14034   MVT InSVT = InVT.getScalarType();
14035   assert(VT.getScalarType().getScalarSizeInBits() > InSVT.getScalarSizeInBits());
14036
14037   if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16)
14038     return SDValue();
14039   if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
14040     return SDValue();
14041
14042   SDLoc dl(Op);
14043
14044   // SSE41 targets can use the pmovsx* instructions directly.
14045   if (Subtarget->hasSSE41())
14046     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
14047
14048   // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
14049   SDValue Curr = In;
14050   MVT CurrVT = InVT;
14051
14052   // As SRAI is only available on i16/i32 types, we expand only up to i32
14053   // and handle i64 separately.
14054   while (CurrVT != VT && CurrVT.getScalarType() != MVT::i32) {
14055     Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
14056     MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
14057     CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
14058     Curr = DAG.getBitcast(CurrVT, Curr);
14059   }
14060
14061   SDValue SignExt = Curr;
14062   if (CurrVT != InVT) {
14063     unsigned SignExtShift =
14064         CurrVT.getScalarSizeInBits() - InSVT.getScalarSizeInBits();
14065     SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
14066                           DAG.getConstant(SignExtShift, dl, MVT::i8));
14067   }
14068
14069   if (CurrVT == VT)
14070     return SignExt;
14071
14072   if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
14073     SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
14074                                DAG.getConstant(31, dl, MVT::i8));
14075     SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
14076     return DAG.getBitcast(VT, Ext);
14077   }
14078
14079   return SDValue();
14080 }
14081
14082 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
14083                                 SelectionDAG &DAG) {
14084   MVT VT = Op->getSimpleValueType(0);
14085   SDValue In = Op->getOperand(0);
14086   MVT InVT = In.getSimpleValueType();
14087   SDLoc dl(Op);
14088
14089   if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
14090     return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
14091
14092   if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
14093       (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
14094       (VT != MVT::v16i16 || InVT != MVT::v16i8))
14095     return SDValue();
14096
14097   if (Subtarget->hasInt256())
14098     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
14099
14100   // Optimize vectors in AVX mode
14101   // Sign extend  v8i16 to v8i32 and
14102   //              v4i32 to v4i64
14103   //
14104   // Divide input vector into two parts
14105   // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
14106   // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
14107   // concat the vectors to original VT
14108
14109   unsigned NumElems = InVT.getVectorNumElements();
14110   SDValue Undef = DAG.getUNDEF(InVT);
14111
14112   SmallVector<int,8> ShufMask1(NumElems, -1);
14113   for (unsigned i = 0; i != NumElems/2; ++i)
14114     ShufMask1[i] = i;
14115
14116   SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask1[0]);
14117
14118   SmallVector<int,8> ShufMask2(NumElems, -1);
14119   for (unsigned i = 0; i != NumElems/2; ++i)
14120     ShufMask2[i] = i + NumElems/2;
14121
14122   SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]);
14123
14124   MVT HalfVT = MVT::getVectorVT(VT.getScalarType(),
14125                                 VT.getVectorNumElements()/2);
14126
14127   OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
14128   OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi);
14129
14130   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
14131 }
14132
14133 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
14134 // may emit an illegal shuffle but the expansion is still better than scalar
14135 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
14136 // we'll emit a shuffle and a arithmetic shift.
14137 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
14138 // TODO: It is possible to support ZExt by zeroing the undef values during
14139 // the shuffle phase or after the shuffle.
14140 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
14141                                  SelectionDAG &DAG) {
14142   MVT RegVT = Op.getSimpleValueType();
14143   assert(RegVT.isVector() && "We only custom lower vector sext loads.");
14144   assert(RegVT.isInteger() &&
14145          "We only custom lower integer vector sext loads.");
14146
14147   // Nothing useful we can do without SSE2 shuffles.
14148   assert(Subtarget->hasSSE2() && "We only custom lower sext loads with SSE2.");
14149
14150   LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
14151   SDLoc dl(Ld);
14152   EVT MemVT = Ld->getMemoryVT();
14153   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14154   unsigned RegSz = RegVT.getSizeInBits();
14155
14156   ISD::LoadExtType Ext = Ld->getExtensionType();
14157
14158   assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
14159          && "Only anyext and sext are currently implemented.");
14160   assert(MemVT != RegVT && "Cannot extend to the same type");
14161   assert(MemVT.isVector() && "Must load a vector from memory");
14162
14163   unsigned NumElems = RegVT.getVectorNumElements();
14164   unsigned MemSz = MemVT.getSizeInBits();
14165   assert(RegSz > MemSz && "Register size must be greater than the mem size");
14166
14167   if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) {
14168     // The only way in which we have a legal 256-bit vector result but not the
14169     // integer 256-bit operations needed to directly lower a sextload is if we
14170     // have AVX1 but not AVX2. In that case, we can always emit a sextload to
14171     // a 128-bit vector and a normal sign_extend to 256-bits that should get
14172     // correctly legalized. We do this late to allow the canonical form of
14173     // sextload to persist throughout the rest of the DAG combiner -- it wants
14174     // to fold together any extensions it can, and so will fuse a sign_extend
14175     // of an sextload into a sextload targeting a wider value.
14176     SDValue Load;
14177     if (MemSz == 128) {
14178       // Just switch this to a normal load.
14179       assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
14180                                        "it must be a legal 128-bit vector "
14181                                        "type!");
14182       Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
14183                   Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(),
14184                   Ld->isInvariant(), Ld->getAlignment());
14185     } else {
14186       assert(MemSz < 128 &&
14187              "Can't extend a type wider than 128 bits to a 256 bit vector!");
14188       // Do an sext load to a 128-bit vector type. We want to use the same
14189       // number of elements, but elements half as wide. This will end up being
14190       // recursively lowered by this routine, but will succeed as we definitely
14191       // have all the necessary features if we're using AVX1.
14192       EVT HalfEltVT =
14193           EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
14194       EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
14195       Load =
14196           DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
14197                          Ld->getPointerInfo(), MemVT, Ld->isVolatile(),
14198                          Ld->isNonTemporal(), Ld->isInvariant(),
14199                          Ld->getAlignment());
14200     }
14201
14202     // Replace chain users with the new chain.
14203     assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
14204     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
14205
14206     // Finally, do a normal sign-extend to the desired register.
14207     return DAG.getSExtOrTrunc(Load, dl, RegVT);
14208   }
14209
14210   // All sizes must be a power of two.
14211   assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
14212          "Non-power-of-two elements are not custom lowered!");
14213
14214   // Attempt to load the original value using scalar loads.
14215   // Find the largest scalar type that divides the total loaded size.
14216   MVT SclrLoadTy = MVT::i8;
14217   for (MVT Tp : MVT::integer_valuetypes()) {
14218     if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
14219       SclrLoadTy = Tp;
14220     }
14221   }
14222
14223   // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
14224   if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
14225       (64 <= MemSz))
14226     SclrLoadTy = MVT::f64;
14227
14228   // Calculate the number of scalar loads that we need to perform
14229   // in order to load our vector from memory.
14230   unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
14231
14232   assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
14233          "Can only lower sext loads with a single scalar load!");
14234
14235   unsigned loadRegZize = RegSz;
14236   if (Ext == ISD::SEXTLOAD && RegSz >= 256)
14237     loadRegZize = 128;
14238
14239   // Represent our vector as a sequence of elements which are the
14240   // largest scalar that we can load.
14241   EVT LoadUnitVecVT = EVT::getVectorVT(
14242       *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
14243
14244   // Represent the data using the same element type that is stored in
14245   // memory. In practice, we ''widen'' MemVT.
14246   EVT WideVecVT =
14247       EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
14248                        loadRegZize / MemVT.getScalarType().getSizeInBits());
14249
14250   assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
14251          "Invalid vector type");
14252
14253   // We can't shuffle using an illegal type.
14254   assert(TLI.isTypeLegal(WideVecVT) &&
14255          "We only lower types that form legal widened vector types");
14256
14257   SmallVector<SDValue, 8> Chains;
14258   SDValue Ptr = Ld->getBasePtr();
14259   SDValue Increment =
14260       DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl, TLI.getPointerTy());
14261   SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
14262
14263   for (unsigned i = 0; i < NumLoads; ++i) {
14264     // Perform a single load.
14265     SDValue ScalarLoad =
14266         DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
14267                     Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(),
14268                     Ld->getAlignment());
14269     Chains.push_back(ScalarLoad.getValue(1));
14270     // Create the first element type using SCALAR_TO_VECTOR in order to avoid
14271     // another round of DAGCombining.
14272     if (i == 0)
14273       Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
14274     else
14275       Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
14276                         ScalarLoad, DAG.getIntPtrConstant(i, dl));
14277
14278     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
14279   }
14280
14281   SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
14282
14283   // Bitcast the loaded value to a vector of the original element type, in
14284   // the size of the target vector type.
14285   SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
14286   unsigned SizeRatio = RegSz / MemSz;
14287
14288   if (Ext == ISD::SEXTLOAD) {
14289     // If we have SSE4.1, we can directly emit a VSEXT node.
14290     if (Subtarget->hasSSE41()) {
14291       SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
14292       DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
14293       return Sext;
14294     }
14295
14296     // Otherwise we'll shuffle the small elements in the high bits of the
14297     // larger type and perform an arithmetic shift. If the shift is not legal
14298     // it's better to scalarize.
14299     assert(TLI.isOperationLegalOrCustom(ISD::SRA, RegVT) &&
14300            "We can't implement a sext load without an arithmetic right shift!");
14301
14302     // Redistribute the loaded elements into the different locations.
14303     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
14304     for (unsigned i = 0; i != NumElems; ++i)
14305       ShuffleVec[i * SizeRatio + SizeRatio - 1] = i;
14306
14307     SDValue Shuff = DAG.getVectorShuffle(
14308         WideVecVT, dl, SlicedVec, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
14309
14310     Shuff = DAG.getBitcast(RegVT, Shuff);
14311
14312     // Build the arithmetic shift.
14313     unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
14314                    MemVT.getVectorElementType().getSizeInBits();
14315     Shuff =
14316         DAG.getNode(ISD::SRA, dl, RegVT, Shuff,
14317                     DAG.getConstant(Amt, dl, RegVT));
14318
14319     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
14320     return Shuff;
14321   }
14322
14323   // Redistribute the loaded elements into the different locations.
14324   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
14325   for (unsigned i = 0; i != NumElems; ++i)
14326     ShuffleVec[i * SizeRatio] = i;
14327
14328   SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
14329                                        DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
14330
14331   // Bitcast to the requested type.
14332   Shuff = DAG.getBitcast(RegVT, Shuff);
14333   DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
14334   return Shuff;
14335 }
14336
14337 // isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
14338 // ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
14339 // from the AND / OR.
14340 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
14341   Opc = Op.getOpcode();
14342   if (Opc != ISD::OR && Opc != ISD::AND)
14343     return false;
14344   return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
14345           Op.getOperand(0).hasOneUse() &&
14346           Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
14347           Op.getOperand(1).hasOneUse());
14348 }
14349
14350 // isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
14351 // 1 and that the SETCC node has a single use.
14352 static bool isXor1OfSetCC(SDValue Op) {
14353   if (Op.getOpcode() != ISD::XOR)
14354     return false;
14355   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
14356   if (N1C && N1C->getAPIntValue() == 1) {
14357     return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
14358       Op.getOperand(0).hasOneUse();
14359   }
14360   return false;
14361 }
14362
14363 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
14364   bool addTest = true;
14365   SDValue Chain = Op.getOperand(0);
14366   SDValue Cond  = Op.getOperand(1);
14367   SDValue Dest  = Op.getOperand(2);
14368   SDLoc dl(Op);
14369   SDValue CC;
14370   bool Inverted = false;
14371
14372   if (Cond.getOpcode() == ISD::SETCC) {
14373     // Check for setcc([su]{add,sub,mul}o == 0).
14374     if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
14375         isa<ConstantSDNode>(Cond.getOperand(1)) &&
14376         cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() &&
14377         Cond.getOperand(0).getResNo() == 1 &&
14378         (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
14379          Cond.getOperand(0).getOpcode() == ISD::UADDO ||
14380          Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
14381          Cond.getOperand(0).getOpcode() == ISD::USUBO ||
14382          Cond.getOperand(0).getOpcode() == ISD::SMULO ||
14383          Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
14384       Inverted = true;
14385       Cond = Cond.getOperand(0);
14386     } else {
14387       SDValue NewCond = LowerSETCC(Cond, DAG);
14388       if (NewCond.getNode())
14389         Cond = NewCond;
14390     }
14391   }
14392 #if 0
14393   // FIXME: LowerXALUO doesn't handle these!!
14394   else if (Cond.getOpcode() == X86ISD::ADD  ||
14395            Cond.getOpcode() == X86ISD::SUB  ||
14396            Cond.getOpcode() == X86ISD::SMUL ||
14397            Cond.getOpcode() == X86ISD::UMUL)
14398     Cond = LowerXALUO(Cond, DAG);
14399 #endif
14400
14401   // Look pass (and (setcc_carry (cmp ...)), 1).
14402   if (Cond.getOpcode() == ISD::AND &&
14403       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
14404     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
14405     if (C && C->getAPIntValue() == 1)
14406       Cond = Cond.getOperand(0);
14407   }
14408
14409   // If condition flag is set by a X86ISD::CMP, then use it as the condition
14410   // setting operand in place of the X86ISD::SETCC.
14411   unsigned CondOpcode = Cond.getOpcode();
14412   if (CondOpcode == X86ISD::SETCC ||
14413       CondOpcode == X86ISD::SETCC_CARRY) {
14414     CC = Cond.getOperand(0);
14415
14416     SDValue Cmp = Cond.getOperand(1);
14417     unsigned Opc = Cmp.getOpcode();
14418     // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
14419     if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
14420       Cond = Cmp;
14421       addTest = false;
14422     } else {
14423       switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
14424       default: break;
14425       case X86::COND_O:
14426       case X86::COND_B:
14427         // These can only come from an arithmetic instruction with overflow,
14428         // e.g. SADDO, UADDO.
14429         Cond = Cond.getNode()->getOperand(1);
14430         addTest = false;
14431         break;
14432       }
14433     }
14434   }
14435   CondOpcode = Cond.getOpcode();
14436   if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
14437       CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
14438       ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
14439        Cond.getOperand(0).getValueType() != MVT::i8)) {
14440     SDValue LHS = Cond.getOperand(0);
14441     SDValue RHS = Cond.getOperand(1);
14442     unsigned X86Opcode;
14443     unsigned X86Cond;
14444     SDVTList VTs;
14445     // Keep this in sync with LowerXALUO, otherwise we might create redundant
14446     // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
14447     // X86ISD::INC).
14448     switch (CondOpcode) {
14449     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
14450     case ISD::SADDO:
14451       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
14452         if (C->isOne()) {
14453           X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
14454           break;
14455         }
14456       X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
14457     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
14458     case ISD::SSUBO:
14459       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
14460         if (C->isOne()) {
14461           X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
14462           break;
14463         }
14464       X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
14465     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
14466     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
14467     default: llvm_unreachable("unexpected overflowing operator");
14468     }
14469     if (Inverted)
14470       X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
14471     if (CondOpcode == ISD::UMULO)
14472       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
14473                           MVT::i32);
14474     else
14475       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
14476
14477     SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
14478
14479     if (CondOpcode == ISD::UMULO)
14480       Cond = X86Op.getValue(2);
14481     else
14482       Cond = X86Op.getValue(1);
14483
14484     CC = DAG.getConstant(X86Cond, dl, MVT::i8);
14485     addTest = false;
14486   } else {
14487     unsigned CondOpc;
14488     if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
14489       SDValue Cmp = Cond.getOperand(0).getOperand(1);
14490       if (CondOpc == ISD::OR) {
14491         // Also, recognize the pattern generated by an FCMP_UNE. We can emit
14492         // two branches instead of an explicit OR instruction with a
14493         // separate test.
14494         if (Cmp == Cond.getOperand(1).getOperand(1) &&
14495             isX86LogicalCmp(Cmp)) {
14496           CC = Cond.getOperand(0).getOperand(0);
14497           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
14498                               Chain, Dest, CC, Cmp);
14499           CC = Cond.getOperand(1).getOperand(0);
14500           Cond = Cmp;
14501           addTest = false;
14502         }
14503       } else { // ISD::AND
14504         // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
14505         // two branches instead of an explicit AND instruction with a
14506         // separate test. However, we only do this if this block doesn't
14507         // have a fall-through edge, because this requires an explicit
14508         // jmp when the condition is false.
14509         if (Cmp == Cond.getOperand(1).getOperand(1) &&
14510             isX86LogicalCmp(Cmp) &&
14511             Op.getNode()->hasOneUse()) {
14512           X86::CondCode CCode =
14513             (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
14514           CCode = X86::GetOppositeBranchCondition(CCode);
14515           CC = DAG.getConstant(CCode, dl, MVT::i8);
14516           SDNode *User = *Op.getNode()->use_begin();
14517           // Look for an unconditional branch following this conditional branch.
14518           // We need this because we need to reverse the successors in order
14519           // to implement FCMP_OEQ.
14520           if (User->getOpcode() == ISD::BR) {
14521             SDValue FalseBB = User->getOperand(1);
14522             SDNode *NewBR =
14523               DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
14524             assert(NewBR == User);
14525             (void)NewBR;
14526             Dest = FalseBB;
14527
14528             Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
14529                                 Chain, Dest, CC, Cmp);
14530             X86::CondCode CCode =
14531               (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
14532             CCode = X86::GetOppositeBranchCondition(CCode);
14533             CC = DAG.getConstant(CCode, dl, MVT::i8);
14534             Cond = Cmp;
14535             addTest = false;
14536           }
14537         }
14538       }
14539     } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
14540       // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
14541       // It should be transformed during dag combiner except when the condition
14542       // is set by a arithmetics with overflow node.
14543       X86::CondCode CCode =
14544         (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
14545       CCode = X86::GetOppositeBranchCondition(CCode);
14546       CC = DAG.getConstant(CCode, dl, MVT::i8);
14547       Cond = Cond.getOperand(0).getOperand(1);
14548       addTest = false;
14549     } else if (Cond.getOpcode() == ISD::SETCC &&
14550                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
14551       // For FCMP_OEQ, we can emit
14552       // two branches instead of an explicit AND instruction with a
14553       // separate test. However, we only do this if this block doesn't
14554       // have a fall-through edge, because this requires an explicit
14555       // jmp when the condition is false.
14556       if (Op.getNode()->hasOneUse()) {
14557         SDNode *User = *Op.getNode()->use_begin();
14558         // Look for an unconditional branch following this conditional branch.
14559         // We need this because we need to reverse the successors in order
14560         // to implement FCMP_OEQ.
14561         if (User->getOpcode() == ISD::BR) {
14562           SDValue FalseBB = User->getOperand(1);
14563           SDNode *NewBR =
14564             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
14565           assert(NewBR == User);
14566           (void)NewBR;
14567           Dest = FalseBB;
14568
14569           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
14570                                     Cond.getOperand(0), Cond.getOperand(1));
14571           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
14572           CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
14573           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
14574                               Chain, Dest, CC, Cmp);
14575           CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
14576           Cond = Cmp;
14577           addTest = false;
14578         }
14579       }
14580     } else if (Cond.getOpcode() == ISD::SETCC &&
14581                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
14582       // For FCMP_UNE, we can emit
14583       // two branches instead of an explicit AND instruction with a
14584       // separate test. However, we only do this if this block doesn't
14585       // have a fall-through edge, because this requires an explicit
14586       // jmp when the condition is false.
14587       if (Op.getNode()->hasOneUse()) {
14588         SDNode *User = *Op.getNode()->use_begin();
14589         // Look for an unconditional branch following this conditional branch.
14590         // We need this because we need to reverse the successors in order
14591         // to implement FCMP_UNE.
14592         if (User->getOpcode() == ISD::BR) {
14593           SDValue FalseBB = User->getOperand(1);
14594           SDNode *NewBR =
14595             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
14596           assert(NewBR == User);
14597           (void)NewBR;
14598
14599           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
14600                                     Cond.getOperand(0), Cond.getOperand(1));
14601           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
14602           CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
14603           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
14604                               Chain, Dest, CC, Cmp);
14605           CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
14606           Cond = Cmp;
14607           addTest = false;
14608           Dest = FalseBB;
14609         }
14610       }
14611     }
14612   }
14613
14614   if (addTest) {
14615     // Look pass the truncate if the high bits are known zero.
14616     if (isTruncWithZeroHighBitsInput(Cond, DAG))
14617         Cond = Cond.getOperand(0);
14618
14619     // We know the result of AND is compared against zero. Try to match
14620     // it to BT.
14621     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
14622       SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
14623       if (NewSetCC.getNode()) {
14624         CC = NewSetCC.getOperand(0);
14625         Cond = NewSetCC.getOperand(1);
14626         addTest = false;
14627       }
14628     }
14629   }
14630
14631   if (addTest) {
14632     X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
14633     CC = DAG.getConstant(X86Cond, dl, MVT::i8);
14634     Cond = EmitTest(Cond, X86Cond, dl, DAG);
14635   }
14636   Cond = ConvertCmpIfNecessary(Cond, DAG);
14637   return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
14638                      Chain, Dest, CC, Cond);
14639 }
14640
14641 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
14642 // Calls to _alloca are needed to probe the stack when allocating more than 4k
14643 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
14644 // that the guard pages used by the OS virtual memory manager are allocated in
14645 // correct sequence.
14646 SDValue
14647 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
14648                                            SelectionDAG &DAG) const {
14649   MachineFunction &MF = DAG.getMachineFunction();
14650   bool SplitStack = MF.shouldSplitStack();
14651   bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMachO()) ||
14652                SplitStack;
14653   SDLoc dl(Op);
14654
14655   if (!Lower) {
14656     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14657     SDNode* Node = Op.getNode();
14658
14659     unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
14660     assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
14661         " not tell us which reg is the stack pointer!");
14662     EVT VT = Node->getValueType(0);
14663     SDValue Tmp1 = SDValue(Node, 0);
14664     SDValue Tmp2 = SDValue(Node, 1);
14665     SDValue Tmp3 = Node->getOperand(2);
14666     SDValue Chain = Tmp1.getOperand(0);
14667
14668     // Chain the dynamic stack allocation so that it doesn't modify the stack
14669     // pointer when other instructions are using the stack.
14670     Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true),
14671         SDLoc(Node));
14672
14673     SDValue Size = Tmp2.getOperand(1);
14674     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
14675     Chain = SP.getValue(1);
14676     unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
14677     const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
14678     unsigned StackAlign = TFI.getStackAlignment();
14679     Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
14680     if (Align > StackAlign)
14681       Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
14682           DAG.getConstant(-(uint64_t)Align, dl, VT));
14683     Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
14684
14685     Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
14686         DAG.getIntPtrConstant(0, dl, true), SDValue(),
14687         SDLoc(Node));
14688
14689     SDValue Ops[2] = { Tmp1, Tmp2 };
14690     return DAG.getMergeValues(Ops, dl);
14691   }
14692
14693   // Get the inputs.
14694   SDValue Chain = Op.getOperand(0);
14695   SDValue Size  = Op.getOperand(1);
14696   unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
14697   EVT VT = Op.getNode()->getValueType(0);
14698
14699   bool Is64Bit = Subtarget->is64Bit();
14700   EVT SPTy = getPointerTy();
14701
14702   if (SplitStack) {
14703     MachineRegisterInfo &MRI = MF.getRegInfo();
14704
14705     if (Is64Bit) {
14706       // The 64 bit implementation of segmented stacks needs to clobber both r10
14707       // r11. This makes it impossible to use it along with nested parameters.
14708       const Function *F = MF.getFunction();
14709
14710       for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
14711            I != E; ++I)
14712         if (I->hasNestAttr())
14713           report_fatal_error("Cannot use segmented stacks with functions that "
14714                              "have nested arguments.");
14715     }
14716
14717     const TargetRegisterClass *AddrRegClass =
14718       getRegClassFor(getPointerTy());
14719     unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
14720     Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
14721     SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
14722                                 DAG.getRegister(Vreg, SPTy));
14723     SDValue Ops1[2] = { Value, Chain };
14724     return DAG.getMergeValues(Ops1, dl);
14725   } else {
14726     SDValue Flag;
14727     const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX);
14728
14729     Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
14730     Flag = Chain.getValue(1);
14731     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
14732
14733     Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
14734
14735     const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
14736     unsigned SPReg = RegInfo->getStackRegister();
14737     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
14738     Chain = SP.getValue(1);
14739
14740     if (Align) {
14741       SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
14742                        DAG.getConstant(-(uint64_t)Align, dl, VT));
14743       Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
14744     }
14745
14746     SDValue Ops1[2] = { SP, Chain };
14747     return DAG.getMergeValues(Ops1, dl);
14748   }
14749 }
14750
14751 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
14752   MachineFunction &MF = DAG.getMachineFunction();
14753   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
14754
14755   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
14756   SDLoc DL(Op);
14757
14758   if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) {
14759     // vastart just stores the address of the VarArgsFrameIndex slot into the
14760     // memory location argument.
14761     SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
14762                                    getPointerTy());
14763     return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
14764                         MachinePointerInfo(SV), false, false, 0);
14765   }
14766
14767   // __va_list_tag:
14768   //   gp_offset         (0 - 6 * 8)
14769   //   fp_offset         (48 - 48 + 8 * 16)
14770   //   overflow_arg_area (point to parameters coming in memory).
14771   //   reg_save_area
14772   SmallVector<SDValue, 8> MemOps;
14773   SDValue FIN = Op.getOperand(1);
14774   // Store gp_offset
14775   SDValue Store = DAG.getStore(Op.getOperand(0), DL,
14776                                DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
14777                                                DL, MVT::i32),
14778                                FIN, MachinePointerInfo(SV), false, false, 0);
14779   MemOps.push_back(Store);
14780
14781   // Store fp_offset
14782   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
14783                     FIN, DAG.getIntPtrConstant(4, DL));
14784   Store = DAG.getStore(Op.getOperand(0), DL,
14785                        DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL,
14786                                        MVT::i32),
14787                        FIN, MachinePointerInfo(SV, 4), false, false, 0);
14788   MemOps.push_back(Store);
14789
14790   // Store ptr to overflow_arg_area
14791   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
14792                     FIN, DAG.getIntPtrConstant(4, DL));
14793   SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
14794                                     getPointerTy());
14795   Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
14796                        MachinePointerInfo(SV, 8),
14797                        false, false, 0);
14798   MemOps.push_back(Store);
14799
14800   // Store ptr to reg_save_area.
14801   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
14802                     FIN, DAG.getIntPtrConstant(8, DL));
14803   SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
14804                                     getPointerTy());
14805   Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
14806                        MachinePointerInfo(SV, 16), false, false, 0);
14807   MemOps.push_back(Store);
14808   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
14809 }
14810
14811 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
14812   assert(Subtarget->is64Bit() &&
14813          "LowerVAARG only handles 64-bit va_arg!");
14814   assert((Subtarget->isTargetLinux() ||
14815           Subtarget->isTargetDarwin()) &&
14816           "Unhandled target in LowerVAARG");
14817   assert(Op.getNode()->getNumOperands() == 4);
14818   SDValue Chain = Op.getOperand(0);
14819   SDValue SrcPtr = Op.getOperand(1);
14820   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
14821   unsigned Align = Op.getConstantOperandVal(3);
14822   SDLoc dl(Op);
14823
14824   EVT ArgVT = Op.getNode()->getValueType(0);
14825   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
14826   uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
14827   uint8_t ArgMode;
14828
14829   // Decide which area this value should be read from.
14830   // TODO: Implement the AMD64 ABI in its entirety. This simple
14831   // selection mechanism works only for the basic types.
14832   if (ArgVT == MVT::f80) {
14833     llvm_unreachable("va_arg for f80 not yet implemented");
14834   } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
14835     ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
14836   } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
14837     ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
14838   } else {
14839     llvm_unreachable("Unhandled argument type in LowerVAARG");
14840   }
14841
14842   if (ArgMode == 2) {
14843     // Sanity Check: Make sure using fp_offset makes sense.
14844     assert(!Subtarget->useSoftFloat() &&
14845            !(DAG.getMachineFunction().getFunction()->hasFnAttribute(
14846                Attribute::NoImplicitFloat)) &&
14847            Subtarget->hasSSE1());
14848   }
14849
14850   // Insert VAARG_64 node into the DAG
14851   // VAARG_64 returns two values: Variable Argument Address, Chain
14852   SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
14853                        DAG.getConstant(ArgMode, dl, MVT::i8),
14854                        DAG.getConstant(Align, dl, MVT::i32)};
14855   SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other);
14856   SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
14857                                           VTs, InstOps, MVT::i64,
14858                                           MachinePointerInfo(SV),
14859                                           /*Align=*/0,
14860                                           /*Volatile=*/false,
14861                                           /*ReadMem=*/true,
14862                                           /*WriteMem=*/true);
14863   Chain = VAARG.getValue(1);
14864
14865   // Load the next argument and return it
14866   return DAG.getLoad(ArgVT, dl,
14867                      Chain,
14868                      VAARG,
14869                      MachinePointerInfo(),
14870                      false, false, false, 0);
14871 }
14872
14873 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
14874                            SelectionDAG &DAG) {
14875   // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
14876   assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
14877   SDValue Chain = Op.getOperand(0);
14878   SDValue DstPtr = Op.getOperand(1);
14879   SDValue SrcPtr = Op.getOperand(2);
14880   const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
14881   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
14882   SDLoc DL(Op);
14883
14884   return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
14885                        DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
14886                        false, false,
14887                        MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
14888 }
14889
14890 // getTargetVShiftByConstNode - Handle vector element shifts where the shift
14891 // amount is a constant. Takes immediate version of shift as input.
14892 static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
14893                                           SDValue SrcOp, uint64_t ShiftAmt,
14894                                           SelectionDAG &DAG) {
14895   MVT ElementType = VT.getVectorElementType();
14896
14897   // Fold this packed shift into its first operand if ShiftAmt is 0.
14898   if (ShiftAmt == 0)
14899     return SrcOp;
14900
14901   // Check for ShiftAmt >= element width
14902   if (ShiftAmt >= ElementType.getSizeInBits()) {
14903     if (Opc == X86ISD::VSRAI)
14904       ShiftAmt = ElementType.getSizeInBits() - 1;
14905     else
14906       return DAG.getConstant(0, dl, VT);
14907   }
14908
14909   assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
14910          && "Unknown target vector shift-by-constant node");
14911
14912   // Fold this packed vector shift into a build vector if SrcOp is a
14913   // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT.
14914   if (VT == SrcOp.getSimpleValueType() &&
14915       ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
14916     SmallVector<SDValue, 8> Elts;
14917     unsigned NumElts = SrcOp->getNumOperands();
14918     ConstantSDNode *ND;
14919
14920     switch(Opc) {
14921     default: llvm_unreachable(nullptr);
14922     case X86ISD::VSHLI:
14923       for (unsigned i=0; i!=NumElts; ++i) {
14924         SDValue CurrentOp = SrcOp->getOperand(i);
14925         if (CurrentOp->getOpcode() == ISD::UNDEF) {
14926           Elts.push_back(CurrentOp);
14927           continue;
14928         }
14929         ND = cast<ConstantSDNode>(CurrentOp);
14930         const APInt &C = ND->getAPIntValue();
14931         Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
14932       }
14933       break;
14934     case X86ISD::VSRLI:
14935       for (unsigned i=0; i!=NumElts; ++i) {
14936         SDValue CurrentOp = SrcOp->getOperand(i);
14937         if (CurrentOp->getOpcode() == ISD::UNDEF) {
14938           Elts.push_back(CurrentOp);
14939           continue;
14940         }
14941         ND = cast<ConstantSDNode>(CurrentOp);
14942         const APInt &C = ND->getAPIntValue();
14943         Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
14944       }
14945       break;
14946     case X86ISD::VSRAI:
14947       for (unsigned i=0; i!=NumElts; ++i) {
14948         SDValue CurrentOp = SrcOp->getOperand(i);
14949         if (CurrentOp->getOpcode() == ISD::UNDEF) {
14950           Elts.push_back(CurrentOp);
14951           continue;
14952         }
14953         ND = cast<ConstantSDNode>(CurrentOp);
14954         const APInt &C = ND->getAPIntValue();
14955         Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
14956       }
14957       break;
14958     }
14959
14960     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
14961   }
14962
14963   return DAG.getNode(Opc, dl, VT, SrcOp,
14964                      DAG.getConstant(ShiftAmt, dl, MVT::i8));
14965 }
14966
14967 // getTargetVShiftNode - Handle vector element shifts where the shift amount
14968 // may or may not be a constant. Takes immediate version of shift as input.
14969 static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
14970                                    SDValue SrcOp, SDValue ShAmt,
14971                                    SelectionDAG &DAG) {
14972   MVT SVT = ShAmt.getSimpleValueType();
14973   assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
14974
14975   // Catch shift-by-constant.
14976   if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
14977     return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
14978                                       CShAmt->getZExtValue(), DAG);
14979
14980   // Change opcode to non-immediate version
14981   switch (Opc) {
14982     default: llvm_unreachable("Unknown target vector shift node");
14983     case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
14984     case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
14985     case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
14986   }
14987
14988   const X86Subtarget &Subtarget =
14989       static_cast<const X86Subtarget &>(DAG.getSubtarget());
14990   if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
14991       ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
14992     // Let the shuffle legalizer expand this shift amount node.
14993     SDValue Op0 = ShAmt.getOperand(0);
14994     Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
14995     ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, &Subtarget, DAG);
14996   } else {
14997     // Need to build a vector containing shift amount.
14998     // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
14999     SmallVector<SDValue, 4> ShOps;
15000     ShOps.push_back(ShAmt);
15001     if (SVT == MVT::i32) {
15002       ShOps.push_back(DAG.getConstant(0, dl, SVT));
15003       ShOps.push_back(DAG.getUNDEF(SVT));
15004     }
15005     ShOps.push_back(DAG.getUNDEF(SVT));
15006
15007     MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
15008     ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, BVT, ShOps);
15009   }
15010
15011   // The return type has to be a 128-bit type with the same element
15012   // type as the input type.
15013   MVT EltVT = VT.getVectorElementType();
15014   EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
15015
15016   ShAmt = DAG.getBitcast(ShVT, ShAmt);
15017   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
15018 }
15019
15020 /// \brief Return (and \p Op, \p Mask) for compare instructions or
15021 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
15022 /// necessary casting for \p Mask when lowering masking intrinsics.
15023 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
15024                                     SDValue PreservedSrc,
15025                                     const X86Subtarget *Subtarget,
15026                                     SelectionDAG &DAG) {
15027     EVT VT = Op.getValueType();
15028     EVT MaskVT = EVT::getVectorVT(*DAG.getContext(),
15029                                   MVT::i1, VT.getVectorNumElements());
15030     EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
15031                                      Mask.getValueType().getSizeInBits());
15032     SDLoc dl(Op);
15033
15034     assert(MaskVT.isSimple() && "invalid mask type");
15035
15036     if (isAllOnes(Mask))
15037       return Op;
15038
15039     // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
15040     // are extracted by EXTRACT_SUBVECTOR.
15041     SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
15042                                 DAG.getBitcast(BitcastVT, Mask),
15043                                 DAG.getIntPtrConstant(0, dl));
15044
15045     switch (Op.getOpcode()) {
15046       default: break;
15047       case X86ISD::PCMPEQM:
15048       case X86ISD::PCMPGTM:
15049       case X86ISD::CMPM:
15050       case X86ISD::CMPMU:
15051         return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
15052     }
15053     if (PreservedSrc.getOpcode() == ISD::UNDEF)
15054       PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
15055     return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc);
15056 }
15057
15058 /// \brief Creates an SDNode for a predicated scalar operation.
15059 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
15060 /// The mask is comming as MVT::i8 and it should be truncated
15061 /// to MVT::i1 while lowering masking intrinsics.
15062 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
15063 /// "X86select" instead of "vselect". We just can't create the "vselect" node for
15064 /// a scalar instruction.
15065 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
15066                                     SDValue PreservedSrc,
15067                                     const X86Subtarget *Subtarget,
15068                                     SelectionDAG &DAG) {
15069     if (isAllOnes(Mask))
15070       return Op;
15071
15072     EVT VT = Op.getValueType();
15073     SDLoc dl(Op);
15074     // The mask should be of type MVT::i1
15075     SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
15076
15077     if (PreservedSrc.getOpcode() == ISD::UNDEF)
15078       PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
15079     return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
15080 }
15081
15082 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
15083                                        SelectionDAG &DAG) {
15084   SDLoc dl(Op);
15085   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
15086   EVT VT = Op.getValueType();
15087   const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
15088   if (IntrData) {
15089     switch(IntrData->Type) {
15090     case INTR_TYPE_1OP:
15091       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
15092     case INTR_TYPE_2OP:
15093       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
15094         Op.getOperand(2));
15095     case INTR_TYPE_3OP:
15096       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
15097         Op.getOperand(2), Op.getOperand(3));
15098     case INTR_TYPE_1OP_MASK_RM: {
15099       SDValue Src = Op.getOperand(1);
15100       SDValue PassThru = Op.getOperand(2);
15101       SDValue Mask = Op.getOperand(3);
15102       SDValue RoundingMode;
15103       if (Op.getNumOperands() == 4)
15104         RoundingMode = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
15105       else
15106         RoundingMode = Op.getOperand(4);
15107       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
15108       if (IntrWithRoundingModeOpcode != 0) {
15109         unsigned Round = cast<ConstantSDNode>(RoundingMode)->getZExtValue();
15110         if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION)
15111           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
15112                                       dl, Op.getValueType(), Src, RoundingMode),
15113                                       Mask, PassThru, Subtarget, DAG);
15114       }
15115       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
15116                                               RoundingMode),
15117                                   Mask, PassThru, Subtarget, DAG);
15118     }
15119     case INTR_TYPE_1OP_MASK: {
15120       SDValue Src = Op.getOperand(1);
15121       SDValue Passthru = Op.getOperand(2);
15122       SDValue Mask = Op.getOperand(3);
15123       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
15124                                   Mask, Passthru, Subtarget, DAG);
15125     }
15126     case INTR_TYPE_SCALAR_MASK_RM: {
15127       SDValue Src1 = Op.getOperand(1);
15128       SDValue Src2 = Op.getOperand(2);
15129       SDValue Src0 = Op.getOperand(3);
15130       SDValue Mask = Op.getOperand(4);
15131       // There are 2 kinds of intrinsics in this group:
15132       // (1) With supress-all-exceptions (sae) or rounding mode- 6 operands
15133       // (2) With rounding mode and sae - 7 operands.
15134       if (Op.getNumOperands() == 6) {
15135         SDValue Sae  = Op.getOperand(5);
15136         unsigned Opc = IntrData->Opc1 ? IntrData->Opc1 : IntrData->Opc0;
15137         return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2,
15138                                                 Sae),
15139                                     Mask, Src0, Subtarget, DAG);
15140       }
15141       assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
15142       SDValue RoundingMode  = Op.getOperand(5);
15143       SDValue Sae  = Op.getOperand(6);
15144       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
15145                                               RoundingMode, Sae),
15146                                   Mask, Src0, Subtarget, DAG);
15147     }
15148     case INTR_TYPE_2OP_MASK: {
15149       SDValue Src1 = Op.getOperand(1);
15150       SDValue Src2 = Op.getOperand(2);
15151       SDValue PassThru = Op.getOperand(3);
15152       SDValue Mask = Op.getOperand(4);
15153       // We specify 2 possible opcodes for intrinsics with rounding modes.
15154       // First, we check if the intrinsic may have non-default rounding mode,
15155       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
15156       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
15157       if (IntrWithRoundingModeOpcode != 0) {
15158         SDValue Rnd = Op.getOperand(5);
15159         unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
15160         if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
15161           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
15162                                       dl, Op.getValueType(),
15163                                       Src1, Src2, Rnd),
15164                                       Mask, PassThru, Subtarget, DAG);
15165         }
15166       }
15167       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
15168                                               Src1,Src2),
15169                                   Mask, PassThru, Subtarget, DAG);
15170     }
15171     case INTR_TYPE_3OP_MASK: {
15172       SDValue Src1 = Op.getOperand(1);
15173       SDValue Src2 = Op.getOperand(2);
15174       SDValue Src3 = Op.getOperand(3);
15175       SDValue PassThru = Op.getOperand(4);
15176       SDValue Mask = Op.getOperand(5);
15177       // We specify 2 possible opcodes for intrinsics with rounding modes.
15178       // First, we check if the intrinsic may have non-default rounding mode,
15179       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
15180       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
15181       if (IntrWithRoundingModeOpcode != 0) {
15182         SDValue Rnd = Op.getOperand(6);
15183         unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
15184         if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
15185           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
15186                                       dl, Op.getValueType(),
15187                                       Src1, Src2, Src3, Rnd),
15188                                       Mask, PassThru, Subtarget, DAG);
15189         }
15190       }
15191       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
15192                                               Src1, Src2, Src3),
15193                                   Mask, PassThru, Subtarget, DAG);
15194     }
15195     case FMA_OP_MASK: {
15196       SDValue Src1 = Op.getOperand(1);
15197       SDValue Src2 = Op.getOperand(2);
15198       SDValue Src3 = Op.getOperand(3);
15199       SDValue Mask = Op.getOperand(4);
15200       // We specify 2 possible opcodes for intrinsics with rounding modes.
15201       // First, we check if the intrinsic may have non-default rounding mode,
15202       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
15203       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
15204       if (IntrWithRoundingModeOpcode != 0) {
15205         SDValue Rnd = Op.getOperand(5);
15206         if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
15207             X86::STATIC_ROUNDING::CUR_DIRECTION)
15208           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
15209                                                   dl, Op.getValueType(),
15210                                                   Src1, Src2, Src3, Rnd),
15211                                       Mask, Src1, Subtarget, DAG);
15212       }
15213       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
15214                                               dl, Op.getValueType(),
15215                                               Src1, Src2, Src3),
15216                                   Mask, Src1, Subtarget, DAG);
15217     }
15218     case CMP_MASK:
15219     case CMP_MASK_CC: {
15220       // Comparison intrinsics with masks.
15221       // Example of transformation:
15222       // (i8 (int_x86_avx512_mask_pcmpeq_q_128
15223       //             (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
15224       // (i8 (bitcast
15225       //   (v8i1 (insert_subvector undef,
15226       //           (v2i1 (and (PCMPEQM %a, %b),
15227       //                      (extract_subvector
15228       //                         (v8i1 (bitcast %mask)), 0))), 0))))
15229       EVT VT = Op.getOperand(1).getValueType();
15230       EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
15231                                     VT.getVectorNumElements());
15232       SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
15233       EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
15234                                        Mask.getValueType().getSizeInBits());
15235       SDValue Cmp;
15236       if (IntrData->Type == CMP_MASK_CC) {
15237         SDValue CC = Op.getOperand(3);
15238         CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
15239         // We specify 2 possible opcodes for intrinsics with rounding modes.
15240         // First, we check if the intrinsic may have non-default rounding mode,
15241         // (IntrData->Opc1 != 0), then we check the rounding mode operand.
15242         if (IntrData->Opc1 != 0) {
15243           SDValue Rnd = Op.getOperand(5);
15244           if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
15245               X86::STATIC_ROUNDING::CUR_DIRECTION)
15246             Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
15247                               Op.getOperand(2), CC, Rnd);
15248         }
15249         //default rounding mode
15250         if(!Cmp.getNode())
15251             Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
15252                               Op.getOperand(2), CC);
15253
15254       } else {
15255         assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
15256         Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
15257                           Op.getOperand(2));
15258       }
15259       SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
15260                                              DAG.getTargetConstant(0, dl,
15261                                                                    MaskVT),
15262                                              Subtarget, DAG);
15263       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
15264                                 DAG.getUNDEF(BitcastVT), CmpMask,
15265                                 DAG.getIntPtrConstant(0, dl));
15266       return DAG.getBitcast(Op.getValueType(), Res);
15267     }
15268     case COMI: { // Comparison intrinsics
15269       ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
15270       SDValue LHS = Op.getOperand(1);
15271       SDValue RHS = Op.getOperand(2);
15272       unsigned X86CC = TranslateX86CC(CC, dl, true, LHS, RHS, DAG);
15273       assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
15274       SDValue Cond = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
15275       SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15276                                   DAG.getConstant(X86CC, dl, MVT::i8), Cond);
15277       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
15278     }
15279     case VSHIFT:
15280       return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
15281                                  Op.getOperand(1), Op.getOperand(2), DAG);
15282     case VSHIFT_MASK:
15283       return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl,
15284                                                       Op.getSimpleValueType(),
15285                                                       Op.getOperand(1),
15286                                                       Op.getOperand(2), DAG),
15287                                   Op.getOperand(4), Op.getOperand(3), Subtarget,
15288                                   DAG);
15289     case COMPRESS_EXPAND_IN_REG: {
15290       SDValue Mask = Op.getOperand(3);
15291       SDValue DataToCompress = Op.getOperand(1);
15292       SDValue PassThru = Op.getOperand(2);
15293       if (isAllOnes(Mask)) // return data as is
15294         return Op.getOperand(1);
15295       EVT VT = Op.getValueType();
15296       EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
15297                                     VT.getVectorNumElements());
15298       EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
15299                                        Mask.getValueType().getSizeInBits());
15300       SDLoc dl(Op);
15301       SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
15302                                   DAG.getBitcast(BitcastVT, Mask),
15303                                   DAG.getIntPtrConstant(0, dl));
15304
15305       return DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToCompress,
15306                          PassThru);
15307     }
15308     case BLEND: {
15309       SDValue Mask = Op.getOperand(3);
15310       EVT VT = Op.getValueType();
15311       EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
15312                                     VT.getVectorNumElements());
15313       EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
15314                                        Mask.getValueType().getSizeInBits());
15315       SDLoc dl(Op);
15316       SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
15317                                   DAG.getBitcast(BitcastVT, Mask),
15318                                   DAG.getIntPtrConstant(0, dl));
15319       return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1),
15320                          Op.getOperand(2));
15321     }
15322     default:
15323       break;
15324     }
15325   }
15326
15327   switch (IntNo) {
15328   default: return SDValue();    // Don't custom lower most intrinsics.
15329
15330   case Intrinsic::x86_avx2_permd:
15331   case Intrinsic::x86_avx2_permps:
15332     // Operands intentionally swapped. Mask is last operand to intrinsic,
15333     // but second operand for node/instruction.
15334     return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
15335                        Op.getOperand(2), Op.getOperand(1));
15336
15337   // ptest and testp intrinsics. The intrinsic these come from are designed to
15338   // return an integer value, not just an instruction so lower it to the ptest
15339   // or testp pattern and a setcc for the result.
15340   case Intrinsic::x86_sse41_ptestz:
15341   case Intrinsic::x86_sse41_ptestc:
15342   case Intrinsic::x86_sse41_ptestnzc:
15343   case Intrinsic::x86_avx_ptestz_256:
15344   case Intrinsic::x86_avx_ptestc_256:
15345   case Intrinsic::x86_avx_ptestnzc_256:
15346   case Intrinsic::x86_avx_vtestz_ps:
15347   case Intrinsic::x86_avx_vtestc_ps:
15348   case Intrinsic::x86_avx_vtestnzc_ps:
15349   case Intrinsic::x86_avx_vtestz_pd:
15350   case Intrinsic::x86_avx_vtestc_pd:
15351   case Intrinsic::x86_avx_vtestnzc_pd:
15352   case Intrinsic::x86_avx_vtestz_ps_256:
15353   case Intrinsic::x86_avx_vtestc_ps_256:
15354   case Intrinsic::x86_avx_vtestnzc_ps_256:
15355   case Intrinsic::x86_avx_vtestz_pd_256:
15356   case Intrinsic::x86_avx_vtestc_pd_256:
15357   case Intrinsic::x86_avx_vtestnzc_pd_256: {
15358     bool IsTestPacked = false;
15359     unsigned X86CC;
15360     switch (IntNo) {
15361     default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
15362     case Intrinsic::x86_avx_vtestz_ps:
15363     case Intrinsic::x86_avx_vtestz_pd:
15364     case Intrinsic::x86_avx_vtestz_ps_256:
15365     case Intrinsic::x86_avx_vtestz_pd_256:
15366       IsTestPacked = true; // Fallthrough
15367     case Intrinsic::x86_sse41_ptestz:
15368     case Intrinsic::x86_avx_ptestz_256:
15369       // ZF = 1
15370       X86CC = X86::COND_E;
15371       break;
15372     case Intrinsic::x86_avx_vtestc_ps:
15373     case Intrinsic::x86_avx_vtestc_pd:
15374     case Intrinsic::x86_avx_vtestc_ps_256:
15375     case Intrinsic::x86_avx_vtestc_pd_256:
15376       IsTestPacked = true; // Fallthrough
15377     case Intrinsic::x86_sse41_ptestc:
15378     case Intrinsic::x86_avx_ptestc_256:
15379       // CF = 1
15380       X86CC = X86::COND_B;
15381       break;
15382     case Intrinsic::x86_avx_vtestnzc_ps:
15383     case Intrinsic::x86_avx_vtestnzc_pd:
15384     case Intrinsic::x86_avx_vtestnzc_ps_256:
15385     case Intrinsic::x86_avx_vtestnzc_pd_256:
15386       IsTestPacked = true; // Fallthrough
15387     case Intrinsic::x86_sse41_ptestnzc:
15388     case Intrinsic::x86_avx_ptestnzc_256:
15389       // ZF and CF = 0
15390       X86CC = X86::COND_A;
15391       break;
15392     }
15393
15394     SDValue LHS = Op.getOperand(1);
15395     SDValue RHS = Op.getOperand(2);
15396     unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
15397     SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
15398     SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8);
15399     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
15400     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
15401   }
15402   case Intrinsic::x86_avx512_kortestz_w:
15403   case Intrinsic::x86_avx512_kortestc_w: {
15404     unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B;
15405     SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
15406     SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
15407     SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8);
15408     SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
15409     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i1, CC, Test);
15410     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
15411   }
15412
15413   case Intrinsic::x86_sse42_pcmpistria128:
15414   case Intrinsic::x86_sse42_pcmpestria128:
15415   case Intrinsic::x86_sse42_pcmpistric128:
15416   case Intrinsic::x86_sse42_pcmpestric128:
15417   case Intrinsic::x86_sse42_pcmpistrio128:
15418   case Intrinsic::x86_sse42_pcmpestrio128:
15419   case Intrinsic::x86_sse42_pcmpistris128:
15420   case Intrinsic::x86_sse42_pcmpestris128:
15421   case Intrinsic::x86_sse42_pcmpistriz128:
15422   case Intrinsic::x86_sse42_pcmpestriz128: {
15423     unsigned Opcode;
15424     unsigned X86CC;
15425     switch (IntNo) {
15426     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
15427     case Intrinsic::x86_sse42_pcmpistria128:
15428       Opcode = X86ISD::PCMPISTRI;
15429       X86CC = X86::COND_A;
15430       break;
15431     case Intrinsic::x86_sse42_pcmpestria128:
15432       Opcode = X86ISD::PCMPESTRI;
15433       X86CC = X86::COND_A;
15434       break;
15435     case Intrinsic::x86_sse42_pcmpistric128:
15436       Opcode = X86ISD::PCMPISTRI;
15437       X86CC = X86::COND_B;
15438       break;
15439     case Intrinsic::x86_sse42_pcmpestric128:
15440       Opcode = X86ISD::PCMPESTRI;
15441       X86CC = X86::COND_B;
15442       break;
15443     case Intrinsic::x86_sse42_pcmpistrio128:
15444       Opcode = X86ISD::PCMPISTRI;
15445       X86CC = X86::COND_O;
15446       break;
15447     case Intrinsic::x86_sse42_pcmpestrio128:
15448       Opcode = X86ISD::PCMPESTRI;
15449       X86CC = X86::COND_O;
15450       break;
15451     case Intrinsic::x86_sse42_pcmpistris128:
15452       Opcode = X86ISD::PCMPISTRI;
15453       X86CC = X86::COND_S;
15454       break;
15455     case Intrinsic::x86_sse42_pcmpestris128:
15456       Opcode = X86ISD::PCMPESTRI;
15457       X86CC = X86::COND_S;
15458       break;
15459     case Intrinsic::x86_sse42_pcmpistriz128:
15460       Opcode = X86ISD::PCMPISTRI;
15461       X86CC = X86::COND_E;
15462       break;
15463     case Intrinsic::x86_sse42_pcmpestriz128:
15464       Opcode = X86ISD::PCMPESTRI;
15465       X86CC = X86::COND_E;
15466       break;
15467     }
15468     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
15469     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
15470     SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
15471     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15472                                 DAG.getConstant(X86CC, dl, MVT::i8),
15473                                 SDValue(PCMP.getNode(), 1));
15474     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
15475   }
15476
15477   case Intrinsic::x86_sse42_pcmpistri128:
15478   case Intrinsic::x86_sse42_pcmpestri128: {
15479     unsigned Opcode;
15480     if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
15481       Opcode = X86ISD::PCMPISTRI;
15482     else
15483       Opcode = X86ISD::PCMPESTRI;
15484
15485     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
15486     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
15487     return DAG.getNode(Opcode, dl, VTs, NewOps);
15488   }
15489
15490   case Intrinsic::x86_seh_lsda: {
15491     // Compute the symbol for the LSDA. We know it'll get emitted later.
15492     MachineFunction &MF = DAG.getMachineFunction();
15493     SDValue Op1 = Op.getOperand(1);
15494     auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
15495     MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
15496         GlobalValue::getRealLinkageName(Fn->getName()));
15497     StringRef Name = LSDASym->getName();
15498     assert(Name.data()[Name.size()] == '\0' && "not null terminated");
15499
15500     // Generate a simple absolute symbol reference. This intrinsic is only
15501     // supported on 32-bit Windows, which isn't PIC.
15502     SDValue Result =
15503         DAG.getTargetExternalSymbol(Name.data(), VT, X86II::MO_NOPREFIX);
15504     return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
15505   }
15506   }
15507 }
15508
15509 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
15510                               SDValue Src, SDValue Mask, SDValue Base,
15511                               SDValue Index, SDValue ScaleOp, SDValue Chain,
15512                               const X86Subtarget * Subtarget) {
15513   SDLoc dl(Op);
15514   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
15515   assert(C && "Invalid scale type");
15516   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
15517   EVT MaskVT = MVT::getVectorVT(MVT::i1,
15518                              Index.getSimpleValueType().getVectorNumElements());
15519   SDValue MaskInReg;
15520   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
15521   if (MaskC)
15522     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT);
15523   else
15524     MaskInReg = DAG.getBitcast(MaskVT, Mask);
15525   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
15526   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
15527   SDValue Segment = DAG.getRegister(0, MVT::i32);
15528   if (Src.getOpcode() == ISD::UNDEF)
15529     Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl);
15530   SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
15531   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
15532   SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
15533   return DAG.getMergeValues(RetOps, dl);
15534 }
15535
15536 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
15537                                SDValue Src, SDValue Mask, SDValue Base,
15538                                SDValue Index, SDValue ScaleOp, SDValue Chain) {
15539   SDLoc dl(Op);
15540   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
15541   assert(C && "Invalid scale type");
15542   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
15543   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
15544   SDValue Segment = DAG.getRegister(0, MVT::i32);
15545   EVT MaskVT = MVT::getVectorVT(MVT::i1,
15546                              Index.getSimpleValueType().getVectorNumElements());
15547   SDValue MaskInReg;
15548   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
15549   if (MaskC)
15550     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT);
15551   else
15552     MaskInReg = DAG.getBitcast(MaskVT, Mask);
15553   SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
15554   SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
15555   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
15556   return SDValue(Res, 1);
15557 }
15558
15559 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
15560                                SDValue Mask, SDValue Base, SDValue Index,
15561                                SDValue ScaleOp, SDValue Chain) {
15562   SDLoc dl(Op);
15563   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
15564   assert(C && "Invalid scale type");
15565   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
15566   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
15567   SDValue Segment = DAG.getRegister(0, MVT::i32);
15568   EVT MaskVT =
15569     MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
15570   SDValue MaskInReg;
15571   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
15572   if (MaskC)
15573     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT);
15574   else
15575     MaskInReg = DAG.getBitcast(MaskVT, Mask);
15576   //SDVTList VTs = DAG.getVTList(MVT::Other);
15577   SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
15578   SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
15579   return SDValue(Res, 0);
15580 }
15581
15582 // getReadPerformanceCounter - Handles the lowering of builtin intrinsics that
15583 // read performance monitor counters (x86_rdpmc).
15584 static void getReadPerformanceCounter(SDNode *N, SDLoc DL,
15585                               SelectionDAG &DAG, const X86Subtarget *Subtarget,
15586                               SmallVectorImpl<SDValue> &Results) {
15587   assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
15588   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
15589   SDValue LO, HI;
15590
15591   // The ECX register is used to select the index of the performance counter
15592   // to read.
15593   SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
15594                                    N->getOperand(2));
15595   SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
15596
15597   // Reads the content of a 64-bit performance counter and returns it in the
15598   // registers EDX:EAX.
15599   if (Subtarget->is64Bit()) {
15600     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
15601     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
15602                             LO.getValue(2));
15603   } else {
15604     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
15605     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
15606                             LO.getValue(2));
15607   }
15608   Chain = HI.getValue(1);
15609
15610   if (Subtarget->is64Bit()) {
15611     // The EAX register is loaded with the low-order 32 bits. The EDX register
15612     // is loaded with the supported high-order bits of the counter.
15613     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
15614                               DAG.getConstant(32, DL, MVT::i8));
15615     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
15616     Results.push_back(Chain);
15617     return;
15618   }
15619
15620   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
15621   SDValue Ops[] = { LO, HI };
15622   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
15623   Results.push_back(Pair);
15624   Results.push_back(Chain);
15625 }
15626
15627 // getReadTimeStampCounter - Handles the lowering of builtin intrinsics that
15628 // read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is
15629 // also used to custom lower READCYCLECOUNTER nodes.
15630 static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode,
15631                               SelectionDAG &DAG, const X86Subtarget *Subtarget,
15632                               SmallVectorImpl<SDValue> &Results) {
15633   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
15634   SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
15635   SDValue LO, HI;
15636
15637   // The processor's time-stamp counter (a 64-bit MSR) is stored into the
15638   // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
15639   // and the EAX register is loaded with the low-order 32 bits.
15640   if (Subtarget->is64Bit()) {
15641     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
15642     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
15643                             LO.getValue(2));
15644   } else {
15645     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
15646     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
15647                             LO.getValue(2));
15648   }
15649   SDValue Chain = HI.getValue(1);
15650
15651   if (Opcode == X86ISD::RDTSCP_DAG) {
15652     assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
15653
15654     // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
15655     // the ECX register. Add 'ecx' explicitly to the chain.
15656     SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
15657                                      HI.getValue(2));
15658     // Explicitly store the content of ECX at the location passed in input
15659     // to the 'rdtscp' intrinsic.
15660     Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
15661                          MachinePointerInfo(), false, false, 0);
15662   }
15663
15664   if (Subtarget->is64Bit()) {
15665     // The EDX register is loaded with the high-order 32 bits of the MSR, and
15666     // the EAX register is loaded with the low-order 32 bits.
15667     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
15668                               DAG.getConstant(32, DL, MVT::i8));
15669     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
15670     Results.push_back(Chain);
15671     return;
15672   }
15673
15674   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
15675   SDValue Ops[] = { LO, HI };
15676   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
15677   Results.push_back(Pair);
15678   Results.push_back(Chain);
15679 }
15680
15681 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
15682                                      SelectionDAG &DAG) {
15683   SmallVector<SDValue, 2> Results;
15684   SDLoc DL(Op);
15685   getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
15686                           Results);
15687   return DAG.getMergeValues(Results, DL);
15688 }
15689
15690
15691 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
15692                                       SelectionDAG &DAG) {
15693   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
15694
15695   const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo);
15696   if (!IntrData)
15697     return SDValue();
15698
15699   SDLoc dl(Op);
15700   switch(IntrData->Type) {
15701   default:
15702     llvm_unreachable("Unknown Intrinsic Type");
15703     break;
15704   case RDSEED:
15705   case RDRAND: {
15706     // Emit the node with the right value type.
15707     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
15708     SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
15709
15710     // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
15711     // Otherwise return the value from Rand, which is always 0, casted to i32.
15712     SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
15713                       DAG.getConstant(1, dl, Op->getValueType(1)),
15714                       DAG.getConstant(X86::COND_B, dl, MVT::i32),
15715                       SDValue(Result.getNode(), 1) };
15716     SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
15717                                   DAG.getVTList(Op->getValueType(1), MVT::Glue),
15718                                   Ops);
15719
15720     // Return { result, isValid, chain }.
15721     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
15722                        SDValue(Result.getNode(), 2));
15723   }
15724   case GATHER: {
15725   //gather(v1, mask, index, base, scale);
15726     SDValue Chain = Op.getOperand(0);
15727     SDValue Src   = Op.getOperand(2);
15728     SDValue Base  = Op.getOperand(3);
15729     SDValue Index = Op.getOperand(4);
15730     SDValue Mask  = Op.getOperand(5);
15731     SDValue Scale = Op.getOperand(6);
15732     return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
15733                          Chain, Subtarget);
15734   }
15735   case SCATTER: {
15736   //scatter(base, mask, index, v1, scale);
15737     SDValue Chain = Op.getOperand(0);
15738     SDValue Base  = Op.getOperand(2);
15739     SDValue Mask  = Op.getOperand(3);
15740     SDValue Index = Op.getOperand(4);
15741     SDValue Src   = Op.getOperand(5);
15742     SDValue Scale = Op.getOperand(6);
15743     return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
15744                           Scale, Chain);
15745   }
15746   case PREFETCH: {
15747     SDValue Hint = Op.getOperand(6);
15748     unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
15749     assert(HintVal < 2 && "Wrong prefetch hint in intrinsic: should be 0 or 1");
15750     unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0);
15751     SDValue Chain = Op.getOperand(0);
15752     SDValue Mask  = Op.getOperand(2);
15753     SDValue Index = Op.getOperand(3);
15754     SDValue Base  = Op.getOperand(4);
15755     SDValue Scale = Op.getOperand(5);
15756     return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain);
15757   }
15758   // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
15759   case RDTSC: {
15760     SmallVector<SDValue, 2> Results;
15761     getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
15762                             Results);
15763     return DAG.getMergeValues(Results, dl);
15764   }
15765   // Read Performance Monitoring Counters.
15766   case RDPMC: {
15767     SmallVector<SDValue, 2> Results;
15768     getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
15769     return DAG.getMergeValues(Results, dl);
15770   }
15771   // XTEST intrinsics.
15772   case XTEST: {
15773     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
15774     SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
15775     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15776                                 DAG.getConstant(X86::COND_NE, dl, MVT::i8),
15777                                 InTrans);
15778     SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
15779     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
15780                        Ret, SDValue(InTrans.getNode(), 1));
15781   }
15782   // ADC/ADCX/SBB
15783   case ADX: {
15784     SmallVector<SDValue, 2> Results;
15785     SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
15786     SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
15787     SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
15788                                 DAG.getConstant(-1, dl, MVT::i8));
15789     SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
15790                               Op.getOperand(4), GenCF.getValue(1));
15791     SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
15792                                  Op.getOperand(5), MachinePointerInfo(),
15793                                  false, false, 0);
15794     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15795                                 DAG.getConstant(X86::COND_B, dl, MVT::i8),
15796                                 Res.getValue(1));
15797     Results.push_back(SetCC);
15798     Results.push_back(Store);
15799     return DAG.getMergeValues(Results, dl);
15800   }
15801   case COMPRESS_TO_MEM: {
15802     SDLoc dl(Op);
15803     SDValue Mask = Op.getOperand(4);
15804     SDValue DataToCompress = Op.getOperand(3);
15805     SDValue Addr = Op.getOperand(2);
15806     SDValue Chain = Op.getOperand(0);
15807
15808     EVT VT = DataToCompress.getValueType();
15809     if (isAllOnes(Mask)) // return just a store
15810       return DAG.getStore(Chain, dl, DataToCompress, Addr,
15811                           MachinePointerInfo(), false, false,
15812                           VT.getScalarSizeInBits()/8);
15813
15814     EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
15815                                   VT.getVectorNumElements());
15816     EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
15817                                      Mask.getValueType().getSizeInBits());
15818     SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
15819                                 DAG.getBitcast(BitcastVT, Mask),
15820                                 DAG.getIntPtrConstant(0, dl));
15821
15822     SDValue Compressed =  DAG.getNode(IntrData->Opc0, dl, VT, VMask,
15823                                       DataToCompress, DAG.getUNDEF(VT));
15824     return DAG.getStore(Chain, dl, Compressed, Addr,
15825                         MachinePointerInfo(), false, false,
15826                         VT.getScalarSizeInBits()/8);
15827   }
15828   case EXPAND_FROM_MEM: {
15829     SDLoc dl(Op);
15830     SDValue Mask = Op.getOperand(4);
15831     SDValue PathThru = Op.getOperand(3);
15832     SDValue Addr = Op.getOperand(2);
15833     SDValue Chain = Op.getOperand(0);
15834     EVT VT = Op.getValueType();
15835
15836     if (isAllOnes(Mask)) // return just a load
15837       return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false,
15838                          false, VT.getScalarSizeInBits()/8);
15839     EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
15840                                   VT.getVectorNumElements());
15841     EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
15842                                      Mask.getValueType().getSizeInBits());
15843     SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
15844                                 DAG.getBitcast(BitcastVT, Mask),
15845                                 DAG.getIntPtrConstant(0, dl));
15846
15847     SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(),
15848                                        false, false, false,
15849                                        VT.getScalarSizeInBits()/8);
15850
15851     SDValue Results[] = {
15852         DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToExpand, PathThru),
15853         Chain};
15854     return DAG.getMergeValues(Results, dl);
15855   }
15856   }
15857 }
15858
15859 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
15860                                            SelectionDAG &DAG) const {
15861   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
15862   MFI->setReturnAddressIsTaken(true);
15863
15864   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
15865     return SDValue();
15866
15867   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
15868   SDLoc dl(Op);
15869   EVT PtrVT = getPointerTy();
15870
15871   if (Depth > 0) {
15872     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
15873     const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
15874     SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
15875     return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
15876                        DAG.getNode(ISD::ADD, dl, PtrVT,
15877                                    FrameAddr, Offset),
15878                        MachinePointerInfo(), false, false, false, 0);
15879   }
15880
15881   // Just load the return address.
15882   SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
15883   return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
15884                      RetAddrFI, MachinePointerInfo(), false, false, false, 0);
15885 }
15886
15887 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
15888   MachineFunction &MF = DAG.getMachineFunction();
15889   MachineFrameInfo *MFI = MF.getFrameInfo();
15890   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
15891   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
15892   EVT VT = Op.getValueType();
15893
15894   MFI->setFrameAddressIsTaken(true);
15895
15896   if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
15897     // Depth > 0 makes no sense on targets which use Windows unwind codes.  It
15898     // is not possible to crawl up the stack without looking at the unwind codes
15899     // simultaneously.
15900     int FrameAddrIndex = FuncInfo->getFAIndex();
15901     if (!FrameAddrIndex) {
15902       // Set up a frame object for the return address.
15903       unsigned SlotSize = RegInfo->getSlotSize();
15904       FrameAddrIndex = MF.getFrameInfo()->CreateFixedObject(
15905           SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
15906       FuncInfo->setFAIndex(FrameAddrIndex);
15907     }
15908     return DAG.getFrameIndex(FrameAddrIndex, VT);
15909   }
15910
15911   unsigned FrameReg =
15912       RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
15913   SDLoc dl(Op);  // FIXME probably not meaningful
15914   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
15915   assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
15916           (FrameReg == X86::EBP && VT == MVT::i32)) &&
15917          "Invalid Frame Register!");
15918   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
15919   while (Depth--)
15920     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
15921                             MachinePointerInfo(),
15922                             false, false, false, 0);
15923   return FrameAddr;
15924 }
15925
15926 // FIXME? Maybe this could be a TableGen attribute on some registers and
15927 // this table could be generated automatically from RegInfo.
15928 unsigned X86TargetLowering::getRegisterByName(const char* RegName,
15929                                               EVT VT) const {
15930   unsigned Reg = StringSwitch<unsigned>(RegName)
15931                        .Case("esp", X86::ESP)
15932                        .Case("rsp", X86::RSP)
15933                        .Default(0);
15934   if (Reg)
15935     return Reg;
15936   report_fatal_error("Invalid register name global variable");
15937 }
15938
15939 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
15940                                                      SelectionDAG &DAG) const {
15941   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
15942   return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
15943 }
15944
15945 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
15946   SDValue Chain     = Op.getOperand(0);
15947   SDValue Offset    = Op.getOperand(1);
15948   SDValue Handler   = Op.getOperand(2);
15949   SDLoc dl      (Op);
15950
15951   EVT PtrVT = getPointerTy();
15952   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
15953   unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
15954   assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
15955           (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
15956          "Invalid Frame Register!");
15957   SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
15958   unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
15959
15960   SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
15961                                  DAG.getIntPtrConstant(RegInfo->getSlotSize(),
15962                                                        dl));
15963   StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
15964   Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
15965                        false, false, 0);
15966   Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
15967
15968   return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
15969                      DAG.getRegister(StoreAddrReg, PtrVT));
15970 }
15971
15972 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
15973                                                SelectionDAG &DAG) const {
15974   SDLoc DL(Op);
15975   return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
15976                      DAG.getVTList(MVT::i32, MVT::Other),
15977                      Op.getOperand(0), Op.getOperand(1));
15978 }
15979
15980 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
15981                                                 SelectionDAG &DAG) const {
15982   SDLoc DL(Op);
15983   return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
15984                      Op.getOperand(0), Op.getOperand(1));
15985 }
15986
15987 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
15988   return Op.getOperand(0);
15989 }
15990
15991 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
15992                                                 SelectionDAG &DAG) const {
15993   SDValue Root = Op.getOperand(0);
15994   SDValue Trmp = Op.getOperand(1); // trampoline
15995   SDValue FPtr = Op.getOperand(2); // nested function
15996   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
15997   SDLoc dl (Op);
15998
15999   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
16000   const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
16001
16002   if (Subtarget->is64Bit()) {
16003     SDValue OutChains[6];
16004
16005     // Large code-model.
16006     const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
16007     const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
16008
16009     const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
16010     const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
16011
16012     const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
16013
16014     // Load the pointer to the nested function into R11.
16015     unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
16016     SDValue Addr = Trmp;
16017     OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
16018                                 Addr, MachinePointerInfo(TrmpAddr),
16019                                 false, false, 0);
16020
16021     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
16022                        DAG.getConstant(2, dl, MVT::i64));
16023     OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
16024                                 MachinePointerInfo(TrmpAddr, 2),
16025                                 false, false, 2);
16026
16027     // Load the 'nest' parameter value into R10.
16028     // R10 is specified in X86CallingConv.td
16029     OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
16030     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
16031                        DAG.getConstant(10, dl, MVT::i64));
16032     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
16033                                 Addr, MachinePointerInfo(TrmpAddr, 10),
16034                                 false, false, 0);
16035
16036     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
16037                        DAG.getConstant(12, dl, MVT::i64));
16038     OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
16039                                 MachinePointerInfo(TrmpAddr, 12),
16040                                 false, false, 2);
16041
16042     // Jump to the nested function.
16043     OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
16044     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
16045                        DAG.getConstant(20, dl, MVT::i64));
16046     OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
16047                                 Addr, MachinePointerInfo(TrmpAddr, 20),
16048                                 false, false, 0);
16049
16050     unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
16051     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
16052                        DAG.getConstant(22, dl, MVT::i64));
16053     OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
16054                                 Addr, MachinePointerInfo(TrmpAddr, 22),
16055                                 false, false, 0);
16056
16057     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
16058   } else {
16059     const Function *Func =
16060       cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
16061     CallingConv::ID CC = Func->getCallingConv();
16062     unsigned NestReg;
16063
16064     switch (CC) {
16065     default:
16066       llvm_unreachable("Unsupported calling convention");
16067     case CallingConv::C:
16068     case CallingConv::X86_StdCall: {
16069       // Pass 'nest' parameter in ECX.
16070       // Must be kept in sync with X86CallingConv.td
16071       NestReg = X86::ECX;
16072
16073       // Check that ECX wasn't needed by an 'inreg' parameter.
16074       FunctionType *FTy = Func->getFunctionType();
16075       const AttributeSet &Attrs = Func->getAttributes();
16076
16077       if (!Attrs.isEmpty() && !Func->isVarArg()) {
16078         unsigned InRegCount = 0;
16079         unsigned Idx = 1;
16080
16081         for (FunctionType::param_iterator I = FTy->param_begin(),
16082              E = FTy->param_end(); I != E; ++I, ++Idx)
16083           if (Attrs.hasAttribute(Idx, Attribute::InReg))
16084             // FIXME: should only count parameters that are lowered to integers.
16085             InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
16086
16087         if (InRegCount > 2) {
16088           report_fatal_error("Nest register in use - reduce number of inreg"
16089                              " parameters!");
16090         }
16091       }
16092       break;
16093     }
16094     case CallingConv::X86_FastCall:
16095     case CallingConv::X86_ThisCall:
16096     case CallingConv::Fast:
16097       // Pass 'nest' parameter in EAX.
16098       // Must be kept in sync with X86CallingConv.td
16099       NestReg = X86::EAX;
16100       break;
16101     }
16102
16103     SDValue OutChains[4];
16104     SDValue Addr, Disp;
16105
16106     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
16107                        DAG.getConstant(10, dl, MVT::i32));
16108     Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
16109
16110     // This is storing the opcode for MOV32ri.
16111     const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
16112     const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
16113     OutChains[0] = DAG.getStore(Root, dl,
16114                                 DAG.getConstant(MOV32ri|N86Reg, dl, MVT::i8),
16115                                 Trmp, MachinePointerInfo(TrmpAddr),
16116                                 false, false, 0);
16117
16118     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
16119                        DAG.getConstant(1, dl, MVT::i32));
16120     OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
16121                                 MachinePointerInfo(TrmpAddr, 1),
16122                                 false, false, 1);
16123
16124     const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
16125     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
16126                        DAG.getConstant(5, dl, MVT::i32));
16127     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
16128                                 Addr, MachinePointerInfo(TrmpAddr, 5),
16129                                 false, false, 1);
16130
16131     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
16132                        DAG.getConstant(6, dl, MVT::i32));
16133     OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
16134                                 MachinePointerInfo(TrmpAddr, 6),
16135                                 false, false, 1);
16136
16137     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
16138   }
16139 }
16140
16141 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
16142                                             SelectionDAG &DAG) const {
16143   /*
16144    The rounding mode is in bits 11:10 of FPSR, and has the following
16145    settings:
16146      00 Round to nearest
16147      01 Round to -inf
16148      10 Round to +inf
16149      11 Round to 0
16150
16151   FLT_ROUNDS, on the other hand, expects the following:
16152     -1 Undefined
16153      0 Round to 0
16154      1 Round to nearest
16155      2 Round to +inf
16156      3 Round to -inf
16157
16158   To perform the conversion, we do:
16159     (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
16160   */
16161
16162   MachineFunction &MF = DAG.getMachineFunction();
16163   const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
16164   unsigned StackAlignment = TFI.getStackAlignment();
16165   MVT VT = Op.getSimpleValueType();
16166   SDLoc DL(Op);
16167
16168   // Save FP Control Word to stack slot
16169   int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
16170   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
16171
16172   MachineMemOperand *MMO =
16173    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
16174                            MachineMemOperand::MOStore, 2, 2);
16175
16176   SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
16177   SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
16178                                           DAG.getVTList(MVT::Other),
16179                                           Ops, MVT::i16, MMO);
16180
16181   // Load FP Control Word from stack slot
16182   SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
16183                             MachinePointerInfo(), false, false, false, 0);
16184
16185   // Transform as necessary
16186   SDValue CWD1 =
16187     DAG.getNode(ISD::SRL, DL, MVT::i16,
16188                 DAG.getNode(ISD::AND, DL, MVT::i16,
16189                             CWD, DAG.getConstant(0x800, DL, MVT::i16)),
16190                 DAG.getConstant(11, DL, MVT::i8));
16191   SDValue CWD2 =
16192     DAG.getNode(ISD::SRL, DL, MVT::i16,
16193                 DAG.getNode(ISD::AND, DL, MVT::i16,
16194                             CWD, DAG.getConstant(0x400, DL, MVT::i16)),
16195                 DAG.getConstant(9, DL, MVT::i8));
16196
16197   SDValue RetVal =
16198     DAG.getNode(ISD::AND, DL, MVT::i16,
16199                 DAG.getNode(ISD::ADD, DL, MVT::i16,
16200                             DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
16201                             DAG.getConstant(1, DL, MVT::i16)),
16202                 DAG.getConstant(3, DL, MVT::i16));
16203
16204   return DAG.getNode((VT.getSizeInBits() < 16 ?
16205                       ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
16206 }
16207
16208 static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
16209   MVT VT = Op.getSimpleValueType();
16210   EVT OpVT = VT;
16211   unsigned NumBits = VT.getSizeInBits();
16212   SDLoc dl(Op);
16213
16214   Op = Op.getOperand(0);
16215   if (VT == MVT::i8) {
16216     // Zero extend to i32 since there is not an i8 bsr.
16217     OpVT = MVT::i32;
16218     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
16219   }
16220
16221   // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
16222   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
16223   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
16224
16225   // If src is zero (i.e. bsr sets ZF), returns NumBits.
16226   SDValue Ops[] = {
16227     Op,
16228     DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
16229     DAG.getConstant(X86::COND_E, dl, MVT::i8),
16230     Op.getValue(1)
16231   };
16232   Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
16233
16234   // Finally xor with NumBits-1.
16235   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
16236                    DAG.getConstant(NumBits - 1, dl, OpVT));
16237
16238   if (VT == MVT::i8)
16239     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
16240   return Op;
16241 }
16242
16243 static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
16244   MVT VT = Op.getSimpleValueType();
16245   EVT OpVT = VT;
16246   unsigned NumBits = VT.getSizeInBits();
16247   SDLoc dl(Op);
16248
16249   Op = Op.getOperand(0);
16250   if (VT == MVT::i8) {
16251     // Zero extend to i32 since there is not an i8 bsr.
16252     OpVT = MVT::i32;
16253     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
16254   }
16255
16256   // Issue a bsr (scan bits in reverse).
16257   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
16258   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
16259
16260   // And xor with NumBits-1.
16261   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
16262                    DAG.getConstant(NumBits - 1, dl, OpVT));
16263
16264   if (VT == MVT::i8)
16265     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
16266   return Op;
16267 }
16268
16269 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
16270   MVT VT = Op.getSimpleValueType();
16271   unsigned NumBits = VT.getSizeInBits();
16272   SDLoc dl(Op);
16273   Op = Op.getOperand(0);
16274
16275   // Issue a bsf (scan bits forward) which also sets EFLAGS.
16276   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
16277   Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
16278
16279   // If src is zero (i.e. bsf sets ZF), returns NumBits.
16280   SDValue Ops[] = {
16281     Op,
16282     DAG.getConstant(NumBits, dl, VT),
16283     DAG.getConstant(X86::COND_E, dl, MVT::i8),
16284     Op.getValue(1)
16285   };
16286   return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
16287 }
16288
16289 // Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
16290 // ones, and then concatenate the result back.
16291 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
16292   MVT VT = Op.getSimpleValueType();
16293
16294   assert(VT.is256BitVector() && VT.isInteger() &&
16295          "Unsupported value type for operation");
16296
16297   unsigned NumElems = VT.getVectorNumElements();
16298   SDLoc dl(Op);
16299
16300   // Extract the LHS vectors
16301   SDValue LHS = Op.getOperand(0);
16302   SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
16303   SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
16304
16305   // Extract the RHS vectors
16306   SDValue RHS = Op.getOperand(1);
16307   SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
16308   SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
16309
16310   MVT EltVT = VT.getVectorElementType();
16311   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
16312
16313   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
16314                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
16315                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
16316 }
16317
16318 static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
16319   if (Op.getValueType() == MVT::i1)
16320     return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
16321                        Op.getOperand(0), Op.getOperand(1));
16322   assert(Op.getSimpleValueType().is256BitVector() &&
16323          Op.getSimpleValueType().isInteger() &&
16324          "Only handle AVX 256-bit vector integer operation");
16325   return Lower256IntArith(Op, DAG);
16326 }
16327
16328 static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
16329   if (Op.getValueType() == MVT::i1)
16330     return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
16331                        Op.getOperand(0), Op.getOperand(1));
16332   assert(Op.getSimpleValueType().is256BitVector() &&
16333          Op.getSimpleValueType().isInteger() &&
16334          "Only handle AVX 256-bit vector integer operation");
16335   return Lower256IntArith(Op, DAG);
16336 }
16337
16338 static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
16339                         SelectionDAG &DAG) {
16340   SDLoc dl(Op);
16341   MVT VT = Op.getSimpleValueType();
16342
16343   if (VT == MVT::i1)
16344     return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
16345
16346   // Decompose 256-bit ops into smaller 128-bit ops.
16347   if (VT.is256BitVector() && !Subtarget->hasInt256())
16348     return Lower256IntArith(Op, DAG);
16349
16350   SDValue A = Op.getOperand(0);
16351   SDValue B = Op.getOperand(1);
16352
16353   // Lower v16i8/v32i8 mul as promotion to v8i16/v16i16 vector
16354   // pairs, multiply and truncate.
16355   if (VT == MVT::v16i8 || VT == MVT::v32i8) {
16356     if (Subtarget->hasInt256()) {
16357       if (VT == MVT::v32i8) {
16358         MVT SubVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() / 2);
16359         SDValue Lo = DAG.getIntPtrConstant(0, dl);
16360         SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl);
16361         SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, A, Lo);
16362         SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, B, Lo);
16363         SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, A, Hi);
16364         SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, B, Hi);
16365         return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
16366                            DAG.getNode(ISD::MUL, dl, SubVT, ALo, BLo),
16367                            DAG.getNode(ISD::MUL, dl, SubVT, AHi, BHi));
16368       }
16369
16370       MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
16371       return DAG.getNode(
16372           ISD::TRUNCATE, dl, VT,
16373           DAG.getNode(ISD::MUL, dl, ExVT,
16374                       DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
16375                       DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
16376     }
16377
16378     assert(VT == MVT::v16i8 &&
16379            "Pre-AVX2 support only supports v16i8 multiplication");
16380     MVT ExVT = MVT::v8i16;
16381
16382     // Extract the lo parts and sign extend to i16
16383     SDValue ALo, BLo;
16384     if (Subtarget->hasSSE41()) {
16385       ALo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, A);
16386       BLo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, B);
16387     } else {
16388       const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
16389                               -1, 4, -1, 5, -1, 6, -1, 7};
16390       ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
16391       BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
16392       ALo = DAG.getBitcast(ExVT, ALo);
16393       BLo = DAG.getBitcast(ExVT, BLo);
16394       ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
16395       BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
16396     }
16397
16398     // Extract the hi parts and sign extend to i16
16399     SDValue AHi, BHi;
16400     if (Subtarget->hasSSE41()) {
16401       const int ShufMask[] = {8,  9,  10, 11, 12, 13, 14, 15,
16402                               -1, -1, -1, -1, -1, -1, -1, -1};
16403       AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
16404       BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
16405       AHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, AHi);
16406       BHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, BHi);
16407     } else {
16408       const int ShufMask[] = {-1, 8,  -1, 9,  -1, 10, -1, 11,
16409                               -1, 12, -1, 13, -1, 14, -1, 15};
16410       AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
16411       BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
16412       AHi = DAG.getBitcast(ExVT, AHi);
16413       BHi = DAG.getBitcast(ExVT, BHi);
16414       AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
16415       BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
16416     }
16417
16418     // Multiply, mask the lower 8bits of the lo/hi results and pack
16419     SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
16420     SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
16421     RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
16422     RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
16423     return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
16424   }
16425
16426   // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
16427   if (VT == MVT::v4i32) {
16428     assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() &&
16429            "Should not custom lower when pmuldq is available!");
16430
16431     // Extract the odd parts.
16432     static const int UnpackMask[] = { 1, -1, 3, -1 };
16433     SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
16434     SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
16435
16436     // Multiply the even parts.
16437     SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
16438     // Now multiply odd parts.
16439     SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
16440
16441     Evens = DAG.getBitcast(VT, Evens);
16442     Odds = DAG.getBitcast(VT, Odds);
16443
16444     // Merge the two vectors back together with a shuffle. This expands into 2
16445     // shuffles.
16446     static const int ShufMask[] = { 0, 4, 2, 6 };
16447     return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
16448   }
16449
16450   assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
16451          "Only know how to lower V2I64/V4I64/V8I64 multiply");
16452
16453   //  Ahi = psrlqi(a, 32);
16454   //  Bhi = psrlqi(b, 32);
16455   //
16456   //  AloBlo = pmuludq(a, b);
16457   //  AloBhi = pmuludq(a, Bhi);
16458   //  AhiBlo = pmuludq(Ahi, b);
16459
16460   //  AloBhi = psllqi(AloBhi, 32);
16461   //  AhiBlo = psllqi(AhiBlo, 32);
16462   //  return AloBlo + AloBhi + AhiBlo;
16463
16464   SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
16465   SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
16466
16467   // Bit cast to 32-bit vectors for MULUDQ
16468   EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
16469                                   (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;
16470   A = DAG.getBitcast(MulVT, A);
16471   B = DAG.getBitcast(MulVT, B);
16472   Ahi = DAG.getBitcast(MulVT, Ahi);
16473   Bhi = DAG.getBitcast(MulVT, Bhi);
16474
16475   SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
16476   SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
16477   SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
16478
16479   AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG);
16480   AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG);
16481
16482   SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
16483   return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
16484 }
16485
16486 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
16487   assert(Subtarget->isTargetWin64() && "Unexpected target");
16488   EVT VT = Op.getValueType();
16489   assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
16490          "Unexpected return type for lowering");
16491
16492   RTLIB::Libcall LC;
16493   bool isSigned;
16494   switch (Op->getOpcode()) {
16495   default: llvm_unreachable("Unexpected request for libcall!");
16496   case ISD::SDIV:      isSigned = true;  LC = RTLIB::SDIV_I128;    break;
16497   case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;
16498   case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;
16499   case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;
16500   case ISD::SDIVREM:   isSigned = true;  LC = RTLIB::SDIVREM_I128; break;
16501   case ISD::UDIVREM:   isSigned = false; LC = RTLIB::UDIVREM_I128; break;
16502   }
16503
16504   SDLoc dl(Op);
16505   SDValue InChain = DAG.getEntryNode();
16506
16507   TargetLowering::ArgListTy Args;
16508   TargetLowering::ArgListEntry Entry;
16509   for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
16510     EVT ArgVT = Op->getOperand(i).getValueType();
16511     assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
16512            "Unexpected argument type for lowering");
16513     SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
16514     Entry.Node = StackPtr;
16515     InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(),
16516                            false, false, 16);
16517     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
16518     Entry.Ty = PointerType::get(ArgTy,0);
16519     Entry.isSExt = false;
16520     Entry.isZExt = false;
16521     Args.push_back(Entry);
16522   }
16523
16524   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
16525                                          getPointerTy());
16526
16527   TargetLowering::CallLoweringInfo CLI(DAG);
16528   CLI.setDebugLoc(dl).setChain(InChain)
16529     .setCallee(getLibcallCallingConv(LC),
16530                static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
16531                Callee, std::move(Args), 0)
16532     .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
16533
16534   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
16535   return DAG.getBitcast(VT, CallInfo.first);
16536 }
16537
16538 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
16539                              SelectionDAG &DAG) {
16540   SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
16541   EVT VT = Op0.getValueType();
16542   SDLoc dl(Op);
16543
16544   assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) ||
16545          (VT == MVT::v8i32 && Subtarget->hasInt256()));
16546
16547   // PMULxD operations multiply each even value (starting at 0) of LHS with
16548   // the related value of RHS and produce a widen result.
16549   // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
16550   // => <2 x i64> <ae|cg>
16551   //
16552   // In other word, to have all the results, we need to perform two PMULxD:
16553   // 1. one with the even values.
16554   // 2. one with the odd values.
16555   // To achieve #2, with need to place the odd values at an even position.
16556   //
16557   // Place the odd value at an even position (basically, shift all values 1
16558   // step to the left):
16559   const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
16560   // <a|b|c|d> => <b|undef|d|undef>
16561   SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask);
16562   // <e|f|g|h> => <f|undef|h|undef>
16563   SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask);
16564
16565   // Emit two multiplies, one for the lower 2 ints and one for the higher 2
16566   // ints.
16567   MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
16568   bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
16569   unsigned Opcode =
16570       (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
16571   // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
16572   // => <2 x i64> <ae|cg>
16573   SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
16574   // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
16575   // => <2 x i64> <bf|dh>
16576   SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
16577
16578   // Shuffle it back into the right order.
16579   SDValue Highs, Lows;
16580   if (VT == MVT::v8i32) {
16581     const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
16582     Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
16583     const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
16584     Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
16585   } else {
16586     const int HighMask[] = {1, 5, 3, 7};
16587     Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
16588     const int LowMask[] = {0, 4, 2, 6};
16589     Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
16590   }
16591
16592   // If we have a signed multiply but no PMULDQ fix up the high parts of a
16593   // unsigned multiply.
16594   if (IsSigned && !Subtarget->hasSSE41()) {
16595     SDValue ShAmt =
16596         DAG.getConstant(31, dl,
16597                         DAG.getTargetLoweringInfo().getShiftAmountTy(VT));
16598     SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
16599                              DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
16600     SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
16601                              DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
16602
16603     SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
16604     Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
16605   }
16606
16607   // The first result of MUL_LOHI is actually the low value, followed by the
16608   // high value.
16609   SDValue Ops[] = {Lows, Highs};
16610   return DAG.getMergeValues(Ops, dl);
16611 }
16612
16613 // Return true if the requred (according to Opcode) shift-imm form is natively
16614 // supported by the Subtarget
16615 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget *Subtarget,
16616                                         unsigned Opcode) {
16617   if (VT.getScalarSizeInBits() < 16)
16618     return false;
16619
16620   if (VT.is512BitVector() &&
16621       (VT.getScalarSizeInBits() > 16 || Subtarget->hasBWI()))
16622     return true;
16623
16624   bool LShift = VT.is128BitVector() ||
16625     (VT.is256BitVector() && Subtarget->hasInt256());
16626
16627   bool AShift = LShift && (Subtarget->hasVLX() ||
16628     (VT != MVT::v2i64 && VT != MVT::v4i64));
16629   return (Opcode == ISD::SRA) ? AShift : LShift;
16630 }
16631
16632 // The shift amount is a variable, but it is the same for all vector lanes.
16633 // These instrcutions are defined together with shift-immediate.
16634 static
16635 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget *Subtarget,
16636                                       unsigned Opcode) {
16637   return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
16638 }
16639
16640 // Return true if the requred (according to Opcode) variable-shift form is
16641 // natively supported by the Subtarget
16642 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget *Subtarget,
16643                                     unsigned Opcode) {
16644
16645   if (!Subtarget->hasInt256() || VT.getScalarSizeInBits() < 16)
16646     return false;
16647
16648   // vXi16 supported only on AVX-512, BWI
16649   if (VT.getScalarSizeInBits() == 16 && !Subtarget->hasBWI())
16650     return false;
16651
16652   if (VT.is512BitVector() || Subtarget->hasVLX())
16653     return true;
16654
16655   bool LShift = VT.is128BitVector() || VT.is256BitVector();
16656   bool AShift = LShift &&  VT != MVT::v2i64 && VT != MVT::v4i64;
16657   return (Opcode == ISD::SRA) ? AShift : LShift;
16658 }
16659
16660 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
16661                                          const X86Subtarget *Subtarget) {
16662   MVT VT = Op.getSimpleValueType();
16663   SDLoc dl(Op);
16664   SDValue R = Op.getOperand(0);
16665   SDValue Amt = Op.getOperand(1);
16666
16667   unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
16668     (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
16669
16670   // Optimize shl/srl/sra with constant shift amount.
16671   if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
16672     if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
16673       uint64_t ShiftAmt = ShiftConst->getZExtValue();
16674
16675       if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
16676         return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
16677
16678       if (VT == MVT::v16i8 || (Subtarget->hasInt256() && VT == MVT::v32i8)) {
16679         unsigned NumElts = VT.getVectorNumElements();
16680         MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
16681
16682         if (Op.getOpcode() == ISD::SHL) {
16683           // Simple i8 add case
16684           if (ShiftAmt == 1)
16685             return DAG.getNode(ISD::ADD, dl, VT, R, R);
16686
16687           // Make a large shift.
16688           SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
16689                                                    R, ShiftAmt, DAG);
16690           SHL = DAG.getBitcast(VT, SHL);
16691           // Zero out the rightmost bits.
16692           SmallVector<SDValue, 32> V(
16693               NumElts, DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, MVT::i8));
16694           return DAG.getNode(ISD::AND, dl, VT, SHL,
16695                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
16696         }
16697         if (Op.getOpcode() == ISD::SRL) {
16698           // Make a large shift.
16699           SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
16700                                                    R, ShiftAmt, DAG);
16701           SRL = DAG.getBitcast(VT, SRL);
16702           // Zero out the leftmost bits.
16703           SmallVector<SDValue, 32> V(
16704               NumElts, DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, MVT::i8));
16705           return DAG.getNode(ISD::AND, dl, VT, SRL,
16706                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
16707         }
16708         if (Op.getOpcode() == ISD::SRA) {
16709           if (ShiftAmt == 7) {
16710             // R s>> 7  ===  R s< 0
16711             SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
16712             return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
16713           }
16714
16715           // R s>> a === ((R u>> a) ^ m) - m
16716           SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
16717           SmallVector<SDValue, 32> V(NumElts,
16718                                      DAG.getConstant(128 >> ShiftAmt, dl,
16719                                                      MVT::i8));
16720           SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
16721           Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
16722           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
16723           return Res;
16724         }
16725         llvm_unreachable("Unknown shift opcode.");
16726       }
16727     }
16728   }
16729
16730   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
16731   if (!Subtarget->is64Bit() &&
16732       (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
16733       Amt.getOpcode() == ISD::BITCAST &&
16734       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
16735     Amt = Amt.getOperand(0);
16736     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
16737                      VT.getVectorNumElements();
16738     unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
16739     uint64_t ShiftAmt = 0;
16740     for (unsigned i = 0; i != Ratio; ++i) {
16741       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i));
16742       if (!C)
16743         return SDValue();
16744       // 6 == Log2(64)
16745       ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
16746     }
16747     // Check remaining shift amounts.
16748     for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
16749       uint64_t ShAmt = 0;
16750       for (unsigned j = 0; j != Ratio; ++j) {
16751         ConstantSDNode *C =
16752           dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
16753         if (!C)
16754           return SDValue();
16755         // 6 == Log2(64)
16756         ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
16757       }
16758       if (ShAmt != ShiftAmt)
16759         return SDValue();
16760     }
16761     return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
16762   }
16763
16764   return SDValue();
16765 }
16766
16767 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
16768                                         const X86Subtarget* Subtarget) {
16769   MVT VT = Op.getSimpleValueType();
16770   SDLoc dl(Op);
16771   SDValue R = Op.getOperand(0);
16772   SDValue Amt = Op.getOperand(1);
16773
16774   unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
16775     (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
16776
16777   unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
16778     (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
16779
16780   if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
16781     SDValue BaseShAmt;
16782     EVT EltVT = VT.getVectorElementType();
16783
16784     if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
16785       // Check if this build_vector node is doing a splat.
16786       // If so, then set BaseShAmt equal to the splat value.
16787       BaseShAmt = BV->getSplatValue();
16788       if (BaseShAmt && BaseShAmt.getOpcode() == ISD::UNDEF)
16789         BaseShAmt = SDValue();
16790     } else {
16791       if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
16792         Amt = Amt.getOperand(0);
16793
16794       ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
16795       if (SVN && SVN->isSplat()) {
16796         unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
16797         SDValue InVec = Amt.getOperand(0);
16798         if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
16799           assert((SplatIdx < InVec.getValueType().getVectorNumElements()) &&
16800                  "Unexpected shuffle index found!");
16801           BaseShAmt = InVec.getOperand(SplatIdx);
16802         } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
16803            if (ConstantSDNode *C =
16804                dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
16805              if (C->getZExtValue() == SplatIdx)
16806                BaseShAmt = InVec.getOperand(1);
16807            }
16808         }
16809
16810         if (!BaseShAmt)
16811           // Avoid introducing an extract element from a shuffle.
16812           BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
16813                                   DAG.getIntPtrConstant(SplatIdx, dl));
16814       }
16815     }
16816
16817     if (BaseShAmt.getNode()) {
16818       assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
16819       if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
16820         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
16821       else if (EltVT.bitsLT(MVT::i32))
16822         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
16823
16824       return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, DAG);
16825     }
16826   }
16827
16828   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
16829   if (!Subtarget->is64Bit() && VT == MVT::v2i64  &&
16830       Amt.getOpcode() == ISD::BITCAST &&
16831       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
16832     Amt = Amt.getOperand(0);
16833     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
16834                      VT.getVectorNumElements();
16835     std::vector<SDValue> Vals(Ratio);
16836     for (unsigned i = 0; i != Ratio; ++i)
16837       Vals[i] = Amt.getOperand(i);
16838     for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
16839       for (unsigned j = 0; j != Ratio; ++j)
16840         if (Vals[j] != Amt.getOperand(i + j))
16841           return SDValue();
16842     }
16843     return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
16844   }
16845   return SDValue();
16846 }
16847
16848 static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
16849                           SelectionDAG &DAG) {
16850   MVT VT = Op.getSimpleValueType();
16851   SDLoc dl(Op);
16852   SDValue R = Op.getOperand(0);
16853   SDValue Amt = Op.getOperand(1);
16854
16855   assert(VT.isVector() && "Custom lowering only for vector shifts!");
16856   assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!");
16857
16858   if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
16859     return V;
16860
16861   if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
16862       return V;
16863
16864   if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
16865     return Op;
16866
16867   // 2i64 vector logical shifts can efficiently avoid scalarization - do the
16868   // shifts per-lane and then shuffle the partial results back together.
16869   if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
16870     // Splat the shift amounts so the scalar shifts above will catch it.
16871     SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
16872     SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
16873     SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
16874     SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
16875     return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
16876   }
16877
16878   // If possible, lower this packed shift into a vector multiply instead of
16879   // expanding it into a sequence of scalar shifts.
16880   // Do this only if the vector shift count is a constant build_vector.
16881   if (Op.getOpcode() == ISD::SHL &&
16882       (VT == MVT::v8i16 || VT == MVT::v4i32 ||
16883        (Subtarget->hasInt256() && VT == MVT::v16i16)) &&
16884       ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
16885     SmallVector<SDValue, 8> Elts;
16886     EVT SVT = VT.getScalarType();
16887     unsigned SVTBits = SVT.getSizeInBits();
16888     const APInt &One = APInt(SVTBits, 1);
16889     unsigned NumElems = VT.getVectorNumElements();
16890
16891     for (unsigned i=0; i !=NumElems; ++i) {
16892       SDValue Op = Amt->getOperand(i);
16893       if (Op->getOpcode() == ISD::UNDEF) {
16894         Elts.push_back(Op);
16895         continue;
16896       }
16897
16898       ConstantSDNode *ND = cast<ConstantSDNode>(Op);
16899       const APInt &C = APInt(SVTBits, ND->getAPIntValue().getZExtValue());
16900       uint64_t ShAmt = C.getZExtValue();
16901       if (ShAmt >= SVTBits) {
16902         Elts.push_back(DAG.getUNDEF(SVT));
16903         continue;
16904       }
16905       Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
16906     }
16907     SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
16908     return DAG.getNode(ISD::MUL, dl, VT, R, BV);
16909   }
16910
16911   // Lower SHL with variable shift amount.
16912   if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
16913     Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
16914
16915     Op = DAG.getNode(ISD::ADD, dl, VT, Op,
16916                      DAG.getConstant(0x3f800000U, dl, VT));
16917     Op = DAG.getBitcast(MVT::v4f32, Op);
16918     Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
16919     return DAG.getNode(ISD::MUL, dl, VT, Op, R);
16920   }
16921
16922   // If possible, lower this shift as a sequence of two shifts by
16923   // constant plus a MOVSS/MOVSD instead of scalarizing it.
16924   // Example:
16925   //   (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
16926   //
16927   // Could be rewritten as:
16928   //   (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
16929   //
16930   // The advantage is that the two shifts from the example would be
16931   // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
16932   // the vector shift into four scalar shifts plus four pairs of vector
16933   // insert/extract.
16934   if ((VT == MVT::v8i16 || VT == MVT::v4i32) &&
16935       ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
16936     unsigned TargetOpcode = X86ISD::MOVSS;
16937     bool CanBeSimplified;
16938     // The splat value for the first packed shift (the 'X' from the example).
16939     SDValue Amt1 = Amt->getOperand(0);
16940     // The splat value for the second packed shift (the 'Y' from the example).
16941     SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) :
16942                                         Amt->getOperand(2);
16943
16944     // See if it is possible to replace this node with a sequence of
16945     // two shifts followed by a MOVSS/MOVSD
16946     if (VT == MVT::v4i32) {
16947       // Check if it is legal to use a MOVSS.
16948       CanBeSimplified = Amt2 == Amt->getOperand(2) &&
16949                         Amt2 == Amt->getOperand(3);
16950       if (!CanBeSimplified) {
16951         // Otherwise, check if we can still simplify this node using a MOVSD.
16952         CanBeSimplified = Amt1 == Amt->getOperand(1) &&
16953                           Amt->getOperand(2) == Amt->getOperand(3);
16954         TargetOpcode = X86ISD::MOVSD;
16955         Amt2 = Amt->getOperand(2);
16956       }
16957     } else {
16958       // Do similar checks for the case where the machine value type
16959       // is MVT::v8i16.
16960       CanBeSimplified = Amt1 == Amt->getOperand(1);
16961       for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
16962         CanBeSimplified = Amt2 == Amt->getOperand(i);
16963
16964       if (!CanBeSimplified) {
16965         TargetOpcode = X86ISD::MOVSD;
16966         CanBeSimplified = true;
16967         Amt2 = Amt->getOperand(4);
16968         for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
16969           CanBeSimplified = Amt1 == Amt->getOperand(i);
16970         for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
16971           CanBeSimplified = Amt2 == Amt->getOperand(j);
16972       }
16973     }
16974
16975     if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
16976         isa<ConstantSDNode>(Amt2)) {
16977       // Replace this node with two shifts followed by a MOVSS/MOVSD.
16978       EVT CastVT = MVT::v4i32;
16979       SDValue Splat1 =
16980         DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
16981       SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
16982       SDValue Splat2 =
16983         DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
16984       SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
16985       if (TargetOpcode == X86ISD::MOVSD)
16986         CastVT = MVT::v2i64;
16987       SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1);
16988       SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2);
16989       SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2,
16990                                             BitCast1, DAG);
16991       return DAG.getBitcast(VT, Result);
16992     }
16993   }
16994
16995   if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
16996     // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
16997     Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(5, dl, VT));
16998
16999     SDValue VSelM = DAG.getConstant(0x80, dl, VT);
17000     SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
17001     OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
17002
17003     // r = VSELECT(r, shl(r, 4), a);
17004     SDValue M = DAG.getNode(ISD::SHL, dl, VT, R, DAG.getConstant(4, dl, VT));
17005     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
17006
17007     // a += a
17008     Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
17009     OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
17010     OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
17011
17012     // r = VSELECT(r, shl(r, 2), a);
17013     M = DAG.getNode(ISD::SHL, dl, VT, R, DAG.getConstant(2, dl, VT));
17014     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
17015
17016     // a += a
17017     Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
17018     OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
17019     OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
17020
17021     // return VSELECT(r, r+r, a);
17022     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel,
17023                     DAG.getNode(ISD::ADD, dl, VT, R, R), R);
17024     return R;
17025   }
17026
17027   // It's worth extending once and using the v8i32 shifts for 16-bit types, but
17028   // the extra overheads to get from v16i8 to v8i32 make the existing SSE
17029   // solution better.
17030   if (Subtarget->hasInt256() && VT == MVT::v8i16) {
17031     MVT ExtVT = MVT::v8i32;
17032     unsigned ExtOpc =
17033         Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
17034     R = DAG.getNode(ExtOpc, dl, ExtVT, R);
17035     Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
17036     return DAG.getNode(ISD::TRUNCATE, dl, VT,
17037                        DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
17038   }
17039
17040   if (Subtarget->hasInt256() && VT == MVT::v16i16) {
17041     MVT ExtVT = MVT::v8i32;
17042     SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
17043     SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
17044     SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
17045     SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, R, R);
17046     SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, R, R);
17047     ALo = DAG.getBitcast(ExtVT, ALo);
17048     AHi = DAG.getBitcast(ExtVT, AHi);
17049     RLo = DAG.getBitcast(ExtVT, RLo);
17050     RHi = DAG.getBitcast(ExtVT, RHi);
17051     SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
17052     SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
17053     Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
17054     Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
17055     return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
17056   }
17057
17058   // Decompose 256-bit shifts into smaller 128-bit shifts.
17059   if (VT.is256BitVector()) {
17060     unsigned NumElems = VT.getVectorNumElements();
17061     MVT EltVT = VT.getVectorElementType();
17062     EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
17063
17064     // Extract the two vectors
17065     SDValue V1 = Extract128BitVector(R, 0, DAG, dl);
17066     SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl);
17067
17068     // Recreate the shift amount vectors
17069     SDValue Amt1, Amt2;
17070     if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
17071       // Constant shift amount
17072       SmallVector<SDValue, 8> Ops(Amt->op_begin(), Amt->op_begin() + NumElems);
17073       ArrayRef<SDValue> Amt1Csts = makeArrayRef(Ops).slice(0, NumElems / 2);
17074       ArrayRef<SDValue> Amt2Csts = makeArrayRef(Ops).slice(NumElems / 2);
17075
17076       Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts);
17077       Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts);
17078     } else {
17079       // Variable shift amount
17080       Amt1 = Extract128BitVector(Amt, 0, DAG, dl);
17081       Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl);
17082     }
17083
17084     // Issue new vector shifts for the smaller types
17085     V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
17086     V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
17087
17088     // Concatenate the result back
17089     return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
17090   }
17091
17092   return SDValue();
17093 }
17094
17095 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
17096   // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
17097   // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
17098   // looks for this combo and may remove the "setcc" instruction if the "setcc"
17099   // has only one use.
17100   SDNode *N = Op.getNode();
17101   SDValue LHS = N->getOperand(0);
17102   SDValue RHS = N->getOperand(1);
17103   unsigned BaseOp = 0;
17104   unsigned Cond = 0;
17105   SDLoc DL(Op);
17106   switch (Op.getOpcode()) {
17107   default: llvm_unreachable("Unknown ovf instruction!");
17108   case ISD::SADDO:
17109     // A subtract of one will be selected as a INC. Note that INC doesn't
17110     // set CF, so we can't do this for UADDO.
17111     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
17112       if (C->isOne()) {
17113         BaseOp = X86ISD::INC;
17114         Cond = X86::COND_O;
17115         break;
17116       }
17117     BaseOp = X86ISD::ADD;
17118     Cond = X86::COND_O;
17119     break;
17120   case ISD::UADDO:
17121     BaseOp = X86ISD::ADD;
17122     Cond = X86::COND_B;
17123     break;
17124   case ISD::SSUBO:
17125     // A subtract of one will be selected as a DEC. Note that DEC doesn't
17126     // set CF, so we can't do this for USUBO.
17127     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
17128       if (C->isOne()) {
17129         BaseOp = X86ISD::DEC;
17130         Cond = X86::COND_O;
17131         break;
17132       }
17133     BaseOp = X86ISD::SUB;
17134     Cond = X86::COND_O;
17135     break;
17136   case ISD::USUBO:
17137     BaseOp = X86ISD::SUB;
17138     Cond = X86::COND_B;
17139     break;
17140   case ISD::SMULO:
17141     BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
17142     Cond = X86::COND_O;
17143     break;
17144   case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
17145     if (N->getValueType(0) == MVT::i8) {
17146       BaseOp = X86ISD::UMUL8;
17147       Cond = X86::COND_O;
17148       break;
17149     }
17150     SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
17151                                  MVT::i32);
17152     SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
17153
17154     SDValue SetCC =
17155       DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
17156                   DAG.getConstant(X86::COND_O, DL, MVT::i32),
17157                   SDValue(Sum.getNode(), 2));
17158
17159     return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
17160   }
17161   }
17162
17163   // Also sets EFLAGS.
17164   SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
17165   SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
17166
17167   SDValue SetCC =
17168     DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1),
17169                 DAG.getConstant(Cond, DL, MVT::i32),
17170                 SDValue(Sum.getNode(), 1));
17171
17172   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
17173 }
17174
17175 /// Returns true if the operand type is exactly twice the native width, and
17176 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
17177 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
17178 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
17179 bool X86TargetLowering::needsCmpXchgNb(const Type *MemType) const {
17180   unsigned OpWidth = MemType->getPrimitiveSizeInBits();
17181
17182   if (OpWidth == 64)
17183     return !Subtarget->is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
17184   else if (OpWidth == 128)
17185     return Subtarget->hasCmpxchg16b();
17186   else
17187     return false;
17188 }
17189
17190 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
17191   return needsCmpXchgNb(SI->getValueOperand()->getType());
17192 }
17193
17194 // Note: this turns large loads into lock cmpxchg8b/16b.
17195 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
17196 bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
17197   auto PTy = cast<PointerType>(LI->getPointerOperand()->getType());
17198   return needsCmpXchgNb(PTy->getElementType());
17199 }
17200
17201 TargetLoweringBase::AtomicRMWExpansionKind
17202 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
17203   unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
17204   const Type *MemType = AI->getType();
17205
17206   // If the operand is too big, we must see if cmpxchg8/16b is available
17207   // and default to library calls otherwise.
17208   if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
17209     return needsCmpXchgNb(MemType) ? AtomicRMWExpansionKind::CmpXChg
17210                                    : AtomicRMWExpansionKind::None;
17211   }
17212
17213   AtomicRMWInst::BinOp Op = AI->getOperation();
17214   switch (Op) {
17215   default:
17216     llvm_unreachable("Unknown atomic operation");
17217   case AtomicRMWInst::Xchg:
17218   case AtomicRMWInst::Add:
17219   case AtomicRMWInst::Sub:
17220     // It's better to use xadd, xsub or xchg for these in all cases.
17221     return AtomicRMWExpansionKind::None;
17222   case AtomicRMWInst::Or:
17223   case AtomicRMWInst::And:
17224   case AtomicRMWInst::Xor:
17225     // If the atomicrmw's result isn't actually used, we can just add a "lock"
17226     // prefix to a normal instruction for these operations.
17227     return !AI->use_empty() ? AtomicRMWExpansionKind::CmpXChg
17228                             : AtomicRMWExpansionKind::None;
17229   case AtomicRMWInst::Nand:
17230   case AtomicRMWInst::Max:
17231   case AtomicRMWInst::Min:
17232   case AtomicRMWInst::UMax:
17233   case AtomicRMWInst::UMin:
17234     // These always require a non-trivial set of data operations on x86. We must
17235     // use a cmpxchg loop.
17236     return AtomicRMWExpansionKind::CmpXChg;
17237   }
17238 }
17239
17240 static bool hasMFENCE(const X86Subtarget& Subtarget) {
17241   // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
17242   // no-sse2). There isn't any reason to disable it if the target processor
17243   // supports it.
17244   return Subtarget.hasSSE2() || Subtarget.is64Bit();
17245 }
17246
17247 LoadInst *
17248 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
17249   unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
17250   const Type *MemType = AI->getType();
17251   // Accesses larger than the native width are turned into cmpxchg/libcalls, so
17252   // there is no benefit in turning such RMWs into loads, and it is actually
17253   // harmful as it introduces a mfence.
17254   if (MemType->getPrimitiveSizeInBits() > NativeWidth)
17255     return nullptr;
17256
17257   auto Builder = IRBuilder<>(AI);
17258   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
17259   auto SynchScope = AI->getSynchScope();
17260   // We must restrict the ordering to avoid generating loads with Release or
17261   // ReleaseAcquire orderings.
17262   auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
17263   auto Ptr = AI->getPointerOperand();
17264
17265   // Before the load we need a fence. Here is an example lifted from
17266   // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
17267   // is required:
17268   // Thread 0:
17269   //   x.store(1, relaxed);
17270   //   r1 = y.fetch_add(0, release);
17271   // Thread 1:
17272   //   y.fetch_add(42, acquire);
17273   //   r2 = x.load(relaxed);
17274   // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
17275   // lowered to just a load without a fence. A mfence flushes the store buffer,
17276   // making the optimization clearly correct.
17277   // FIXME: it is required if isAtLeastRelease(Order) but it is not clear
17278   // otherwise, we might be able to be more agressive on relaxed idempotent
17279   // rmw. In practice, they do not look useful, so we don't try to be
17280   // especially clever.
17281   if (SynchScope == SingleThread)
17282     // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
17283     // the IR level, so we must wrap it in an intrinsic.
17284     return nullptr;
17285
17286   if (!hasMFENCE(*Subtarget))
17287     // FIXME: it might make sense to use a locked operation here but on a
17288     // different cache-line to prevent cache-line bouncing. In practice it
17289     // is probably a small win, and x86 processors without mfence are rare
17290     // enough that we do not bother.
17291     return nullptr;
17292
17293   Function *MFence =
17294       llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
17295   Builder.CreateCall(MFence, {});
17296
17297   // Finally we can emit the atomic load.
17298   LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
17299           AI->getType()->getPrimitiveSizeInBits());
17300   Loaded->setAtomic(Order, SynchScope);
17301   AI->replaceAllUsesWith(Loaded);
17302   AI->eraseFromParent();
17303   return Loaded;
17304 }
17305
17306 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
17307                                  SelectionDAG &DAG) {
17308   SDLoc dl(Op);
17309   AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
17310     cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
17311   SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
17312     cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
17313
17314   // The only fence that needs an instruction is a sequentially-consistent
17315   // cross-thread fence.
17316   if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) {
17317     if (hasMFENCE(*Subtarget))
17318       return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
17319
17320     SDValue Chain = Op.getOperand(0);
17321     SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
17322     SDValue Ops[] = {
17323       DAG.getRegister(X86::ESP, MVT::i32),     // Base
17324       DAG.getTargetConstant(1, dl, MVT::i8),   // Scale
17325       DAG.getRegister(0, MVT::i32),            // Index
17326       DAG.getTargetConstant(0, dl, MVT::i32),  // Disp
17327       DAG.getRegister(0, MVT::i32),            // Segment.
17328       Zero,
17329       Chain
17330     };
17331     SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
17332     return SDValue(Res, 0);
17333   }
17334
17335   // MEMBARRIER is a compiler barrier; it codegens to a no-op.
17336   return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
17337 }
17338
17339 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
17340                              SelectionDAG &DAG) {
17341   MVT T = Op.getSimpleValueType();
17342   SDLoc DL(Op);
17343   unsigned Reg = 0;
17344   unsigned size = 0;
17345   switch(T.SimpleTy) {
17346   default: llvm_unreachable("Invalid value type!");
17347   case MVT::i8:  Reg = X86::AL;  size = 1; break;
17348   case MVT::i16: Reg = X86::AX;  size = 2; break;
17349   case MVT::i32: Reg = X86::EAX; size = 4; break;
17350   case MVT::i64:
17351     assert(Subtarget->is64Bit() && "Node not type legal!");
17352     Reg = X86::RAX; size = 8;
17353     break;
17354   }
17355   SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
17356                                   Op.getOperand(2), SDValue());
17357   SDValue Ops[] = { cpIn.getValue(0),
17358                     Op.getOperand(1),
17359                     Op.getOperand(3),
17360                     DAG.getTargetConstant(size, DL, MVT::i8),
17361                     cpIn.getValue(1) };
17362   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
17363   MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
17364   SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
17365                                            Ops, T, MMO);
17366
17367   SDValue cpOut =
17368     DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
17369   SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
17370                                       MVT::i32, cpOut.getValue(2));
17371   SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1),
17372                                 DAG.getConstant(X86::COND_E, DL, MVT::i8),
17373                                 EFLAGS);
17374
17375   DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
17376   DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
17377   DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
17378   return SDValue();
17379 }
17380
17381 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
17382                             SelectionDAG &DAG) {
17383   MVT SrcVT = Op.getOperand(0).getSimpleValueType();
17384   MVT DstVT = Op.getSimpleValueType();
17385
17386   if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) {
17387     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
17388     if (DstVT != MVT::f64)
17389       // This conversion needs to be expanded.
17390       return SDValue();
17391
17392     SDValue InVec = Op->getOperand(0);
17393     SDLoc dl(Op);
17394     unsigned NumElts = SrcVT.getVectorNumElements();
17395     EVT SVT = SrcVT.getVectorElementType();
17396
17397     // Widen the vector in input in the case of MVT::v2i32.
17398     // Example: from MVT::v2i32 to MVT::v4i32.
17399     SmallVector<SDValue, 16> Elts;
17400     for (unsigned i = 0, e = NumElts; i != e; ++i)
17401       Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec,
17402                                  DAG.getIntPtrConstant(i, dl)));
17403
17404     // Explicitly mark the extra elements as Undef.
17405     Elts.append(NumElts, DAG.getUNDEF(SVT));
17406
17407     EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
17408     SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts);
17409     SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
17410     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
17411                        DAG.getIntPtrConstant(0, dl));
17412   }
17413
17414   assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
17415          Subtarget->hasMMX() && "Unexpected custom BITCAST");
17416   assert((DstVT == MVT::i64 ||
17417           (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
17418          "Unexpected custom BITCAST");
17419   // i64 <=> MMX conversions are Legal.
17420   if (SrcVT==MVT::i64 && DstVT.isVector())
17421     return Op;
17422   if (DstVT==MVT::i64 && SrcVT.isVector())
17423     return Op;
17424   // MMX <=> MMX conversions are Legal.
17425   if (SrcVT.isVector() && DstVT.isVector())
17426     return Op;
17427   // All other conversions need to be expanded.
17428   return SDValue();
17429 }
17430
17431 /// Compute the horizontal sum of bytes in V for the elements of VT.
17432 ///
17433 /// Requires V to be a byte vector and VT to be an integer vector type with
17434 /// wider elements than V's type. The width of the elements of VT determines
17435 /// how many bytes of V are summed horizontally to produce each element of the
17436 /// result.
17437 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
17438                                       const X86Subtarget *Subtarget,
17439                                       SelectionDAG &DAG) {
17440   SDLoc DL(V);
17441   MVT ByteVecVT = V.getSimpleValueType();
17442   MVT EltVT = VT.getVectorElementType();
17443   int NumElts = VT.getVectorNumElements();
17444   assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
17445          "Expected value to have byte element type.");
17446   assert(EltVT != MVT::i8 &&
17447          "Horizontal byte sum only makes sense for wider elements!");
17448   unsigned VecSize = VT.getSizeInBits();
17449   assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
17450
17451   // PSADBW instruction horizontally add all bytes and leave the result in i64
17452   // chunks, thus directly computes the pop count for v2i64 and v4i64.
17453   if (EltVT == MVT::i64) {
17454     SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
17455     V = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT, V, Zeros);
17456     return DAG.getBitcast(VT, V);
17457   }
17458
17459   if (EltVT == MVT::i32) {
17460     // We unpack the low half and high half into i32s interleaved with zeros so
17461     // that we can use PSADBW to horizontally sum them. The most useful part of
17462     // this is that it lines up the results of two PSADBW instructions to be
17463     // two v2i64 vectors which concatenated are the 4 population counts. We can
17464     // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
17465     SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
17466     SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V, Zeros);
17467     SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V, Zeros);
17468
17469     // Do the horizontal sums into two v2i64s.
17470     Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
17471     Low = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT,
17472                       DAG.getBitcast(ByteVecVT, Low), Zeros);
17473     High = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT,
17474                        DAG.getBitcast(ByteVecVT, High), Zeros);
17475
17476     // Merge them together.
17477     MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
17478     V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
17479                     DAG.getBitcast(ShortVecVT, Low),
17480                     DAG.getBitcast(ShortVecVT, High));
17481
17482     return DAG.getBitcast(VT, V);
17483   }
17484
17485   // The only element type left is i16.
17486   assert(EltVT == MVT::i16 && "Unknown how to handle type");
17487
17488   // To obtain pop count for each i16 element starting from the pop count for
17489   // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
17490   // right by 8. It is important to shift as i16s as i8 vector shift isn't
17491   // directly supported.
17492   SmallVector<SDValue, 16> Shifters(NumElts, DAG.getConstant(8, DL, EltVT));
17493   SDValue Shifter = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Shifters);
17494   SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), Shifter);
17495   V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
17496                   DAG.getBitcast(ByteVecVT, V));
17497   return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), Shifter);
17498 }
17499
17500 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, SDLoc DL,
17501                                         const X86Subtarget *Subtarget,
17502                                         SelectionDAG &DAG) {
17503   MVT VT = Op.getSimpleValueType();
17504   MVT EltVT = VT.getVectorElementType();
17505   unsigned VecSize = VT.getSizeInBits();
17506
17507   // Implement a lookup table in register by using an algorithm based on:
17508   // http://wm.ite.pl/articles/sse-popcount.html
17509   //
17510   // The general idea is that every lower byte nibble in the input vector is an
17511   // index into a in-register pre-computed pop count table. We then split up the
17512   // input vector in two new ones: (1) a vector with only the shifted-right
17513   // higher nibbles for each byte and (2) a vector with the lower nibbles (and
17514   // masked out higher ones) for each byte. PSHUB is used separately with both
17515   // to index the in-register table. Next, both are added and the result is a
17516   // i8 vector where each element contains the pop count for input byte.
17517   //
17518   // To obtain the pop count for elements != i8, we follow up with the same
17519   // approach and use additional tricks as described below.
17520   //
17521   const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
17522                        /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
17523                        /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
17524                        /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
17525
17526   int NumByteElts = VecSize / 8;
17527   MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
17528   SDValue In = DAG.getBitcast(ByteVecVT, Op);
17529   SmallVector<SDValue, 16> LUTVec;
17530   for (int i = 0; i < NumByteElts; ++i)
17531     LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
17532   SDValue InRegLUT = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, LUTVec);
17533   SmallVector<SDValue, 16> Mask0F(NumByteElts,
17534                                   DAG.getConstant(0x0F, DL, MVT::i8));
17535   SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, Mask0F);
17536
17537   // High nibbles
17538   SmallVector<SDValue, 16> Four(NumByteElts, DAG.getConstant(4, DL, MVT::i8));
17539   SDValue FourV = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, Four);
17540   SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
17541
17542   // Low nibbles
17543   SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
17544
17545   // The input vector is used as the shuffle mask that index elements into the
17546   // LUT. After counting low and high nibbles, add the vector to obtain the
17547   // final pop count per i8 element.
17548   SDValue HighPopCnt =
17549       DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
17550   SDValue LowPopCnt =
17551       DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
17552   SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
17553
17554   if (EltVT == MVT::i8)
17555     return PopCnt;
17556
17557   return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
17558 }
17559
17560 static SDValue LowerVectorCTPOPBitmath(SDValue Op, SDLoc DL,
17561                                        const X86Subtarget *Subtarget,
17562                                        SelectionDAG &DAG) {
17563   MVT VT = Op.getSimpleValueType();
17564   assert(VT.is128BitVector() &&
17565          "Only 128-bit vector bitmath lowering supported.");
17566
17567   int VecSize = VT.getSizeInBits();
17568   MVT EltVT = VT.getVectorElementType();
17569   int Len = EltVT.getSizeInBits();
17570
17571   // This is the vectorized version of the "best" algorithm from
17572   // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
17573   // with a minor tweak to use a series of adds + shifts instead of vector
17574   // multiplications. Implemented for all integer vector types. We only use
17575   // this when we don't have SSSE3 which allows a LUT-based lowering that is
17576   // much faster, even faster than using native popcnt instructions.
17577
17578   auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
17579     MVT VT = V.getSimpleValueType();
17580     SmallVector<SDValue, 32> Shifters(
17581         VT.getVectorNumElements(),
17582         DAG.getConstant(Shifter, DL, VT.getVectorElementType()));
17583     return DAG.getNode(OpCode, DL, VT, V,
17584                        DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Shifters));
17585   };
17586   auto GetMask = [&](SDValue V, APInt Mask) {
17587     MVT VT = V.getSimpleValueType();
17588     SmallVector<SDValue, 32> Masks(
17589         VT.getVectorNumElements(),
17590         DAG.getConstant(Mask, DL, VT.getVectorElementType()));
17591     return DAG.getNode(ISD::AND, DL, VT, V,
17592                        DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Masks));
17593   };
17594
17595   // We don't want to incur the implicit masks required to SRL vNi8 vectors on
17596   // x86, so set the SRL type to have elements at least i16 wide. This is
17597   // correct because all of our SRLs are followed immediately by a mask anyways
17598   // that handles any bits that sneak into the high bits of the byte elements.
17599   MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
17600
17601   SDValue V = Op;
17602
17603   // v = v - ((v >> 1) & 0x55555555...)
17604   SDValue Srl =
17605       DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
17606   SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
17607   V = DAG.getNode(ISD::SUB, DL, VT, V, And);
17608
17609   // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
17610   SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
17611   Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
17612   SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
17613   V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
17614
17615   // v = (v + (v >> 4)) & 0x0F0F0F0F...
17616   Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
17617   SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
17618   V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
17619
17620   // At this point, V contains the byte-wise population count, and we are
17621   // merely doing a horizontal sum if necessary to get the wider element
17622   // counts.
17623   if (EltVT == MVT::i8)
17624     return V;
17625
17626   return LowerHorizontalByteSum(
17627       DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
17628       DAG);
17629 }
17630
17631 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget *Subtarget,
17632                                 SelectionDAG &DAG) {
17633   MVT VT = Op.getSimpleValueType();
17634   // FIXME: Need to add AVX-512 support here!
17635   assert((VT.is256BitVector() || VT.is128BitVector()) &&
17636          "Unknown CTPOP type to handle");
17637   SDLoc DL(Op.getNode());
17638   SDValue Op0 = Op.getOperand(0);
17639
17640   if (!Subtarget->hasSSSE3()) {
17641     // We can't use the fast LUT approach, so fall back on vectorized bitmath.
17642     assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
17643     return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
17644   }
17645
17646   if (VT.is256BitVector() && !Subtarget->hasInt256()) {
17647     unsigned NumElems = VT.getVectorNumElements();
17648
17649     // Extract each 128-bit vector, compute pop count and concat the result.
17650     SDValue LHS = Extract128BitVector(Op0, 0, DAG, DL);
17651     SDValue RHS = Extract128BitVector(Op0, NumElems/2, DAG, DL);
17652
17653     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
17654                        LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
17655                        LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
17656   }
17657
17658   return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
17659 }
17660
17661 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget,
17662                           SelectionDAG &DAG) {
17663   assert(Op.getValueType().isVector() &&
17664          "We only do custom lowering for vector population count.");
17665   return LowerVectorCTPOP(Op, Subtarget, DAG);
17666 }
17667
17668 static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
17669   SDNode *Node = Op.getNode();
17670   SDLoc dl(Node);
17671   EVT T = Node->getValueType(0);
17672   SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
17673                               DAG.getConstant(0, dl, T), Node->getOperand(2));
17674   return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
17675                        cast<AtomicSDNode>(Node)->getMemoryVT(),
17676                        Node->getOperand(0),
17677                        Node->getOperand(1), negOp,
17678                        cast<AtomicSDNode>(Node)->getMemOperand(),
17679                        cast<AtomicSDNode>(Node)->getOrdering(),
17680                        cast<AtomicSDNode>(Node)->getSynchScope());
17681 }
17682
17683 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
17684   SDNode *Node = Op.getNode();
17685   SDLoc dl(Node);
17686   EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
17687
17688   // Convert seq_cst store -> xchg
17689   // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
17690   // FIXME: On 32-bit, store -> fist or movq would be more efficient
17691   //        (The only way to get a 16-byte store is cmpxchg16b)
17692   // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
17693   if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent ||
17694       !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
17695     SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
17696                                  cast<AtomicSDNode>(Node)->getMemoryVT(),
17697                                  Node->getOperand(0),
17698                                  Node->getOperand(1), Node->getOperand(2),
17699                                  cast<AtomicSDNode>(Node)->getMemOperand(),
17700                                  cast<AtomicSDNode>(Node)->getOrdering(),
17701                                  cast<AtomicSDNode>(Node)->getSynchScope());
17702     return Swap.getValue(1);
17703   }
17704   // Other atomic stores have a simple pattern.
17705   return Op;
17706 }
17707
17708 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
17709   EVT VT = Op.getNode()->getSimpleValueType(0);
17710
17711   // Let legalize expand this if it isn't a legal type yet.
17712   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
17713     return SDValue();
17714
17715   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
17716
17717   unsigned Opc;
17718   bool ExtraOp = false;
17719   switch (Op.getOpcode()) {
17720   default: llvm_unreachable("Invalid code");
17721   case ISD::ADDC: Opc = X86ISD::ADD; break;
17722   case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
17723   case ISD::SUBC: Opc = X86ISD::SUB; break;
17724   case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
17725   }
17726
17727   if (!ExtraOp)
17728     return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
17729                        Op.getOperand(1));
17730   return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
17731                      Op.getOperand(1), Op.getOperand(2));
17732 }
17733
17734 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
17735                             SelectionDAG &DAG) {
17736   assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit());
17737
17738   // For MacOSX, we want to call an alternative entry point: __sincos_stret,
17739   // which returns the values as { float, float } (in XMM0) or
17740   // { double, double } (which is returned in XMM0, XMM1).
17741   SDLoc dl(Op);
17742   SDValue Arg = Op.getOperand(0);
17743   EVT ArgVT = Arg.getValueType();
17744   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
17745
17746   TargetLowering::ArgListTy Args;
17747   TargetLowering::ArgListEntry Entry;
17748
17749   Entry.Node = Arg;
17750   Entry.Ty = ArgTy;
17751   Entry.isSExt = false;
17752   Entry.isZExt = false;
17753   Args.push_back(Entry);
17754
17755   bool isF64 = ArgVT == MVT::f64;
17756   // Only optimize x86_64 for now. i386 is a bit messy. For f32,
17757   // the small struct {f32, f32} is returned in (eax, edx). For f64,
17758   // the results are returned via SRet in memory.
17759   const char *LibcallName =  isF64 ? "__sincos_stret" : "__sincosf_stret";
17760   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17761   SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy());
17762
17763   Type *RetTy = isF64
17764     ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
17765     : (Type*)VectorType::get(ArgTy, 4);
17766
17767   TargetLowering::CallLoweringInfo CLI(DAG);
17768   CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
17769     .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0);
17770
17771   std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
17772
17773   if (isF64)
17774     // Returned in xmm0 and xmm1.
17775     return CallResult.first;
17776
17777   // Returned in bits 0:31 and 32:64 xmm0.
17778   SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
17779                                CallResult.first, DAG.getIntPtrConstant(0, dl));
17780   SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
17781                                CallResult.first, DAG.getIntPtrConstant(1, dl));
17782   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
17783   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
17784 }
17785
17786 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget *Subtarget,
17787                              SelectionDAG &DAG) {
17788   assert(Subtarget->hasAVX512() &&
17789          "MGATHER/MSCATTER are supported on AVX-512 arch only");
17790
17791   MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
17792   EVT VT = N->getValue().getValueType();
17793   assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
17794   SDLoc dl(Op);
17795
17796   // X86 scatter kills mask register, so its type should be added to
17797   // the list of return values
17798   if (N->getNumValues() == 1) {
17799     SDValue Index = N->getIndex();
17800     if (!Subtarget->hasVLX() && !VT.is512BitVector() &&
17801         !Index.getValueType().is512BitVector())
17802       Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
17803
17804     SDVTList VTs = DAG.getVTList(N->getMask().getValueType(), MVT::Other);
17805     SDValue Ops[] = { N->getOperand(0), N->getOperand(1),  N->getOperand(2),
17806                       N->getOperand(3), Index };
17807
17808     SDValue NewScatter = DAG.getMaskedScatter(VTs, VT, dl, Ops, N->getMemOperand());
17809     DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
17810     return SDValue(NewScatter.getNode(), 0);
17811   }
17812   return Op;
17813 }
17814
17815 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget *Subtarget,
17816                             SelectionDAG &DAG) {
17817   assert(Subtarget->hasAVX512() &&
17818          "MGATHER/MSCATTER are supported on AVX-512 arch only");
17819
17820   MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
17821   EVT VT = Op.getValueType();
17822   assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
17823   SDLoc dl(Op);
17824
17825   SDValue Index = N->getIndex();
17826   if (!Subtarget->hasVLX() && !VT.is512BitVector() &&
17827       !Index.getValueType().is512BitVector()) {
17828     Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
17829     SDValue Ops[] = { N->getOperand(0), N->getOperand(1),  N->getOperand(2),
17830                       N->getOperand(3), Index };
17831     DAG.UpdateNodeOperands(N, Ops);
17832   }
17833   return Op;
17834 }
17835
17836 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
17837                                                     SelectionDAG &DAG) const {
17838   // TODO: Eventually, the lowering of these nodes should be informed by or
17839   // deferred to the GC strategy for the function in which they appear. For
17840   // now, however, they must be lowered to something. Since they are logically
17841   // no-ops in the case of a null GC strategy (or a GC strategy which does not
17842   // require special handling for these nodes), lower them as literal NOOPs for
17843   // the time being.
17844   SmallVector<SDValue, 2> Ops;
17845
17846   Ops.push_back(Op.getOperand(0));
17847   if (Op->getGluedNode())
17848     Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
17849
17850   SDLoc OpDL(Op);
17851   SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
17852   SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
17853
17854   return NOOP;
17855 }
17856
17857 SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
17858                                                   SelectionDAG &DAG) const {
17859   // TODO: Eventually, the lowering of these nodes should be informed by or
17860   // deferred to the GC strategy for the function in which they appear. For
17861   // now, however, they must be lowered to something. Since they are logically
17862   // no-ops in the case of a null GC strategy (or a GC strategy which does not
17863   // require special handling for these nodes), lower them as literal NOOPs for
17864   // the time being.
17865   SmallVector<SDValue, 2> Ops;
17866
17867   Ops.push_back(Op.getOperand(0));
17868   if (Op->getGluedNode())
17869     Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
17870
17871   SDLoc OpDL(Op);
17872   SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
17873   SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
17874
17875   return NOOP;
17876 }
17877
17878 /// LowerOperation - Provide custom lowering hooks for some operations.
17879 ///
17880 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
17881   switch (Op.getOpcode()) {
17882   default: llvm_unreachable("Should not custom lower this!");
17883   case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
17884   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
17885     return LowerCMP_SWAP(Op, Subtarget, DAG);
17886   case ISD::CTPOP:              return LowerCTPOP(Op, Subtarget, DAG);
17887   case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
17888   case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op,DAG);
17889   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
17890   case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
17891   case ISD::VECTOR_SHUFFLE:     return lowerVectorShuffle(Op, Subtarget, DAG);
17892   case ISD::VSELECT:            return LowerVSELECT(Op, DAG);
17893   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
17894   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
17895   case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
17896   case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
17897   case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
17898   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
17899   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
17900   case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
17901   case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
17902   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
17903   case ISD::SHL_PARTS:
17904   case ISD::SRA_PARTS:
17905   case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
17906   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
17907   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
17908   case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
17909   case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, Subtarget, DAG);
17910   case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, Subtarget, DAG);
17911   case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, Subtarget, DAG);
17912   case ISD::SIGN_EXTEND_VECTOR_INREG:
17913     return LowerSIGN_EXTEND_VECTOR_INREG(Op, Subtarget, DAG);
17914   case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
17915   case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
17916   case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
17917   case ISD::LOAD:               return LowerExtendedLoad(Op, Subtarget, DAG);
17918   case ISD::FABS:
17919   case ISD::FNEG:               return LowerFABSorFNEG(Op, DAG);
17920   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
17921   case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
17922   case ISD::SETCC:              return LowerSETCC(Op, DAG);
17923   case ISD::SELECT:             return LowerSELECT(Op, DAG);
17924   case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
17925   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
17926   case ISD::VASTART:            return LowerVASTART(Op, DAG);
17927   case ISD::VAARG:              return LowerVAARG(Op, DAG);
17928   case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
17929   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
17930   case ISD::INTRINSIC_VOID:
17931   case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
17932   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
17933   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
17934   case ISD::FRAME_TO_ARGS_OFFSET:
17935                                 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
17936   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
17937   case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
17938   case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
17939   case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
17940   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
17941   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
17942   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
17943   case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
17944   case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ_ZERO_UNDEF(Op, DAG);
17945   case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
17946   case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
17947   case ISD::UMUL_LOHI:
17948   case ISD::SMUL_LOHI:          return LowerMUL_LOHI(Op, Subtarget, DAG);
17949   case ISD::SRA:
17950   case ISD::SRL:
17951   case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
17952   case ISD::SADDO:
17953   case ISD::UADDO:
17954   case ISD::SSUBO:
17955   case ISD::USUBO:
17956   case ISD::SMULO:
17957   case ISD::UMULO:              return LowerXALUO(Op, DAG);
17958   case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
17959   case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);
17960   case ISD::ADDC:
17961   case ISD::ADDE:
17962   case ISD::SUBC:
17963   case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
17964   case ISD::ADD:                return LowerADD(Op, DAG);
17965   case ISD::SUB:                return LowerSUB(Op, DAG);
17966   case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
17967   case ISD::MGATHER:            return LowerMGATHER(Op, Subtarget, DAG);
17968   case ISD::MSCATTER:           return LowerMSCATTER(Op, Subtarget, DAG);
17969   case ISD::GC_TRANSITION_START:
17970                                 return LowerGC_TRANSITION_START(Op, DAG);
17971   case ISD::GC_TRANSITION_END:  return LowerGC_TRANSITION_END(Op, DAG);
17972   }
17973 }
17974
17975 /// ReplaceNodeResults - Replace a node with an illegal result type
17976 /// with a new node built out of custom code.
17977 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
17978                                            SmallVectorImpl<SDValue>&Results,
17979                                            SelectionDAG &DAG) const {
17980   SDLoc dl(N);
17981   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17982   switch (N->getOpcode()) {
17983   default:
17984     llvm_unreachable("Do not know how to custom type legalize this operation!");
17985   // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
17986   case X86ISD::FMINC:
17987   case X86ISD::FMIN:
17988   case X86ISD::FMAXC:
17989   case X86ISD::FMAX: {
17990     EVT VT = N->getValueType(0);
17991     if (VT != MVT::v2f32)
17992       llvm_unreachable("Unexpected type (!= v2f32) on FMIN/FMAX.");
17993     SDValue UNDEF = DAG.getUNDEF(VT);
17994     SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
17995                               N->getOperand(0), UNDEF);
17996     SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
17997                               N->getOperand(1), UNDEF);
17998     Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
17999     return;
18000   }
18001   case ISD::SIGN_EXTEND_INREG:
18002   case ISD::ADDC:
18003   case ISD::ADDE:
18004   case ISD::SUBC:
18005   case ISD::SUBE:
18006     // We don't want to expand or promote these.
18007     return;
18008   case ISD::SDIV:
18009   case ISD::UDIV:
18010   case ISD::SREM:
18011   case ISD::UREM:
18012   case ISD::SDIVREM:
18013   case ISD::UDIVREM: {
18014     SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
18015     Results.push_back(V);
18016     return;
18017   }
18018   case ISD::FP_TO_SINT:
18019     // FP_TO_INT*_IN_MEM is not legal for f16 inputs.  Do not convert
18020     // (FP_TO_SINT (load f16)) to FP_TO_INT*.
18021     if (N->getOperand(0).getValueType() == MVT::f16)
18022       break;
18023     // fallthrough
18024   case ISD::FP_TO_UINT: {
18025     bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
18026
18027     if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType()))
18028       return;
18029
18030     std::pair<SDValue,SDValue> Vals =
18031         FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
18032     SDValue FIST = Vals.first, StackSlot = Vals.second;
18033     if (FIST.getNode()) {
18034       EVT VT = N->getValueType(0);
18035       // Return a load from the stack slot.
18036       if (StackSlot.getNode())
18037         Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
18038                                       MachinePointerInfo(),
18039                                       false, false, false, 0));
18040       else
18041         Results.push_back(FIST);
18042     }
18043     return;
18044   }
18045   case ISD::UINT_TO_FP: {
18046     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
18047     if (N->getOperand(0).getValueType() != MVT::v2i32 ||
18048         N->getValueType(0) != MVT::v2f32)
18049       return;
18050     SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,
18051                                  N->getOperand(0));
18052     SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
18053                                      MVT::f64);
18054     SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias);
18055     SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
18056                              DAG.getBitcast(MVT::v2i64, VBias));
18057     Or = DAG.getBitcast(MVT::v2f64, Or);
18058     SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
18059     Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
18060     return;
18061   }
18062   case ISD::FP_ROUND: {
18063     if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
18064         return;
18065     SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
18066     Results.push_back(V);
18067     return;
18068   }
18069   case ISD::FP_EXTEND: {
18070     // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
18071     // No other ValueType for FP_EXTEND should reach this point.
18072     assert(N->getValueType(0) == MVT::v2f32 &&
18073            "Do not know how to legalize this Node");
18074     return;
18075   }
18076   case ISD::INTRINSIC_W_CHAIN: {
18077     unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
18078     switch (IntNo) {
18079     default : llvm_unreachable("Do not know how to custom type "
18080                                "legalize this intrinsic operation!");
18081     case Intrinsic::x86_rdtsc:
18082       return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
18083                                      Results);
18084     case Intrinsic::x86_rdtscp:
18085       return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
18086                                      Results);
18087     case Intrinsic::x86_rdpmc:
18088       return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
18089     }
18090   }
18091   case ISD::READCYCLECOUNTER: {
18092     return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
18093                                    Results);
18094   }
18095   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
18096     EVT T = N->getValueType(0);
18097     assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
18098     bool Regs64bit = T == MVT::i128;
18099     EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
18100     SDValue cpInL, cpInH;
18101     cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
18102                         DAG.getConstant(0, dl, HalfT));
18103     cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
18104                         DAG.getConstant(1, dl, HalfT));
18105     cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
18106                              Regs64bit ? X86::RAX : X86::EAX,
18107                              cpInL, SDValue());
18108     cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
18109                              Regs64bit ? X86::RDX : X86::EDX,
18110                              cpInH, cpInL.getValue(1));
18111     SDValue swapInL, swapInH;
18112     swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
18113                           DAG.getConstant(0, dl, HalfT));
18114     swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
18115                           DAG.getConstant(1, dl, HalfT));
18116     swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl,
18117                                Regs64bit ? X86::RBX : X86::EBX,
18118                                swapInL, cpInH.getValue(1));
18119     swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl,
18120                                Regs64bit ? X86::RCX : X86::ECX,
18121                                swapInH, swapInL.getValue(1));
18122     SDValue Ops[] = { swapInH.getValue(0),
18123                       N->getOperand(1),
18124                       swapInH.getValue(1) };
18125     SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
18126     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
18127     unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
18128                                   X86ISD::LCMPXCHG8_DAG;
18129     SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
18130     SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
18131                                         Regs64bit ? X86::RAX : X86::EAX,
18132                                         HalfT, Result.getValue(1));
18133     SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
18134                                         Regs64bit ? X86::RDX : X86::EDX,
18135                                         HalfT, cpOutL.getValue(2));
18136     SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
18137
18138     SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
18139                                         MVT::i32, cpOutH.getValue(2));
18140     SDValue Success =
18141         DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
18142                     DAG.getConstant(X86::COND_E, dl, MVT::i8), EFLAGS);
18143     Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
18144
18145     Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
18146     Results.push_back(Success);
18147     Results.push_back(EFLAGS.getValue(1));
18148     return;
18149   }
18150   case ISD::ATOMIC_SWAP:
18151   case ISD::ATOMIC_LOAD_ADD:
18152   case ISD::ATOMIC_LOAD_SUB:
18153   case ISD::ATOMIC_LOAD_AND:
18154   case ISD::ATOMIC_LOAD_OR:
18155   case ISD::ATOMIC_LOAD_XOR:
18156   case ISD::ATOMIC_LOAD_NAND:
18157   case ISD::ATOMIC_LOAD_MIN:
18158   case ISD::ATOMIC_LOAD_MAX:
18159   case ISD::ATOMIC_LOAD_UMIN:
18160   case ISD::ATOMIC_LOAD_UMAX:
18161   case ISD::ATOMIC_LOAD: {
18162     // Delegate to generic TypeLegalization. Situations we can really handle
18163     // should have already been dealt with by AtomicExpandPass.cpp.
18164     break;
18165   }
18166   case ISD::BITCAST: {
18167     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
18168     EVT DstVT = N->getValueType(0);
18169     EVT SrcVT = N->getOperand(0)->getValueType(0);
18170
18171     if (SrcVT != MVT::f64 ||
18172         (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
18173       return;
18174
18175     unsigned NumElts = DstVT.getVectorNumElements();
18176     EVT SVT = DstVT.getVectorElementType();
18177     EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
18178     SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
18179                                    MVT::v2f64, N->getOperand(0));
18180     SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
18181
18182     if (ExperimentalVectorWideningLegalization) {
18183       // If we are legalizing vectors by widening, we already have the desired
18184       // legal vector type, just return it.
18185       Results.push_back(ToVecInt);
18186       return;
18187     }
18188
18189     SmallVector<SDValue, 8> Elts;
18190     for (unsigned i = 0, e = NumElts; i != e; ++i)
18191       Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
18192                                    ToVecInt, DAG.getIntPtrConstant(i, dl)));
18193
18194     Results.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, DstVT, Elts));
18195   }
18196   }
18197 }
18198
18199 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
18200   switch ((X86ISD::NodeType)Opcode) {
18201   case X86ISD::FIRST_NUMBER:       break;
18202   case X86ISD::BSF:                return "X86ISD::BSF";
18203   case X86ISD::BSR:                return "X86ISD::BSR";
18204   case X86ISD::SHLD:               return "X86ISD::SHLD";
18205   case X86ISD::SHRD:               return "X86ISD::SHRD";
18206   case X86ISD::FAND:               return "X86ISD::FAND";
18207   case X86ISD::FANDN:              return "X86ISD::FANDN";
18208   case X86ISD::FOR:                return "X86ISD::FOR";
18209   case X86ISD::FXOR:               return "X86ISD::FXOR";
18210   case X86ISD::FILD:               return "X86ISD::FILD";
18211   case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
18212   case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
18213   case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
18214   case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
18215   case X86ISD::FLD:                return "X86ISD::FLD";
18216   case X86ISD::FST:                return "X86ISD::FST";
18217   case X86ISD::CALL:               return "X86ISD::CALL";
18218   case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
18219   case X86ISD::RDTSCP_DAG:         return "X86ISD::RDTSCP_DAG";
18220   case X86ISD::RDPMC_DAG:          return "X86ISD::RDPMC_DAG";
18221   case X86ISD::BT:                 return "X86ISD::BT";
18222   case X86ISD::CMP:                return "X86ISD::CMP";
18223   case X86ISD::COMI:               return "X86ISD::COMI";
18224   case X86ISD::UCOMI:              return "X86ISD::UCOMI";
18225   case X86ISD::CMPM:               return "X86ISD::CMPM";
18226   case X86ISD::CMPMU:              return "X86ISD::CMPMU";
18227   case X86ISD::CMPM_RND:           return "X86ISD::CMPM_RND";
18228   case X86ISD::SETCC:              return "X86ISD::SETCC";
18229   case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
18230   case X86ISD::FSETCC:             return "X86ISD::FSETCC";
18231   case X86ISD::FGETSIGNx86:        return "X86ISD::FGETSIGNx86";
18232   case X86ISD::CMOV:               return "X86ISD::CMOV";
18233   case X86ISD::BRCOND:             return "X86ISD::BRCOND";
18234   case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
18235   case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
18236   case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
18237   case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
18238   case X86ISD::Wrapper:            return "X86ISD::Wrapper";
18239   case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
18240   case X86ISD::MOVDQ2Q:            return "X86ISD::MOVDQ2Q";
18241   case X86ISD::MMX_MOVD2W:         return "X86ISD::MMX_MOVD2W";
18242   case X86ISD::MMX_MOVW2D:         return "X86ISD::MMX_MOVW2D";
18243   case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
18244   case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
18245   case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
18246   case X86ISD::PINSRB:             return "X86ISD::PINSRB";
18247   case X86ISD::PINSRW:             return "X86ISD::PINSRW";
18248   case X86ISD::MMX_PINSRW:         return "X86ISD::MMX_PINSRW";
18249   case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
18250   case X86ISD::ANDNP:              return "X86ISD::ANDNP";
18251   case X86ISD::PSIGN:              return "X86ISD::PSIGN";
18252   case X86ISD::BLENDI:             return "X86ISD::BLENDI";
18253   case X86ISD::SHRUNKBLEND:        return "X86ISD::SHRUNKBLEND";
18254   case X86ISD::ADDUS:              return "X86ISD::ADDUS";
18255   case X86ISD::SUBUS:              return "X86ISD::SUBUS";
18256   case X86ISD::HADD:               return "X86ISD::HADD";
18257   case X86ISD::HSUB:               return "X86ISD::HSUB";
18258   case X86ISD::FHADD:              return "X86ISD::FHADD";
18259   case X86ISD::FHSUB:              return "X86ISD::FHSUB";
18260   case X86ISD::UMAX:               return "X86ISD::UMAX";
18261   case X86ISD::UMIN:               return "X86ISD::UMIN";
18262   case X86ISD::SMAX:               return "X86ISD::SMAX";
18263   case X86ISD::SMIN:               return "X86ISD::SMIN";
18264   case X86ISD::FMAX:               return "X86ISD::FMAX";
18265   case X86ISD::FMAX_RND:           return "X86ISD::FMAX_RND";
18266   case X86ISD::FMIN:               return "X86ISD::FMIN";
18267   case X86ISD::FMIN_RND:           return "X86ISD::FMIN_RND";
18268   case X86ISD::FMAXC:              return "X86ISD::FMAXC";
18269   case X86ISD::FMINC:              return "X86ISD::FMINC";
18270   case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
18271   case X86ISD::FRCP:               return "X86ISD::FRCP";
18272   case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
18273   case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
18274   case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
18275   case X86ISD::EH_SJLJ_SETJMP:     return "X86ISD::EH_SJLJ_SETJMP";
18276   case X86ISD::EH_SJLJ_LONGJMP:    return "X86ISD::EH_SJLJ_LONGJMP";
18277   case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
18278   case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
18279   case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
18280   case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
18281   case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
18282   case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
18283   case X86ISD::LCMPXCHG16_DAG:     return "X86ISD::LCMPXCHG16_DAG";
18284   case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
18285   case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
18286   case X86ISD::VZEXT:              return "X86ISD::VZEXT";
18287   case X86ISD::VSEXT:              return "X86ISD::VSEXT";
18288   case X86ISD::VTRUNC:             return "X86ISD::VTRUNC";
18289   case X86ISD::VTRUNCM:            return "X86ISD::VTRUNCM";
18290   case X86ISD::VINSERT:            return "X86ISD::VINSERT";
18291   case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
18292   case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
18293   case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
18294   case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
18295   case X86ISD::VSHL:               return "X86ISD::VSHL";
18296   case X86ISD::VSRL:               return "X86ISD::VSRL";
18297   case X86ISD::VSRA:               return "X86ISD::VSRA";
18298   case X86ISD::VSHLI:              return "X86ISD::VSHLI";
18299   case X86ISD::VSRLI:              return "X86ISD::VSRLI";
18300   case X86ISD::VSRAI:              return "X86ISD::VSRAI";
18301   case X86ISD::CMPP:               return "X86ISD::CMPP";
18302   case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
18303   case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
18304   case X86ISD::PCMPEQM:            return "X86ISD::PCMPEQM";
18305   case X86ISD::PCMPGTM:            return "X86ISD::PCMPGTM";
18306   case X86ISD::ADD:                return "X86ISD::ADD";
18307   case X86ISD::SUB:                return "X86ISD::SUB";
18308   case X86ISD::ADC:                return "X86ISD::ADC";
18309   case X86ISD::SBB:                return "X86ISD::SBB";
18310   case X86ISD::SMUL:               return "X86ISD::SMUL";
18311   case X86ISD::UMUL:               return "X86ISD::UMUL";
18312   case X86ISD::SMUL8:              return "X86ISD::SMUL8";
18313   case X86ISD::UMUL8:              return "X86ISD::UMUL8";
18314   case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
18315   case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
18316   case X86ISD::INC:                return "X86ISD::INC";
18317   case X86ISD::DEC:                return "X86ISD::DEC";
18318   case X86ISD::OR:                 return "X86ISD::OR";
18319   case X86ISD::XOR:                return "X86ISD::XOR";
18320   case X86ISD::AND:                return "X86ISD::AND";
18321   case X86ISD::BEXTR:              return "X86ISD::BEXTR";
18322   case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
18323   case X86ISD::PTEST:              return "X86ISD::PTEST";
18324   case X86ISD::TESTP:              return "X86ISD::TESTP";
18325   case X86ISD::TESTM:              return "X86ISD::TESTM";
18326   case X86ISD::TESTNM:             return "X86ISD::TESTNM";
18327   case X86ISD::KORTEST:            return "X86ISD::KORTEST";
18328   case X86ISD::PACKSS:             return "X86ISD::PACKSS";
18329   case X86ISD::PACKUS:             return "X86ISD::PACKUS";
18330   case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
18331   case X86ISD::VALIGN:             return "X86ISD::VALIGN";
18332   case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
18333   case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
18334   case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
18335   case X86ISD::SHUFP:              return "X86ISD::SHUFP";
18336   case X86ISD::SHUF128:            return "X86ISD::SHUF128";
18337   case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
18338   case X86ISD::MOVLHPD:            return "X86ISD::MOVLHPD";
18339   case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
18340   case X86ISD::MOVLPS:             return "X86ISD::MOVLPS";
18341   case X86ISD::MOVLPD:             return "X86ISD::MOVLPD";
18342   case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
18343   case X86ISD::MOVSHDUP:           return "X86ISD::MOVSHDUP";
18344   case X86ISD::MOVSLDUP:           return "X86ISD::MOVSLDUP";
18345   case X86ISD::MOVSD:              return "X86ISD::MOVSD";
18346   case X86ISD::MOVSS:              return "X86ISD::MOVSS";
18347   case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
18348   case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
18349   case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
18350   case X86ISD::SUBV_BROADCAST:     return "X86ISD::SUBV_BROADCAST";
18351   case X86ISD::VEXTRACT:           return "X86ISD::VEXTRACT";
18352   case X86ISD::VPERMILPV:          return "X86ISD::VPERMILPV";
18353   case X86ISD::VPERMILPI:          return "X86ISD::VPERMILPI";
18354   case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
18355   case X86ISD::VPERMV:             return "X86ISD::VPERMV";
18356   case X86ISD::VPERMV3:            return "X86ISD::VPERMV3";
18357   case X86ISD::VPERMIV3:           return "X86ISD::VPERMIV3";
18358   case X86ISD::VPERMI:             return "X86ISD::VPERMI";
18359   case X86ISD::VFIXUPIMM:          return "X86ISD::VFIXUPIMM";
18360   case X86ISD::VRANGE:             return "X86ISD::VRANGE";
18361   case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
18362   case X86ISD::PMULDQ:             return "X86ISD::PMULDQ";
18363   case X86ISD::PSADBW:             return "X86ISD::PSADBW";
18364   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
18365   case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
18366   case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
18367   case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
18368   case X86ISD::MFENCE:             return "X86ISD::MFENCE";
18369   case X86ISD::SFENCE:             return "X86ISD::SFENCE";
18370   case X86ISD::LFENCE:             return "X86ISD::LFENCE";
18371   case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
18372   case X86ISD::WIN_FTOL:           return "X86ISD::WIN_FTOL";
18373   case X86ISD::SAHF:               return "X86ISD::SAHF";
18374   case X86ISD::RDRAND:             return "X86ISD::RDRAND";
18375   case X86ISD::RDSEED:             return "X86ISD::RDSEED";
18376   case X86ISD::FMADD:              return "X86ISD::FMADD";
18377   case X86ISD::FMSUB:              return "X86ISD::FMSUB";
18378   case X86ISD::FNMADD:             return "X86ISD::FNMADD";
18379   case X86ISD::FNMSUB:             return "X86ISD::FNMSUB";
18380   case X86ISD::FMADDSUB:           return "X86ISD::FMADDSUB";
18381   case X86ISD::FMSUBADD:           return "X86ISD::FMSUBADD";
18382   case X86ISD::FMADD_RND:          return "X86ISD::FMADD_RND";
18383   case X86ISD::FNMADD_RND:         return "X86ISD::FNMADD_RND";
18384   case X86ISD::FMSUB_RND:          return "X86ISD::FMSUB_RND";
18385   case X86ISD::FNMSUB_RND:         return "X86ISD::FNMSUB_RND";
18386   case X86ISD::FMADDSUB_RND:       return "X86ISD::FMADDSUB_RND";
18387   case X86ISD::FMSUBADD_RND:       return "X86ISD::FMSUBADD_RND";
18388   case X86ISD::RNDSCALE:           return "X86ISD::RNDSCALE";
18389   case X86ISD::PCMPESTRI:          return "X86ISD::PCMPESTRI";
18390   case X86ISD::PCMPISTRI:          return "X86ISD::PCMPISTRI";
18391   case X86ISD::XTEST:              return "X86ISD::XTEST";
18392   case X86ISD::COMPRESS:           return "X86ISD::COMPRESS";
18393   case X86ISD::EXPAND:             return "X86ISD::EXPAND";
18394   case X86ISD::SELECT:             return "X86ISD::SELECT";
18395   case X86ISD::ADDSUB:             return "X86ISD::ADDSUB";
18396   case X86ISD::RCP28:              return "X86ISD::RCP28";
18397   case X86ISD::EXP2:               return "X86ISD::EXP2";
18398   case X86ISD::RSQRT28:            return "X86ISD::RSQRT28";
18399   case X86ISD::FADD_RND:           return "X86ISD::FADD_RND";
18400   case X86ISD::FSUB_RND:           return "X86ISD::FSUB_RND";
18401   case X86ISD::FMUL_RND:           return "X86ISD::FMUL_RND";
18402   case X86ISD::FDIV_RND:           return "X86ISD::FDIV_RND";
18403   case X86ISD::FSQRT_RND:          return "X86ISD::FSQRT_RND";
18404   case X86ISD::FGETEXP_RND:        return "X86ISD::FGETEXP_RND";
18405   case X86ISD::ADDS:               return "X86ISD::ADDS";
18406   case X86ISD::SUBS:               return "X86ISD::SUBS";
18407   }
18408   return nullptr;
18409 }
18410
18411 // isLegalAddressingMode - Return true if the addressing mode represented
18412 // by AM is legal for this target, for a load/store of the specified type.
18413 bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
18414                                               Type *Ty,
18415                                               unsigned AS) const {
18416   // X86 supports extremely general addressing modes.
18417   CodeModel::Model M = getTargetMachine().getCodeModel();
18418   Reloc::Model R = getTargetMachine().getRelocationModel();
18419
18420   // X86 allows a sign-extended 32-bit immediate field as a displacement.
18421   if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
18422     return false;
18423
18424   if (AM.BaseGV) {
18425     unsigned GVFlags =
18426       Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
18427
18428     // If a reference to this global requires an extra load, we can't fold it.
18429     if (isGlobalStubReference(GVFlags))
18430       return false;
18431
18432     // If BaseGV requires a register for the PIC base, we cannot also have a
18433     // BaseReg specified.
18434     if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
18435       return false;
18436
18437     // If lower 4G is not available, then we must use rip-relative addressing.
18438     if ((M != CodeModel::Small || R != Reloc::Static) &&
18439         Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
18440       return false;
18441   }
18442
18443   switch (AM.Scale) {
18444   case 0:
18445   case 1:
18446   case 2:
18447   case 4:
18448   case 8:
18449     // These scales always work.
18450     break;
18451   case 3:
18452   case 5:
18453   case 9:
18454     // These scales are formed with basereg+scalereg.  Only accept if there is
18455     // no basereg yet.
18456     if (AM.HasBaseReg)
18457       return false;
18458     break;
18459   default:  // Other stuff never works.
18460     return false;
18461   }
18462
18463   return true;
18464 }
18465
18466 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
18467   unsigned Bits = Ty->getScalarSizeInBits();
18468
18469   // 8-bit shifts are always expensive, but versions with a scalar amount aren't
18470   // particularly cheaper than those without.
18471   if (Bits == 8)
18472     return false;
18473
18474   // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
18475   // variable shifts just as cheap as scalar ones.
18476   if (Subtarget->hasInt256() && (Bits == 32 || Bits == 64))
18477     return false;
18478
18479   // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
18480   // fully general vector.
18481   return true;
18482 }
18483
18484 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
18485   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
18486     return false;
18487   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
18488   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
18489   return NumBits1 > NumBits2;
18490 }
18491
18492 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
18493   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
18494     return false;
18495
18496   if (!isTypeLegal(EVT::getEVT(Ty1)))
18497     return false;
18498
18499   assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
18500
18501   // Assuming the caller doesn't have a zeroext or signext return parameter,
18502   // truncation all the way down to i1 is valid.
18503   return true;
18504 }
18505
18506 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
18507   return isInt<32>(Imm);
18508 }
18509
18510 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
18511   // Can also use sub to handle negated immediates.
18512   return isInt<32>(Imm);
18513 }
18514
18515 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
18516   if (!VT1.isInteger() || !VT2.isInteger())
18517     return false;
18518   unsigned NumBits1 = VT1.getSizeInBits();
18519   unsigned NumBits2 = VT2.getSizeInBits();
18520   return NumBits1 > NumBits2;
18521 }
18522
18523 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
18524   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
18525   return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
18526 }
18527
18528 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
18529   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
18530   return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
18531 }
18532
18533 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
18534   EVT VT1 = Val.getValueType();
18535   if (isZExtFree(VT1, VT2))
18536     return true;
18537
18538   if (Val.getOpcode() != ISD::LOAD)
18539     return false;
18540
18541   if (!VT1.isSimple() || !VT1.isInteger() ||
18542       !VT2.isSimple() || !VT2.isInteger())
18543     return false;
18544
18545   switch (VT1.getSimpleVT().SimpleTy) {
18546   default: break;
18547   case MVT::i8:
18548   case MVT::i16:
18549   case MVT::i32:
18550     // X86 has 8, 16, and 32-bit zero-extending loads.
18551     return true;
18552   }
18553
18554   return false;
18555 }
18556
18557 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
18558
18559 bool
18560 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
18561   if (!(Subtarget->hasFMA() || Subtarget->hasFMA4()))
18562     return false;
18563
18564   VT = VT.getScalarType();
18565
18566   if (!VT.isSimple())
18567     return false;
18568
18569   switch (VT.getSimpleVT().SimpleTy) {
18570   case MVT::f32:
18571   case MVT::f64:
18572     return true;
18573   default:
18574     break;
18575   }
18576
18577   return false;
18578 }
18579
18580 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
18581   // i16 instructions are longer (0x66 prefix) and potentially slower.
18582   return !(VT1 == MVT::i32 && VT2 == MVT::i16);
18583 }
18584
18585 /// isShuffleMaskLegal - Targets can use this to indicate that they only
18586 /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
18587 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
18588 /// are assumed to be legal.
18589 bool
18590 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
18591                                       EVT VT) const {
18592   if (!VT.isSimple())
18593     return false;
18594
18595   // Not for i1 vectors
18596   if (VT.getScalarType() == MVT::i1)
18597     return false;
18598
18599   // Very little shuffling can be done for 64-bit vectors right now.
18600   if (VT.getSizeInBits() == 64)
18601     return false;
18602
18603   // We only care that the types being shuffled are legal. The lowering can
18604   // handle any possible shuffle mask that results.
18605   return isTypeLegal(VT.getSimpleVT());
18606 }
18607
18608 bool
18609 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
18610                                           EVT VT) const {
18611   // Just delegate to the generic legality, clear masks aren't special.
18612   return isShuffleMaskLegal(Mask, VT);
18613 }
18614
18615 //===----------------------------------------------------------------------===//
18616 //                           X86 Scheduler Hooks
18617 //===----------------------------------------------------------------------===//
18618
18619 /// Utility function to emit xbegin specifying the start of an RTM region.
18620 static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,
18621                                      const TargetInstrInfo *TII) {
18622   DebugLoc DL = MI->getDebugLoc();
18623
18624   const BasicBlock *BB = MBB->getBasicBlock();
18625   MachineFunction::iterator I = MBB;
18626   ++I;
18627
18628   // For the v = xbegin(), we generate
18629   //
18630   // thisMBB:
18631   //  xbegin sinkMBB
18632   //
18633   // mainMBB:
18634   //  eax = -1
18635   //
18636   // sinkMBB:
18637   //  v = eax
18638
18639   MachineBasicBlock *thisMBB = MBB;
18640   MachineFunction *MF = MBB->getParent();
18641   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
18642   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
18643   MF->insert(I, mainMBB);
18644   MF->insert(I, sinkMBB);
18645
18646   // Transfer the remainder of BB and its successor edges to sinkMBB.
18647   sinkMBB->splice(sinkMBB->begin(), MBB,
18648                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
18649   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
18650
18651   // thisMBB:
18652   //  xbegin sinkMBB
18653   //  # fallthrough to mainMBB
18654   //  # abortion to sinkMBB
18655   BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
18656   thisMBB->addSuccessor(mainMBB);
18657   thisMBB->addSuccessor(sinkMBB);
18658
18659   // mainMBB:
18660   //  EAX = -1
18661   BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
18662   mainMBB->addSuccessor(sinkMBB);
18663
18664   // sinkMBB:
18665   // EAX is live into the sinkMBB
18666   sinkMBB->addLiveIn(X86::EAX);
18667   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
18668           TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
18669     .addReg(X86::EAX);
18670
18671   MI->eraseFromParent();
18672   return sinkMBB;
18673 }
18674
18675 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
18676 // or XMM0_V32I8 in AVX all of this code can be replaced with that
18677 // in the .td file.
18678 static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB,
18679                                        const TargetInstrInfo *TII) {
18680   unsigned Opc;
18681   switch (MI->getOpcode()) {
18682   default: llvm_unreachable("illegal opcode!");
18683   case X86::PCMPISTRM128REG:  Opc = X86::PCMPISTRM128rr;  break;
18684   case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
18685   case X86::PCMPISTRM128MEM:  Opc = X86::PCMPISTRM128rm;  break;
18686   case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
18687   case X86::PCMPESTRM128REG:  Opc = X86::PCMPESTRM128rr;  break;
18688   case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
18689   case X86::PCMPESTRM128MEM:  Opc = X86::PCMPESTRM128rm;  break;
18690   case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
18691   }
18692
18693   DebugLoc dl = MI->getDebugLoc();
18694   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
18695
18696   unsigned NumArgs = MI->getNumOperands();
18697   for (unsigned i = 1; i < NumArgs; ++i) {
18698     MachineOperand &Op = MI->getOperand(i);
18699     if (!(Op.isReg() && Op.isImplicit()))
18700       MIB.addOperand(Op);
18701   }
18702   if (MI->hasOneMemOperand())
18703     MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
18704
18705   BuildMI(*BB, MI, dl,
18706     TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
18707     .addReg(X86::XMM0);
18708
18709   MI->eraseFromParent();
18710   return BB;
18711 }
18712
18713 // FIXME: Custom handling because TableGen doesn't support multiple implicit
18714 // defs in an instruction pattern
18715 static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB,
18716                                        const TargetInstrInfo *TII) {
18717   unsigned Opc;
18718   switch (MI->getOpcode()) {
18719   default: llvm_unreachable("illegal opcode!");
18720   case X86::PCMPISTRIREG:  Opc = X86::PCMPISTRIrr;  break;
18721   case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
18722   case X86::PCMPISTRIMEM:  Opc = X86::PCMPISTRIrm;  break;
18723   case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
18724   case X86::PCMPESTRIREG:  Opc = X86::PCMPESTRIrr;  break;
18725   case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
18726   case X86::PCMPESTRIMEM:  Opc = X86::PCMPESTRIrm;  break;
18727   case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
18728   }
18729
18730   DebugLoc dl = MI->getDebugLoc();
18731   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
18732
18733   unsigned NumArgs = MI->getNumOperands(); // remove the results
18734   for (unsigned i = 1; i < NumArgs; ++i) {
18735     MachineOperand &Op = MI->getOperand(i);
18736     if (!(Op.isReg() && Op.isImplicit()))
18737       MIB.addOperand(Op);
18738   }
18739   if (MI->hasOneMemOperand())
18740     MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
18741
18742   BuildMI(*BB, MI, dl,
18743     TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
18744     .addReg(X86::ECX);
18745
18746   MI->eraseFromParent();
18747   return BB;
18748 }
18749
18750 static MachineBasicBlock *EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
18751                                       const X86Subtarget *Subtarget) {
18752   DebugLoc dl = MI->getDebugLoc();
18753   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
18754   // Address into RAX/EAX, other two args into ECX, EDX.
18755   unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
18756   unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
18757   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
18758   for (int i = 0; i < X86::AddrNumOperands; ++i)
18759     MIB.addOperand(MI->getOperand(i));
18760
18761   unsigned ValOps = X86::AddrNumOperands;
18762   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
18763     .addReg(MI->getOperand(ValOps).getReg());
18764   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
18765     .addReg(MI->getOperand(ValOps+1).getReg());
18766
18767   // The instruction doesn't actually take any operands though.
18768   BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr));
18769
18770   MI->eraseFromParent(); // The pseudo is gone now.
18771   return BB;
18772 }
18773
18774 MachineBasicBlock *
18775 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI,
18776                                                  MachineBasicBlock *MBB) const {
18777   // Emit va_arg instruction on X86-64.
18778
18779   // Operands to this pseudo-instruction:
18780   // 0  ) Output        : destination address (reg)
18781   // 1-5) Input         : va_list address (addr, i64mem)
18782   // 6  ) ArgSize       : Size (in bytes) of vararg type
18783   // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
18784   // 8  ) Align         : Alignment of type
18785   // 9  ) EFLAGS (implicit-def)
18786
18787   assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
18788   static_assert(X86::AddrNumOperands == 5,
18789                 "VAARG_64 assumes 5 address operands");
18790
18791   unsigned DestReg = MI->getOperand(0).getReg();
18792   MachineOperand &Base = MI->getOperand(1);
18793   MachineOperand &Scale = MI->getOperand(2);
18794   MachineOperand &Index = MI->getOperand(3);
18795   MachineOperand &Disp = MI->getOperand(4);
18796   MachineOperand &Segment = MI->getOperand(5);
18797   unsigned ArgSize = MI->getOperand(6).getImm();
18798   unsigned ArgMode = MI->getOperand(7).getImm();
18799   unsigned Align = MI->getOperand(8).getImm();
18800
18801   // Memory Reference
18802   assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
18803   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
18804   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
18805
18806   // Machine Information
18807   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
18808   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
18809   const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
18810   const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
18811   DebugLoc DL = MI->getDebugLoc();
18812
18813   // struct va_list {
18814   //   i32   gp_offset
18815   //   i32   fp_offset
18816   //   i64   overflow_area (address)
18817   //   i64   reg_save_area (address)
18818   // }
18819   // sizeof(va_list) = 24
18820   // alignment(va_list) = 8
18821
18822   unsigned TotalNumIntRegs = 6;
18823   unsigned TotalNumXMMRegs = 8;
18824   bool UseGPOffset = (ArgMode == 1);
18825   bool UseFPOffset = (ArgMode == 2);
18826   unsigned MaxOffset = TotalNumIntRegs * 8 +
18827                        (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
18828
18829   /* Align ArgSize to a multiple of 8 */
18830   unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
18831   bool NeedsAlign = (Align > 8);
18832
18833   MachineBasicBlock *thisMBB = MBB;
18834   MachineBasicBlock *overflowMBB;
18835   MachineBasicBlock *offsetMBB;
18836   MachineBasicBlock *endMBB;
18837
18838   unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
18839   unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
18840   unsigned OffsetReg = 0;
18841
18842   if (!UseGPOffset && !UseFPOffset) {
18843     // If we only pull from the overflow region, we don't create a branch.
18844     // We don't need to alter control flow.
18845     OffsetDestReg = 0; // unused
18846     OverflowDestReg = DestReg;
18847
18848     offsetMBB = nullptr;
18849     overflowMBB = thisMBB;
18850     endMBB = thisMBB;
18851   } else {
18852     // First emit code to check if gp_offset (or fp_offset) is below the bound.
18853     // If so, pull the argument from reg_save_area. (branch to offsetMBB)
18854     // If not, pull from overflow_area. (branch to overflowMBB)
18855     //
18856     //       thisMBB
18857     //         |     .
18858     //         |        .
18859     //     offsetMBB   overflowMBB
18860     //         |        .
18861     //         |     .
18862     //        endMBB
18863
18864     // Registers for the PHI in endMBB
18865     OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
18866     OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
18867
18868     const BasicBlock *LLVM_BB = MBB->getBasicBlock();
18869     MachineFunction *MF = MBB->getParent();
18870     overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
18871     offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
18872     endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
18873
18874     MachineFunction::iterator MBBIter = MBB;
18875     ++MBBIter;
18876
18877     // Insert the new basic blocks
18878     MF->insert(MBBIter, offsetMBB);
18879     MF->insert(MBBIter, overflowMBB);
18880     MF->insert(MBBIter, endMBB);
18881
18882     // Transfer the remainder of MBB and its successor edges to endMBB.
18883     endMBB->splice(endMBB->begin(), thisMBB,
18884                    std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
18885     endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
18886
18887     // Make offsetMBB and overflowMBB successors of thisMBB
18888     thisMBB->addSuccessor(offsetMBB);
18889     thisMBB->addSuccessor(overflowMBB);
18890
18891     // endMBB is a successor of both offsetMBB and overflowMBB
18892     offsetMBB->addSuccessor(endMBB);
18893     overflowMBB->addSuccessor(endMBB);
18894
18895     // Load the offset value into a register
18896     OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
18897     BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
18898       .addOperand(Base)
18899       .addOperand(Scale)
18900       .addOperand(Index)
18901       .addDisp(Disp, UseFPOffset ? 4 : 0)
18902       .addOperand(Segment)
18903       .setMemRefs(MMOBegin, MMOEnd);
18904
18905     // Check if there is enough room left to pull this argument.
18906     BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
18907       .addReg(OffsetReg)
18908       .addImm(MaxOffset + 8 - ArgSizeA8);
18909
18910     // Branch to "overflowMBB" if offset >= max
18911     // Fall through to "offsetMBB" otherwise
18912     BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
18913       .addMBB(overflowMBB);
18914   }
18915
18916   // In offsetMBB, emit code to use the reg_save_area.
18917   if (offsetMBB) {
18918     assert(OffsetReg != 0);
18919
18920     // Read the reg_save_area address.
18921     unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
18922     BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
18923       .addOperand(Base)
18924       .addOperand(Scale)
18925       .addOperand(Index)
18926       .addDisp(Disp, 16)
18927       .addOperand(Segment)
18928       .setMemRefs(MMOBegin, MMOEnd);
18929
18930     // Zero-extend the offset
18931     unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
18932       BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
18933         .addImm(0)
18934         .addReg(OffsetReg)
18935         .addImm(X86::sub_32bit);
18936
18937     // Add the offset to the reg_save_area to get the final address.
18938     BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
18939       .addReg(OffsetReg64)
18940       .addReg(RegSaveReg);
18941
18942     // Compute the offset for the next argument
18943     unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
18944     BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
18945       .addReg(OffsetReg)
18946       .addImm(UseFPOffset ? 16 : 8);
18947
18948     // Store it back into the va_list.
18949     BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
18950       .addOperand(Base)
18951       .addOperand(Scale)
18952       .addOperand(Index)
18953       .addDisp(Disp, UseFPOffset ? 4 : 0)
18954       .addOperand(Segment)
18955       .addReg(NextOffsetReg)
18956       .setMemRefs(MMOBegin, MMOEnd);
18957
18958     // Jump to endMBB
18959     BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
18960       .addMBB(endMBB);
18961   }
18962
18963   //
18964   // Emit code to use overflow area
18965   //
18966
18967   // Load the overflow_area address into a register.
18968   unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
18969   BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
18970     .addOperand(Base)
18971     .addOperand(Scale)
18972     .addOperand(Index)
18973     .addDisp(Disp, 8)
18974     .addOperand(Segment)
18975     .setMemRefs(MMOBegin, MMOEnd);
18976
18977   // If we need to align it, do so. Otherwise, just copy the address
18978   // to OverflowDestReg.
18979   if (NeedsAlign) {
18980     // Align the overflow address
18981     assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2");
18982     unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
18983
18984     // aligned_addr = (addr + (align-1)) & ~(align-1)
18985     BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
18986       .addReg(OverflowAddrReg)
18987       .addImm(Align-1);
18988
18989     BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
18990       .addReg(TmpReg)
18991       .addImm(~(uint64_t)(Align-1));
18992   } else {
18993     BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
18994       .addReg(OverflowAddrReg);
18995   }
18996
18997   // Compute the next overflow address after this argument.
18998   // (the overflow address should be kept 8-byte aligned)
18999   unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
19000   BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
19001     .addReg(OverflowDestReg)
19002     .addImm(ArgSizeA8);
19003
19004   // Store the new overflow address.
19005   BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
19006     .addOperand(Base)
19007     .addOperand(Scale)
19008     .addOperand(Index)
19009     .addDisp(Disp, 8)
19010     .addOperand(Segment)
19011     .addReg(NextAddrReg)
19012     .setMemRefs(MMOBegin, MMOEnd);
19013
19014   // If we branched, emit the PHI to the front of endMBB.
19015   if (offsetMBB) {
19016     BuildMI(*endMBB, endMBB->begin(), DL,
19017             TII->get(X86::PHI), DestReg)
19018       .addReg(OffsetDestReg).addMBB(offsetMBB)
19019       .addReg(OverflowDestReg).addMBB(overflowMBB);
19020   }
19021
19022   // Erase the pseudo instruction
19023   MI->eraseFromParent();
19024
19025   return endMBB;
19026 }
19027
19028 MachineBasicBlock *
19029 X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
19030                                                  MachineInstr *MI,
19031                                                  MachineBasicBlock *MBB) const {
19032   // Emit code to save XMM registers to the stack. The ABI says that the
19033   // number of registers to save is given in %al, so it's theoretically
19034   // possible to do an indirect jump trick to avoid saving all of them,
19035   // however this code takes a simpler approach and just executes all
19036   // of the stores if %al is non-zero. It's less code, and it's probably
19037   // easier on the hardware branch predictor, and stores aren't all that
19038   // expensive anyway.
19039
19040   // Create the new basic blocks. One block contains all the XMM stores,
19041   // and one block is the final destination regardless of whether any
19042   // stores were performed.
19043   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
19044   MachineFunction *F = MBB->getParent();
19045   MachineFunction::iterator MBBIter = MBB;
19046   ++MBBIter;
19047   MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
19048   MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
19049   F->insert(MBBIter, XMMSaveMBB);
19050   F->insert(MBBIter, EndMBB);
19051
19052   // Transfer the remainder of MBB and its successor edges to EndMBB.
19053   EndMBB->splice(EndMBB->begin(), MBB,
19054                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
19055   EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
19056
19057   // The original block will now fall through to the XMM save block.
19058   MBB->addSuccessor(XMMSaveMBB);
19059   // The XMMSaveMBB will fall through to the end block.
19060   XMMSaveMBB->addSuccessor(EndMBB);
19061
19062   // Now add the instructions.
19063   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
19064   DebugLoc DL = MI->getDebugLoc();
19065
19066   unsigned CountReg = MI->getOperand(0).getReg();
19067   int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
19068   int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
19069
19070   if (!Subtarget->isTargetWin64()) {
19071     // If %al is 0, branch around the XMM save block.
19072     BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
19073     BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
19074     MBB->addSuccessor(EndMBB);
19075   }
19076
19077   // Make sure the last operand is EFLAGS, which gets clobbered by the branch
19078   // that was just emitted, but clearly shouldn't be "saved".
19079   assert((MI->getNumOperands() <= 3 ||
19080           !MI->getOperand(MI->getNumOperands() - 1).isReg() ||
19081           MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS)
19082          && "Expected last argument to be EFLAGS");
19083   unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
19084   // In the XMM save block, save all the XMM argument registers.
19085   for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) {
19086     int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
19087     MachineMemOperand *MMO =
19088       F->getMachineMemOperand(
19089           MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset),
19090         MachineMemOperand::MOStore,
19091         /*Size=*/16, /*Align=*/16);
19092     BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
19093       .addFrameIndex(RegSaveFrameIndex)
19094       .addImm(/*Scale=*/1)
19095       .addReg(/*IndexReg=*/0)
19096       .addImm(/*Disp=*/Offset)
19097       .addReg(/*Segment=*/0)
19098       .addReg(MI->getOperand(i).getReg())
19099       .addMemOperand(MMO);
19100   }
19101
19102   MI->eraseFromParent();   // The pseudo instruction is gone now.
19103
19104   return EndMBB;
19105 }
19106
19107 // The EFLAGS operand of SelectItr might be missing a kill marker
19108 // because there were multiple uses of EFLAGS, and ISel didn't know
19109 // which to mark. Figure out whether SelectItr should have had a
19110 // kill marker, and set it if it should. Returns the correct kill
19111 // marker value.
19112 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
19113                                      MachineBasicBlock* BB,
19114                                      const TargetRegisterInfo* TRI) {
19115   // Scan forward through BB for a use/def of EFLAGS.
19116   MachineBasicBlock::iterator miI(std::next(SelectItr));
19117   for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
19118     const MachineInstr& mi = *miI;
19119     if (mi.readsRegister(X86::EFLAGS))
19120       return false;
19121     if (mi.definesRegister(X86::EFLAGS))
19122       break; // Should have kill-flag - update below.
19123   }
19124
19125   // If we hit the end of the block, check whether EFLAGS is live into a
19126   // successor.
19127   if (miI == BB->end()) {
19128     for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
19129                                           sEnd = BB->succ_end();
19130          sItr != sEnd; ++sItr) {
19131       MachineBasicBlock* succ = *sItr;
19132       if (succ->isLiveIn(X86::EFLAGS))
19133         return false;
19134     }
19135   }
19136
19137   // We found a def, or hit the end of the basic block and EFLAGS wasn't live
19138   // out. SelectMI should have a kill flag on EFLAGS.
19139   SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
19140   return true;
19141 }
19142
19143 MachineBasicBlock *
19144 X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
19145                                      MachineBasicBlock *BB) const {
19146   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
19147   DebugLoc DL = MI->getDebugLoc();
19148
19149   // To "insert" a SELECT_CC instruction, we actually have to insert the
19150   // diamond control-flow pattern.  The incoming instruction knows the
19151   // destination vreg to set, the condition code register to branch on, the
19152   // true/false values to select between, and a branch opcode to use.
19153   const BasicBlock *LLVM_BB = BB->getBasicBlock();
19154   MachineFunction::iterator It = BB;
19155   ++It;
19156
19157   //  thisMBB:
19158   //  ...
19159   //   TrueVal = ...
19160   //   cmpTY ccX, r1, r2
19161   //   bCC copy1MBB
19162   //   fallthrough --> copy0MBB
19163   MachineBasicBlock *thisMBB = BB;
19164   MachineFunction *F = BB->getParent();
19165
19166   // We also lower double CMOVs:
19167   //   (CMOV (CMOV F, T, cc1), T, cc2)
19168   // to two successives branches.  For that, we look for another CMOV as the
19169   // following instruction.
19170   //
19171   // Without this, we would add a PHI between the two jumps, which ends up
19172   // creating a few copies all around. For instance, for
19173   //
19174   //    (sitofp (zext (fcmp une)))
19175   //
19176   // we would generate:
19177   //
19178   //         ucomiss %xmm1, %xmm0
19179   //         movss  <1.0f>, %xmm0
19180   //         movaps  %xmm0, %xmm1
19181   //         jne     .LBB5_2
19182   //         xorps   %xmm1, %xmm1
19183   // .LBB5_2:
19184   //         jp      .LBB5_4
19185   //         movaps  %xmm1, %xmm0
19186   // .LBB5_4:
19187   //         retq
19188   //
19189   // because this custom-inserter would have generated:
19190   //
19191   //   A
19192   //   | \
19193   //   |  B
19194   //   | /
19195   //   C
19196   //   | \
19197   //   |  D
19198   //   | /
19199   //   E
19200   //
19201   // A: X = ...; Y = ...
19202   // B: empty
19203   // C: Z = PHI [X, A], [Y, B]
19204   // D: empty
19205   // E: PHI [X, C], [Z, D]
19206   //
19207   // If we lower both CMOVs in a single step, we can instead generate:
19208   //
19209   //   A
19210   //   | \
19211   //   |  C
19212   //   | /|
19213   //   |/ |
19214   //   |  |
19215   //   |  D
19216   //   | /
19217   //   E
19218   //
19219   // A: X = ...; Y = ...
19220   // D: empty
19221   // E: PHI [X, A], [X, C], [Y, D]
19222   //
19223   // Which, in our sitofp/fcmp example, gives us something like:
19224   //
19225   //         ucomiss %xmm1, %xmm0
19226   //         movss  <1.0f>, %xmm0
19227   //         jne     .LBB5_4
19228   //         jp      .LBB5_4
19229   //         xorps   %xmm0, %xmm0
19230   // .LBB5_4:
19231   //         retq
19232   //
19233   MachineInstr *NextCMOV = nullptr;
19234   MachineBasicBlock::iterator NextMIIt =
19235       std::next(MachineBasicBlock::iterator(MI));
19236   if (NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() &&
19237       NextMIIt->getOperand(2).getReg() == MI->getOperand(2).getReg() &&
19238       NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg())
19239     NextCMOV = &*NextMIIt;
19240
19241   MachineBasicBlock *jcc1MBB = nullptr;
19242
19243   // If we have a double CMOV, we lower it to two successive branches to
19244   // the same block.  EFLAGS is used by both, so mark it as live in the second.
19245   if (NextCMOV) {
19246     jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
19247     F->insert(It, jcc1MBB);
19248     jcc1MBB->addLiveIn(X86::EFLAGS);
19249   }
19250
19251   MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
19252   MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
19253   F->insert(It, copy0MBB);
19254   F->insert(It, sinkMBB);
19255
19256   // If the EFLAGS register isn't dead in the terminator, then claim that it's
19257   // live into the sink and copy blocks.
19258   const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
19259
19260   MachineInstr *LastEFLAGSUser = NextCMOV ? NextCMOV : MI;
19261   if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
19262       !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
19263     copy0MBB->addLiveIn(X86::EFLAGS);
19264     sinkMBB->addLiveIn(X86::EFLAGS);
19265   }
19266
19267   // Transfer the remainder of BB and its successor edges to sinkMBB.
19268   sinkMBB->splice(sinkMBB->begin(), BB,
19269                   std::next(MachineBasicBlock::iterator(MI)), BB->end());
19270   sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
19271
19272   // Add the true and fallthrough blocks as its successors.
19273   if (NextCMOV) {
19274     // The fallthrough block may be jcc1MBB, if we have a double CMOV.
19275     BB->addSuccessor(jcc1MBB);
19276
19277     // In that case, jcc1MBB will itself fallthrough the copy0MBB, and
19278     // jump to the sinkMBB.
19279     jcc1MBB->addSuccessor(copy0MBB);
19280     jcc1MBB->addSuccessor(sinkMBB);
19281   } else {
19282     BB->addSuccessor(copy0MBB);
19283   }
19284
19285   // The true block target of the first (or only) branch is always sinkMBB.
19286   BB->addSuccessor(sinkMBB);
19287
19288   // Create the conditional branch instruction.
19289   unsigned Opc =
19290     X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
19291   BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
19292
19293   if (NextCMOV) {
19294     unsigned Opc2 = X86::GetCondBranchFromCond(
19295         (X86::CondCode)NextCMOV->getOperand(3).getImm());
19296     BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
19297   }
19298
19299   //  copy0MBB:
19300   //   %FalseValue = ...
19301   //   # fallthrough to sinkMBB
19302   copy0MBB->addSuccessor(sinkMBB);
19303
19304   //  sinkMBB:
19305   //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
19306   //  ...
19307   MachineInstrBuilder MIB =
19308       BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI),
19309               MI->getOperand(0).getReg())
19310           .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
19311           .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
19312
19313   // If we have a double CMOV, the second Jcc provides the same incoming
19314   // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
19315   if (NextCMOV) {
19316     MIB.addReg(MI->getOperand(2).getReg()).addMBB(jcc1MBB);
19317     // Copy the PHI result to the register defined by the second CMOV.
19318     BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
19319             DL, TII->get(TargetOpcode::COPY), NextCMOV->getOperand(0).getReg())
19320         .addReg(MI->getOperand(0).getReg());
19321     NextCMOV->eraseFromParent();
19322   }
19323
19324   MI->eraseFromParent();   // The pseudo instruction is gone now.
19325   return sinkMBB;
19326 }
19327
19328 MachineBasicBlock *
19329 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
19330                                         MachineBasicBlock *BB) const {
19331   MachineFunction *MF = BB->getParent();
19332   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
19333   DebugLoc DL = MI->getDebugLoc();
19334   const BasicBlock *LLVM_BB = BB->getBasicBlock();
19335
19336   assert(MF->shouldSplitStack());
19337
19338   const bool Is64Bit = Subtarget->is64Bit();
19339   const bool IsLP64 = Subtarget->isTarget64BitLP64();
19340
19341   const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
19342   const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
19343
19344   // BB:
19345   //  ... [Till the alloca]
19346   // If stacklet is not large enough, jump to mallocMBB
19347   //
19348   // bumpMBB:
19349   //  Allocate by subtracting from RSP
19350   //  Jump to continueMBB
19351   //
19352   // mallocMBB:
19353   //  Allocate by call to runtime
19354   //
19355   // continueMBB:
19356   //  ...
19357   //  [rest of original BB]
19358   //
19359
19360   MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
19361   MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
19362   MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
19363
19364   MachineRegisterInfo &MRI = MF->getRegInfo();
19365   const TargetRegisterClass *AddrRegClass =
19366     getRegClassFor(getPointerTy());
19367
19368   unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
19369     bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
19370     tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
19371     SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
19372     sizeVReg = MI->getOperand(1).getReg(),
19373     physSPReg = IsLP64 || Subtarget->isTargetNaCl64() ? X86::RSP : X86::ESP;
19374
19375   MachineFunction::iterator MBBIter = BB;
19376   ++MBBIter;
19377
19378   MF->insert(MBBIter, bumpMBB);
19379   MF->insert(MBBIter, mallocMBB);
19380   MF->insert(MBBIter, continueMBB);
19381
19382   continueMBB->splice(continueMBB->begin(), BB,
19383                       std::next(MachineBasicBlock::iterator(MI)), BB->end());
19384   continueMBB->transferSuccessorsAndUpdatePHIs(BB);
19385
19386   // Add code to the main basic block to check if the stack limit has been hit,
19387   // and if so, jump to mallocMBB otherwise to bumpMBB.
19388   BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
19389   BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
19390     .addReg(tmpSPVReg).addReg(sizeVReg);
19391   BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
19392     .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
19393     .addReg(SPLimitVReg);
19394   BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
19395
19396   // bumpMBB simply decreases the stack pointer, since we know the current
19397   // stacklet has enough space.
19398   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
19399     .addReg(SPLimitVReg);
19400   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
19401     .addReg(SPLimitVReg);
19402   BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
19403
19404   // Calls into a routine in libgcc to allocate more space from the heap.
19405   const uint32_t *RegMask =
19406       Subtarget->getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
19407   if (IsLP64) {
19408     BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
19409       .addReg(sizeVReg);
19410     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
19411       .addExternalSymbol("__morestack_allocate_stack_space")
19412       .addRegMask(RegMask)
19413       .addReg(X86::RDI, RegState::Implicit)
19414       .addReg(X86::RAX, RegState::ImplicitDefine);
19415   } else if (Is64Bit) {
19416     BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
19417       .addReg(sizeVReg);
19418     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
19419       .addExternalSymbol("__morestack_allocate_stack_space")
19420       .addRegMask(RegMask)
19421       .addReg(X86::EDI, RegState::Implicit)
19422       .addReg(X86::EAX, RegState::ImplicitDefine);
19423   } else {
19424     BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
19425       .addImm(12);
19426     BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
19427     BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
19428       .addExternalSymbol("__morestack_allocate_stack_space")
19429       .addRegMask(RegMask)
19430       .addReg(X86::EAX, RegState::ImplicitDefine);
19431   }
19432
19433   if (!Is64Bit)
19434     BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
19435       .addImm(16);
19436
19437   BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
19438     .addReg(IsLP64 ? X86::RAX : X86::EAX);
19439   BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
19440
19441   // Set up the CFG correctly.
19442   BB->addSuccessor(bumpMBB);
19443   BB->addSuccessor(mallocMBB);
19444   mallocMBB->addSuccessor(continueMBB);
19445   bumpMBB->addSuccessor(continueMBB);
19446
19447   // Take care of the PHI nodes.
19448   BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
19449           MI->getOperand(0).getReg())
19450     .addReg(mallocPtrVReg).addMBB(mallocMBB)
19451     .addReg(bumpSPPtrVReg).addMBB(bumpMBB);
19452
19453   // Delete the original pseudo instruction.
19454   MI->eraseFromParent();
19455
19456   // And we're done.
19457   return continueMBB;
19458 }
19459
19460 MachineBasicBlock *
19461 X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
19462                                         MachineBasicBlock *BB) const {
19463   DebugLoc DL = MI->getDebugLoc();
19464
19465   assert(!Subtarget->isTargetMachO());
19466
19467   X86FrameLowering::emitStackProbeCall(*BB->getParent(), *BB, MI, DL);
19468
19469   MI->eraseFromParent();   // The pseudo instruction is gone now.
19470   return BB;
19471 }
19472
19473 MachineBasicBlock *
19474 X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
19475                                       MachineBasicBlock *BB) const {
19476   // This is pretty easy.  We're taking the value that we received from
19477   // our load from the relocation, sticking it in either RDI (x86-64)
19478   // or EAX and doing an indirect call.  The return value will then
19479   // be in the normal return register.
19480   MachineFunction *F = BB->getParent();
19481   const X86InstrInfo *TII = Subtarget->getInstrInfo();
19482   DebugLoc DL = MI->getDebugLoc();
19483
19484   assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
19485   assert(MI->getOperand(3).isGlobal() && "This should be a global");
19486
19487   // Get a register mask for the lowered call.
19488   // FIXME: The 32-bit calls have non-standard calling conventions. Use a
19489   // proper register mask.
19490   const uint32_t *RegMask =
19491       Subtarget->getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
19492   if (Subtarget->is64Bit()) {
19493     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
19494                                       TII->get(X86::MOV64rm), X86::RDI)
19495     .addReg(X86::RIP)
19496     .addImm(0).addReg(0)
19497     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
19498                       MI->getOperand(3).getTargetFlags())
19499     .addReg(0);
19500     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
19501     addDirectMem(MIB, X86::RDI);
19502     MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
19503   } else if (F->getTarget().getRelocationModel() != Reloc::PIC_) {
19504     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
19505                                       TII->get(X86::MOV32rm), X86::EAX)
19506     .addReg(0)
19507     .addImm(0).addReg(0)
19508     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
19509                       MI->getOperand(3).getTargetFlags())
19510     .addReg(0);
19511     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
19512     addDirectMem(MIB, X86::EAX);
19513     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
19514   } else {
19515     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
19516                                       TII->get(X86::MOV32rm), X86::EAX)
19517     .addReg(TII->getGlobalBaseReg(F))
19518     .addImm(0).addReg(0)
19519     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
19520                       MI->getOperand(3).getTargetFlags())
19521     .addReg(0);
19522     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
19523     addDirectMem(MIB, X86::EAX);
19524     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
19525   }
19526
19527   MI->eraseFromParent(); // The pseudo instruction is gone now.
19528   return BB;
19529 }
19530
19531 MachineBasicBlock *
19532 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
19533                                     MachineBasicBlock *MBB) const {
19534   DebugLoc DL = MI->getDebugLoc();
19535   MachineFunction *MF = MBB->getParent();
19536   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
19537   MachineRegisterInfo &MRI = MF->getRegInfo();
19538
19539   const BasicBlock *BB = MBB->getBasicBlock();
19540   MachineFunction::iterator I = MBB;
19541   ++I;
19542
19543   // Memory Reference
19544   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
19545   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
19546
19547   unsigned DstReg;
19548   unsigned MemOpndSlot = 0;
19549
19550   unsigned CurOp = 0;
19551
19552   DstReg = MI->getOperand(CurOp++).getReg();
19553   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
19554   assert(RC->hasType(MVT::i32) && "Invalid destination!");
19555   unsigned mainDstReg = MRI.createVirtualRegister(RC);
19556   unsigned restoreDstReg = MRI.createVirtualRegister(RC);
19557
19558   MemOpndSlot = CurOp;
19559
19560   MVT PVT = getPointerTy();
19561   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
19562          "Invalid Pointer Size!");
19563
19564   // For v = setjmp(buf), we generate
19565   //
19566   // thisMBB:
19567   //  buf[LabelOffset] = restoreMBB
19568   //  SjLjSetup restoreMBB
19569   //
19570   // mainMBB:
19571   //  v_main = 0
19572   //
19573   // sinkMBB:
19574   //  v = phi(main, restore)
19575   //
19576   // restoreMBB:
19577   //  if base pointer being used, load it from frame
19578   //  v_restore = 1
19579
19580   MachineBasicBlock *thisMBB = MBB;
19581   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
19582   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
19583   MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
19584   MF->insert(I, mainMBB);
19585   MF->insert(I, sinkMBB);
19586   MF->push_back(restoreMBB);
19587
19588   MachineInstrBuilder MIB;
19589
19590   // Transfer the remainder of BB and its successor edges to sinkMBB.
19591   sinkMBB->splice(sinkMBB->begin(), MBB,
19592                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
19593   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
19594
19595   // thisMBB:
19596   unsigned PtrStoreOpc = 0;
19597   unsigned LabelReg = 0;
19598   const int64_t LabelOffset = 1 * PVT.getStoreSize();
19599   Reloc::Model RM = MF->getTarget().getRelocationModel();
19600   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
19601                      (RM == Reloc::Static || RM == Reloc::DynamicNoPIC);
19602
19603   // Prepare IP either in reg or imm.
19604   if (!UseImmLabel) {
19605     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
19606     const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
19607     LabelReg = MRI.createVirtualRegister(PtrRC);
19608     if (Subtarget->is64Bit()) {
19609       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
19610               .addReg(X86::RIP)
19611               .addImm(0)
19612               .addReg(0)
19613               .addMBB(restoreMBB)
19614               .addReg(0);
19615     } else {
19616       const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
19617       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
19618               .addReg(XII->getGlobalBaseReg(MF))
19619               .addImm(0)
19620               .addReg(0)
19621               .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference())
19622               .addReg(0);
19623     }
19624   } else
19625     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
19626   // Store IP
19627   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
19628   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
19629     if (i == X86::AddrDisp)
19630       MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset);
19631     else
19632       MIB.addOperand(MI->getOperand(MemOpndSlot + i));
19633   }
19634   if (!UseImmLabel)
19635     MIB.addReg(LabelReg);
19636   else
19637     MIB.addMBB(restoreMBB);
19638   MIB.setMemRefs(MMOBegin, MMOEnd);
19639   // Setup
19640   MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
19641           .addMBB(restoreMBB);
19642
19643   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
19644   MIB.addRegMask(RegInfo->getNoPreservedMask());
19645   thisMBB->addSuccessor(mainMBB);
19646   thisMBB->addSuccessor(restoreMBB);
19647
19648   // mainMBB:
19649   //  EAX = 0
19650   BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
19651   mainMBB->addSuccessor(sinkMBB);
19652
19653   // sinkMBB:
19654   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
19655           TII->get(X86::PHI), DstReg)
19656     .addReg(mainDstReg).addMBB(mainMBB)
19657     .addReg(restoreDstReg).addMBB(restoreMBB);
19658
19659   // restoreMBB:
19660   if (RegInfo->hasBasePointer(*MF)) {
19661     const bool Uses64BitFramePtr =
19662         Subtarget->isTarget64BitLP64() || Subtarget->isTargetNaCl64();
19663     X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
19664     X86FI->setRestoreBasePointer(MF);
19665     unsigned FramePtr = RegInfo->getFrameRegister(*MF);
19666     unsigned BasePtr = RegInfo->getBaseRegister();
19667     unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
19668     addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
19669                  FramePtr, true, X86FI->getRestoreBasePointerOffset())
19670       .setMIFlag(MachineInstr::FrameSetup);
19671   }
19672   BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
19673   BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
19674   restoreMBB->addSuccessor(sinkMBB);
19675
19676   MI->eraseFromParent();
19677   return sinkMBB;
19678 }
19679
19680 MachineBasicBlock *
19681 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
19682                                      MachineBasicBlock *MBB) const {
19683   DebugLoc DL = MI->getDebugLoc();
19684   MachineFunction *MF = MBB->getParent();
19685   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
19686   MachineRegisterInfo &MRI = MF->getRegInfo();
19687
19688   // Memory Reference
19689   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
19690   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
19691
19692   MVT PVT = getPointerTy();
19693   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
19694          "Invalid Pointer Size!");
19695
19696   const TargetRegisterClass *RC =
19697     (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
19698   unsigned Tmp = MRI.createVirtualRegister(RC);
19699   // Since FP is only updated here but NOT referenced, it's treated as GPR.
19700   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
19701   unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
19702   unsigned SP = RegInfo->getStackRegister();
19703
19704   MachineInstrBuilder MIB;
19705
19706   const int64_t LabelOffset = 1 * PVT.getStoreSize();
19707   const int64_t SPOffset = 2 * PVT.getStoreSize();
19708
19709   unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
19710   unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
19711
19712   // Reload FP
19713   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
19714   for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
19715     MIB.addOperand(MI->getOperand(i));
19716   MIB.setMemRefs(MMOBegin, MMOEnd);
19717   // Reload IP
19718   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
19719   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
19720     if (i == X86::AddrDisp)
19721       MIB.addDisp(MI->getOperand(i), LabelOffset);
19722     else
19723       MIB.addOperand(MI->getOperand(i));
19724   }
19725   MIB.setMemRefs(MMOBegin, MMOEnd);
19726   // Reload SP
19727   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
19728   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
19729     if (i == X86::AddrDisp)
19730       MIB.addDisp(MI->getOperand(i), SPOffset);
19731     else
19732       MIB.addOperand(MI->getOperand(i));
19733   }
19734   MIB.setMemRefs(MMOBegin, MMOEnd);
19735   // Jump
19736   BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
19737
19738   MI->eraseFromParent();
19739   return MBB;
19740 }
19741
19742 // Replace 213-type (isel default) FMA3 instructions with 231-type for
19743 // accumulator loops. Writing back to the accumulator allows the coalescer
19744 // to remove extra copies in the loop.
19745 MachineBasicBlock *
19746 X86TargetLowering::emitFMA3Instr(MachineInstr *MI,
19747                                  MachineBasicBlock *MBB) const {
19748   MachineOperand &AddendOp = MI->getOperand(3);
19749
19750   // Bail out early if the addend isn't a register - we can't switch these.
19751   if (!AddendOp.isReg())
19752     return MBB;
19753
19754   MachineFunction &MF = *MBB->getParent();
19755   MachineRegisterInfo &MRI = MF.getRegInfo();
19756
19757   // Check whether the addend is defined by a PHI:
19758   assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?");
19759   MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg());
19760   if (!AddendDef.isPHI())
19761     return MBB;
19762
19763   // Look for the following pattern:
19764   // loop:
19765   //   %addend = phi [%entry, 0], [%loop, %result]
19766   //   ...
19767   //   %result<tied1> = FMA213 %m2<tied0>, %m1, %addend
19768
19769   // Replace with:
19770   //   loop:
19771   //   %addend = phi [%entry, 0], [%loop, %result]
19772   //   ...
19773   //   %result<tied1> = FMA231 %addend<tied0>, %m1, %m2
19774
19775   for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) {
19776     assert(AddendDef.getOperand(i).isReg());
19777     MachineOperand PHISrcOp = AddendDef.getOperand(i);
19778     MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg());
19779     if (&PHISrcInst == MI) {
19780       // Found a matching instruction.
19781       unsigned NewFMAOpc = 0;
19782       switch (MI->getOpcode()) {
19783         case X86::VFMADDPDr213r: NewFMAOpc = X86::VFMADDPDr231r; break;
19784         case X86::VFMADDPSr213r: NewFMAOpc = X86::VFMADDPSr231r; break;
19785         case X86::VFMADDSDr213r: NewFMAOpc = X86::VFMADDSDr231r; break;
19786         case X86::VFMADDSSr213r: NewFMAOpc = X86::VFMADDSSr231r; break;
19787         case X86::VFMSUBPDr213r: NewFMAOpc = X86::VFMSUBPDr231r; break;
19788         case X86::VFMSUBPSr213r: NewFMAOpc = X86::VFMSUBPSr231r; break;
19789         case X86::VFMSUBSDr213r: NewFMAOpc = X86::VFMSUBSDr231r; break;
19790         case X86::VFMSUBSSr213r: NewFMAOpc = X86::VFMSUBSSr231r; break;
19791         case X86::VFNMADDPDr213r: NewFMAOpc = X86::VFNMADDPDr231r; break;
19792         case X86::VFNMADDPSr213r: NewFMAOpc = X86::VFNMADDPSr231r; break;
19793         case X86::VFNMADDSDr213r: NewFMAOpc = X86::VFNMADDSDr231r; break;
19794         case X86::VFNMADDSSr213r: NewFMAOpc = X86::VFNMADDSSr231r; break;
19795         case X86::VFNMSUBPDr213r: NewFMAOpc = X86::VFNMSUBPDr231r; break;
19796         case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break;
19797         case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break;
19798         case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break;
19799         case X86::VFMADDSUBPDr213r: NewFMAOpc = X86::VFMADDSUBPDr231r; break;
19800         case X86::VFMADDSUBPSr213r: NewFMAOpc = X86::VFMADDSUBPSr231r; break;
19801         case X86::VFMSUBADDPDr213r: NewFMAOpc = X86::VFMSUBADDPDr231r; break;
19802         case X86::VFMSUBADDPSr213r: NewFMAOpc = X86::VFMSUBADDPSr231r; break;
19803
19804         case X86::VFMADDPDr213rY: NewFMAOpc = X86::VFMADDPDr231rY; break;
19805         case X86::VFMADDPSr213rY: NewFMAOpc = X86::VFMADDPSr231rY; break;
19806         case X86::VFMSUBPDr213rY: NewFMAOpc = X86::VFMSUBPDr231rY; break;
19807         case X86::VFMSUBPSr213rY: NewFMAOpc = X86::VFMSUBPSr231rY; break;
19808         case X86::VFNMADDPDr213rY: NewFMAOpc = X86::VFNMADDPDr231rY; break;
19809         case X86::VFNMADDPSr213rY: NewFMAOpc = X86::VFNMADDPSr231rY; break;
19810         case X86::VFNMSUBPDr213rY: NewFMAOpc = X86::VFNMSUBPDr231rY; break;
19811         case X86::VFNMSUBPSr213rY: NewFMAOpc = X86::VFNMSUBPSr231rY; break;
19812         case X86::VFMADDSUBPDr213rY: NewFMAOpc = X86::VFMADDSUBPDr231rY; break;
19813         case X86::VFMADDSUBPSr213rY: NewFMAOpc = X86::VFMADDSUBPSr231rY; break;
19814         case X86::VFMSUBADDPDr213rY: NewFMAOpc = X86::VFMSUBADDPDr231rY; break;
19815         case X86::VFMSUBADDPSr213rY: NewFMAOpc = X86::VFMSUBADDPSr231rY; break;
19816         default: llvm_unreachable("Unrecognized FMA variant.");
19817       }
19818
19819       const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
19820       MachineInstrBuilder MIB =
19821         BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc))
19822         .addOperand(MI->getOperand(0))
19823         .addOperand(MI->getOperand(3))
19824         .addOperand(MI->getOperand(2))
19825         .addOperand(MI->getOperand(1));
19826       MBB->insert(MachineBasicBlock::iterator(MI), MIB);
19827       MI->eraseFromParent();
19828     }
19829   }
19830
19831   return MBB;
19832 }
19833
19834 MachineBasicBlock *
19835 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
19836                                                MachineBasicBlock *BB) const {
19837   switch (MI->getOpcode()) {
19838   default: llvm_unreachable("Unexpected instr type to insert");
19839   case X86::TAILJMPd64:
19840   case X86::TAILJMPr64:
19841   case X86::TAILJMPm64:
19842   case X86::TAILJMPd64_REX:
19843   case X86::TAILJMPr64_REX:
19844   case X86::TAILJMPm64_REX:
19845     llvm_unreachable("TAILJMP64 would not be touched here.");
19846   case X86::TCRETURNdi64:
19847   case X86::TCRETURNri64:
19848   case X86::TCRETURNmi64:
19849     return BB;
19850   case X86::WIN_ALLOCA:
19851     return EmitLoweredWinAlloca(MI, BB);
19852   case X86::SEG_ALLOCA_32:
19853   case X86::SEG_ALLOCA_64:
19854     return EmitLoweredSegAlloca(MI, BB);
19855   case X86::TLSCall_32:
19856   case X86::TLSCall_64:
19857     return EmitLoweredTLSCall(MI, BB);
19858   case X86::CMOV_GR8:
19859   case X86::CMOV_FR32:
19860   case X86::CMOV_FR64:
19861   case X86::CMOV_V4F32:
19862   case X86::CMOV_V2F64:
19863   case X86::CMOV_V2I64:
19864   case X86::CMOV_V8F32:
19865   case X86::CMOV_V4F64:
19866   case X86::CMOV_V4I64:
19867   case X86::CMOV_V16F32:
19868   case X86::CMOV_V8F64:
19869   case X86::CMOV_V8I64:
19870   case X86::CMOV_GR16:
19871   case X86::CMOV_GR32:
19872   case X86::CMOV_RFP32:
19873   case X86::CMOV_RFP64:
19874   case X86::CMOV_RFP80:
19875   case X86::CMOV_V8I1:
19876   case X86::CMOV_V16I1:
19877   case X86::CMOV_V32I1:
19878   case X86::CMOV_V64I1:
19879     return EmitLoweredSelect(MI, BB);
19880
19881   case X86::FP32_TO_INT16_IN_MEM:
19882   case X86::FP32_TO_INT32_IN_MEM:
19883   case X86::FP32_TO_INT64_IN_MEM:
19884   case X86::FP64_TO_INT16_IN_MEM:
19885   case X86::FP64_TO_INT32_IN_MEM:
19886   case X86::FP64_TO_INT64_IN_MEM:
19887   case X86::FP80_TO_INT16_IN_MEM:
19888   case X86::FP80_TO_INT32_IN_MEM:
19889   case X86::FP80_TO_INT64_IN_MEM: {
19890     MachineFunction *F = BB->getParent();
19891     const TargetInstrInfo *TII = Subtarget->getInstrInfo();
19892     DebugLoc DL = MI->getDebugLoc();
19893
19894     // Change the floating point control register to use "round towards zero"
19895     // mode when truncating to an integer value.
19896     int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
19897     addFrameReference(BuildMI(*BB, MI, DL,
19898                               TII->get(X86::FNSTCW16m)), CWFrameIdx);
19899
19900     // Load the old value of the high byte of the control word...
19901     unsigned OldCW =
19902       F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
19903     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
19904                       CWFrameIdx);
19905
19906     // Set the high part to be round to zero...
19907     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
19908       .addImm(0xC7F);
19909
19910     // Reload the modified control word now...
19911     addFrameReference(BuildMI(*BB, MI, DL,
19912                               TII->get(X86::FLDCW16m)), CWFrameIdx);
19913
19914     // Restore the memory image of control word to original value
19915     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
19916       .addReg(OldCW);
19917
19918     // Get the X86 opcode to use.
19919     unsigned Opc;
19920     switch (MI->getOpcode()) {
19921     default: llvm_unreachable("illegal opcode!");
19922     case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
19923     case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
19924     case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
19925     case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
19926     case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
19927     case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
19928     case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
19929     case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
19930     case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
19931     }
19932
19933     X86AddressMode AM;
19934     MachineOperand &Op = MI->getOperand(0);
19935     if (Op.isReg()) {
19936       AM.BaseType = X86AddressMode::RegBase;
19937       AM.Base.Reg = Op.getReg();
19938     } else {
19939       AM.BaseType = X86AddressMode::FrameIndexBase;
19940       AM.Base.FrameIndex = Op.getIndex();
19941     }
19942     Op = MI->getOperand(1);
19943     if (Op.isImm())
19944       AM.Scale = Op.getImm();
19945     Op = MI->getOperand(2);
19946     if (Op.isImm())
19947       AM.IndexReg = Op.getImm();
19948     Op = MI->getOperand(3);
19949     if (Op.isGlobal()) {
19950       AM.GV = Op.getGlobal();
19951     } else {
19952       AM.Disp = Op.getImm();
19953     }
19954     addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
19955                       .addReg(MI->getOperand(X86::AddrNumOperands).getReg());
19956
19957     // Reload the original control word now.
19958     addFrameReference(BuildMI(*BB, MI, DL,
19959                               TII->get(X86::FLDCW16m)), CWFrameIdx);
19960
19961     MI->eraseFromParent();   // The pseudo instruction is gone now.
19962     return BB;
19963   }
19964     // String/text processing lowering.
19965   case X86::PCMPISTRM128REG:
19966   case X86::VPCMPISTRM128REG:
19967   case X86::PCMPISTRM128MEM:
19968   case X86::VPCMPISTRM128MEM:
19969   case X86::PCMPESTRM128REG:
19970   case X86::VPCMPESTRM128REG:
19971   case X86::PCMPESTRM128MEM:
19972   case X86::VPCMPESTRM128MEM:
19973     assert(Subtarget->hasSSE42() &&
19974            "Target must have SSE4.2 or AVX features enabled");
19975     return EmitPCMPSTRM(MI, BB, Subtarget->getInstrInfo());
19976
19977   // String/text processing lowering.
19978   case X86::PCMPISTRIREG:
19979   case X86::VPCMPISTRIREG:
19980   case X86::PCMPISTRIMEM:
19981   case X86::VPCMPISTRIMEM:
19982   case X86::PCMPESTRIREG:
19983   case X86::VPCMPESTRIREG:
19984   case X86::PCMPESTRIMEM:
19985   case X86::VPCMPESTRIMEM:
19986     assert(Subtarget->hasSSE42() &&
19987            "Target must have SSE4.2 or AVX features enabled");
19988     return EmitPCMPSTRI(MI, BB, Subtarget->getInstrInfo());
19989
19990   // Thread synchronization.
19991   case X86::MONITOR:
19992     return EmitMonitor(MI, BB, Subtarget);
19993
19994   // xbegin
19995   case X86::XBEGIN:
19996     return EmitXBegin(MI, BB, Subtarget->getInstrInfo());
19997
19998   case X86::VASTART_SAVE_XMM_REGS:
19999     return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
20000
20001   case X86::VAARG_64:
20002     return EmitVAARG64WithCustomInserter(MI, BB);
20003
20004   case X86::EH_SjLj_SetJmp32:
20005   case X86::EH_SjLj_SetJmp64:
20006     return emitEHSjLjSetJmp(MI, BB);
20007
20008   case X86::EH_SjLj_LongJmp32:
20009   case X86::EH_SjLj_LongJmp64:
20010     return emitEHSjLjLongJmp(MI, BB);
20011
20012   case TargetOpcode::STATEPOINT:
20013     // As an implementation detail, STATEPOINT shares the STACKMAP format at
20014     // this point in the process.  We diverge later.
20015     return emitPatchPoint(MI, BB);
20016
20017   case TargetOpcode::STACKMAP:
20018   case TargetOpcode::PATCHPOINT:
20019     return emitPatchPoint(MI, BB);
20020
20021   case X86::VFMADDPDr213r:
20022   case X86::VFMADDPSr213r:
20023   case X86::VFMADDSDr213r:
20024   case X86::VFMADDSSr213r:
20025   case X86::VFMSUBPDr213r:
20026   case X86::VFMSUBPSr213r:
20027   case X86::VFMSUBSDr213r:
20028   case X86::VFMSUBSSr213r:
20029   case X86::VFNMADDPDr213r:
20030   case X86::VFNMADDPSr213r:
20031   case X86::VFNMADDSDr213r:
20032   case X86::VFNMADDSSr213r:
20033   case X86::VFNMSUBPDr213r:
20034   case X86::VFNMSUBPSr213r:
20035   case X86::VFNMSUBSDr213r:
20036   case X86::VFNMSUBSSr213r:
20037   case X86::VFMADDSUBPDr213r:
20038   case X86::VFMADDSUBPSr213r:
20039   case X86::VFMSUBADDPDr213r:
20040   case X86::VFMSUBADDPSr213r:
20041   case X86::VFMADDPDr213rY:
20042   case X86::VFMADDPSr213rY:
20043   case X86::VFMSUBPDr213rY:
20044   case X86::VFMSUBPSr213rY:
20045   case X86::VFNMADDPDr213rY:
20046   case X86::VFNMADDPSr213rY:
20047   case X86::VFNMSUBPDr213rY:
20048   case X86::VFNMSUBPSr213rY:
20049   case X86::VFMADDSUBPDr213rY:
20050   case X86::VFMADDSUBPSr213rY:
20051   case X86::VFMSUBADDPDr213rY:
20052   case X86::VFMSUBADDPSr213rY:
20053     return emitFMA3Instr(MI, BB);
20054   }
20055 }
20056
20057 //===----------------------------------------------------------------------===//
20058 //                           X86 Optimization Hooks
20059 //===----------------------------------------------------------------------===//
20060
20061 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
20062                                                       APInt &KnownZero,
20063                                                       APInt &KnownOne,
20064                                                       const SelectionDAG &DAG,
20065                                                       unsigned Depth) const {
20066   unsigned BitWidth = KnownZero.getBitWidth();
20067   unsigned Opc = Op.getOpcode();
20068   assert((Opc >= ISD::BUILTIN_OP_END ||
20069           Opc == ISD::INTRINSIC_WO_CHAIN ||
20070           Opc == ISD::INTRINSIC_W_CHAIN ||
20071           Opc == ISD::INTRINSIC_VOID) &&
20072          "Should use MaskedValueIsZero if you don't know whether Op"
20073          " is a target node!");
20074
20075   KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
20076   switch (Opc) {
20077   default: break;
20078   case X86ISD::ADD:
20079   case X86ISD::SUB:
20080   case X86ISD::ADC:
20081   case X86ISD::SBB:
20082   case X86ISD::SMUL:
20083   case X86ISD::UMUL:
20084   case X86ISD::INC:
20085   case X86ISD::DEC:
20086   case X86ISD::OR:
20087   case X86ISD::XOR:
20088   case X86ISD::AND:
20089     // These nodes' second result is a boolean.
20090     if (Op.getResNo() == 0)
20091       break;
20092     // Fallthrough
20093   case X86ISD::SETCC:
20094     KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
20095     break;
20096   case ISD::INTRINSIC_WO_CHAIN: {
20097     unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
20098     unsigned NumLoBits = 0;
20099     switch (IntId) {
20100     default: break;
20101     case Intrinsic::x86_sse_movmsk_ps:
20102     case Intrinsic::x86_avx_movmsk_ps_256:
20103     case Intrinsic::x86_sse2_movmsk_pd:
20104     case Intrinsic::x86_avx_movmsk_pd_256:
20105     case Intrinsic::x86_mmx_pmovmskb:
20106     case Intrinsic::x86_sse2_pmovmskb_128:
20107     case Intrinsic::x86_avx2_pmovmskb: {
20108       // High bits of movmskp{s|d}, pmovmskb are known zero.
20109       switch (IntId) {
20110         default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
20111         case Intrinsic::x86_sse_movmsk_ps:      NumLoBits = 4; break;
20112         case Intrinsic::x86_avx_movmsk_ps_256:  NumLoBits = 8; break;
20113         case Intrinsic::x86_sse2_movmsk_pd:     NumLoBits = 2; break;
20114         case Intrinsic::x86_avx_movmsk_pd_256:  NumLoBits = 4; break;
20115         case Intrinsic::x86_mmx_pmovmskb:       NumLoBits = 8; break;
20116         case Intrinsic::x86_sse2_pmovmskb_128:  NumLoBits = 16; break;
20117         case Intrinsic::x86_avx2_pmovmskb:      NumLoBits = 32; break;
20118       }
20119       KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
20120       break;
20121     }
20122     }
20123     break;
20124   }
20125   }
20126 }
20127
20128 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
20129   SDValue Op,
20130   const SelectionDAG &,
20131   unsigned Depth) const {
20132   // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
20133   if (Op.getOpcode() == X86ISD::SETCC_CARRY)
20134     return Op.getValueType().getScalarType().getSizeInBits();
20135
20136   // Fallback case.
20137   return 1;
20138 }
20139
20140 /// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
20141 /// node is a GlobalAddress + offset.
20142 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
20143                                        const GlobalValue* &GA,
20144                                        int64_t &Offset) const {
20145   if (N->getOpcode() == X86ISD::Wrapper) {
20146     if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
20147       GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
20148       Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
20149       return true;
20150     }
20151   }
20152   return TargetLowering::isGAPlusOffset(N, GA, Offset);
20153 }
20154
20155 /// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the
20156 /// same as extracting the high 128-bit part of 256-bit vector and then
20157 /// inserting the result into the low part of a new 256-bit vector
20158 static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
20159   EVT VT = SVOp->getValueType(0);
20160   unsigned NumElems = VT.getVectorNumElements();
20161
20162   // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
20163   for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j)
20164     if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
20165         SVOp->getMaskElt(j) >= 0)
20166       return false;
20167
20168   return true;
20169 }
20170
20171 /// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the
20172 /// same as extracting the low 128-bit part of 256-bit vector and then
20173 /// inserting the result into the high part of a new 256-bit vector
20174 static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
20175   EVT VT = SVOp->getValueType(0);
20176   unsigned NumElems = VT.getVectorNumElements();
20177
20178   // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
20179   for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j)
20180     if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
20181         SVOp->getMaskElt(j) >= 0)
20182       return false;
20183
20184   return true;
20185 }
20186
20187 /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
20188 static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
20189                                         TargetLowering::DAGCombinerInfo &DCI,
20190                                         const X86Subtarget* Subtarget) {
20191   SDLoc dl(N);
20192   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
20193   SDValue V1 = SVOp->getOperand(0);
20194   SDValue V2 = SVOp->getOperand(1);
20195   EVT VT = SVOp->getValueType(0);
20196   unsigned NumElems = VT.getVectorNumElements();
20197
20198   if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
20199       V2.getOpcode() == ISD::CONCAT_VECTORS) {
20200     //
20201     //                   0,0,0,...
20202     //                      |
20203     //    V      UNDEF    BUILD_VECTOR    UNDEF
20204     //     \      /           \           /
20205     //  CONCAT_VECTOR         CONCAT_VECTOR
20206     //         \                  /
20207     //          \                /
20208     //          RESULT: V + zero extended
20209     //
20210     if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
20211         V2.getOperand(1).getOpcode() != ISD::UNDEF ||
20212         V1.getOperand(1).getOpcode() != ISD::UNDEF)
20213       return SDValue();
20214
20215     if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
20216       return SDValue();
20217
20218     // To match the shuffle mask, the first half of the mask should
20219     // be exactly the first vector, and all the rest a splat with the
20220     // first element of the second one.
20221     for (unsigned i = 0; i != NumElems/2; ++i)
20222       if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
20223           !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
20224         return SDValue();
20225
20226     // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
20227     if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
20228       if (Ld->hasNUsesOfValue(1, 0)) {
20229         SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
20230         SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
20231         SDValue ResNode =
20232           DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
20233                                   Ld->getMemoryVT(),
20234                                   Ld->getPointerInfo(),
20235                                   Ld->getAlignment(),
20236                                   false/*isVolatile*/, true/*ReadMem*/,
20237                                   false/*WriteMem*/);
20238
20239         // Make sure the newly-created LOAD is in the same position as Ld in
20240         // terms of dependency. We create a TokenFactor for Ld and ResNode,
20241         // and update uses of Ld's output chain to use the TokenFactor.
20242         if (Ld->hasAnyUseOfValue(1)) {
20243           SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
20244                              SDValue(Ld, 1), SDValue(ResNode.getNode(), 1));
20245           DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
20246           DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
20247                                  SDValue(ResNode.getNode(), 1));
20248         }
20249
20250         return DAG.getBitcast(VT, ResNode);
20251       }
20252     }
20253
20254     // Emit a zeroed vector and insert the desired subvector on its
20255     // first half.
20256     SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
20257     SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
20258     return DCI.CombineTo(N, InsV);
20259   }
20260
20261   //===--------------------------------------------------------------------===//
20262   // Combine some shuffles into subvector extracts and inserts:
20263   //
20264
20265   // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
20266   if (isShuffleHigh128VectorInsertLow(SVOp)) {
20267     SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl);
20268     SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl);
20269     return DCI.CombineTo(N, InsV);
20270   }
20271
20272   // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
20273   if (isShuffleLow128VectorInsertHigh(SVOp)) {
20274     SDValue V = Extract128BitVector(V1, 0, DAG, dl);
20275     SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl);
20276     return DCI.CombineTo(N, InsV);
20277   }
20278
20279   return SDValue();
20280 }
20281
20282 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
20283 /// possible.
20284 ///
20285 /// This is the leaf of the recursive combinine below. When we have found some
20286 /// chain of single-use x86 shuffle instructions and accumulated the combined
20287 /// shuffle mask represented by them, this will try to pattern match that mask
20288 /// into either a single instruction if there is a special purpose instruction
20289 /// for this operation, or into a PSHUFB instruction which is a fully general
20290 /// instruction but should only be used to replace chains over a certain depth.
20291 static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
20292                                    int Depth, bool HasPSHUFB, SelectionDAG &DAG,
20293                                    TargetLowering::DAGCombinerInfo &DCI,
20294                                    const X86Subtarget *Subtarget) {
20295   assert(!Mask.empty() && "Cannot combine an empty shuffle mask!");
20296
20297   // Find the operand that enters the chain. Note that multiple uses are OK
20298   // here, we're not going to remove the operand we find.
20299   SDValue Input = Op.getOperand(0);
20300   while (Input.getOpcode() == ISD::BITCAST)
20301     Input = Input.getOperand(0);
20302
20303   MVT VT = Input.getSimpleValueType();
20304   MVT RootVT = Root.getSimpleValueType();
20305   SDLoc DL(Root);
20306
20307   // Just remove no-op shuffle masks.
20308   if (Mask.size() == 1) {
20309     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Input),
20310                   /*AddTo*/ true);
20311     return true;
20312   }
20313
20314   // Use the float domain if the operand type is a floating point type.
20315   bool FloatDomain = VT.isFloatingPoint();
20316
20317   // For floating point shuffles, we don't have free copies in the shuffle
20318   // instructions or the ability to load as part of the instruction, so
20319   // canonicalize their shuffles to UNPCK or MOV variants.
20320   //
20321   // Note that even with AVX we prefer the PSHUFD form of shuffle for integer
20322   // vectors because it can have a load folded into it that UNPCK cannot. This
20323   // doesn't preclude something switching to the shorter encoding post-RA.
20324   //
20325   // FIXME: Should teach these routines about AVX vector widths.
20326   if (FloatDomain && VT.getSizeInBits() == 128) {
20327     if (Mask.equals({0, 0}) || Mask.equals({1, 1})) {
20328       bool Lo = Mask.equals({0, 0});
20329       unsigned Shuffle;
20330       MVT ShuffleVT;
20331       // Check if we have SSE3 which will let us use MOVDDUP. That instruction
20332       // is no slower than UNPCKLPD but has the option to fold the input operand
20333       // into even an unaligned memory load.
20334       if (Lo && Subtarget->hasSSE3()) {
20335         Shuffle = X86ISD::MOVDDUP;
20336         ShuffleVT = MVT::v2f64;
20337       } else {
20338         // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller
20339         // than the UNPCK variants.
20340         Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS;
20341         ShuffleVT = MVT::v4f32;
20342       }
20343       if (Depth == 1 && Root->getOpcode() == Shuffle)
20344         return false; // Nothing to do!
20345       Op = DAG.getBitcast(ShuffleVT, Input);
20346       DCI.AddToWorklist(Op.getNode());
20347       if (Shuffle == X86ISD::MOVDDUP)
20348         Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
20349       else
20350         Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
20351       DCI.AddToWorklist(Op.getNode());
20352       DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
20353                     /*AddTo*/ true);
20354       return true;
20355     }
20356     if (Subtarget->hasSSE3() &&
20357         (Mask.equals({0, 0, 2, 2}) || Mask.equals({1, 1, 3, 3}))) {
20358       bool Lo = Mask.equals({0, 0, 2, 2});
20359       unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP;
20360       MVT ShuffleVT = MVT::v4f32;
20361       if (Depth == 1 && Root->getOpcode() == Shuffle)
20362         return false; // Nothing to do!
20363       Op = DAG.getBitcast(ShuffleVT, Input);
20364       DCI.AddToWorklist(Op.getNode());
20365       Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
20366       DCI.AddToWorklist(Op.getNode());
20367       DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
20368                     /*AddTo*/ true);
20369       return true;
20370     }
20371     if (Mask.equals({0, 0, 1, 1}) || Mask.equals({2, 2, 3, 3})) {
20372       bool Lo = Mask.equals({0, 0, 1, 1});
20373       unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
20374       MVT ShuffleVT = MVT::v4f32;
20375       if (Depth == 1 && Root->getOpcode() == Shuffle)
20376         return false; // Nothing to do!
20377       Op = DAG.getBitcast(ShuffleVT, Input);
20378       DCI.AddToWorklist(Op.getNode());
20379       Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
20380       DCI.AddToWorklist(Op.getNode());
20381       DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
20382                     /*AddTo*/ true);
20383       return true;
20384     }
20385   }
20386
20387   // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK
20388   // variants as none of these have single-instruction variants that are
20389   // superior to the UNPCK formulation.
20390   if (!FloatDomain && VT.getSizeInBits() == 128 &&
20391       (Mask.equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
20392        Mask.equals({4, 4, 5, 5, 6, 6, 7, 7}) ||
20393        Mask.equals({0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7}) ||
20394        Mask.equals(
20395            {8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15}))) {
20396     bool Lo = Mask[0] == 0;
20397     unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
20398     if (Depth == 1 && Root->getOpcode() == Shuffle)
20399       return false; // Nothing to do!
20400     MVT ShuffleVT;
20401     switch (Mask.size()) {
20402     case 8:
20403       ShuffleVT = MVT::v8i16;
20404       break;
20405     case 16:
20406       ShuffleVT = MVT::v16i8;
20407       break;
20408     default:
20409       llvm_unreachable("Impossible mask size!");
20410     };
20411     Op = DAG.getBitcast(ShuffleVT, Input);
20412     DCI.AddToWorklist(Op.getNode());
20413     Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
20414     DCI.AddToWorklist(Op.getNode());
20415     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
20416                   /*AddTo*/ true);
20417     return true;
20418   }
20419
20420   // Don't try to re-form single instruction chains under any circumstances now
20421   // that we've done encoding canonicalization for them.
20422   if (Depth < 2)
20423     return false;
20424
20425   // If we have 3 or more shuffle instructions or a chain involving PSHUFB, we
20426   // can replace them with a single PSHUFB instruction profitably. Intel's
20427   // manuals suggest only using PSHUFB if doing so replacing 5 instructions, but
20428   // in practice PSHUFB tends to be *very* fast so we're more aggressive.
20429   if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) {
20430     SmallVector<SDValue, 16> PSHUFBMask;
20431     int NumBytes = VT.getSizeInBits() / 8;
20432     int Ratio = NumBytes / Mask.size();
20433     for (int i = 0; i < NumBytes; ++i) {
20434       if (Mask[i / Ratio] == SM_SentinelUndef) {
20435         PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
20436         continue;
20437       }
20438       int M = Mask[i / Ratio] != SM_SentinelZero
20439                   ? Ratio * Mask[i / Ratio] + i % Ratio
20440                   : 255;
20441       PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
20442     }
20443     MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
20444     Op = DAG.getBitcast(ByteVT, Input);
20445     DCI.AddToWorklist(Op.getNode());
20446     SDValue PSHUFBMaskOp =
20447         DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVT, PSHUFBMask);
20448     DCI.AddToWorklist(PSHUFBMaskOp.getNode());
20449     Op = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Op, PSHUFBMaskOp);
20450     DCI.AddToWorklist(Op.getNode());
20451     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
20452                   /*AddTo*/ true);
20453     return true;
20454   }
20455
20456   // Failed to find any combines.
20457   return false;
20458 }
20459
20460 /// \brief Fully generic combining of x86 shuffle instructions.
20461 ///
20462 /// This should be the last combine run over the x86 shuffle instructions. Once
20463 /// they have been fully optimized, this will recursively consider all chains
20464 /// of single-use shuffle instructions, build a generic model of the cumulative
20465 /// shuffle operation, and check for simpler instructions which implement this
20466 /// operation. We use this primarily for two purposes:
20467 ///
20468 /// 1) Collapse generic shuffles to specialized single instructions when
20469 ///    equivalent. In most cases, this is just an encoding size win, but
20470 ///    sometimes we will collapse multiple generic shuffles into a single
20471 ///    special-purpose shuffle.
20472 /// 2) Look for sequences of shuffle instructions with 3 or more total
20473 ///    instructions, and replace them with the slightly more expensive SSSE3
20474 ///    PSHUFB instruction if available. We do this as the last combining step
20475 ///    to ensure we avoid using PSHUFB if we can implement the shuffle with
20476 ///    a suitable short sequence of other instructions. The PHUFB will either
20477 ///    use a register or have to read from memory and so is slightly (but only
20478 ///    slightly) more expensive than the other shuffle instructions.
20479 ///
20480 /// Because this is inherently a quadratic operation (for each shuffle in
20481 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
20482 /// This should never be an issue in practice as the shuffle lowering doesn't
20483 /// produce sequences of more than 8 instructions.
20484 ///
20485 /// FIXME: We will currently miss some cases where the redundant shuffling
20486 /// would simplify under the threshold for PSHUFB formation because of
20487 /// combine-ordering. To fix this, we should do the redundant instruction
20488 /// combining in this recursive walk.
20489 static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
20490                                           ArrayRef<int> RootMask,
20491                                           int Depth, bool HasPSHUFB,
20492                                           SelectionDAG &DAG,
20493                                           TargetLowering::DAGCombinerInfo &DCI,
20494                                           const X86Subtarget *Subtarget) {
20495   // Bound the depth of our recursive combine because this is ultimately
20496   // quadratic in nature.
20497   if (Depth > 8)
20498     return false;
20499
20500   // Directly rip through bitcasts to find the underlying operand.
20501   while (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).hasOneUse())
20502     Op = Op.getOperand(0);
20503
20504   MVT VT = Op.getSimpleValueType();
20505   if (!VT.isVector())
20506     return false; // Bail if we hit a non-vector.
20507
20508   assert(Root.getSimpleValueType().isVector() &&
20509          "Shuffles operate on vector types!");
20510   assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
20511          "Can only combine shuffles of the same vector register size.");
20512
20513   if (!isTargetShuffle(Op.getOpcode()))
20514     return false;
20515   SmallVector<int, 16> OpMask;
20516   bool IsUnary;
20517   bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, OpMask, IsUnary);
20518   // We only can combine unary shuffles which we can decode the mask for.
20519   if (!HaveMask || !IsUnary)
20520     return false;
20521
20522   assert(VT.getVectorNumElements() == OpMask.size() &&
20523          "Different mask size from vector size!");
20524   assert(((RootMask.size() > OpMask.size() &&
20525            RootMask.size() % OpMask.size() == 0) ||
20526           (OpMask.size() > RootMask.size() &&
20527            OpMask.size() % RootMask.size() == 0) ||
20528           OpMask.size() == RootMask.size()) &&
20529          "The smaller number of elements must divide the larger.");
20530   int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
20531   int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
20532   assert(((RootRatio == 1 && OpRatio == 1) ||
20533           (RootRatio == 1) != (OpRatio == 1)) &&
20534          "Must not have a ratio for both incoming and op masks!");
20535
20536   SmallVector<int, 16> Mask;
20537   Mask.reserve(std::max(OpMask.size(), RootMask.size()));
20538
20539   // Merge this shuffle operation's mask into our accumulated mask. Note that
20540   // this shuffle's mask will be the first applied to the input, followed by the
20541   // root mask to get us all the way to the root value arrangement. The reason
20542   // for this order is that we are recursing up the operation chain.
20543   for (int i = 0, e = std::max(OpMask.size(), RootMask.size()); i < e; ++i) {
20544     int RootIdx = i / RootRatio;
20545     if (RootMask[RootIdx] < 0) {
20546       // This is a zero or undef lane, we're done.
20547       Mask.push_back(RootMask[RootIdx]);
20548       continue;
20549     }
20550
20551     int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
20552     int OpIdx = RootMaskedIdx / OpRatio;
20553     if (OpMask[OpIdx] < 0) {
20554       // The incoming lanes are zero or undef, it doesn't matter which ones we
20555       // are using.
20556       Mask.push_back(OpMask[OpIdx]);
20557       continue;
20558     }
20559
20560     // Ok, we have non-zero lanes, map them through.
20561     Mask.push_back(OpMask[OpIdx] * OpRatio +
20562                    RootMaskedIdx % OpRatio);
20563   }
20564
20565   // See if we can recurse into the operand to combine more things.
20566   switch (Op.getOpcode()) {
20567     case X86ISD::PSHUFB:
20568       HasPSHUFB = true;
20569     case X86ISD::PSHUFD:
20570     case X86ISD::PSHUFHW:
20571     case X86ISD::PSHUFLW:
20572       if (Op.getOperand(0).hasOneUse() &&
20573           combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
20574                                         HasPSHUFB, DAG, DCI, Subtarget))
20575         return true;
20576       break;
20577
20578     case X86ISD::UNPCKL:
20579     case X86ISD::UNPCKH:
20580       assert(Op.getOperand(0) == Op.getOperand(1) && "We only combine unary shuffles!");
20581       // We can't check for single use, we have to check that this shuffle is the only user.
20582       if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
20583           combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
20584                                         HasPSHUFB, DAG, DCI, Subtarget))
20585           return true;
20586       break;
20587   }
20588
20589   // Minor canonicalization of the accumulated shuffle mask to make it easier
20590   // to match below. All this does is detect masks with squential pairs of
20591   // elements, and shrink them to the half-width mask. It does this in a loop
20592   // so it will reduce the size of the mask to the minimal width mask which
20593   // performs an equivalent shuffle.
20594   SmallVector<int, 16> WidenedMask;
20595   while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
20596     Mask = std::move(WidenedMask);
20597     WidenedMask.clear();
20598   }
20599
20600   return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI,
20601                                 Subtarget);
20602 }
20603
20604 /// \brief Get the PSHUF-style mask from PSHUF node.
20605 ///
20606 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
20607 /// PSHUF-style masks that can be reused with such instructions.
20608 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
20609   MVT VT = N.getSimpleValueType();
20610   SmallVector<int, 4> Mask;
20611   bool IsUnary;
20612   bool HaveMask = getTargetShuffleMask(N.getNode(), VT, Mask, IsUnary);
20613   (void)HaveMask;
20614   assert(HaveMask);
20615
20616   // If we have more than 128-bits, only the low 128-bits of shuffle mask
20617   // matter. Check that the upper masks are repeats and remove them.
20618   if (VT.getSizeInBits() > 128) {
20619     int LaneElts = 128 / VT.getScalarSizeInBits();
20620 #ifndef NDEBUG
20621     for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
20622       for (int j = 0; j < LaneElts; ++j)
20623         assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
20624                "Mask doesn't repeat in high 128-bit lanes!");
20625 #endif
20626     Mask.resize(LaneElts);
20627   }
20628
20629   switch (N.getOpcode()) {
20630   case X86ISD::PSHUFD:
20631     return Mask;
20632   case X86ISD::PSHUFLW:
20633     Mask.resize(4);
20634     return Mask;
20635   case X86ISD::PSHUFHW:
20636     Mask.erase(Mask.begin(), Mask.begin() + 4);
20637     for (int &M : Mask)
20638       M -= 4;
20639     return Mask;
20640   default:
20641     llvm_unreachable("No valid shuffle instruction found!");
20642   }
20643 }
20644
20645 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
20646 ///
20647 /// We walk up the chain and look for a combinable shuffle, skipping over
20648 /// shuffles that we could hoist this shuffle's transformation past without
20649 /// altering anything.
20650 static SDValue
20651 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
20652                              SelectionDAG &DAG,
20653                              TargetLowering::DAGCombinerInfo &DCI) {
20654   assert(N.getOpcode() == X86ISD::PSHUFD &&
20655          "Called with something other than an x86 128-bit half shuffle!");
20656   SDLoc DL(N);
20657
20658   // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
20659   // of the shuffles in the chain so that we can form a fresh chain to replace
20660   // this one.
20661   SmallVector<SDValue, 8> Chain;
20662   SDValue V = N.getOperand(0);
20663   for (; V.hasOneUse(); V = V.getOperand(0)) {
20664     switch (V.getOpcode()) {
20665     default:
20666       return SDValue(); // Nothing combined!
20667
20668     case ISD::BITCAST:
20669       // Skip bitcasts as we always know the type for the target specific
20670       // instructions.
20671       continue;
20672
20673     case X86ISD::PSHUFD:
20674       // Found another dword shuffle.
20675       break;
20676
20677     case X86ISD::PSHUFLW:
20678       // Check that the low words (being shuffled) are the identity in the
20679       // dword shuffle, and the high words are self-contained.
20680       if (Mask[0] != 0 || Mask[1] != 1 ||
20681           !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
20682         return SDValue();
20683
20684       Chain.push_back(V);
20685       continue;
20686
20687     case X86ISD::PSHUFHW:
20688       // Check that the high words (being shuffled) are the identity in the
20689       // dword shuffle, and the low words are self-contained.
20690       if (Mask[2] != 2 || Mask[3] != 3 ||
20691           !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
20692         return SDValue();
20693
20694       Chain.push_back(V);
20695       continue;
20696
20697     case X86ISD::UNPCKL:
20698     case X86ISD::UNPCKH:
20699       // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
20700       // shuffle into a preceding word shuffle.
20701       if (V.getSimpleValueType().getScalarType() != MVT::i8 &&
20702           V.getSimpleValueType().getScalarType() != MVT::i16)
20703         return SDValue();
20704
20705       // Search for a half-shuffle which we can combine with.
20706       unsigned CombineOp =
20707           V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
20708       if (V.getOperand(0) != V.getOperand(1) ||
20709           !V->isOnlyUserOf(V.getOperand(0).getNode()))
20710         return SDValue();
20711       Chain.push_back(V);
20712       V = V.getOperand(0);
20713       do {
20714         switch (V.getOpcode()) {
20715         default:
20716           return SDValue(); // Nothing to combine.
20717
20718         case X86ISD::PSHUFLW:
20719         case X86ISD::PSHUFHW:
20720           if (V.getOpcode() == CombineOp)
20721             break;
20722
20723           Chain.push_back(V);
20724
20725           // Fallthrough!
20726         case ISD::BITCAST:
20727           V = V.getOperand(0);
20728           continue;
20729         }
20730         break;
20731       } while (V.hasOneUse());
20732       break;
20733     }
20734     // Break out of the loop if we break out of the switch.
20735     break;
20736   }
20737
20738   if (!V.hasOneUse())
20739     // We fell out of the loop without finding a viable combining instruction.
20740     return SDValue();
20741
20742   // Merge this node's mask and our incoming mask.
20743   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
20744   for (int &M : Mask)
20745     M = VMask[M];
20746   V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
20747                   getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
20748
20749   // Rebuild the chain around this new shuffle.
20750   while (!Chain.empty()) {
20751     SDValue W = Chain.pop_back_val();
20752
20753     if (V.getValueType() != W.getOperand(0).getValueType())
20754       V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
20755
20756     switch (W.getOpcode()) {
20757     default:
20758       llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
20759
20760     case X86ISD::UNPCKL:
20761     case X86ISD::UNPCKH:
20762       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
20763       break;
20764
20765     case X86ISD::PSHUFD:
20766     case X86ISD::PSHUFLW:
20767     case X86ISD::PSHUFHW:
20768       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
20769       break;
20770     }
20771   }
20772   if (V.getValueType() != N.getValueType())
20773     V = DAG.getBitcast(N.getValueType(), V);
20774
20775   // Return the new chain to replace N.
20776   return V;
20777 }
20778
20779 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw.
20780 ///
20781 /// We walk up the chain, skipping shuffles of the other half and looking
20782 /// through shuffles which switch halves trying to find a shuffle of the same
20783 /// pair of dwords.
20784 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
20785                                         SelectionDAG &DAG,
20786                                         TargetLowering::DAGCombinerInfo &DCI) {
20787   assert(
20788       (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
20789       "Called with something other than an x86 128-bit half shuffle!");
20790   SDLoc DL(N);
20791   unsigned CombineOpcode = N.getOpcode();
20792
20793   // Walk up a single-use chain looking for a combinable shuffle.
20794   SDValue V = N.getOperand(0);
20795   for (; V.hasOneUse(); V = V.getOperand(0)) {
20796     switch (V.getOpcode()) {
20797     default:
20798       return false; // Nothing combined!
20799
20800     case ISD::BITCAST:
20801       // Skip bitcasts as we always know the type for the target specific
20802       // instructions.
20803       continue;
20804
20805     case X86ISD::PSHUFLW:
20806     case X86ISD::PSHUFHW:
20807       if (V.getOpcode() == CombineOpcode)
20808         break;
20809
20810       // Other-half shuffles are no-ops.
20811       continue;
20812     }
20813     // Break out of the loop if we break out of the switch.
20814     break;
20815   }
20816
20817   if (!V.hasOneUse())
20818     // We fell out of the loop without finding a viable combining instruction.
20819     return false;
20820
20821   // Combine away the bottom node as its shuffle will be accumulated into
20822   // a preceding shuffle.
20823   DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
20824
20825   // Record the old value.
20826   SDValue Old = V;
20827
20828   // Merge this node's mask and our incoming mask (adjusted to account for all
20829   // the pshufd instructions encountered).
20830   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
20831   for (int &M : Mask)
20832     M = VMask[M];
20833   V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
20834                   getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
20835
20836   // Check that the shuffles didn't cancel each other out. If not, we need to
20837   // combine to the new one.
20838   if (Old != V)
20839     // Replace the combinable shuffle with the combined one, updating all users
20840     // so that we re-evaluate the chain here.
20841     DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
20842
20843   return true;
20844 }
20845
20846 /// \brief Try to combine x86 target specific shuffles.
20847 static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
20848                                            TargetLowering::DAGCombinerInfo &DCI,
20849                                            const X86Subtarget *Subtarget) {
20850   SDLoc DL(N);
20851   MVT VT = N.getSimpleValueType();
20852   SmallVector<int, 4> Mask;
20853
20854   switch (N.getOpcode()) {
20855   case X86ISD::PSHUFD:
20856   case X86ISD::PSHUFLW:
20857   case X86ISD::PSHUFHW:
20858     Mask = getPSHUFShuffleMask(N);
20859     assert(Mask.size() == 4);
20860     break;
20861   default:
20862     return SDValue();
20863   }
20864
20865   // Nuke no-op shuffles that show up after combining.
20866   if (isNoopShuffleMask(Mask))
20867     return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
20868
20869   // Look for simplifications involving one or two shuffle instructions.
20870   SDValue V = N.getOperand(0);
20871   switch (N.getOpcode()) {
20872   default:
20873     break;
20874   case X86ISD::PSHUFLW:
20875   case X86ISD::PSHUFHW:
20876     assert(VT.getScalarType() == MVT::i16 && "Bad word shuffle type!");
20877
20878     if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
20879       return SDValue(); // We combined away this shuffle, so we're done.
20880
20881     // See if this reduces to a PSHUFD which is no more expensive and can
20882     // combine with more operations. Note that it has to at least flip the
20883     // dwords as otherwise it would have been removed as a no-op.
20884     if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
20885       int DMask[] = {0, 1, 2, 3};
20886       int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
20887       DMask[DOffset + 0] = DOffset + 1;
20888       DMask[DOffset + 1] = DOffset + 0;
20889       MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
20890       V = DAG.getBitcast(DVT, V);
20891       DCI.AddToWorklist(V.getNode());
20892       V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
20893                       getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
20894       DCI.AddToWorklist(V.getNode());
20895       return DAG.getBitcast(VT, V);
20896     }
20897
20898     // Look for shuffle patterns which can be implemented as a single unpack.
20899     // FIXME: This doesn't handle the location of the PSHUFD generically, and
20900     // only works when we have a PSHUFD followed by two half-shuffles.
20901     if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
20902         (V.getOpcode() == X86ISD::PSHUFLW ||
20903          V.getOpcode() == X86ISD::PSHUFHW) &&
20904         V.getOpcode() != N.getOpcode() &&
20905         V.hasOneUse()) {
20906       SDValue D = V.getOperand(0);
20907       while (D.getOpcode() == ISD::BITCAST && D.hasOneUse())
20908         D = D.getOperand(0);
20909       if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
20910         SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
20911         SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
20912         int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
20913         int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
20914         int WordMask[8];
20915         for (int i = 0; i < 4; ++i) {
20916           WordMask[i + NOffset] = Mask[i] + NOffset;
20917           WordMask[i + VOffset] = VMask[i] + VOffset;
20918         }
20919         // Map the word mask through the DWord mask.
20920         int MappedMask[8];
20921         for (int i = 0; i < 8; ++i)
20922           MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
20923         if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
20924             makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
20925           // We can replace all three shuffles with an unpack.
20926           V = DAG.getBitcast(VT, D.getOperand(0));
20927           DCI.AddToWorklist(V.getNode());
20928           return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
20929                                                 : X86ISD::UNPCKH,
20930                              DL, VT, V, V);
20931         }
20932       }
20933     }
20934
20935     break;
20936
20937   case X86ISD::PSHUFD:
20938     if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI))
20939       return NewN;
20940
20941     break;
20942   }
20943
20944   return SDValue();
20945 }
20946
20947 /// \brief Try to combine a shuffle into a target-specific add-sub node.
20948 ///
20949 /// We combine this directly on the abstract vector shuffle nodes so it is
20950 /// easier to generically match. We also insert dummy vector shuffle nodes for
20951 /// the operands which explicitly discard the lanes which are unused by this
20952 /// operation to try to flow through the rest of the combiner the fact that
20953 /// they're unused.
20954 static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) {
20955   SDLoc DL(N);
20956   EVT VT = N->getValueType(0);
20957
20958   // We only handle target-independent shuffles.
20959   // FIXME: It would be easy and harmless to use the target shuffle mask
20960   // extraction tool to support more.
20961   if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
20962     return SDValue();
20963
20964   auto *SVN = cast<ShuffleVectorSDNode>(N);
20965   ArrayRef<int> Mask = SVN->getMask();
20966   SDValue V1 = N->getOperand(0);
20967   SDValue V2 = N->getOperand(1);
20968
20969   // We require the first shuffle operand to be the SUB node, and the second to
20970   // be the ADD node.
20971   // FIXME: We should support the commuted patterns.
20972   if (V1->getOpcode() != ISD::FSUB || V2->getOpcode() != ISD::FADD)
20973     return SDValue();
20974
20975   // If there are other uses of these operations we can't fold them.
20976   if (!V1->hasOneUse() || !V2->hasOneUse())
20977     return SDValue();
20978
20979   // Ensure that both operations have the same operands. Note that we can
20980   // commute the FADD operands.
20981   SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
20982   if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
20983       (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
20984     return SDValue();
20985
20986   // We're looking for blends between FADD and FSUB nodes. We insist on these
20987   // nodes being lined up in a specific expected pattern.
20988   if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
20989         isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
20990         isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15})))
20991     return SDValue();
20992
20993   // Only specific types are legal at this point, assert so we notice if and
20994   // when these change.
20995   assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 ||
20996           VT == MVT::v4f64) &&
20997          "Unknown vector type encountered!");
20998
20999   return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS);
21000 }
21001
21002 /// PerformShuffleCombine - Performs several different shuffle combines.
21003 static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
21004                                      TargetLowering::DAGCombinerInfo &DCI,
21005                                      const X86Subtarget *Subtarget) {
21006   SDLoc dl(N);
21007   SDValue N0 = N->getOperand(0);
21008   SDValue N1 = N->getOperand(1);
21009   EVT VT = N->getValueType(0);
21010
21011   // Don't create instructions with illegal types after legalize types has run.
21012   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21013   if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
21014     return SDValue();
21015
21016   // If we have legalized the vector types, look for blends of FADD and FSUB
21017   // nodes that we can fuse into an ADDSUB node.
21018   if (TLI.isTypeLegal(VT) && Subtarget->hasSSE3())
21019     if (SDValue AddSub = combineShuffleToAddSub(N, DAG))
21020       return AddSub;
21021
21022   // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
21023   if (Subtarget->hasFp256() && VT.is256BitVector() &&
21024       N->getOpcode() == ISD::VECTOR_SHUFFLE)
21025     return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
21026
21027   // During Type Legalization, when promoting illegal vector types,
21028   // the backend might introduce new shuffle dag nodes and bitcasts.
21029   //
21030   // This code performs the following transformation:
21031   // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
21032   //       (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
21033   //
21034   // We do this only if both the bitcast and the BINOP dag nodes have
21035   // one use. Also, perform this transformation only if the new binary
21036   // operation is legal. This is to avoid introducing dag nodes that
21037   // potentially need to be further expanded (or custom lowered) into a
21038   // less optimal sequence of dag nodes.
21039   if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
21040       N1.getOpcode() == ISD::UNDEF && N0.hasOneUse() &&
21041       N0.getOpcode() == ISD::BITCAST) {
21042     SDValue BC0 = N0.getOperand(0);
21043     EVT SVT = BC0.getValueType();
21044     unsigned Opcode = BC0.getOpcode();
21045     unsigned NumElts = VT.getVectorNumElements();
21046
21047     if (BC0.hasOneUse() && SVT.isVector() &&
21048         SVT.getVectorNumElements() * 2 == NumElts &&
21049         TLI.isOperationLegal(Opcode, VT)) {
21050       bool CanFold = false;
21051       switch (Opcode) {
21052       default : break;
21053       case ISD::ADD :
21054       case ISD::FADD :
21055       case ISD::SUB :
21056       case ISD::FSUB :
21057       case ISD::MUL :
21058       case ISD::FMUL :
21059         CanFold = true;
21060       }
21061
21062       unsigned SVTNumElts = SVT.getVectorNumElements();
21063       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
21064       for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
21065         CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
21066       for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
21067         CanFold = SVOp->getMaskElt(i) < 0;
21068
21069       if (CanFold) {
21070         SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
21071         SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
21072         SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
21073         return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]);
21074       }
21075     }
21076   }
21077
21078   // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
21079   // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
21080   // consecutive, non-overlapping, and in the right order.
21081   SmallVector<SDValue, 16> Elts;
21082   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
21083     Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
21084
21085   SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true);
21086   if (LD.getNode())
21087     return LD;
21088
21089   if (isTargetShuffle(N->getOpcode())) {
21090     SDValue Shuffle =
21091         PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget);
21092     if (Shuffle.getNode())
21093       return Shuffle;
21094
21095     // Try recursively combining arbitrary sequences of x86 shuffle
21096     // instructions into higher-order shuffles. We do this after combining
21097     // specific PSHUF instruction sequences into their minimal form so that we
21098     // can evaluate how many specialized shuffle instructions are involved in
21099     // a particular chain.
21100     SmallVector<int, 1> NonceMask; // Just a placeholder.
21101     NonceMask.push_back(0);
21102     if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask,
21103                                       /*Depth*/ 1, /*HasPSHUFB*/ false, DAG,
21104                                       DCI, Subtarget))
21105       return SDValue(); // This routine will use CombineTo to replace N.
21106   }
21107
21108   return SDValue();
21109 }
21110
21111 /// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target
21112 /// specific shuffle of a load can be folded into a single element load.
21113 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
21114 /// shuffles have been custom lowered so we need to handle those here.
21115 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
21116                                          TargetLowering::DAGCombinerInfo &DCI) {
21117   if (DCI.isBeforeLegalizeOps())
21118     return SDValue();
21119
21120   SDValue InVec = N->getOperand(0);
21121   SDValue EltNo = N->getOperand(1);
21122
21123   if (!isa<ConstantSDNode>(EltNo))
21124     return SDValue();
21125
21126   EVT OriginalVT = InVec.getValueType();
21127
21128   if (InVec.getOpcode() == ISD::BITCAST) {
21129     // Don't duplicate a load with other uses.
21130     if (!InVec.hasOneUse())
21131       return SDValue();
21132     EVT BCVT = InVec.getOperand(0).getValueType();
21133     if (!BCVT.isVector() ||
21134         BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
21135       return SDValue();
21136     InVec = InVec.getOperand(0);
21137   }
21138
21139   EVT CurrentVT = InVec.getValueType();
21140
21141   if (!isTargetShuffle(InVec.getOpcode()))
21142     return SDValue();
21143
21144   // Don't duplicate a load with other uses.
21145   if (!InVec.hasOneUse())
21146     return SDValue();
21147
21148   SmallVector<int, 16> ShuffleMask;
21149   bool UnaryShuffle;
21150   if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(),
21151                             ShuffleMask, UnaryShuffle))
21152     return SDValue();
21153
21154   // Select the input vector, guarding against out of range extract vector.
21155   unsigned NumElems = CurrentVT.getVectorNumElements();
21156   int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
21157   int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt];
21158   SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0)
21159                                          : InVec.getOperand(1);
21160
21161   // If inputs to shuffle are the same for both ops, then allow 2 uses
21162   unsigned AllowedUses = InVec.getNumOperands() > 1 &&
21163                          InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
21164
21165   if (LdNode.getOpcode() == ISD::BITCAST) {
21166     // Don't duplicate a load with other uses.
21167     if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
21168       return SDValue();
21169
21170     AllowedUses = 1; // only allow 1 load use if we have a bitcast
21171     LdNode = LdNode.getOperand(0);
21172   }
21173
21174   if (!ISD::isNormalLoad(LdNode.getNode()))
21175     return SDValue();
21176
21177   LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
21178
21179   if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
21180     return SDValue();
21181
21182   EVT EltVT = N->getValueType(0);
21183   // If there's a bitcast before the shuffle, check if the load type and
21184   // alignment is valid.
21185   unsigned Align = LN0->getAlignment();
21186   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21187   unsigned NewAlign = TLI.getDataLayout()->getABITypeAlignment(
21188       EltVT.getTypeForEVT(*DAG.getContext()));
21189
21190   if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
21191     return SDValue();
21192
21193   // All checks match so transform back to vector_shuffle so that DAG combiner
21194   // can finish the job
21195   SDLoc dl(N);
21196
21197   // Create shuffle node taking into account the case that its a unary shuffle
21198   SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT)
21199                                    : InVec.getOperand(1);
21200   Shuffle = DAG.getVectorShuffle(CurrentVT, dl,
21201                                  InVec.getOperand(0), Shuffle,
21202                                  &ShuffleMask[0]);
21203   Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
21204   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
21205                      EltNo);
21206 }
21207
21208 /// \brief Detect bitcasts between i32 to x86mmx low word. Since MMX types are
21209 /// special and don't usually play with other vector types, it's better to
21210 /// handle them early to be sure we emit efficient code by avoiding
21211 /// store-load conversions.
21212 static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG) {
21213   if (N->getValueType(0) != MVT::x86mmx ||
21214       N->getOperand(0)->getOpcode() != ISD::BUILD_VECTOR ||
21215       N->getOperand(0)->getValueType(0) != MVT::v2i32)
21216     return SDValue();
21217
21218   SDValue V = N->getOperand(0);
21219   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(1));
21220   if (C && C->getZExtValue() == 0 && V.getOperand(0).getValueType() == MVT::i32)
21221     return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(V.getOperand(0)),
21222                        N->getValueType(0), V.getOperand(0));
21223
21224   return SDValue();
21225 }
21226
21227 /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
21228 /// generation and convert it from being a bunch of shuffles and extracts
21229 /// into a somewhat faster sequence. For i686, the best sequence is apparently
21230 /// storing the value and loading scalars back, while for x64 we should
21231 /// use 64-bit extracts and shifts.
21232 static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
21233                                          TargetLowering::DAGCombinerInfo &DCI) {
21234   SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI);
21235   if (NewOp.getNode())
21236     return NewOp;
21237
21238   SDValue InputVector = N->getOperand(0);
21239   SDLoc dl(InputVector);
21240   // Detect mmx to i32 conversion through a v2i32 elt extract.
21241   if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
21242       N->getValueType(0) == MVT::i32 &&
21243       InputVector.getValueType() == MVT::v2i32) {
21244
21245     // The bitcast source is a direct mmx result.
21246     SDValue MMXSrc = InputVector.getNode()->getOperand(0);
21247     if (MMXSrc.getValueType() == MVT::x86mmx)
21248       return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
21249                          N->getValueType(0),
21250                          InputVector.getNode()->getOperand(0));
21251
21252     // The mmx is indirect: (i64 extract_elt (v1i64 bitcast (x86mmx ...))).
21253     SDValue MMXSrcOp = MMXSrc.getOperand(0);
21254     if (MMXSrc.getOpcode() == ISD::EXTRACT_VECTOR_ELT && MMXSrc.hasOneUse() &&
21255         MMXSrc.getValueType() == MVT::i64 && MMXSrcOp.hasOneUse() &&
21256         MMXSrcOp.getOpcode() == ISD::BITCAST &&
21257         MMXSrcOp.getValueType() == MVT::v1i64 &&
21258         MMXSrcOp.getOperand(0).getValueType() == MVT::x86mmx)
21259       return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
21260                          N->getValueType(0),
21261                          MMXSrcOp.getOperand(0));
21262   }
21263
21264   EVT VT = N->getValueType(0);
21265
21266   if (VT == MVT::i1 && dyn_cast<ConstantSDNode>(N->getOperand(1)) &&
21267       InputVector.getOpcode() == ISD::BITCAST &&
21268       dyn_cast<ConstantSDNode>(InputVector.getOperand(0))) {
21269     uint64_t ExtractedElt =
21270           cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
21271     uint64_t InputValue =
21272           cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue();
21273     uint64_t Res = (InputValue >> ExtractedElt) & 1;
21274     return DAG.getConstant(Res, dl, MVT::i1);
21275   }
21276   // Only operate on vectors of 4 elements, where the alternative shuffling
21277   // gets to be more expensive.
21278   if (InputVector.getValueType() != MVT::v4i32)
21279     return SDValue();
21280
21281   // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
21282   // single use which is a sign-extend or zero-extend, and all elements are
21283   // used.
21284   SmallVector<SDNode *, 4> Uses;
21285   unsigned ExtractedElements = 0;
21286   for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
21287        UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
21288     if (UI.getUse().getResNo() != InputVector.getResNo())
21289       return SDValue();
21290
21291     SDNode *Extract = *UI;
21292     if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
21293       return SDValue();
21294
21295     if (Extract->getValueType(0) != MVT::i32)
21296       return SDValue();
21297     if (!Extract->hasOneUse())
21298       return SDValue();
21299     if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
21300         Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
21301       return SDValue();
21302     if (!isa<ConstantSDNode>(Extract->getOperand(1)))
21303       return SDValue();
21304
21305     // Record which element was extracted.
21306     ExtractedElements |=
21307       1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
21308
21309     Uses.push_back(Extract);
21310   }
21311
21312   // If not all the elements were used, this may not be worthwhile.
21313   if (ExtractedElements != 15)
21314     return SDValue();
21315
21316   // Ok, we've now decided to do the transformation.
21317   // If 64-bit shifts are legal, use the extract-shift sequence,
21318   // otherwise bounce the vector off the cache.
21319   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21320   SDValue Vals[4];
21321
21322   if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
21323     SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
21324     EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy();
21325     SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
21326       DAG.getConstant(0, dl, VecIdxTy));
21327     SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
21328       DAG.getConstant(1, dl, VecIdxTy));
21329
21330     SDValue ShAmt = DAG.getConstant(32, dl,
21331       DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64));
21332     Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
21333     Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
21334       DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
21335     Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
21336     Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
21337       DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
21338   } else {
21339     // Store the value to a temporary stack slot.
21340     SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
21341     SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
21342       MachinePointerInfo(), false, false, 0);
21343
21344     EVT ElementType = InputVector.getValueType().getVectorElementType();
21345     unsigned EltSize = ElementType.getSizeInBits() / 8;
21346
21347     // Replace each use (extract) with a load of the appropriate element.
21348     for (unsigned i = 0; i < 4; ++i) {
21349       uint64_t Offset = EltSize * i;
21350       SDValue OffsetVal = DAG.getConstant(Offset, dl, TLI.getPointerTy());
21351
21352       SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
21353                                        StackPtr, OffsetVal);
21354
21355       // Load the scalar.
21356       Vals[i] = DAG.getLoad(ElementType, dl, Ch,
21357                             ScalarAddr, MachinePointerInfo(),
21358                             false, false, false, 0);
21359
21360     }
21361   }
21362
21363   // Replace the extracts
21364   for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
21365     UE = Uses.end(); UI != UE; ++UI) {
21366     SDNode *Extract = *UI;
21367
21368     SDValue Idx = Extract->getOperand(1);
21369     uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
21370     DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
21371   }
21372
21373   // The replacement was made in place; don't return anything.
21374   return SDValue();
21375 }
21376
21377 /// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match.
21378 static std::pair<unsigned, bool>
21379 matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
21380                    SelectionDAG &DAG, const X86Subtarget *Subtarget) {
21381   if (!VT.isVector())
21382     return std::make_pair(0, false);
21383
21384   bool NeedSplit = false;
21385   switch (VT.getSimpleVT().SimpleTy) {
21386   default: return std::make_pair(0, false);
21387   case MVT::v4i64:
21388   case MVT::v2i64:
21389     if (!Subtarget->hasVLX())
21390       return std::make_pair(0, false);
21391     break;
21392   case MVT::v64i8:
21393   case MVT::v32i16:
21394     if (!Subtarget->hasBWI())
21395       return std::make_pair(0, false);
21396     break;
21397   case MVT::v16i32:
21398   case MVT::v8i64:
21399     if (!Subtarget->hasAVX512())
21400       return std::make_pair(0, false);
21401     break;
21402   case MVT::v32i8:
21403   case MVT::v16i16:
21404   case MVT::v8i32:
21405     if (!Subtarget->hasAVX2())
21406       NeedSplit = true;
21407     if (!Subtarget->hasAVX())
21408       return std::make_pair(0, false);
21409     break;
21410   case MVT::v16i8:
21411   case MVT::v8i16:
21412   case MVT::v4i32:
21413     if (!Subtarget->hasSSE2())
21414       return std::make_pair(0, false);
21415   }
21416
21417   // SSE2 has only a small subset of the operations.
21418   bool hasUnsigned = Subtarget->hasSSE41() ||
21419                      (Subtarget->hasSSE2() && VT == MVT::v16i8);
21420   bool hasSigned = Subtarget->hasSSE41() ||
21421                    (Subtarget->hasSSE2() && VT == MVT::v8i16);
21422
21423   ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
21424
21425   unsigned Opc = 0;
21426   // Check for x CC y ? x : y.
21427   if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
21428       DAG.isEqualTo(RHS, Cond.getOperand(1))) {
21429     switch (CC) {
21430     default: break;
21431     case ISD::SETULT:
21432     case ISD::SETULE:
21433       Opc = hasUnsigned ? X86ISD::UMIN : 0u; break;
21434     case ISD::SETUGT:
21435     case ISD::SETUGE:
21436       Opc = hasUnsigned ? X86ISD::UMAX : 0u; break;
21437     case ISD::SETLT:
21438     case ISD::SETLE:
21439       Opc = hasSigned ? X86ISD::SMIN : 0u; break;
21440     case ISD::SETGT:
21441     case ISD::SETGE:
21442       Opc = hasSigned ? X86ISD::SMAX : 0u; break;
21443     }
21444   // Check for x CC y ? y : x -- a min/max with reversed arms.
21445   } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
21446              DAG.isEqualTo(RHS, Cond.getOperand(0))) {
21447     switch (CC) {
21448     default: break;
21449     case ISD::SETULT:
21450     case ISD::SETULE:
21451       Opc = hasUnsigned ? X86ISD::UMAX : 0u; break;
21452     case ISD::SETUGT:
21453     case ISD::SETUGE:
21454       Opc = hasUnsigned ? X86ISD::UMIN : 0u; break;
21455     case ISD::SETLT:
21456     case ISD::SETLE:
21457       Opc = hasSigned ? X86ISD::SMAX : 0u; break;
21458     case ISD::SETGT:
21459     case ISD::SETGE:
21460       Opc = hasSigned ? X86ISD::SMIN : 0u; break;
21461     }
21462   }
21463
21464   return std::make_pair(Opc, NeedSplit);
21465 }
21466
21467 static SDValue
21468 transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
21469                                       const X86Subtarget *Subtarget) {
21470   SDLoc dl(N);
21471   SDValue Cond = N->getOperand(0);
21472   SDValue LHS = N->getOperand(1);
21473   SDValue RHS = N->getOperand(2);
21474
21475   if (Cond.getOpcode() == ISD::SIGN_EXTEND) {
21476     SDValue CondSrc = Cond->getOperand(0);
21477     if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG)
21478       Cond = CondSrc->getOperand(0);
21479   }
21480
21481   if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
21482     return SDValue();
21483
21484   // A vselect where all conditions and data are constants can be optimized into
21485   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
21486   if (ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
21487       ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
21488     return SDValue();
21489
21490   unsigned MaskValue = 0;
21491   if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
21492     return SDValue();
21493
21494   MVT VT = N->getSimpleValueType(0);
21495   unsigned NumElems = VT.getVectorNumElements();
21496   SmallVector<int, 8> ShuffleMask(NumElems, -1);
21497   for (unsigned i = 0; i < NumElems; ++i) {
21498     // Be sure we emit undef where we can.
21499     if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF)
21500       ShuffleMask[i] = -1;
21501     else
21502       ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1);
21503   }
21504
21505   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21506   if (!TLI.isShuffleMaskLegal(ShuffleMask, VT))
21507     return SDValue();
21508   return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]);
21509 }
21510
21511 /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
21512 /// nodes.
21513 static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
21514                                     TargetLowering::DAGCombinerInfo &DCI,
21515                                     const X86Subtarget *Subtarget) {
21516   SDLoc DL(N);
21517   SDValue Cond = N->getOperand(0);
21518   // Get the LHS/RHS of the select.
21519   SDValue LHS = N->getOperand(1);
21520   SDValue RHS = N->getOperand(2);
21521   EVT VT = LHS.getValueType();
21522   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21523
21524   // If we have SSE[12] support, try to form min/max nodes. SSE min/max
21525   // instructions match the semantics of the common C idiom x<y?x:y but not
21526   // x<=y?x:y, because of how they handle negative zero (which can be
21527   // ignored in unsafe-math mode).
21528   // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
21529   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
21530       VT != MVT::f80 && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
21531       (Subtarget->hasSSE2() ||
21532        (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
21533     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
21534
21535     unsigned Opcode = 0;
21536     // Check for x CC y ? x : y.
21537     if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
21538         DAG.isEqualTo(RHS, Cond.getOperand(1))) {
21539       switch (CC) {
21540       default: break;
21541       case ISD::SETULT:
21542         // Converting this to a min would handle NaNs incorrectly, and swapping
21543         // the operands would cause it to handle comparisons between positive
21544         // and negative zero incorrectly.
21545         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
21546           if (!DAG.getTarget().Options.UnsafeFPMath &&
21547               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
21548             break;
21549           std::swap(LHS, RHS);
21550         }
21551         Opcode = X86ISD::FMIN;
21552         break;
21553       case ISD::SETOLE:
21554         // Converting this to a min would handle comparisons between positive
21555         // and negative zero incorrectly.
21556         if (!DAG.getTarget().Options.UnsafeFPMath &&
21557             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
21558           break;
21559         Opcode = X86ISD::FMIN;
21560         break;
21561       case ISD::SETULE:
21562         // Converting this to a min would handle both negative zeros and NaNs
21563         // incorrectly, but we can swap the operands to fix both.
21564         std::swap(LHS, RHS);
21565       case ISD::SETOLT:
21566       case ISD::SETLT:
21567       case ISD::SETLE:
21568         Opcode = X86ISD::FMIN;
21569         break;
21570
21571       case ISD::SETOGE:
21572         // Converting this to a max would handle comparisons between positive
21573         // and negative zero incorrectly.
21574         if (!DAG.getTarget().Options.UnsafeFPMath &&
21575             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
21576           break;
21577         Opcode = X86ISD::FMAX;
21578         break;
21579       case ISD::SETUGT:
21580         // Converting this to a max would handle NaNs incorrectly, and swapping
21581         // the operands would cause it to handle comparisons between positive
21582         // and negative zero incorrectly.
21583         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
21584           if (!DAG.getTarget().Options.UnsafeFPMath &&
21585               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
21586             break;
21587           std::swap(LHS, RHS);
21588         }
21589         Opcode = X86ISD::FMAX;
21590         break;
21591       case ISD::SETUGE:
21592         // Converting this to a max would handle both negative zeros and NaNs
21593         // incorrectly, but we can swap the operands to fix both.
21594         std::swap(LHS, RHS);
21595       case ISD::SETOGT:
21596       case ISD::SETGT:
21597       case ISD::SETGE:
21598         Opcode = X86ISD::FMAX;
21599         break;
21600       }
21601     // Check for x CC y ? y : x -- a min/max with reversed arms.
21602     } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
21603                DAG.isEqualTo(RHS, Cond.getOperand(0))) {
21604       switch (CC) {
21605       default: break;
21606       case ISD::SETOGE:
21607         // Converting this to a min would handle comparisons between positive
21608         // and negative zero incorrectly, and swapping the operands would
21609         // cause it to handle NaNs incorrectly.
21610         if (!DAG.getTarget().Options.UnsafeFPMath &&
21611             !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
21612           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
21613             break;
21614           std::swap(LHS, RHS);
21615         }
21616         Opcode = X86ISD::FMIN;
21617         break;
21618       case ISD::SETUGT:
21619         // Converting this to a min would handle NaNs incorrectly.
21620         if (!DAG.getTarget().Options.UnsafeFPMath &&
21621             (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
21622           break;
21623         Opcode = X86ISD::FMIN;
21624         break;
21625       case ISD::SETUGE:
21626         // Converting this to a min would handle both negative zeros and NaNs
21627         // incorrectly, but we can swap the operands to fix both.
21628         std::swap(LHS, RHS);
21629       case ISD::SETOGT:
21630       case ISD::SETGT:
21631       case ISD::SETGE:
21632         Opcode = X86ISD::FMIN;
21633         break;
21634
21635       case ISD::SETULT:
21636         // Converting this to a max would handle NaNs incorrectly.
21637         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
21638           break;
21639         Opcode = X86ISD::FMAX;
21640         break;
21641       case ISD::SETOLE:
21642         // Converting this to a max would handle comparisons between positive
21643         // and negative zero incorrectly, and swapping the operands would
21644         // cause it to handle NaNs incorrectly.
21645         if (!DAG.getTarget().Options.UnsafeFPMath &&
21646             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
21647           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
21648             break;
21649           std::swap(LHS, RHS);
21650         }
21651         Opcode = X86ISD::FMAX;
21652         break;
21653       case ISD::SETULE:
21654         // Converting this to a max would handle both negative zeros and NaNs
21655         // incorrectly, but we can swap the operands to fix both.
21656         std::swap(LHS, RHS);
21657       case ISD::SETOLT:
21658       case ISD::SETLT:
21659       case ISD::SETLE:
21660         Opcode = X86ISD::FMAX;
21661         break;
21662       }
21663     }
21664
21665     if (Opcode)
21666       return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
21667   }
21668
21669   EVT CondVT = Cond.getValueType();
21670   if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() &&
21671       CondVT.getVectorElementType() == MVT::i1) {
21672     // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
21673     // lowering on KNL. In this case we convert it to
21674     // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
21675     // The same situation for all 128 and 256-bit vectors of i8 and i16.
21676     // Since SKX these selects have a proper lowering.
21677     EVT OpVT = LHS.getValueType();
21678     if ((OpVT.is128BitVector() || OpVT.is256BitVector()) &&
21679         (OpVT.getVectorElementType() == MVT::i8 ||
21680          OpVT.getVectorElementType() == MVT::i16) &&
21681         !(Subtarget->hasBWI() && Subtarget->hasVLX())) {
21682       Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond);
21683       DCI.AddToWorklist(Cond.getNode());
21684       return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS);
21685     }
21686   }
21687   // If this is a select between two integer constants, try to do some
21688   // optimizations.
21689   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
21690     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
21691       // Don't do this for crazy integer types.
21692       if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
21693         // If this is efficiently invertible, canonicalize the LHSC/RHSC values
21694         // so that TrueC (the true value) is larger than FalseC.
21695         bool NeedsCondInvert = false;
21696
21697         if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
21698             // Efficiently invertible.
21699             (Cond.getOpcode() == ISD::SETCC ||  // setcc -> invertible.
21700              (Cond.getOpcode() == ISD::XOR &&   // xor(X, C) -> invertible.
21701               isa<ConstantSDNode>(Cond.getOperand(1))))) {
21702           NeedsCondInvert = true;
21703           std::swap(TrueC, FalseC);
21704         }
21705
21706         // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
21707         if (FalseC->getAPIntValue() == 0 &&
21708             TrueC->getAPIntValue().isPowerOf2()) {
21709           if (NeedsCondInvert) // Invert the condition if needed.
21710             Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
21711                                DAG.getConstant(1, DL, Cond.getValueType()));
21712
21713           // Zero extend the condition if needed.
21714           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
21715
21716           unsigned ShAmt = TrueC->getAPIntValue().logBase2();
21717           return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
21718                              DAG.getConstant(ShAmt, DL, MVT::i8));
21719         }
21720
21721         // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
21722         if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
21723           if (NeedsCondInvert) // Invert the condition if needed.
21724             Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
21725                                DAG.getConstant(1, DL, Cond.getValueType()));
21726
21727           // Zero extend the condition if needed.
21728           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
21729                              FalseC->getValueType(0), Cond);
21730           return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
21731                              SDValue(FalseC, 0));
21732         }
21733
21734         // Optimize cases that will turn into an LEA instruction.  This requires
21735         // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
21736         if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
21737           uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
21738           if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
21739
21740           bool isFastMultiplier = false;
21741           if (Diff < 10) {
21742             switch ((unsigned char)Diff) {
21743               default: break;
21744               case 1:  // result = add base, cond
21745               case 2:  // result = lea base(    , cond*2)
21746               case 3:  // result = lea base(cond, cond*2)
21747               case 4:  // result = lea base(    , cond*4)
21748               case 5:  // result = lea base(cond, cond*4)
21749               case 8:  // result = lea base(    , cond*8)
21750               case 9:  // result = lea base(cond, cond*8)
21751                 isFastMultiplier = true;
21752                 break;
21753             }
21754           }
21755
21756           if (isFastMultiplier) {
21757             APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
21758             if (NeedsCondInvert) // Invert the condition if needed.
21759               Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
21760                                  DAG.getConstant(1, DL, Cond.getValueType()));
21761
21762             // Zero extend the condition if needed.
21763             Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
21764                                Cond);
21765             // Scale the condition by the difference.
21766             if (Diff != 1)
21767               Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
21768                                  DAG.getConstant(Diff, DL,
21769                                                  Cond.getValueType()));
21770
21771             // Add the base if non-zero.
21772             if (FalseC->getAPIntValue() != 0)
21773               Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
21774                                  SDValue(FalseC, 0));
21775             return Cond;
21776           }
21777         }
21778       }
21779   }
21780
21781   // Canonicalize max and min:
21782   // (x > y) ? x : y -> (x >= y) ? x : y
21783   // (x < y) ? x : y -> (x <= y) ? x : y
21784   // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
21785   // the need for an extra compare
21786   // against zero. e.g.
21787   // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
21788   // subl   %esi, %edi
21789   // testl  %edi, %edi
21790   // movl   $0, %eax
21791   // cmovgl %edi, %eax
21792   // =>
21793   // xorl   %eax, %eax
21794   // subl   %esi, $edi
21795   // cmovsl %eax, %edi
21796   if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
21797       DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
21798       DAG.isEqualTo(RHS, Cond.getOperand(1))) {
21799     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
21800     switch (CC) {
21801     default: break;
21802     case ISD::SETLT:
21803     case ISD::SETGT: {
21804       ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
21805       Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
21806                           Cond.getOperand(0), Cond.getOperand(1), NewCC);
21807       return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
21808     }
21809     }
21810   }
21811
21812   // Early exit check
21813   if (!TLI.isTypeLegal(VT))
21814     return SDValue();
21815
21816   // Match VSELECTs into subs with unsigned saturation.
21817   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
21818       // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
21819       ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
21820        (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
21821     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
21822
21823     // Check if one of the arms of the VSELECT is a zero vector. If it's on the
21824     // left side invert the predicate to simplify logic below.
21825     SDValue Other;
21826     if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
21827       Other = RHS;
21828       CC = ISD::getSetCCInverse(CC, true);
21829     } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
21830       Other = LHS;
21831     }
21832
21833     if (Other.getNode() && Other->getNumOperands() == 2 &&
21834         DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
21835       SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
21836       SDValue CondRHS = Cond->getOperand(1);
21837
21838       // Look for a general sub with unsigned saturation first.
21839       // x >= y ? x-y : 0 --> subus x, y
21840       // x >  y ? x-y : 0 --> subus x, y
21841       if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
21842           Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
21843         return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
21844
21845       if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
21846         if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
21847           if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
21848             if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
21849               // If the RHS is a constant we have to reverse the const
21850               // canonicalization.
21851               // x > C-1 ? x+-C : 0 --> subus x, C
21852               if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
21853                   CondRHSConst->getAPIntValue() ==
21854                       (-OpRHSConst->getAPIntValue() - 1))
21855                 return DAG.getNode(
21856                     X86ISD::SUBUS, DL, VT, OpLHS,
21857                     DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));
21858
21859           // Another special case: If C was a sign bit, the sub has been
21860           // canonicalized into a xor.
21861           // FIXME: Would it be better to use computeKnownBits to determine
21862           //        whether it's safe to decanonicalize the xor?
21863           // x s< 0 ? x^C : 0 --> subus x, C
21864           if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
21865               ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
21866               OpRHSConst->getAPIntValue().isSignBit())
21867             // Note that we have to rebuild the RHS constant here to ensure we
21868             // don't rely on particular values of undef lanes.
21869             return DAG.getNode(
21870                 X86ISD::SUBUS, DL, VT, OpLHS,
21871                 DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
21872         }
21873     }
21874   }
21875
21876   // Try to match a min/max vector operation.
21877   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) {
21878     std::pair<unsigned, bool> ret = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget);
21879     unsigned Opc = ret.first;
21880     bool NeedSplit = ret.second;
21881
21882     if (Opc && NeedSplit) {
21883       unsigned NumElems = VT.getVectorNumElements();
21884       // Extract the LHS vectors
21885       SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, DL);
21886       SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, DL);
21887
21888       // Extract the RHS vectors
21889       SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, DL);
21890       SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, DL);
21891
21892       // Create min/max for each subvector
21893       LHS = DAG.getNode(Opc, DL, LHS1.getValueType(), LHS1, RHS1);
21894       RHS = DAG.getNode(Opc, DL, LHS2.getValueType(), LHS2, RHS2);
21895
21896       // Merge the result
21897       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS, RHS);
21898     } else if (Opc)
21899       return DAG.getNode(Opc, DL, VT, LHS, RHS);
21900   }
21901
21902   // Simplify vector selection if condition value type matches vselect
21903   // operand type
21904   if (N->getOpcode() == ISD::VSELECT && CondVT == VT) {
21905     assert(Cond.getValueType().isVector() &&
21906            "vector select expects a vector selector!");
21907
21908     bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
21909     bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
21910
21911     // Try invert the condition if true value is not all 1s and false value
21912     // is not all 0s.
21913     if (!TValIsAllOnes && !FValIsAllZeros &&
21914         // Check if the selector will be produced by CMPP*/PCMP*
21915         Cond.getOpcode() == ISD::SETCC &&
21916         // Check if SETCC has already been promoted
21917         TLI.getSetCCResultType(*DAG.getContext(), VT) == CondVT) {
21918       bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
21919       bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
21920
21921       if (TValIsAllZeros || FValIsAllOnes) {
21922         SDValue CC = Cond.getOperand(2);
21923         ISD::CondCode NewCC =
21924           ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
21925                                Cond.getOperand(0).getValueType().isInteger());
21926         Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC);
21927         std::swap(LHS, RHS);
21928         TValIsAllOnes = FValIsAllOnes;
21929         FValIsAllZeros = TValIsAllZeros;
21930       }
21931     }
21932
21933     if (TValIsAllOnes || FValIsAllZeros) {
21934       SDValue Ret;
21935
21936       if (TValIsAllOnes && FValIsAllZeros)
21937         Ret = Cond;
21938       else if (TValIsAllOnes)
21939         Ret =
21940             DAG.getNode(ISD::OR, DL, CondVT, Cond, DAG.getBitcast(CondVT, RHS));
21941       else if (FValIsAllZeros)
21942         Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond,
21943                           DAG.getBitcast(CondVT, LHS));
21944
21945       return DAG.getBitcast(VT, Ret);
21946     }
21947   }
21948
21949   // We should generate an X86ISD::BLENDI from a vselect if its argument
21950   // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
21951   // constants. This specific pattern gets generated when we split a
21952   // selector for a 512 bit vector in a machine without AVX512 (but with
21953   // 256-bit vectors), during legalization:
21954   //
21955   // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
21956   //
21957   // Iff we find this pattern and the build_vectors are built from
21958   // constants, we translate the vselect into a shuffle_vector that we
21959   // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
21960   if ((N->getOpcode() == ISD::VSELECT ||
21961        N->getOpcode() == X86ISD::SHRUNKBLEND) &&
21962       !DCI.isBeforeLegalize() && !VT.is512BitVector()) {
21963     SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
21964     if (Shuffle.getNode())
21965       return Shuffle;
21966   }
21967
21968   // If this is a *dynamic* select (non-constant condition) and we can match
21969   // this node with one of the variable blend instructions, restructure the
21970   // condition so that the blends can use the high bit of each element and use
21971   // SimplifyDemandedBits to simplify the condition operand.
21972   if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
21973       !DCI.isBeforeLegalize() &&
21974       !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
21975     unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
21976
21977     // Don't optimize vector selects that map to mask-registers.
21978     if (BitWidth == 1)
21979       return SDValue();
21980
21981     // We can only handle the cases where VSELECT is directly legal on the
21982     // subtarget. We custom lower VSELECT nodes with constant conditions and
21983     // this makes it hard to see whether a dynamic VSELECT will correctly
21984     // lower, so we both check the operation's status and explicitly handle the
21985     // cases where a *dynamic* blend will fail even though a constant-condition
21986     // blend could be custom lowered.
21987     // FIXME: We should find a better way to handle this class of problems.
21988     // Potentially, we should combine constant-condition vselect nodes
21989     // pre-legalization into shuffles and not mark as many types as custom
21990     // lowered.
21991     if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
21992       return SDValue();
21993     // FIXME: We don't support i16-element blends currently. We could and
21994     // should support them by making *all* the bits in the condition be set
21995     // rather than just the high bit and using an i8-element blend.
21996     if (VT.getScalarType() == MVT::i16)
21997       return SDValue();
21998     // Dynamic blending was only available from SSE4.1 onward.
21999     if (VT.getSizeInBits() == 128 && !Subtarget->hasSSE41())
22000       return SDValue();
22001     // Byte blends are only available in AVX2
22002     if (VT.getSizeInBits() == 256 && VT.getScalarType() == MVT::i8 &&
22003         !Subtarget->hasAVX2())
22004       return SDValue();
22005
22006     assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
22007     APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
22008
22009     APInt KnownZero, KnownOne;
22010     TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
22011                                           DCI.isBeforeLegalizeOps());
22012     if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
22013         TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
22014                                  TLO)) {
22015       // If we changed the computation somewhere in the DAG, this change
22016       // will affect all users of Cond.
22017       // Make sure it is fine and update all the nodes so that we do not
22018       // use the generic VSELECT anymore. Otherwise, we may perform
22019       // wrong optimizations as we messed up with the actual expectation
22020       // for the vector boolean values.
22021       if (Cond != TLO.Old) {
22022         // Check all uses of that condition operand to check whether it will be
22023         // consumed by non-BLEND instructions, which may depend on all bits are
22024         // set properly.
22025         for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
22026              I != E; ++I)
22027           if (I->getOpcode() != ISD::VSELECT)
22028             // TODO: Add other opcodes eventually lowered into BLEND.
22029             return SDValue();
22030
22031         // Update all the users of the condition, before committing the change,
22032         // so that the VSELECT optimizations that expect the correct vector
22033         // boolean value will not be triggered.
22034         for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
22035              I != E; ++I)
22036           DAG.ReplaceAllUsesOfValueWith(
22037               SDValue(*I, 0),
22038               DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0),
22039                           Cond, I->getOperand(1), I->getOperand(2)));
22040         DCI.CommitTargetLoweringOpt(TLO);
22041         return SDValue();
22042       }
22043       // At this point, only Cond is changed. Change the condition
22044       // just for N to keep the opportunity to optimize all other
22045       // users their own way.
22046       DAG.ReplaceAllUsesOfValueWith(
22047           SDValue(N, 0),
22048           DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0),
22049                       TLO.New, N->getOperand(1), N->getOperand(2)));
22050       return SDValue();
22051     }
22052   }
22053
22054   return SDValue();
22055 }
22056
22057 // Check whether a boolean test is testing a boolean value generated by
22058 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
22059 // code.
22060 //
22061 // Simplify the following patterns:
22062 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
22063 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
22064 // to (Op EFLAGS Cond)
22065 //
22066 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
22067 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
22068 // to (Op EFLAGS !Cond)
22069 //
22070 // where Op could be BRCOND or CMOV.
22071 //
22072 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
22073   // Quit if not CMP and SUB with its value result used.
22074   if (Cmp.getOpcode() != X86ISD::CMP &&
22075       (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0)))
22076       return SDValue();
22077
22078   // Quit if not used as a boolean value.
22079   if (CC != X86::COND_E && CC != X86::COND_NE)
22080     return SDValue();
22081
22082   // Check CMP operands. One of them should be 0 or 1 and the other should be
22083   // an SetCC or extended from it.
22084   SDValue Op1 = Cmp.getOperand(0);
22085   SDValue Op2 = Cmp.getOperand(1);
22086
22087   SDValue SetCC;
22088   const ConstantSDNode* C = nullptr;
22089   bool needOppositeCond = (CC == X86::COND_E);
22090   bool checkAgainstTrue = false; // Is it a comparison against 1?
22091
22092   if ((C = dyn_cast<ConstantSDNode>(Op1)))
22093     SetCC = Op2;
22094   else if ((C = dyn_cast<ConstantSDNode>(Op2)))
22095     SetCC = Op1;
22096   else // Quit if all operands are not constants.
22097     return SDValue();
22098
22099   if (C->getZExtValue() == 1) {
22100     needOppositeCond = !needOppositeCond;
22101     checkAgainstTrue = true;
22102   } else if (C->getZExtValue() != 0)
22103     // Quit if the constant is neither 0 or 1.
22104     return SDValue();
22105
22106   bool truncatedToBoolWithAnd = false;
22107   // Skip (zext $x), (trunc $x), or (and $x, 1) node.
22108   while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
22109          SetCC.getOpcode() == ISD::TRUNCATE ||
22110          SetCC.getOpcode() == ISD::AND) {
22111     if (SetCC.getOpcode() == ISD::AND) {
22112       int OpIdx = -1;
22113       ConstantSDNode *CS;
22114       if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(0))) &&
22115           CS->getZExtValue() == 1)
22116         OpIdx = 1;
22117       if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(1))) &&
22118           CS->getZExtValue() == 1)
22119         OpIdx = 0;
22120       if (OpIdx == -1)
22121         break;
22122       SetCC = SetCC.getOperand(OpIdx);
22123       truncatedToBoolWithAnd = true;
22124     } else
22125       SetCC = SetCC.getOperand(0);
22126   }
22127
22128   switch (SetCC.getOpcode()) {
22129   case X86ISD::SETCC_CARRY:
22130     // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
22131     // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
22132     // i.e. it's a comparison against true but the result of SETCC_CARRY is not
22133     // truncated to i1 using 'and'.
22134     if (checkAgainstTrue && !truncatedToBoolWithAnd)
22135       break;
22136     assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
22137            "Invalid use of SETCC_CARRY!");
22138     // FALL THROUGH
22139   case X86ISD::SETCC:
22140     // Set the condition code or opposite one if necessary.
22141     CC = X86::CondCode(SetCC.getConstantOperandVal(0));
22142     if (needOppositeCond)
22143       CC = X86::GetOppositeBranchCondition(CC);
22144     return SetCC.getOperand(1);
22145   case X86ISD::CMOV: {
22146     // Check whether false/true value has canonical one, i.e. 0 or 1.
22147     ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
22148     ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
22149     // Quit if true value is not a constant.
22150     if (!TVal)
22151       return SDValue();
22152     // Quit if false value is not a constant.
22153     if (!FVal) {
22154       SDValue Op = SetCC.getOperand(0);
22155       // Skip 'zext' or 'trunc' node.
22156       if (Op.getOpcode() == ISD::ZERO_EXTEND ||
22157           Op.getOpcode() == ISD::TRUNCATE)
22158         Op = Op.getOperand(0);
22159       // A special case for rdrand/rdseed, where 0 is set if false cond is
22160       // found.
22161       if ((Op.getOpcode() != X86ISD::RDRAND &&
22162            Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
22163         return SDValue();
22164     }
22165     // Quit if false value is not the constant 0 or 1.
22166     bool FValIsFalse = true;
22167     if (FVal && FVal->getZExtValue() != 0) {
22168       if (FVal->getZExtValue() != 1)
22169         return SDValue();
22170       // If FVal is 1, opposite cond is needed.
22171       needOppositeCond = !needOppositeCond;
22172       FValIsFalse = false;
22173     }
22174     // Quit if TVal is not the constant opposite of FVal.
22175     if (FValIsFalse && TVal->getZExtValue() != 1)
22176       return SDValue();
22177     if (!FValIsFalse && TVal->getZExtValue() != 0)
22178       return SDValue();
22179     CC = X86::CondCode(SetCC.getConstantOperandVal(2));
22180     if (needOppositeCond)
22181       CC = X86::GetOppositeBranchCondition(CC);
22182     return SetCC.getOperand(3);
22183   }
22184   }
22185
22186   return SDValue();
22187 }
22188
22189 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
22190 /// Match:
22191 ///   (X86or (X86setcc) (X86setcc))
22192 ///   (X86cmp (and (X86setcc) (X86setcc)), 0)
22193 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
22194                                            X86::CondCode &CC1, SDValue &Flags,
22195                                            bool &isAnd) {
22196   if (Cond->getOpcode() == X86ISD::CMP) {
22197     ConstantSDNode *CondOp1C = dyn_cast<ConstantSDNode>(Cond->getOperand(1));
22198     if (!CondOp1C || !CondOp1C->isNullValue())
22199       return false;
22200
22201     Cond = Cond->getOperand(0);
22202   }
22203
22204   isAnd = false;
22205
22206   SDValue SetCC0, SetCC1;
22207   switch (Cond->getOpcode()) {
22208   default: return false;
22209   case ISD::AND:
22210   case X86ISD::AND:
22211     isAnd = true;
22212     // fallthru
22213   case ISD::OR:
22214   case X86ISD::OR:
22215     SetCC0 = Cond->getOperand(0);
22216     SetCC1 = Cond->getOperand(1);
22217     break;
22218   };
22219
22220   // Make sure we have SETCC nodes, using the same flags value.
22221   if (SetCC0.getOpcode() != X86ISD::SETCC ||
22222       SetCC1.getOpcode() != X86ISD::SETCC ||
22223       SetCC0->getOperand(1) != SetCC1->getOperand(1))
22224     return false;
22225
22226   CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
22227   CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
22228   Flags = SetCC0->getOperand(1);
22229   return true;
22230 }
22231
22232 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
22233 static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
22234                                   TargetLowering::DAGCombinerInfo &DCI,
22235                                   const X86Subtarget *Subtarget) {
22236   SDLoc DL(N);
22237
22238   // If the flag operand isn't dead, don't touch this CMOV.
22239   if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
22240     return SDValue();
22241
22242   SDValue FalseOp = N->getOperand(0);
22243   SDValue TrueOp = N->getOperand(1);
22244   X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
22245   SDValue Cond = N->getOperand(3);
22246
22247   if (CC == X86::COND_E || CC == X86::COND_NE) {
22248     switch (Cond.getOpcode()) {
22249     default: break;
22250     case X86ISD::BSR:
22251     case X86ISD::BSF:
22252       // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
22253       if (DAG.isKnownNeverZero(Cond.getOperand(0)))
22254         return (CC == X86::COND_E) ? FalseOp : TrueOp;
22255     }
22256   }
22257
22258   SDValue Flags;
22259
22260   Flags = checkBoolTestSetCCCombine(Cond, CC);
22261   if (Flags.getNode() &&
22262       // Extra check as FCMOV only supports a subset of X86 cond.
22263       (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
22264     SDValue Ops[] = { FalseOp, TrueOp,
22265                       DAG.getConstant(CC, DL, MVT::i8), Flags };
22266     return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
22267   }
22268
22269   // If this is a select between two integer constants, try to do some
22270   // optimizations.  Note that the operands are ordered the opposite of SELECT
22271   // operands.
22272   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
22273     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
22274       // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
22275       // larger than FalseC (the false value).
22276       if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
22277         CC = X86::GetOppositeBranchCondition(CC);
22278         std::swap(TrueC, FalseC);
22279         std::swap(TrueOp, FalseOp);
22280       }
22281
22282       // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
22283       // This is efficient for any integer data type (including i8/i16) and
22284       // shift amount.
22285       if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
22286         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
22287                            DAG.getConstant(CC, DL, MVT::i8), Cond);
22288
22289         // Zero extend the condition if needed.
22290         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
22291
22292         unsigned ShAmt = TrueC->getAPIntValue().logBase2();
22293         Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
22294                            DAG.getConstant(ShAmt, DL, MVT::i8));
22295         if (N->getNumValues() == 2)  // Dead flag value?
22296           return DCI.CombineTo(N, Cond, SDValue());
22297         return Cond;
22298       }
22299
22300       // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
22301       // for any integer data type, including i8/i16.
22302       if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
22303         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
22304                            DAG.getConstant(CC, DL, MVT::i8), Cond);
22305
22306         // Zero extend the condition if needed.
22307         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
22308                            FalseC->getValueType(0), Cond);
22309         Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
22310                            SDValue(FalseC, 0));
22311
22312         if (N->getNumValues() == 2)  // Dead flag value?
22313           return DCI.CombineTo(N, Cond, SDValue());
22314         return Cond;
22315       }
22316
22317       // Optimize cases that will turn into an LEA instruction.  This requires
22318       // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
22319       if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
22320         uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
22321         if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
22322
22323         bool isFastMultiplier = false;
22324         if (Diff < 10) {
22325           switch ((unsigned char)Diff) {
22326           default: break;
22327           case 1:  // result = add base, cond
22328           case 2:  // result = lea base(    , cond*2)
22329           case 3:  // result = lea base(cond, cond*2)
22330           case 4:  // result = lea base(    , cond*4)
22331           case 5:  // result = lea base(cond, cond*4)
22332           case 8:  // result = lea base(    , cond*8)
22333           case 9:  // result = lea base(cond, cond*8)
22334             isFastMultiplier = true;
22335             break;
22336           }
22337         }
22338
22339         if (isFastMultiplier) {
22340           APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
22341           Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
22342                              DAG.getConstant(CC, DL, MVT::i8), Cond);
22343           // Zero extend the condition if needed.
22344           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
22345                              Cond);
22346           // Scale the condition by the difference.
22347           if (Diff != 1)
22348             Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
22349                                DAG.getConstant(Diff, DL, Cond.getValueType()));
22350
22351           // Add the base if non-zero.
22352           if (FalseC->getAPIntValue() != 0)
22353             Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
22354                                SDValue(FalseC, 0));
22355           if (N->getNumValues() == 2)  // Dead flag value?
22356             return DCI.CombineTo(N, Cond, SDValue());
22357           return Cond;
22358         }
22359       }
22360     }
22361   }
22362
22363   // Handle these cases:
22364   //   (select (x != c), e, c) -> select (x != c), e, x),
22365   //   (select (x == c), c, e) -> select (x == c), x, e)
22366   // where the c is an integer constant, and the "select" is the combination
22367   // of CMOV and CMP.
22368   //
22369   // The rationale for this change is that the conditional-move from a constant
22370   // needs two instructions, however, conditional-move from a register needs
22371   // only one instruction.
22372   //
22373   // CAVEAT: By replacing a constant with a symbolic value, it may obscure
22374   //  some instruction-combining opportunities. This opt needs to be
22375   //  postponed as late as possible.
22376   //
22377   if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
22378     // the DCI.xxxx conditions are provided to postpone the optimization as
22379     // late as possible.
22380
22381     ConstantSDNode *CmpAgainst = nullptr;
22382     if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
22383         (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
22384         !isa<ConstantSDNode>(Cond.getOperand(0))) {
22385
22386       if (CC == X86::COND_NE &&
22387           CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
22388         CC = X86::GetOppositeBranchCondition(CC);
22389         std::swap(TrueOp, FalseOp);
22390       }
22391
22392       if (CC == X86::COND_E &&
22393           CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
22394         SDValue Ops[] = { FalseOp, Cond.getOperand(0),
22395                           DAG.getConstant(CC, DL, MVT::i8), Cond };
22396         return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
22397       }
22398     }
22399   }
22400
22401   // Fold and/or of setcc's to double CMOV:
22402   //   (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
22403   //   (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
22404   //
22405   // This combine lets us generate:
22406   //   cmovcc1 (jcc1 if we don't have CMOV)
22407   //   cmovcc2 (same)
22408   // instead of:
22409   //   setcc1
22410   //   setcc2
22411   //   and/or
22412   //   cmovne (jne if we don't have CMOV)
22413   // When we can't use the CMOV instruction, it might increase branch
22414   // mispredicts.
22415   // When we can use CMOV, or when there is no mispredict, this improves
22416   // throughput and reduces register pressure.
22417   //
22418   if (CC == X86::COND_NE) {
22419     SDValue Flags;
22420     X86::CondCode CC0, CC1;
22421     bool isAndSetCC;
22422     if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
22423       if (isAndSetCC) {
22424         std::swap(FalseOp, TrueOp);
22425         CC0 = X86::GetOppositeBranchCondition(CC0);
22426         CC1 = X86::GetOppositeBranchCondition(CC1);
22427       }
22428
22429       SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
22430         Flags};
22431       SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps);
22432       SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
22433       SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
22434       DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1));
22435       return CMOV;
22436     }
22437   }
22438
22439   return SDValue();
22440 }
22441
22442 static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
22443                                                 const X86Subtarget *Subtarget) {
22444   unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
22445   switch (IntNo) {
22446   default: return SDValue();
22447   // SSE/AVX/AVX2 blend intrinsics.
22448   case Intrinsic::x86_avx2_pblendvb:
22449     // Don't try to simplify this intrinsic if we don't have AVX2.
22450     if (!Subtarget->hasAVX2())
22451       return SDValue();
22452     // FALL-THROUGH
22453   case Intrinsic::x86_avx_blendv_pd_256:
22454   case Intrinsic::x86_avx_blendv_ps_256:
22455     // Don't try to simplify this intrinsic if we don't have AVX.
22456     if (!Subtarget->hasAVX())
22457       return SDValue();
22458     // FALL-THROUGH
22459   case Intrinsic::x86_sse41_blendvps:
22460   case Intrinsic::x86_sse41_blendvpd:
22461   case Intrinsic::x86_sse41_pblendvb: {
22462     SDValue Op0 = N->getOperand(1);
22463     SDValue Op1 = N->getOperand(2);
22464     SDValue Mask = N->getOperand(3);
22465
22466     // Don't try to simplify this intrinsic if we don't have SSE4.1.
22467     if (!Subtarget->hasSSE41())
22468       return SDValue();
22469
22470     // fold (blend A, A, Mask) -> A
22471     if (Op0 == Op1)
22472       return Op0;
22473     // fold (blend A, B, allZeros) -> A
22474     if (ISD::isBuildVectorAllZeros(Mask.getNode()))
22475       return Op0;
22476     // fold (blend A, B, allOnes) -> B
22477     if (ISD::isBuildVectorAllOnes(Mask.getNode()))
22478       return Op1;
22479
22480     // Simplify the case where the mask is a constant i32 value.
22481     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mask)) {
22482       if (C->isNullValue())
22483         return Op0;
22484       if (C->isAllOnesValue())
22485         return Op1;
22486     }
22487
22488     return SDValue();
22489   }
22490
22491   // Packed SSE2/AVX2 arithmetic shift immediate intrinsics.
22492   case Intrinsic::x86_sse2_psrai_w:
22493   case Intrinsic::x86_sse2_psrai_d:
22494   case Intrinsic::x86_avx2_psrai_w:
22495   case Intrinsic::x86_avx2_psrai_d:
22496   case Intrinsic::x86_sse2_psra_w:
22497   case Intrinsic::x86_sse2_psra_d:
22498   case Intrinsic::x86_avx2_psra_w:
22499   case Intrinsic::x86_avx2_psra_d: {
22500     SDValue Op0 = N->getOperand(1);
22501     SDValue Op1 = N->getOperand(2);
22502     EVT VT = Op0.getValueType();
22503     assert(VT.isVector() && "Expected a vector type!");
22504
22505     if (isa<BuildVectorSDNode>(Op1))
22506       Op1 = Op1.getOperand(0);
22507
22508     if (!isa<ConstantSDNode>(Op1))
22509       return SDValue();
22510
22511     EVT SVT = VT.getVectorElementType();
22512     unsigned SVTBits = SVT.getSizeInBits();
22513
22514     ConstantSDNode *CND = cast<ConstantSDNode>(Op1);
22515     const APInt &C = APInt(SVTBits, CND->getAPIntValue().getZExtValue());
22516     uint64_t ShAmt = C.getZExtValue();
22517
22518     // Don't try to convert this shift into a ISD::SRA if the shift
22519     // count is bigger than or equal to the element size.
22520     if (ShAmt >= SVTBits)
22521       return SDValue();
22522
22523     // Trivial case: if the shift count is zero, then fold this
22524     // into the first operand.
22525     if (ShAmt == 0)
22526       return Op0;
22527
22528     // Replace this packed shift intrinsic with a target independent
22529     // shift dag node.
22530     SDLoc DL(N);
22531     SDValue Splat = DAG.getConstant(C, DL, VT);
22532     return DAG.getNode(ISD::SRA, DL, VT, Op0, Splat);
22533   }
22534   }
22535 }
22536
22537 /// PerformMulCombine - Optimize a single multiply with constant into two
22538 /// in order to implement it with two cheaper instructions, e.g.
22539 /// LEA + SHL, LEA + LEA.
22540 static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
22541                                  TargetLowering::DAGCombinerInfo &DCI) {
22542   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
22543     return SDValue();
22544
22545   EVT VT = N->getValueType(0);
22546   if (VT != MVT::i64 && VT != MVT::i32)
22547     return SDValue();
22548
22549   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
22550   if (!C)
22551     return SDValue();
22552   uint64_t MulAmt = C->getZExtValue();
22553   if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
22554     return SDValue();
22555
22556   uint64_t MulAmt1 = 0;
22557   uint64_t MulAmt2 = 0;
22558   if ((MulAmt % 9) == 0) {
22559     MulAmt1 = 9;
22560     MulAmt2 = MulAmt / 9;
22561   } else if ((MulAmt % 5) == 0) {
22562     MulAmt1 = 5;
22563     MulAmt2 = MulAmt / 5;
22564   } else if ((MulAmt % 3) == 0) {
22565     MulAmt1 = 3;
22566     MulAmt2 = MulAmt / 3;
22567   }
22568   if (MulAmt2 &&
22569       (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
22570     SDLoc DL(N);
22571
22572     if (isPowerOf2_64(MulAmt2) &&
22573         !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
22574       // If second multiplifer is pow2, issue it first. We want the multiply by
22575       // 3, 5, or 9 to be folded into the addressing mode unless the lone use
22576       // is an add.
22577       std::swap(MulAmt1, MulAmt2);
22578
22579     SDValue NewMul;
22580     if (isPowerOf2_64(MulAmt1))
22581       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
22582                            DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
22583     else
22584       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
22585                            DAG.getConstant(MulAmt1, DL, VT));
22586
22587     if (isPowerOf2_64(MulAmt2))
22588       NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
22589                            DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
22590     else
22591       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
22592                            DAG.getConstant(MulAmt2, DL, VT));
22593
22594     // Do not add new nodes to DAG combiner worklist.
22595     DCI.CombineTo(N, NewMul, false);
22596   }
22597   return SDValue();
22598 }
22599
22600 static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
22601   SDValue N0 = N->getOperand(0);
22602   SDValue N1 = N->getOperand(1);
22603   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
22604   EVT VT = N0.getValueType();
22605
22606   // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
22607   // since the result of setcc_c is all zero's or all ones.
22608   if (VT.isInteger() && !VT.isVector() &&
22609       N1C && N0.getOpcode() == ISD::AND &&
22610       N0.getOperand(1).getOpcode() == ISD::Constant) {
22611     SDValue N00 = N0.getOperand(0);
22612     if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
22613         ((N00.getOpcode() == ISD::ANY_EXTEND ||
22614           N00.getOpcode() == ISD::ZERO_EXTEND) &&
22615          N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) {
22616       APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
22617       APInt ShAmt = N1C->getAPIntValue();
22618       Mask = Mask.shl(ShAmt);
22619       if (Mask != 0) {
22620         SDLoc DL(N);
22621         return DAG.getNode(ISD::AND, DL, VT,
22622                            N00, DAG.getConstant(Mask, DL, VT));
22623       }
22624     }
22625   }
22626
22627   // Hardware support for vector shifts is sparse which makes us scalarize the
22628   // vector operations in many cases. Also, on sandybridge ADD is faster than
22629   // shl.
22630   // (shl V, 1) -> add V,V
22631   if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
22632     if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
22633       assert(N0.getValueType().isVector() && "Invalid vector shift type");
22634       // We shift all of the values by one. In many cases we do not have
22635       // hardware support for this operation. This is better expressed as an ADD
22636       // of two values.
22637       if (N1SplatC->getZExtValue() == 1)
22638         return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
22639     }
22640
22641   return SDValue();
22642 }
22643
22644 /// \brief Returns a vector of 0s if the node in input is a vector logical
22645 /// shift by a constant amount which is known to be bigger than or equal
22646 /// to the vector element size in bits.
22647 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
22648                                       const X86Subtarget *Subtarget) {
22649   EVT VT = N->getValueType(0);
22650
22651   if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
22652       (!Subtarget->hasInt256() ||
22653        (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
22654     return SDValue();
22655
22656   SDValue Amt = N->getOperand(1);
22657   SDLoc DL(N);
22658   if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
22659     if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
22660       APInt ShiftAmt = AmtSplat->getAPIntValue();
22661       unsigned MaxAmount = VT.getVectorElementType().getSizeInBits();
22662
22663       // SSE2/AVX2 logical shifts always return a vector of 0s
22664       // if the shift amount is bigger than or equal to
22665       // the element size. The constant shift amount will be
22666       // encoded as a 8-bit immediate.
22667       if (ShiftAmt.trunc(8).uge(MaxAmount))
22668         return getZeroVector(VT, Subtarget, DAG, DL);
22669     }
22670
22671   return SDValue();
22672 }
22673
22674 /// PerformShiftCombine - Combine shifts.
22675 static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
22676                                    TargetLowering::DAGCombinerInfo &DCI,
22677                                    const X86Subtarget *Subtarget) {
22678   if (N->getOpcode() == ISD::SHL) {
22679     SDValue V = PerformSHLCombine(N, DAG);
22680     if (V.getNode()) return V;
22681   }
22682
22683   if (N->getOpcode() != ISD::SRA) {
22684     // Try to fold this logical shift into a zero vector.
22685     SDValue V = performShiftToAllZeros(N, DAG, Subtarget);
22686     if (V.getNode()) return V;
22687   }
22688
22689   return SDValue();
22690 }
22691
22692 // CMPEQCombine - Recognize the distinctive  (AND (setcc ...) (setcc ..))
22693 // where both setccs reference the same FP CMP, and rewrite for CMPEQSS
22694 // and friends.  Likewise for OR -> CMPNEQSS.
22695 static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
22696                             TargetLowering::DAGCombinerInfo &DCI,
22697                             const X86Subtarget *Subtarget) {
22698   unsigned opcode;
22699
22700   // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
22701   // we're requiring SSE2 for both.
22702   if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
22703     SDValue N0 = N->getOperand(0);
22704     SDValue N1 = N->getOperand(1);
22705     SDValue CMP0 = N0->getOperand(1);
22706     SDValue CMP1 = N1->getOperand(1);
22707     SDLoc DL(N);
22708
22709     // The SETCCs should both refer to the same CMP.
22710     if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
22711       return SDValue();
22712
22713     SDValue CMP00 = CMP0->getOperand(0);
22714     SDValue CMP01 = CMP0->getOperand(1);
22715     EVT     VT    = CMP00.getValueType();
22716
22717     if (VT == MVT::f32 || VT == MVT::f64) {
22718       bool ExpectingFlags = false;
22719       // Check for any users that want flags:
22720       for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
22721            !ExpectingFlags && UI != UE; ++UI)
22722         switch (UI->getOpcode()) {
22723         default:
22724         case ISD::BR_CC:
22725         case ISD::BRCOND:
22726         case ISD::SELECT:
22727           ExpectingFlags = true;
22728           break;
22729         case ISD::CopyToReg:
22730         case ISD::SIGN_EXTEND:
22731         case ISD::ZERO_EXTEND:
22732         case ISD::ANY_EXTEND:
22733           break;
22734         }
22735
22736       if (!ExpectingFlags) {
22737         enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
22738         enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
22739
22740         if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
22741           X86::CondCode tmp = cc0;
22742           cc0 = cc1;
22743           cc1 = tmp;
22744         }
22745
22746         if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
22747             (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
22748           // FIXME: need symbolic constants for these magic numbers.
22749           // See X86ATTInstPrinter.cpp:printSSECC().
22750           unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
22751           if (Subtarget->hasAVX512()) {
22752             SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00,
22753                                          CMP01,
22754                                          DAG.getConstant(x86cc, DL, MVT::i8));
22755             if (N->getValueType(0) != MVT::i1)
22756               return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
22757                                  FSetCC);
22758             return FSetCC;
22759           }
22760           SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
22761                                               CMP00.getValueType(), CMP00, CMP01,
22762                                               DAG.getConstant(x86cc, DL,
22763                                                               MVT::i8));
22764
22765           bool is64BitFP = (CMP00.getValueType() == MVT::f64);
22766           MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
22767
22768           if (is64BitFP && !Subtarget->is64Bit()) {
22769             // On a 32-bit target, we cannot bitcast the 64-bit float to a
22770             // 64-bit integer, since that's not a legal type. Since
22771             // OnesOrZeroesF is all ones of all zeroes, we don't need all the
22772             // bits, but can do this little dance to extract the lowest 32 bits
22773             // and work with those going forward.
22774             SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
22775                                            OnesOrZeroesF);
22776             SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
22777             OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
22778                                         Vector32, DAG.getIntPtrConstant(0, DL));
22779             IntVT = MVT::i32;
22780           }
22781
22782           SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
22783           SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
22784                                       DAG.getConstant(1, DL, IntVT));
22785           SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
22786                                               ANDed);
22787           return OneBitOfTruth;
22788         }
22789       }
22790     }
22791   }
22792   return SDValue();
22793 }
22794
22795 /// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector
22796 /// so it can be folded inside ANDNP.
22797 static bool CanFoldXORWithAllOnes(const SDNode *N) {
22798   EVT VT = N->getValueType(0);
22799
22800   // Match direct AllOnes for 128 and 256-bit vectors
22801   if (ISD::isBuildVectorAllOnes(N))
22802     return true;
22803
22804   // Look through a bit convert.
22805   if (N->getOpcode() == ISD::BITCAST)
22806     N = N->getOperand(0).getNode();
22807
22808   // Sometimes the operand may come from a insert_subvector building a 256-bit
22809   // allones vector
22810   if (VT.is256BitVector() &&
22811       N->getOpcode() == ISD::INSERT_SUBVECTOR) {
22812     SDValue V1 = N->getOperand(0);
22813     SDValue V2 = N->getOperand(1);
22814
22815     if (V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
22816         V1.getOperand(0).getOpcode() == ISD::UNDEF &&
22817         ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
22818         ISD::isBuildVectorAllOnes(V2.getNode()))
22819       return true;
22820   }
22821
22822   return false;
22823 }
22824
22825 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
22826 // register. In most cases we actually compare or select YMM-sized registers
22827 // and mixing the two types creates horrible code. This method optimizes
22828 // some of the transition sequences.
22829 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
22830                                  TargetLowering::DAGCombinerInfo &DCI,
22831                                  const X86Subtarget *Subtarget) {
22832   EVT VT = N->getValueType(0);
22833   if (!VT.is256BitVector())
22834     return SDValue();
22835
22836   assert((N->getOpcode() == ISD::ANY_EXTEND ||
22837           N->getOpcode() == ISD::ZERO_EXTEND ||
22838           N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
22839
22840   SDValue Narrow = N->getOperand(0);
22841   EVT NarrowVT = Narrow->getValueType(0);
22842   if (!NarrowVT.is128BitVector())
22843     return SDValue();
22844
22845   if (Narrow->getOpcode() != ISD::XOR &&
22846       Narrow->getOpcode() != ISD::AND &&
22847       Narrow->getOpcode() != ISD::OR)
22848     return SDValue();
22849
22850   SDValue N0  = Narrow->getOperand(0);
22851   SDValue N1  = Narrow->getOperand(1);
22852   SDLoc DL(Narrow);
22853
22854   // The Left side has to be a trunc.
22855   if (N0.getOpcode() != ISD::TRUNCATE)
22856     return SDValue();
22857
22858   // The type of the truncated inputs.
22859   EVT WideVT = N0->getOperand(0)->getValueType(0);
22860   if (WideVT != VT)
22861     return SDValue();
22862
22863   // The right side has to be a 'trunc' or a constant vector.
22864   bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
22865   ConstantSDNode *RHSConstSplat = nullptr;
22866   if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
22867     RHSConstSplat = RHSBV->getConstantSplatNode();
22868   if (!RHSTrunc && !RHSConstSplat)
22869     return SDValue();
22870
22871   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22872
22873   if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
22874     return SDValue();
22875
22876   // Set N0 and N1 to hold the inputs to the new wide operation.
22877   N0 = N0->getOperand(0);
22878   if (RHSConstSplat) {
22879     N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(),
22880                      SDValue(RHSConstSplat, 0));
22881     SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);
22882     N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C);
22883   } else if (RHSTrunc) {
22884     N1 = N1->getOperand(0);
22885   }
22886
22887   // Generate the wide operation.
22888   SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
22889   unsigned Opcode = N->getOpcode();
22890   switch (Opcode) {
22891   case ISD::ANY_EXTEND:
22892     return Op;
22893   case ISD::ZERO_EXTEND: {
22894     unsigned InBits = NarrowVT.getScalarType().getSizeInBits();
22895     APInt Mask = APInt::getAllOnesValue(InBits);
22896     Mask = Mask.zext(VT.getScalarType().getSizeInBits());
22897     return DAG.getNode(ISD::AND, DL, VT,
22898                        Op, DAG.getConstant(Mask, DL, VT));
22899   }
22900   case ISD::SIGN_EXTEND:
22901     return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
22902                        Op, DAG.getValueType(NarrowVT));
22903   default:
22904     llvm_unreachable("Unexpected opcode");
22905   }
22906 }
22907
22908 static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG,
22909                                  TargetLowering::DAGCombinerInfo &DCI,
22910                                  const X86Subtarget *Subtarget) {
22911   SDValue N0 = N->getOperand(0);
22912   SDValue N1 = N->getOperand(1);
22913   SDLoc DL(N);
22914
22915   // A vector zext_in_reg may be represented as a shuffle,
22916   // feeding into a bitcast (this represents anyext) feeding into
22917   // an and with a mask.
22918   // We'd like to try to combine that into a shuffle with zero
22919   // plus a bitcast, removing the and.
22920   if (N0.getOpcode() != ISD::BITCAST ||
22921       N0.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE)
22922     return SDValue();
22923
22924   // The other side of the AND should be a splat of 2^C, where C
22925   // is the number of bits in the source type.
22926   if (N1.getOpcode() == ISD::BITCAST)
22927     N1 = N1.getOperand(0);
22928   if (N1.getOpcode() != ISD::BUILD_VECTOR)
22929     return SDValue();
22930   BuildVectorSDNode *Vector = cast<BuildVectorSDNode>(N1);
22931
22932   ShuffleVectorSDNode *Shuffle = cast<ShuffleVectorSDNode>(N0.getOperand(0));
22933   EVT SrcType = Shuffle->getValueType(0);
22934
22935   // We expect a single-source shuffle
22936   if (Shuffle->getOperand(1)->getOpcode() != ISD::UNDEF)
22937     return SDValue();
22938
22939   unsigned SrcSize = SrcType.getScalarSizeInBits();
22940
22941   APInt SplatValue, SplatUndef;
22942   unsigned SplatBitSize;
22943   bool HasAnyUndefs;
22944   if (!Vector->isConstantSplat(SplatValue, SplatUndef,
22945                                 SplatBitSize, HasAnyUndefs))
22946     return SDValue();
22947
22948   unsigned ResSize = N1.getValueType().getScalarSizeInBits();
22949   // Make sure the splat matches the mask we expect
22950   if (SplatBitSize > ResSize ||
22951       (SplatValue + 1).exactLogBase2() != (int)SrcSize)
22952     return SDValue();
22953
22954   // Make sure the input and output size make sense
22955   if (SrcSize >= ResSize || ResSize % SrcSize)
22956     return SDValue();
22957
22958   // We expect a shuffle of the form <0, u, u, u, 1, u, u, u...>
22959   // The number of u's between each two values depends on the ratio between
22960   // the source and dest type.
22961   unsigned ZextRatio = ResSize / SrcSize;
22962   bool IsZext = true;
22963   for (unsigned i = 0; i < SrcType.getVectorNumElements(); ++i) {
22964     if (i % ZextRatio) {
22965       if (Shuffle->getMaskElt(i) > 0) {
22966         // Expected undef
22967         IsZext = false;
22968         break;
22969       }
22970     } else {
22971       if (Shuffle->getMaskElt(i) != (int)(i / ZextRatio)) {
22972         // Expected element number
22973         IsZext = false;
22974         break;
22975       }
22976     }
22977   }
22978
22979   if (!IsZext)
22980     return SDValue();
22981
22982   // Ok, perform the transformation - replace the shuffle with
22983   // a shuffle of the form <0, k, k, k, 1, k, k, k> with zero
22984   // (instead of undef) where the k elements come from the zero vector.
22985   SmallVector<int, 8> Mask;
22986   unsigned NumElems = SrcType.getVectorNumElements();
22987   for (unsigned i = 0; i < NumElems; ++i)
22988     if (i % ZextRatio)
22989       Mask.push_back(NumElems);
22990     else
22991       Mask.push_back(i / ZextRatio);
22992
22993   SDValue NewShuffle = DAG.getVectorShuffle(Shuffle->getValueType(0), DL,
22994     Shuffle->getOperand(0), DAG.getConstant(0, DL, SrcType), Mask);
22995   return DAG.getBitcast(N0.getValueType(), NewShuffle);
22996 }
22997
22998 static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
22999                                  TargetLowering::DAGCombinerInfo &DCI,
23000                                  const X86Subtarget *Subtarget) {
23001   if (DCI.isBeforeLegalizeOps())
23002     return SDValue();
23003
23004   if (SDValue Zext = VectorZextCombine(N, DAG, DCI, Subtarget))
23005     return Zext;
23006
23007   if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget))
23008     return R;
23009
23010   EVT VT = N->getValueType(0);
23011   SDValue N0 = N->getOperand(0);
23012   SDValue N1 = N->getOperand(1);
23013   SDLoc DL(N);
23014
23015   // Create BEXTR instructions
23016   // BEXTR is ((X >> imm) & (2**size-1))
23017   if (VT == MVT::i32 || VT == MVT::i64) {
23018     // Check for BEXTR.
23019     if ((Subtarget->hasBMI() || Subtarget->hasTBM()) &&
23020         (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) {
23021       ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
23022       ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
23023       if (MaskNode && ShiftNode) {
23024         uint64_t Mask = MaskNode->getZExtValue();
23025         uint64_t Shift = ShiftNode->getZExtValue();
23026         if (isMask_64(Mask)) {
23027           uint64_t MaskSize = countPopulation(Mask);
23028           if (Shift + MaskSize <= VT.getSizeInBits())
23029             return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
23030                                DAG.getConstant(Shift | (MaskSize << 8), DL,
23031                                                VT));
23032         }
23033       }
23034     } // BEXTR
23035
23036     return SDValue();
23037   }
23038
23039   // Want to form ANDNP nodes:
23040   // 1) In the hopes of then easily combining them with OR and AND nodes
23041   //    to form PBLEND/PSIGN.
23042   // 2) To match ANDN packed intrinsics
23043   if (VT != MVT::v2i64 && VT != MVT::v4i64)
23044     return SDValue();
23045
23046   // Check LHS for vnot
23047   if (N0.getOpcode() == ISD::XOR &&
23048       //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
23049       CanFoldXORWithAllOnes(N0.getOperand(1).getNode()))
23050     return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
23051
23052   // Check RHS for vnot
23053   if (N1.getOpcode() == ISD::XOR &&
23054       //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
23055       CanFoldXORWithAllOnes(N1.getOperand(1).getNode()))
23056     return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
23057
23058   return SDValue();
23059 }
23060
23061 static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
23062                                 TargetLowering::DAGCombinerInfo &DCI,
23063                                 const X86Subtarget *Subtarget) {
23064   if (DCI.isBeforeLegalizeOps())
23065     return SDValue();
23066
23067   SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
23068   if (R.getNode())
23069     return R;
23070
23071   SDValue N0 = N->getOperand(0);
23072   SDValue N1 = N->getOperand(1);
23073   EVT VT = N->getValueType(0);
23074
23075   // look for psign/blend
23076   if (VT == MVT::v2i64 || VT == MVT::v4i64) {
23077     if (!Subtarget->hasSSSE3() ||
23078         (VT == MVT::v4i64 && !Subtarget->hasInt256()))
23079       return SDValue();
23080
23081     // Canonicalize pandn to RHS
23082     if (N0.getOpcode() == X86ISD::ANDNP)
23083       std::swap(N0, N1);
23084     // or (and (m, y), (pandn m, x))
23085     if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
23086       SDValue Mask = N1.getOperand(0);
23087       SDValue X    = N1.getOperand(1);
23088       SDValue Y;
23089       if (N0.getOperand(0) == Mask)
23090         Y = N0.getOperand(1);
23091       if (N0.getOperand(1) == Mask)
23092         Y = N0.getOperand(0);
23093
23094       // Check to see if the mask appeared in both the AND and ANDNP and
23095       if (!Y.getNode())
23096         return SDValue();
23097
23098       // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
23099       // Look through mask bitcast.
23100       if (Mask.getOpcode() == ISD::BITCAST)
23101         Mask = Mask.getOperand(0);
23102       if (X.getOpcode() == ISD::BITCAST)
23103         X = X.getOperand(0);
23104       if (Y.getOpcode() == ISD::BITCAST)
23105         Y = Y.getOperand(0);
23106
23107       EVT MaskVT = Mask.getValueType();
23108
23109       // Validate that the Mask operand is a vector sra node.
23110       // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
23111       // there is no psrai.b
23112       unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
23113       unsigned SraAmt = ~0;
23114       if (Mask.getOpcode() == ISD::SRA) {
23115         if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
23116           if (auto *AmtConst = AmtBV->getConstantSplatNode())
23117             SraAmt = AmtConst->getZExtValue();
23118       } else if (Mask.getOpcode() == X86ISD::VSRAI) {
23119         SDValue SraC = Mask.getOperand(1);
23120         SraAmt  = cast<ConstantSDNode>(SraC)->getZExtValue();
23121       }
23122       if ((SraAmt + 1) != EltBits)
23123         return SDValue();
23124
23125       SDLoc DL(N);
23126
23127       // Now we know we at least have a plendvb with the mask val.  See if
23128       // we can form a psignb/w/d.
23129       // psign = x.type == y.type == mask.type && y = sub(0, x);
23130       if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
23131           ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
23132           X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
23133         assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
23134                "Unsupported VT for PSIGN");
23135         Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0));
23136         return DAG.getBitcast(VT, Mask);
23137       }
23138       // PBLENDVB only available on SSE 4.1
23139       if (!Subtarget->hasSSE41())
23140         return SDValue();
23141
23142       EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
23143
23144       X = DAG.getBitcast(BlendVT, X);
23145       Y = DAG.getBitcast(BlendVT, Y);
23146       Mask = DAG.getBitcast(BlendVT, Mask);
23147       Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
23148       return DAG.getBitcast(VT, Mask);
23149     }
23150   }
23151
23152   if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
23153     return SDValue();
23154
23155   // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
23156   MachineFunction &MF = DAG.getMachineFunction();
23157   bool OptForSize =
23158       MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize);
23159
23160   // SHLD/SHRD instructions have lower register pressure, but on some
23161   // platforms they have higher latency than the equivalent
23162   // series of shifts/or that would otherwise be generated.
23163   // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
23164   // have higher latencies and we are not optimizing for size.
23165   if (!OptForSize && Subtarget->isSHLDSlow())
23166     return SDValue();
23167
23168   if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
23169     std::swap(N0, N1);
23170   if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
23171     return SDValue();
23172   if (!N0.hasOneUse() || !N1.hasOneUse())
23173     return SDValue();
23174
23175   SDValue ShAmt0 = N0.getOperand(1);
23176   if (ShAmt0.getValueType() != MVT::i8)
23177     return SDValue();
23178   SDValue ShAmt1 = N1.getOperand(1);
23179   if (ShAmt1.getValueType() != MVT::i8)
23180     return SDValue();
23181   if (ShAmt0.getOpcode() == ISD::TRUNCATE)
23182     ShAmt0 = ShAmt0.getOperand(0);
23183   if (ShAmt1.getOpcode() == ISD::TRUNCATE)
23184     ShAmt1 = ShAmt1.getOperand(0);
23185
23186   SDLoc DL(N);
23187   unsigned Opc = X86ISD::SHLD;
23188   SDValue Op0 = N0.getOperand(0);
23189   SDValue Op1 = N1.getOperand(0);
23190   if (ShAmt0.getOpcode() == ISD::SUB) {
23191     Opc = X86ISD::SHRD;
23192     std::swap(Op0, Op1);
23193     std::swap(ShAmt0, ShAmt1);
23194   }
23195
23196   unsigned Bits = VT.getSizeInBits();
23197   if (ShAmt1.getOpcode() == ISD::SUB) {
23198     SDValue Sum = ShAmt1.getOperand(0);
23199     if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
23200       SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
23201       if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
23202         ShAmt1Op1 = ShAmt1Op1.getOperand(0);
23203       if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
23204         return DAG.getNode(Opc, DL, VT,
23205                            Op0, Op1,
23206                            DAG.getNode(ISD::TRUNCATE, DL,
23207                                        MVT::i8, ShAmt0));
23208     }
23209   } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
23210     ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
23211     if (ShAmt0C &&
23212         ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
23213       return DAG.getNode(Opc, DL, VT,
23214                          N0.getOperand(0), N1.getOperand(0),
23215                          DAG.getNode(ISD::TRUNCATE, DL,
23216                                        MVT::i8, ShAmt0));
23217   }
23218
23219   return SDValue();
23220 }
23221
23222 // Generate NEG and CMOV for integer abs.
23223 static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
23224   EVT VT = N->getValueType(0);
23225
23226   // Since X86 does not have CMOV for 8-bit integer, we don't convert
23227   // 8-bit integer abs to NEG and CMOV.
23228   if (VT.isInteger() && VT.getSizeInBits() == 8)
23229     return SDValue();
23230
23231   SDValue N0 = N->getOperand(0);
23232   SDValue N1 = N->getOperand(1);
23233   SDLoc DL(N);
23234
23235   // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
23236   // and change it to SUB and CMOV.
23237   if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
23238       N0.getOpcode() == ISD::ADD &&
23239       N0.getOperand(1) == N1 &&
23240       N1.getOpcode() == ISD::SRA &&
23241       N1.getOperand(0) == N0.getOperand(0))
23242     if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
23243       if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) {
23244         // Generate SUB & CMOV.
23245         SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
23246                                   DAG.getConstant(0, DL, VT), N0.getOperand(0));
23247
23248         SDValue Ops[] = { N0.getOperand(0), Neg,
23249                           DAG.getConstant(X86::COND_GE, DL, MVT::i8),
23250                           SDValue(Neg.getNode(), 1) };
23251         return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
23252       }
23253   return SDValue();
23254 }
23255
23256 // PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes
23257 static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
23258                                  TargetLowering::DAGCombinerInfo &DCI,
23259                                  const X86Subtarget *Subtarget) {
23260   if (DCI.isBeforeLegalizeOps())
23261     return SDValue();
23262
23263   if (Subtarget->hasCMov()) {
23264     SDValue RV = performIntegerAbsCombine(N, DAG);
23265     if (RV.getNode())
23266       return RV;
23267   }
23268
23269   return SDValue();
23270 }
23271
23272 /// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
23273 static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
23274                                   TargetLowering::DAGCombinerInfo &DCI,
23275                                   const X86Subtarget *Subtarget) {
23276   LoadSDNode *Ld = cast<LoadSDNode>(N);
23277   EVT RegVT = Ld->getValueType(0);
23278   EVT MemVT = Ld->getMemoryVT();
23279   SDLoc dl(Ld);
23280   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23281
23282   // For chips with slow 32-byte unaligned loads, break the 32-byte operation
23283   // into two 16-byte operations.
23284   ISD::LoadExtType Ext = Ld->getExtensionType();
23285   unsigned Alignment = Ld->getAlignment();
23286   bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
23287   if (RegVT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
23288       !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
23289     unsigned NumElems = RegVT.getVectorNumElements();
23290     if (NumElems < 2)
23291       return SDValue();
23292
23293     SDValue Ptr = Ld->getBasePtr();
23294     SDValue Increment = DAG.getConstant(16, dl, TLI.getPointerTy());
23295
23296     EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
23297                                   NumElems/2);
23298     SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
23299                                 Ld->getPointerInfo(), Ld->isVolatile(),
23300                                 Ld->isNonTemporal(), Ld->isInvariant(),
23301                                 Alignment);
23302     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
23303     SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
23304                                 Ld->getPointerInfo(), Ld->isVolatile(),
23305                                 Ld->isNonTemporal(), Ld->isInvariant(),
23306                                 std::min(16U, Alignment));
23307     SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
23308                              Load1.getValue(1),
23309                              Load2.getValue(1));
23310
23311     SDValue NewVec = DAG.getUNDEF(RegVT);
23312     NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);
23313     NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);
23314     return DCI.CombineTo(N, NewVec, TF, true);
23315   }
23316
23317   return SDValue();
23318 }
23319
23320 /// PerformMLOADCombine - Resolve extending loads
23321 static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
23322                                    TargetLowering::DAGCombinerInfo &DCI,
23323                                    const X86Subtarget *Subtarget) {
23324   MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
23325   if (Mld->getExtensionType() != ISD::SEXTLOAD)
23326     return SDValue();
23327
23328   EVT VT = Mld->getValueType(0);
23329   unsigned NumElems = VT.getVectorNumElements();
23330   EVT LdVT = Mld->getMemoryVT();
23331   SDLoc dl(Mld);
23332
23333   assert(LdVT != VT && "Cannot extend to the same type");
23334   unsigned ToSz = VT.getVectorElementType().getSizeInBits();
23335   unsigned FromSz = LdVT.getVectorElementType().getSizeInBits();
23336   // From, To sizes and ElemCount must be pow of two
23337   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
23338     "Unexpected size for extending masked load");
23339
23340   unsigned SizeRatio  = ToSz / FromSz;
23341   assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
23342
23343   // Create a type on which we perform the shuffle
23344   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
23345           LdVT.getScalarType(), NumElems*SizeRatio);
23346   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
23347
23348   // Convert Src0 value
23349   SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
23350   if (Mld->getSrc0().getOpcode() != ISD::UNDEF) {
23351     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
23352     for (unsigned i = 0; i != NumElems; ++i)
23353       ShuffleVec[i] = i * SizeRatio;
23354
23355     // Can't shuffle using an illegal type.
23356     assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)
23357             && "WideVecVT should be legal");
23358     WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
23359                                     DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
23360   }
23361   // Prepare the new mask
23362   SDValue NewMask;
23363   SDValue Mask = Mld->getMask();
23364   if (Mask.getValueType() == VT) {
23365     // Mask and original value have the same type
23366     NewMask = DAG.getBitcast(WideVecVT, Mask);
23367     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
23368     for (unsigned i = 0; i != NumElems; ++i)
23369       ShuffleVec[i] = i * SizeRatio;
23370     for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
23371       ShuffleVec[i] = NumElems*SizeRatio;
23372     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
23373                                    DAG.getConstant(0, dl, WideVecVT),
23374                                    &ShuffleVec[0]);
23375   }
23376   else {
23377     assert(Mask.getValueType().getVectorElementType() == MVT::i1);
23378     unsigned WidenNumElts = NumElems*SizeRatio;
23379     unsigned MaskNumElts = VT.getVectorNumElements();
23380     EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
23381                                      WidenNumElts);
23382
23383     unsigned NumConcat = WidenNumElts / MaskNumElts;
23384     SmallVector<SDValue, 16> Ops(NumConcat);
23385     SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
23386     Ops[0] = Mask;
23387     for (unsigned i = 1; i != NumConcat; ++i)
23388       Ops[i] = ZeroVal;
23389
23390     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
23391   }
23392
23393   SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
23394                                      Mld->getBasePtr(), NewMask, WideSrc0,
23395                                      Mld->getMemoryVT(), Mld->getMemOperand(),
23396                                      ISD::NON_EXTLOAD);
23397   SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
23398   return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
23399
23400 }
23401 /// PerformMSTORECombine - Resolve truncating stores
23402 static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
23403                                     const X86Subtarget *Subtarget) {
23404   MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
23405   if (!Mst->isTruncatingStore())
23406     return SDValue();
23407
23408   EVT VT = Mst->getValue().getValueType();
23409   unsigned NumElems = VT.getVectorNumElements();
23410   EVT StVT = Mst->getMemoryVT();
23411   SDLoc dl(Mst);
23412
23413   assert(StVT != VT && "Cannot truncate to the same type");
23414   unsigned FromSz = VT.getVectorElementType().getSizeInBits();
23415   unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
23416
23417   // From, To sizes and ElemCount must be pow of two
23418   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
23419     "Unexpected size for truncating masked store");
23420   // We are going to use the original vector elt for storing.
23421   // Accumulated smaller vector elements must be a multiple of the store size.
23422   assert (((NumElems * FromSz) % ToSz) == 0 &&
23423           "Unexpected ratio for truncating masked store");
23424
23425   unsigned SizeRatio  = FromSz / ToSz;
23426   assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
23427
23428   // Create a type on which we perform the shuffle
23429   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
23430           StVT.getScalarType(), NumElems*SizeRatio);
23431
23432   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
23433
23434   SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
23435   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
23436   for (unsigned i = 0; i != NumElems; ++i)
23437     ShuffleVec[i] = i * SizeRatio;
23438
23439   // Can't shuffle using an illegal type.
23440   assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)
23441           && "WideVecVT should be legal");
23442
23443   SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
23444                                         DAG.getUNDEF(WideVecVT),
23445                                         &ShuffleVec[0]);
23446
23447   SDValue NewMask;
23448   SDValue Mask = Mst->getMask();
23449   if (Mask.getValueType() == VT) {
23450     // Mask and original value have the same type
23451     NewMask = DAG.getBitcast(WideVecVT, Mask);
23452     for (unsigned i = 0; i != NumElems; ++i)
23453       ShuffleVec[i] = i * SizeRatio;
23454     for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
23455       ShuffleVec[i] = NumElems*SizeRatio;
23456     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
23457                                    DAG.getConstant(0, dl, WideVecVT),
23458                                    &ShuffleVec[0]);
23459   }
23460   else {
23461     assert(Mask.getValueType().getVectorElementType() == MVT::i1);
23462     unsigned WidenNumElts = NumElems*SizeRatio;
23463     unsigned MaskNumElts = VT.getVectorNumElements();
23464     EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
23465                                      WidenNumElts);
23466
23467     unsigned NumConcat = WidenNumElts / MaskNumElts;
23468     SmallVector<SDValue, 16> Ops(NumConcat);
23469     SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
23470     Ops[0] = Mask;
23471     for (unsigned i = 1; i != NumConcat; ++i)
23472       Ops[i] = ZeroVal;
23473
23474     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
23475   }
23476
23477   return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, Mst->getBasePtr(),
23478                             NewMask, StVT, Mst->getMemOperand(), false);
23479 }
23480 /// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
23481 static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
23482                                    const X86Subtarget *Subtarget) {
23483   StoreSDNode *St = cast<StoreSDNode>(N);
23484   EVT VT = St->getValue().getValueType();
23485   EVT StVT = St->getMemoryVT();
23486   SDLoc dl(St);
23487   SDValue StoredVal = St->getOperand(1);
23488   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23489
23490   // If we are saving a concatenation of two XMM registers and 32-byte stores
23491   // are slow, such as on Sandy Bridge, perform two 16-byte stores.
23492   unsigned Alignment = St->getAlignment();
23493   bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8;
23494   if (VT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
23495       StVT == VT && !IsAligned) {
23496     unsigned NumElems = VT.getVectorNumElements();
23497     if (NumElems < 2)
23498       return SDValue();
23499
23500     SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl);
23501     SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl);
23502
23503     SDValue Stride = DAG.getConstant(16, dl, TLI.getPointerTy());
23504     SDValue Ptr0 = St->getBasePtr();
23505     SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
23506
23507     SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
23508                                 St->getPointerInfo(), St->isVolatile(),
23509                                 St->isNonTemporal(), Alignment);
23510     SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
23511                                 St->getPointerInfo(), St->isVolatile(),
23512                                 St->isNonTemporal(),
23513                                 std::min(16U, Alignment));
23514     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
23515   }
23516
23517   // Optimize trunc store (of multiple scalars) to shuffle and store.
23518   // First, pack all of the elements in one place. Next, store to memory
23519   // in fewer chunks.
23520   if (St->isTruncatingStore() && VT.isVector()) {
23521     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23522     unsigned NumElems = VT.getVectorNumElements();
23523     assert(StVT != VT && "Cannot truncate to the same type");
23524     unsigned FromSz = VT.getVectorElementType().getSizeInBits();
23525     unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
23526
23527     // From, To sizes and ElemCount must be pow of two
23528     if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
23529     // We are going to use the original vector elt for storing.
23530     // Accumulated smaller vector elements must be a multiple of the store size.
23531     if (0 != (NumElems * FromSz) % ToSz) return SDValue();
23532
23533     unsigned SizeRatio  = FromSz / ToSz;
23534
23535     assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
23536
23537     // Create a type on which we perform the shuffle
23538     EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
23539             StVT.getScalarType(), NumElems*SizeRatio);
23540
23541     assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
23542
23543     SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
23544     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
23545     for (unsigned i = 0; i != NumElems; ++i)
23546       ShuffleVec[i] = i * SizeRatio;
23547
23548     // Can't shuffle using an illegal type.
23549     if (!TLI.isTypeLegal(WideVecVT))
23550       return SDValue();
23551
23552     SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
23553                                          DAG.getUNDEF(WideVecVT),
23554                                          &ShuffleVec[0]);
23555     // At this point all of the data is stored at the bottom of the
23556     // register. We now need to save it to mem.
23557
23558     // Find the largest store unit
23559     MVT StoreType = MVT::i8;
23560     for (MVT Tp : MVT::integer_valuetypes()) {
23561       if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
23562         StoreType = Tp;
23563     }
23564
23565     // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
23566     if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
23567         (64 <= NumElems * ToSz))
23568       StoreType = MVT::f64;
23569
23570     // Bitcast the original vector into a vector of store-size units
23571     EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
23572             StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
23573     assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
23574     SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
23575     SmallVector<SDValue, 8> Chains;
23576     SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8, dl,
23577                                         TLI.getPointerTy());
23578     SDValue Ptr = St->getBasePtr();
23579
23580     // Perform one or more big stores into memory.
23581     for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
23582       SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
23583                                    StoreType, ShuffWide,
23584                                    DAG.getIntPtrConstant(i, dl));
23585       SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
23586                                 St->getPointerInfo(), St->isVolatile(),
23587                                 St->isNonTemporal(), St->getAlignment());
23588       Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
23589       Chains.push_back(Ch);
23590     }
23591
23592     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
23593   }
23594
23595   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
23596   // the FP state in cases where an emms may be missing.
23597   // A preferable solution to the general problem is to figure out the right
23598   // places to insert EMMS.  This qualifies as a quick hack.
23599
23600   // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
23601   if (VT.getSizeInBits() != 64)
23602     return SDValue();
23603
23604   const Function *F = DAG.getMachineFunction().getFunction();
23605   bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
23606   bool F64IsLegal =
23607       !Subtarget->useSoftFloat() && !NoImplicitFloatOps && Subtarget->hasSSE2();
23608   if ((VT.isVector() ||
23609        (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
23610       isa<LoadSDNode>(St->getValue()) &&
23611       !cast<LoadSDNode>(St->getValue())->isVolatile() &&
23612       St->getChain().hasOneUse() && !St->isVolatile()) {
23613     SDNode* LdVal = St->getValue().getNode();
23614     LoadSDNode *Ld = nullptr;
23615     int TokenFactorIndex = -1;
23616     SmallVector<SDValue, 8> Ops;
23617     SDNode* ChainVal = St->getChain().getNode();
23618     // Must be a store of a load.  We currently handle two cases:  the load
23619     // is a direct child, and it's under an intervening TokenFactor.  It is
23620     // possible to dig deeper under nested TokenFactors.
23621     if (ChainVal == LdVal)
23622       Ld = cast<LoadSDNode>(St->getChain());
23623     else if (St->getValue().hasOneUse() &&
23624              ChainVal->getOpcode() == ISD::TokenFactor) {
23625       for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
23626         if (ChainVal->getOperand(i).getNode() == LdVal) {
23627           TokenFactorIndex = i;
23628           Ld = cast<LoadSDNode>(St->getValue());
23629         } else
23630           Ops.push_back(ChainVal->getOperand(i));
23631       }
23632     }
23633
23634     if (!Ld || !ISD::isNormalLoad(Ld))
23635       return SDValue();
23636
23637     // If this is not the MMX case, i.e. we are just turning i64 load/store
23638     // into f64 load/store, avoid the transformation if there are multiple
23639     // uses of the loaded value.
23640     if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
23641       return SDValue();
23642
23643     SDLoc LdDL(Ld);
23644     SDLoc StDL(N);
23645     // If we are a 64-bit capable x86, lower to a single movq load/store pair.
23646     // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
23647     // pair instead.
23648     if (Subtarget->is64Bit() || F64IsLegal) {
23649       EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
23650       SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
23651                                   Ld->getPointerInfo(), Ld->isVolatile(),
23652                                   Ld->isNonTemporal(), Ld->isInvariant(),
23653                                   Ld->getAlignment());
23654       SDValue NewChain = NewLd.getValue(1);
23655       if (TokenFactorIndex != -1) {
23656         Ops.push_back(NewChain);
23657         NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
23658       }
23659       return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
23660                           St->getPointerInfo(),
23661                           St->isVolatile(), St->isNonTemporal(),
23662                           St->getAlignment());
23663     }
23664
23665     // Otherwise, lower to two pairs of 32-bit loads / stores.
23666     SDValue LoAddr = Ld->getBasePtr();
23667     SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
23668                                  DAG.getConstant(4, LdDL, MVT::i32));
23669
23670     SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
23671                                Ld->getPointerInfo(),
23672                                Ld->isVolatile(), Ld->isNonTemporal(),
23673                                Ld->isInvariant(), Ld->getAlignment());
23674     SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
23675                                Ld->getPointerInfo().getWithOffset(4),
23676                                Ld->isVolatile(), Ld->isNonTemporal(),
23677                                Ld->isInvariant(),
23678                                MinAlign(Ld->getAlignment(), 4));
23679
23680     SDValue NewChain = LoLd.getValue(1);
23681     if (TokenFactorIndex != -1) {
23682       Ops.push_back(LoLd);
23683       Ops.push_back(HiLd);
23684       NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
23685     }
23686
23687     LoAddr = St->getBasePtr();
23688     HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
23689                          DAG.getConstant(4, StDL, MVT::i32));
23690
23691     SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
23692                                 St->getPointerInfo(),
23693                                 St->isVolatile(), St->isNonTemporal(),
23694                                 St->getAlignment());
23695     SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
23696                                 St->getPointerInfo().getWithOffset(4),
23697                                 St->isVolatile(),
23698                                 St->isNonTemporal(),
23699                                 MinAlign(St->getAlignment(), 4));
23700     return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
23701   }
23702
23703   // This is similar to the above case, but here we handle a scalar 64-bit
23704   // integer store that is extracted from a vector on a 32-bit target.
23705   // If we have SSE2, then we can treat it like a floating-point double
23706   // to get past legalization. The execution dependencies fixup pass will
23707   // choose the optimal machine instruction for the store if this really is
23708   // an integer or v2f32 rather than an f64.
23709   if (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit() &&
23710       St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
23711     SDValue OldExtract = St->getOperand(1);
23712     SDValue ExtOp0 = OldExtract.getOperand(0);
23713     unsigned VecSize = ExtOp0.getValueSizeInBits();
23714     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
23715     SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
23716     SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
23717                                      BitCast, OldExtract.getOperand(1));
23718     return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
23719                         St->getPointerInfo(), St->isVolatile(),
23720                         St->isNonTemporal(), St->getAlignment());
23721   }
23722
23723   return SDValue();
23724 }
23725
23726 /// Return 'true' if this vector operation is "horizontal"
23727 /// and return the operands for the horizontal operation in LHS and RHS.  A
23728 /// horizontal operation performs the binary operation on successive elements
23729 /// of its first operand, then on successive elements of its second operand,
23730 /// returning the resulting values in a vector.  For example, if
23731 ///   A = < float a0, float a1, float a2, float a3 >
23732 /// and
23733 ///   B = < float b0, float b1, float b2, float b3 >
23734 /// then the result of doing a horizontal operation on A and B is
23735 ///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
23736 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
23737 /// A horizontal-op B, for some already available A and B, and if so then LHS is
23738 /// set to A, RHS to B, and the routine returns 'true'.
23739 /// Note that the binary operation should have the property that if one of the
23740 /// operands is UNDEF then the result is UNDEF.
23741 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
23742   // Look for the following pattern: if
23743   //   A = < float a0, float a1, float a2, float a3 >
23744   //   B = < float b0, float b1, float b2, float b3 >
23745   // and
23746   //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
23747   //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
23748   // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
23749   // which is A horizontal-op B.
23750
23751   // At least one of the operands should be a vector shuffle.
23752   if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
23753       RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
23754     return false;
23755
23756   MVT VT = LHS.getSimpleValueType();
23757
23758   assert((VT.is128BitVector() || VT.is256BitVector()) &&
23759          "Unsupported vector type for horizontal add/sub");
23760
23761   // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
23762   // operate independently on 128-bit lanes.
23763   unsigned NumElts = VT.getVectorNumElements();
23764   unsigned NumLanes = VT.getSizeInBits()/128;
23765   unsigned NumLaneElts = NumElts / NumLanes;
23766   assert((NumLaneElts % 2 == 0) &&
23767          "Vector type should have an even number of elements in each lane");
23768   unsigned HalfLaneElts = NumLaneElts/2;
23769
23770   // View LHS in the form
23771   //   LHS = VECTOR_SHUFFLE A, B, LMask
23772   // If LHS is not a shuffle then pretend it is the shuffle
23773   //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
23774   // NOTE: in what follows a default initialized SDValue represents an UNDEF of
23775   // type VT.
23776   SDValue A, B;
23777   SmallVector<int, 16> LMask(NumElts);
23778   if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
23779     if (LHS.getOperand(0).getOpcode() != ISD::UNDEF)
23780       A = LHS.getOperand(0);
23781     if (LHS.getOperand(1).getOpcode() != ISD::UNDEF)
23782       B = LHS.getOperand(1);
23783     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
23784     std::copy(Mask.begin(), Mask.end(), LMask.begin());
23785   } else {
23786     if (LHS.getOpcode() != ISD::UNDEF)
23787       A = LHS;
23788     for (unsigned i = 0; i != NumElts; ++i)
23789       LMask[i] = i;
23790   }
23791
23792   // Likewise, view RHS in the form
23793   //   RHS = VECTOR_SHUFFLE C, D, RMask
23794   SDValue C, D;
23795   SmallVector<int, 16> RMask(NumElts);
23796   if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
23797     if (RHS.getOperand(0).getOpcode() != ISD::UNDEF)
23798       C = RHS.getOperand(0);
23799     if (RHS.getOperand(1).getOpcode() != ISD::UNDEF)
23800       D = RHS.getOperand(1);
23801     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
23802     std::copy(Mask.begin(), Mask.end(), RMask.begin());
23803   } else {
23804     if (RHS.getOpcode() != ISD::UNDEF)
23805       C = RHS;
23806     for (unsigned i = 0; i != NumElts; ++i)
23807       RMask[i] = i;
23808   }
23809
23810   // Check that the shuffles are both shuffling the same vectors.
23811   if (!(A == C && B == D) && !(A == D && B == C))
23812     return false;
23813
23814   // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
23815   if (!A.getNode() && !B.getNode())
23816     return false;
23817
23818   // If A and B occur in reverse order in RHS, then "swap" them (which means
23819   // rewriting the mask).
23820   if (A != C)
23821     ShuffleVectorSDNode::commuteMask(RMask);
23822
23823   // At this point LHS and RHS are equivalent to
23824   //   LHS = VECTOR_SHUFFLE A, B, LMask
23825   //   RHS = VECTOR_SHUFFLE A, B, RMask
23826   // Check that the masks correspond to performing a horizontal operation.
23827   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
23828     for (unsigned i = 0; i != NumLaneElts; ++i) {
23829       int LIdx = LMask[i+l], RIdx = RMask[i+l];
23830
23831       // Ignore any UNDEF components.
23832       if (LIdx < 0 || RIdx < 0 ||
23833           (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
23834           (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
23835         continue;
23836
23837       // Check that successive elements are being operated on.  If not, this is
23838       // not a horizontal operation.
23839       unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
23840       int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
23841       if (!(LIdx == Index && RIdx == Index + 1) &&
23842           !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
23843         return false;
23844     }
23845   }
23846
23847   LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
23848   RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
23849   return true;
23850 }
23851
23852 /// Do target-specific dag combines on floating point adds.
23853 static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
23854                                   const X86Subtarget *Subtarget) {
23855   EVT VT = N->getValueType(0);
23856   SDValue LHS = N->getOperand(0);
23857   SDValue RHS = N->getOperand(1);
23858
23859   // Try to synthesize horizontal adds from adds of shuffles.
23860   if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
23861        (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
23862       isHorizontalBinOp(LHS, RHS, true))
23863     return DAG.getNode(X86ISD::FHADD, SDLoc(N), VT, LHS, RHS);
23864   return SDValue();
23865 }
23866
23867 /// Do target-specific dag combines on floating point subs.
23868 static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
23869                                   const X86Subtarget *Subtarget) {
23870   EVT VT = N->getValueType(0);
23871   SDValue LHS = N->getOperand(0);
23872   SDValue RHS = N->getOperand(1);
23873
23874   // Try to synthesize horizontal subs from subs of shuffles.
23875   if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
23876        (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
23877       isHorizontalBinOp(LHS, RHS, false))
23878     return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS);
23879   return SDValue();
23880 }
23881
23882 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
23883 static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
23884   assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
23885
23886   // F[X]OR(0.0, x) -> x
23887   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
23888     if (C->getValueAPF().isPosZero())
23889       return N->getOperand(1);
23890
23891   // F[X]OR(x, 0.0) -> x
23892   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
23893     if (C->getValueAPF().isPosZero())
23894       return N->getOperand(0);
23895   return SDValue();
23896 }
23897
23898 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
23899 static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
23900   assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
23901
23902   // Only perform optimizations if UnsafeMath is used.
23903   if (!DAG.getTarget().Options.UnsafeFPMath)
23904     return SDValue();
23905
23906   // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
23907   // into FMINC and FMAXC, which are Commutative operations.
23908   unsigned NewOp = 0;
23909   switch (N->getOpcode()) {
23910     default: llvm_unreachable("unknown opcode");
23911     case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
23912     case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
23913   }
23914
23915   return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
23916                      N->getOperand(0), N->getOperand(1));
23917 }
23918
23919 /// Do target-specific dag combines on X86ISD::FAND nodes.
23920 static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
23921   // FAND(0.0, x) -> 0.0
23922   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
23923     if (C->getValueAPF().isPosZero())
23924       return N->getOperand(0);
23925
23926   // FAND(x, 0.0) -> 0.0
23927   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
23928     if (C->getValueAPF().isPosZero())
23929       return N->getOperand(1);
23930
23931   return SDValue();
23932 }
23933
23934 /// Do target-specific dag combines on X86ISD::FANDN nodes
23935 static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) {
23936   // FANDN(0.0, x) -> x
23937   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
23938     if (C->getValueAPF().isPosZero())
23939       return N->getOperand(1);
23940
23941   // FANDN(x, 0.0) -> 0.0
23942   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
23943     if (C->getValueAPF().isPosZero())
23944       return N->getOperand(1);
23945
23946   return SDValue();
23947 }
23948
23949 static SDValue PerformBTCombine(SDNode *N,
23950                                 SelectionDAG &DAG,
23951                                 TargetLowering::DAGCombinerInfo &DCI) {
23952   // BT ignores high bits in the bit index operand.
23953   SDValue Op1 = N->getOperand(1);
23954   if (Op1.hasOneUse()) {
23955     unsigned BitWidth = Op1.getValueSizeInBits();
23956     APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
23957     APInt KnownZero, KnownOne;
23958     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
23959                                           !DCI.isBeforeLegalizeOps());
23960     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23961     if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
23962         TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
23963       DCI.CommitTargetLoweringOpt(TLO);
23964   }
23965   return SDValue();
23966 }
23967
23968 static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
23969   SDValue Op = N->getOperand(0);
23970   if (Op.getOpcode() == ISD::BITCAST)
23971     Op = Op.getOperand(0);
23972   EVT VT = N->getValueType(0), OpVT = Op.getValueType();
23973   if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
23974       VT.getVectorElementType().getSizeInBits() ==
23975       OpVT.getVectorElementType().getSizeInBits()) {
23976     return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
23977   }
23978   return SDValue();
23979 }
23980
23981 static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
23982                                                const X86Subtarget *Subtarget) {
23983   EVT VT = N->getValueType(0);
23984   if (!VT.isVector())
23985     return SDValue();
23986
23987   SDValue N0 = N->getOperand(0);
23988   SDValue N1 = N->getOperand(1);
23989   EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
23990   SDLoc dl(N);
23991
23992   // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
23993   // both SSE and AVX2 since there is no sign-extended shift right
23994   // operation on a vector with 64-bit elements.
23995   //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
23996   // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
23997   if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
23998       N0.getOpcode() == ISD::SIGN_EXTEND)) {
23999     SDValue N00 = N0.getOperand(0);
24000
24001     // EXTLOAD has a better solution on AVX2,
24002     // it may be replaced with X86ISD::VSEXT node.
24003     if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256())
24004       if (!ISD::isNormalLoad(N00.getNode()))
24005         return SDValue();
24006
24007     if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
24008         SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
24009                                   N00, N1);
24010       return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
24011     }
24012   }
24013   return SDValue();
24014 }
24015
24016 static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
24017                                   TargetLowering::DAGCombinerInfo &DCI,
24018                                   const X86Subtarget *Subtarget) {
24019   SDValue N0 = N->getOperand(0);
24020   EVT VT = N->getValueType(0);
24021   EVT SVT = VT.getScalarType();
24022   EVT InVT = N0->getValueType(0);
24023   EVT InSVT = InVT.getScalarType();
24024   SDLoc DL(N);
24025
24026   // (i8,i32 sext (sdivrem (i8 x, i8 y)) ->
24027   // (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y)
24028   // This exposes the sext to the sdivrem lowering, so that it directly extends
24029   // from AH (which we otherwise need to do contortions to access).
24030   if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 &&
24031       InVT == MVT::i8 && VT == MVT::i32) {
24032     SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
24033     SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, DL, NodeTys,
24034                             N0.getOperand(0), N0.getOperand(1));
24035     DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
24036     return R.getValue(1);
24037   }
24038
24039   if (!DCI.isBeforeLegalizeOps()) {
24040     if (N0.getValueType() == MVT::i1) {
24041       SDValue Zero = DAG.getConstant(0, DL, VT);
24042       SDValue AllOnes =
24043         DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT);
24044       return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero);
24045     }
24046     return SDValue();
24047   }
24048
24049   if (VT.isVector()) {
24050     auto ExtendToVec128 = [&DAG](SDLoc DL, SDValue N) {
24051       EVT InVT = N->getValueType(0);
24052       EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
24053                                    128 / InVT.getScalarSizeInBits());
24054       SmallVector<SDValue, 8> Opnds(128 / InVT.getSizeInBits(),
24055                                     DAG.getUNDEF(InVT));
24056       Opnds[0] = N;
24057       return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
24058     };
24059
24060     // If target-size is 128-bits, then convert to ISD::SIGN_EXTEND_VECTOR_INREG
24061     // which ensures lowering to X86ISD::VSEXT (pmovsx*).
24062     if (VT.getSizeInBits() == 128 &&
24063         (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) &&
24064         (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) {
24065       SDValue ExOp = ExtendToVec128(DL, N0);
24066       return DAG.getSignExtendVectorInReg(ExOp, DL, VT);
24067     }
24068
24069     // On pre-AVX2 targets, split into 128-bit nodes of
24070     // ISD::SIGN_EXTEND_VECTOR_INREG.
24071     if (!Subtarget->hasInt256() && !(VT.getSizeInBits() % 128) &&
24072         (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) &&
24073         (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) {
24074       unsigned NumVecs = VT.getSizeInBits() / 128;
24075       unsigned NumSubElts = 128 / SVT.getSizeInBits();
24076       EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
24077       EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
24078
24079       SmallVector<SDValue, 8> Opnds;
24080       for (unsigned i = 0, Offset = 0; i != NumVecs;
24081            ++i, Offset += NumSubElts) {
24082         SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
24083                                      DAG.getIntPtrConstant(Offset, DL));
24084         SrcVec = ExtendToVec128(DL, SrcVec);
24085         SrcVec = DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT);
24086         Opnds.push_back(SrcVec);
24087       }
24088       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
24089     }
24090   }
24091
24092   if (!Subtarget->hasFp256())
24093     return SDValue();
24094
24095   if (VT.isVector() && VT.getSizeInBits() == 256) {
24096     SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
24097     if (R.getNode())
24098       return R;
24099   }
24100
24101   return SDValue();
24102 }
24103
24104 static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
24105                                  const X86Subtarget* Subtarget) {
24106   SDLoc dl(N);
24107   EVT VT = N->getValueType(0);
24108
24109   // Let legalize expand this if it isn't a legal type yet.
24110   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
24111     return SDValue();
24112
24113   EVT ScalarVT = VT.getScalarType();
24114   if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
24115       (!Subtarget->hasFMA() && !Subtarget->hasFMA4()))
24116     return SDValue();
24117
24118   SDValue A = N->getOperand(0);
24119   SDValue B = N->getOperand(1);
24120   SDValue C = N->getOperand(2);
24121
24122   bool NegA = (A.getOpcode() == ISD::FNEG);
24123   bool NegB = (B.getOpcode() == ISD::FNEG);
24124   bool NegC = (C.getOpcode() == ISD::FNEG);
24125
24126   // Negative multiplication when NegA xor NegB
24127   bool NegMul = (NegA != NegB);
24128   if (NegA)
24129     A = A.getOperand(0);
24130   if (NegB)
24131     B = B.getOperand(0);
24132   if (NegC)
24133     C = C.getOperand(0);
24134
24135   unsigned Opcode;
24136   if (!NegMul)
24137     Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
24138   else
24139     Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
24140
24141   return DAG.getNode(Opcode, dl, VT, A, B, C);
24142 }
24143
24144 static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
24145                                   TargetLowering::DAGCombinerInfo &DCI,
24146                                   const X86Subtarget *Subtarget) {
24147   // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
24148   //           (and (i32 x86isd::setcc_carry), 1)
24149   // This eliminates the zext. This transformation is necessary because
24150   // ISD::SETCC is always legalized to i8.
24151   SDLoc dl(N);
24152   SDValue N0 = N->getOperand(0);
24153   EVT VT = N->getValueType(0);
24154
24155   if (N0.getOpcode() == ISD::AND &&
24156       N0.hasOneUse() &&
24157       N0.getOperand(0).hasOneUse()) {
24158     SDValue N00 = N0.getOperand(0);
24159     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
24160       ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
24161       if (!C || C->getZExtValue() != 1)
24162         return SDValue();
24163       return DAG.getNode(ISD::AND, dl, VT,
24164                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
24165                                      N00.getOperand(0), N00.getOperand(1)),
24166                          DAG.getConstant(1, dl, VT));
24167     }
24168   }
24169
24170   if (N0.getOpcode() == ISD::TRUNCATE &&
24171       N0.hasOneUse() &&
24172       N0.getOperand(0).hasOneUse()) {
24173     SDValue N00 = N0.getOperand(0);
24174     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
24175       return DAG.getNode(ISD::AND, dl, VT,
24176                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
24177                                      N00.getOperand(0), N00.getOperand(1)),
24178                          DAG.getConstant(1, dl, VT));
24179     }
24180   }
24181   if (VT.is256BitVector()) {
24182     SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
24183     if (R.getNode())
24184       return R;
24185   }
24186
24187   // (i8,i32 zext (udivrem (i8 x, i8 y)) ->
24188   // (i8,i32 (udivrem_zext_hreg (i8 x, i8 y)
24189   // This exposes the zext to the udivrem lowering, so that it directly extends
24190   // from AH (which we otherwise need to do contortions to access).
24191   if (N0.getOpcode() == ISD::UDIVREM &&
24192       N0.getResNo() == 1 && N0.getValueType() == MVT::i8 &&
24193       (VT == MVT::i32 || VT == MVT::i64)) {
24194     SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
24195     SDValue R = DAG.getNode(X86ISD::UDIVREM8_ZEXT_HREG, dl, NodeTys,
24196                             N0.getOperand(0), N0.getOperand(1));
24197     DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
24198     return R.getValue(1);
24199   }
24200
24201   return SDValue();
24202 }
24203
24204 // Optimize x == -y --> x+y == 0
24205 //          x != -y --> x+y != 0
24206 static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
24207                                       const X86Subtarget* Subtarget) {
24208   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
24209   SDValue LHS = N->getOperand(0);
24210   SDValue RHS = N->getOperand(1);
24211   EVT VT = N->getValueType(0);
24212   SDLoc DL(N);
24213
24214   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
24215     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
24216       if (C->getAPIntValue() == 0 && LHS.hasOneUse()) {
24217         SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS,
24218                                    LHS.getOperand(1));
24219         return DAG.getSetCC(DL, N->getValueType(0), addV,
24220                             DAG.getConstant(0, DL, addV.getValueType()), CC);
24221       }
24222   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
24223     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0)))
24224       if (C->getAPIntValue() == 0 && RHS.hasOneUse()) {
24225         SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS,
24226                                    RHS.getOperand(1));
24227         return DAG.getSetCC(DL, N->getValueType(0), addV,
24228                             DAG.getConstant(0, DL, addV.getValueType()), CC);
24229       }
24230
24231   if (VT.getScalarType() == MVT::i1 &&
24232       (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
24233     bool IsSEXT0 =
24234         (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
24235         (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
24236     bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
24237
24238     if (!IsSEXT0 || !IsVZero1) {
24239       // Swap the operands and update the condition code.
24240       std::swap(LHS, RHS);
24241       CC = ISD::getSetCCSwappedOperands(CC);
24242
24243       IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
24244                 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
24245       IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
24246     }
24247
24248     if (IsSEXT0 && IsVZero1) {
24249       assert(VT == LHS.getOperand(0).getValueType() &&
24250              "Uexpected operand type");
24251       if (CC == ISD::SETGT)
24252         return DAG.getConstant(0, DL, VT);
24253       if (CC == ISD::SETLE)
24254         return DAG.getConstant(1, DL, VT);
24255       if (CC == ISD::SETEQ || CC == ISD::SETGE)
24256         return DAG.getNOT(DL, LHS.getOperand(0), VT);
24257
24258       assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
24259              "Unexpected condition code!");
24260       return LHS.getOperand(0);
24261     }
24262   }
24263
24264   return SDValue();
24265 }
24266
24267 static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
24268                                          SelectionDAG &DAG) {
24269   SDLoc dl(Load);
24270   MVT VT = Load->getSimpleValueType(0);
24271   MVT EVT = VT.getVectorElementType();
24272   SDValue Addr = Load->getOperand(1);
24273   SDValue NewAddr = DAG.getNode(
24274       ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
24275       DAG.getConstant(Index * EVT.getStoreSize(), dl,
24276                       Addr.getSimpleValueType()));
24277
24278   SDValue NewLoad =
24279       DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
24280                   DAG.getMachineFunction().getMachineMemOperand(
24281                       Load->getMemOperand(), 0, EVT.getStoreSize()));
24282   return NewLoad;
24283 }
24284
24285 static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
24286                                       const X86Subtarget *Subtarget) {
24287   SDLoc dl(N);
24288   MVT VT = N->getOperand(1)->getSimpleValueType(0);
24289   assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
24290          "X86insertps is only defined for v4x32");
24291
24292   SDValue Ld = N->getOperand(1);
24293   if (MayFoldLoad(Ld)) {
24294     // Extract the countS bits from the immediate so we can get the proper
24295     // address when narrowing the vector load to a specific element.
24296     // When the second source op is a memory address, insertps doesn't use
24297     // countS and just gets an f32 from that address.
24298     unsigned DestIndex =
24299         cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
24300
24301     Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
24302
24303     // Create this as a scalar to vector to match the instruction pattern.
24304     SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
24305     // countS bits are ignored when loading from memory on insertps, which
24306     // means we don't need to explicitly set them to 0.
24307     return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
24308                        LoadScalarToVector, N->getOperand(2));
24309   }
24310   return SDValue();
24311 }
24312
24313 static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) {
24314   SDValue V0 = N->getOperand(0);
24315   SDValue V1 = N->getOperand(1);
24316   SDLoc DL(N);
24317   EVT VT = N->getValueType(0);
24318
24319   // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
24320   // operands and changing the mask to 1. This saves us a bunch of
24321   // pattern-matching possibilities related to scalar math ops in SSE/AVX.
24322   // x86InstrInfo knows how to commute this back after instruction selection
24323   // if it would help register allocation.
24324
24325   // TODO: If optimizing for size or a processor that doesn't suffer from
24326   // partial register update stalls, this should be transformed into a MOVSD
24327   // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
24328
24329   if (VT == MVT::v2f64)
24330     if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
24331       if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
24332         SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
24333         return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
24334       }
24335
24336   return SDValue();
24337 }
24338
24339 // Helper function of PerformSETCCCombine. It is to materialize "setb reg"
24340 // as "sbb reg,reg", since it can be extended without zext and produces
24341 // an all-ones bit which is more useful than 0/1 in some cases.
24342 static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG,
24343                                MVT VT) {
24344   if (VT == MVT::i8)
24345     return DAG.getNode(ISD::AND, DL, VT,
24346                        DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
24347                                    DAG.getConstant(X86::COND_B, DL, MVT::i8),
24348                                    EFLAGS),
24349                        DAG.getConstant(1, DL, VT));
24350   assert (VT == MVT::i1 && "Unexpected type for SECCC node");
24351   return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,
24352                      DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
24353                                  DAG.getConstant(X86::COND_B, DL, MVT::i8),
24354                                  EFLAGS));
24355 }
24356
24357 // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
24358 static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
24359                                    TargetLowering::DAGCombinerInfo &DCI,
24360                                    const X86Subtarget *Subtarget) {
24361   SDLoc DL(N);
24362   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
24363   SDValue EFLAGS = N->getOperand(1);
24364
24365   if (CC == X86::COND_A) {
24366     // Try to convert COND_A into COND_B in an attempt to facilitate
24367     // materializing "setb reg".
24368     //
24369     // Do not flip "e > c", where "c" is a constant, because Cmp instruction
24370     // cannot take an immediate as its first operand.
24371     //
24372     if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
24373         EFLAGS.getValueType().isInteger() &&
24374         !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
24375       SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
24376                                    EFLAGS.getNode()->getVTList(),
24377                                    EFLAGS.getOperand(1), EFLAGS.getOperand(0));
24378       SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
24379       return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0));
24380     }
24381   }
24382
24383   // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
24384   // a zext and produces an all-ones bit which is more useful than 0/1 in some
24385   // cases.
24386   if (CC == X86::COND_B)
24387     return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
24388
24389   SDValue Flags;
24390
24391   Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
24392   if (Flags.getNode()) {
24393     SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
24394     return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
24395   }
24396
24397   return SDValue();
24398 }
24399
24400 // Optimize branch condition evaluation.
24401 //
24402 static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
24403                                     TargetLowering::DAGCombinerInfo &DCI,
24404                                     const X86Subtarget *Subtarget) {
24405   SDLoc DL(N);
24406   SDValue Chain = N->getOperand(0);
24407   SDValue Dest = N->getOperand(1);
24408   SDValue EFLAGS = N->getOperand(3);
24409   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
24410
24411   SDValue Flags;
24412
24413   Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
24414   if (Flags.getNode()) {
24415     SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
24416     return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
24417                        Flags);
24418   }
24419
24420   return SDValue();
24421 }
24422
24423 static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
24424                                                          SelectionDAG &DAG) {
24425   // Take advantage of vector comparisons producing 0 or -1 in each lane to
24426   // optimize away operation when it's from a constant.
24427   //
24428   // The general transformation is:
24429   //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
24430   //       AND(VECTOR_CMP(x,y), constant2)
24431   //    constant2 = UNARYOP(constant)
24432
24433   // Early exit if this isn't a vector operation, the operand of the
24434   // unary operation isn't a bitwise AND, or if the sizes of the operations
24435   // aren't the same.
24436   EVT VT = N->getValueType(0);
24437   if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
24438       N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
24439       VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
24440     return SDValue();
24441
24442   // Now check that the other operand of the AND is a constant. We could
24443   // make the transformation for non-constant splats as well, but it's unclear
24444   // that would be a benefit as it would not eliminate any operations, just
24445   // perform one more step in scalar code before moving to the vector unit.
24446   if (BuildVectorSDNode *BV =
24447           dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
24448     // Bail out if the vector isn't a constant.
24449     if (!BV->isConstant())
24450       return SDValue();
24451
24452     // Everything checks out. Build up the new and improved node.
24453     SDLoc DL(N);
24454     EVT IntVT = BV->getValueType(0);
24455     // Create a new constant of the appropriate type for the transformed
24456     // DAG.
24457     SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
24458     // The AND node needs bitcasts to/from an integer vector type around it.
24459     SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
24460     SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
24461                                  N->getOperand(0)->getOperand(0), MaskConst);
24462     SDValue Res = DAG.getBitcast(VT, NewAnd);
24463     return Res;
24464   }
24465
24466   return SDValue();
24467 }
24468
24469 static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
24470                                         const X86Subtarget *Subtarget) {
24471   // First try to optimize away the conversion entirely when it's
24472   // conditionally from a constant. Vectors only.
24473   SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG);
24474   if (Res != SDValue())
24475     return Res;
24476
24477   // Now move on to more general possibilities.
24478   SDValue Op0 = N->getOperand(0);
24479   EVT InVT = Op0->getValueType(0);
24480
24481   // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32))
24482   if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
24483     SDLoc dl(N);
24484     MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
24485     SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
24486     return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
24487   }
24488
24489   // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
24490   // a 32-bit target where SSE doesn't support i64->FP operations.
24491   if (Op0.getOpcode() == ISD::LOAD) {
24492     LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
24493     EVT VT = Ld->getValueType(0);
24494
24495     // This transformation is not supported if the result type is f16
24496     if (N->getValueType(0) == MVT::f16)
24497       return SDValue();
24498
24499     if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
24500         ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
24501         !Subtarget->is64Bit() && VT == MVT::i64) {
24502       SDValue FILDChain = Subtarget->getTargetLowering()->BuildFILD(
24503           SDValue(N, 0), Ld->getValueType(0), Ld->getChain(), Op0, DAG);
24504       DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
24505       return FILDChain;
24506     }
24507   }
24508   return SDValue();
24509 }
24510
24511 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
24512 static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
24513                                  X86TargetLowering::DAGCombinerInfo &DCI) {
24514   // If the LHS and RHS of the ADC node are zero, then it can't overflow and
24515   // the result is either zero or one (depending on the input carry bit).
24516   // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
24517   if (X86::isZeroNode(N->getOperand(0)) &&
24518       X86::isZeroNode(N->getOperand(1)) &&
24519       // We don't have a good way to replace an EFLAGS use, so only do this when
24520       // dead right now.
24521       SDValue(N, 1).use_empty()) {
24522     SDLoc DL(N);
24523     EVT VT = N->getValueType(0);
24524     SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
24525     SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
24526                                DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
24527                                            DAG.getConstant(X86::COND_B, DL,
24528                                                            MVT::i8),
24529                                            N->getOperand(2)),
24530                                DAG.getConstant(1, DL, VT));
24531     return DCI.CombineTo(N, Res1, CarryOut);
24532   }
24533
24534   return SDValue();
24535 }
24536
24537 // fold (add Y, (sete  X, 0)) -> adc  0, Y
24538 //      (add Y, (setne X, 0)) -> sbb -1, Y
24539 //      (sub (sete  X, 0), Y) -> sbb  0, Y
24540 //      (sub (setne X, 0), Y) -> adc -1, Y
24541 static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
24542   SDLoc DL(N);
24543
24544   // Look through ZExts.
24545   SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
24546   if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
24547     return SDValue();
24548
24549   SDValue SetCC = Ext.getOperand(0);
24550   if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
24551     return SDValue();
24552
24553   X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
24554   if (CC != X86::COND_E && CC != X86::COND_NE)
24555     return SDValue();
24556
24557   SDValue Cmp = SetCC.getOperand(1);
24558   if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
24559       !X86::isZeroNode(Cmp.getOperand(1)) ||
24560       !Cmp.getOperand(0).getValueType().isInteger())
24561     return SDValue();
24562
24563   SDValue CmpOp0 = Cmp.getOperand(0);
24564   SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
24565                                DAG.getConstant(1, DL, CmpOp0.getValueType()));
24566
24567   SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
24568   if (CC == X86::COND_NE)
24569     return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
24570                        DL, OtherVal.getValueType(), OtherVal,
24571                        DAG.getConstant(-1ULL, DL, OtherVal.getValueType()),
24572                        NewCmp);
24573   return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
24574                      DL, OtherVal.getValueType(), OtherVal,
24575                      DAG.getConstant(0, DL, OtherVal.getValueType()), NewCmp);
24576 }
24577
24578 /// PerformADDCombine - Do target-specific dag combines on integer adds.
24579 static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
24580                                  const X86Subtarget *Subtarget) {
24581   EVT VT = N->getValueType(0);
24582   SDValue Op0 = N->getOperand(0);
24583   SDValue Op1 = N->getOperand(1);
24584
24585   // Try to synthesize horizontal adds from adds of shuffles.
24586   if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
24587        (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
24588       isHorizontalBinOp(Op0, Op1, true))
24589     return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
24590
24591   return OptimizeConditionalInDecrement(N, DAG);
24592 }
24593
24594 static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
24595                                  const X86Subtarget *Subtarget) {
24596   SDValue Op0 = N->getOperand(0);
24597   SDValue Op1 = N->getOperand(1);
24598
24599   // X86 can't encode an immediate LHS of a sub. See if we can push the
24600   // negation into a preceding instruction.
24601   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
24602     // If the RHS of the sub is a XOR with one use and a constant, invert the
24603     // immediate. Then add one to the LHS of the sub so we can turn
24604     // X-Y -> X+~Y+1, saving one register.
24605     if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
24606         isa<ConstantSDNode>(Op1.getOperand(1))) {
24607       APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
24608       EVT VT = Op0.getValueType();
24609       SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
24610                                    Op1.getOperand(0),
24611                                    DAG.getConstant(~XorC, SDLoc(Op1), VT));
24612       return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
24613                          DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
24614     }
24615   }
24616
24617   // Try to synthesize horizontal adds from adds of shuffles.
24618   EVT VT = N->getValueType(0);
24619   if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
24620        (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
24621       isHorizontalBinOp(Op0, Op1, true))
24622     return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
24623
24624   return OptimizeConditionalInDecrement(N, DAG);
24625 }
24626
24627 /// performVZEXTCombine - Performs build vector combines
24628 static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
24629                                    TargetLowering::DAGCombinerInfo &DCI,
24630                                    const X86Subtarget *Subtarget) {
24631   SDLoc DL(N);
24632   MVT VT = N->getSimpleValueType(0);
24633   SDValue Op = N->getOperand(0);
24634   MVT OpVT = Op.getSimpleValueType();
24635   MVT OpEltVT = OpVT.getVectorElementType();
24636   unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
24637
24638   // (vzext (bitcast (vzext (x)) -> (vzext x)
24639   SDValue V = Op;
24640   while (V.getOpcode() == ISD::BITCAST)
24641     V = V.getOperand(0);
24642
24643   if (V != Op && V.getOpcode() == X86ISD::VZEXT) {
24644     MVT InnerVT = V.getSimpleValueType();
24645     MVT InnerEltVT = InnerVT.getVectorElementType();
24646
24647     // If the element sizes match exactly, we can just do one larger vzext. This
24648     // is always an exact type match as vzext operates on integer types.
24649     if (OpEltVT == InnerEltVT) {
24650       assert(OpVT == InnerVT && "Types must match for vzext!");
24651       return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
24652     }
24653
24654     // The only other way we can combine them is if only a single element of the
24655     // inner vzext is used in the input to the outer vzext.
24656     if (InnerEltVT.getSizeInBits() < InputBits)
24657       return SDValue();
24658
24659     // In this case, the inner vzext is completely dead because we're going to
24660     // only look at bits inside of the low element. Just do the outer vzext on
24661     // a bitcast of the input to the inner.
24662     return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
24663   }
24664
24665   // Check if we can bypass extracting and re-inserting an element of an input
24666   // vector. Essentialy:
24667   // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
24668   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
24669       V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
24670       V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
24671     SDValue ExtractedV = V.getOperand(0);
24672     SDValue OrigV = ExtractedV.getOperand(0);
24673     if (auto *ExtractIdx = dyn_cast<ConstantSDNode>(ExtractedV.getOperand(1)))
24674       if (ExtractIdx->getZExtValue() == 0) {
24675         MVT OrigVT = OrigV.getSimpleValueType();
24676         // Extract a subvector if necessary...
24677         if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
24678           int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
24679           OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
24680                                     OrigVT.getVectorNumElements() / Ratio);
24681           OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
24682                               DAG.getIntPtrConstant(0, DL));
24683         }
24684         Op = DAG.getBitcast(OpVT, OrigV);
24685         return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
24686       }
24687   }
24688
24689   return SDValue();
24690 }
24691
24692 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
24693                                              DAGCombinerInfo &DCI) const {
24694   SelectionDAG &DAG = DCI.DAG;
24695   switch (N->getOpcode()) {
24696   default: break;
24697   case ISD::EXTRACT_VECTOR_ELT:
24698     return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
24699   case ISD::VSELECT:
24700   case ISD::SELECT:
24701   case X86ISD::SHRUNKBLEND:
24702     return PerformSELECTCombine(N, DAG, DCI, Subtarget);
24703   case ISD::BITCAST:        return PerformBITCASTCombine(N, DAG);
24704   case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI, Subtarget);
24705   case ISD::ADD:            return PerformAddCombine(N, DAG, Subtarget);
24706   case ISD::SUB:            return PerformSubCombine(N, DAG, Subtarget);
24707   case X86ISD::ADC:         return PerformADCCombine(N, DAG, DCI);
24708   case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
24709   case ISD::SHL:
24710   case ISD::SRA:
24711   case ISD::SRL:            return PerformShiftCombine(N, DAG, DCI, Subtarget);
24712   case ISD::AND:            return PerformAndCombine(N, DAG, DCI, Subtarget);
24713   case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
24714   case ISD::XOR:            return PerformXorCombine(N, DAG, DCI, Subtarget);
24715   case ISD::LOAD:           return PerformLOADCombine(N, DAG, DCI, Subtarget);
24716   case ISD::MLOAD:          return PerformMLOADCombine(N, DAG, DCI, Subtarget);
24717   case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
24718   case ISD::MSTORE:         return PerformMSTORECombine(N, DAG, Subtarget);
24719   case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, Subtarget);
24720   case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
24721   case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
24722   case X86ISD::FXOR:
24723   case X86ISD::FOR:         return PerformFORCombine(N, DAG);
24724   case X86ISD::FMIN:
24725   case X86ISD::FMAX:        return PerformFMinFMaxCombine(N, DAG);
24726   case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
24727   case X86ISD::FANDN:       return PerformFANDNCombine(N, DAG);
24728   case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
24729   case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
24730   case ISD::ANY_EXTEND:
24731   case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, DCI, Subtarget);
24732   case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
24733   case ISD::SIGN_EXTEND_INREG:
24734     return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
24735   case ISD::SETCC:          return PerformISDSETCCCombine(N, DAG, Subtarget);
24736   case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG, DCI, Subtarget);
24737   case X86ISD::BRCOND:      return PerformBrCondCombine(N, DAG, DCI, Subtarget);
24738   case X86ISD::VZEXT:       return performVZEXTCombine(N, DAG, DCI, Subtarget);
24739   case X86ISD::SHUFP:       // Handle all target specific shuffles
24740   case X86ISD::PALIGNR:
24741   case X86ISD::UNPCKH:
24742   case X86ISD::UNPCKL:
24743   case X86ISD::MOVHLPS:
24744   case X86ISD::MOVLHPS:
24745   case X86ISD::PSHUFB:
24746   case X86ISD::PSHUFD:
24747   case X86ISD::PSHUFHW:
24748   case X86ISD::PSHUFLW:
24749   case X86ISD::MOVSS:
24750   case X86ISD::MOVSD:
24751   case X86ISD::VPERMILPI:
24752   case X86ISD::VPERM2X128:
24753   case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
24754   case ISD::FMA:            return PerformFMACombine(N, DAG, Subtarget);
24755   case ISD::INTRINSIC_WO_CHAIN:
24756     return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget);
24757   case X86ISD::INSERTPS: {
24758     if (getTargetMachine().getOptLevel() > CodeGenOpt::None)
24759       return PerformINSERTPSCombine(N, DAG, Subtarget);
24760     break;
24761   }
24762   case X86ISD::BLENDI:    return PerformBLENDICombine(N, DAG);
24763   }
24764
24765   return SDValue();
24766 }
24767
24768 /// isTypeDesirableForOp - Return true if the target has native support for
24769 /// the specified value type and it is 'desirable' to use the type for the
24770 /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
24771 /// instruction encodings are longer and some i16 instructions are slow.
24772 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
24773   if (!isTypeLegal(VT))
24774     return false;
24775   if (VT != MVT::i16)
24776     return true;
24777
24778   switch (Opc) {
24779   default:
24780     return true;
24781   case ISD::LOAD:
24782   case ISD::SIGN_EXTEND:
24783   case ISD::ZERO_EXTEND:
24784   case ISD::ANY_EXTEND:
24785   case ISD::SHL:
24786   case ISD::SRL:
24787   case ISD::SUB:
24788   case ISD::ADD:
24789   case ISD::MUL:
24790   case ISD::AND:
24791   case ISD::OR:
24792   case ISD::XOR:
24793     return false;
24794   }
24795 }
24796
24797 /// IsDesirableToPromoteOp - This method query the target whether it is
24798 /// beneficial for dag combiner to promote the specified node. If true, it
24799 /// should return the desired promotion type by reference.
24800 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
24801   EVT VT = Op.getValueType();
24802   if (VT != MVT::i16)
24803     return false;
24804
24805   bool Promote = false;
24806   bool Commute = false;
24807   switch (Op.getOpcode()) {
24808   default: break;
24809   case ISD::LOAD: {
24810     LoadSDNode *LD = cast<LoadSDNode>(Op);
24811     // If the non-extending load has a single use and it's not live out, then it
24812     // might be folded.
24813     if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&&
24814                                                      Op.hasOneUse()*/) {
24815       for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
24816              UE = Op.getNode()->use_end(); UI != UE; ++UI) {
24817         // The only case where we'd want to promote LOAD (rather then it being
24818         // promoted as an operand is when it's only use is liveout.
24819         if (UI->getOpcode() != ISD::CopyToReg)
24820           return false;
24821       }
24822     }
24823     Promote = true;
24824     break;
24825   }
24826   case ISD::SIGN_EXTEND:
24827   case ISD::ZERO_EXTEND:
24828   case ISD::ANY_EXTEND:
24829     Promote = true;
24830     break;
24831   case ISD::SHL:
24832   case ISD::SRL: {
24833     SDValue N0 = Op.getOperand(0);
24834     // Look out for (store (shl (load), x)).
24835     if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
24836       return false;
24837     Promote = true;
24838     break;
24839   }
24840   case ISD::ADD:
24841   case ISD::MUL:
24842   case ISD::AND:
24843   case ISD::OR:
24844   case ISD::XOR:
24845     Commute = true;
24846     // fallthrough
24847   case ISD::SUB: {
24848     SDValue N0 = Op.getOperand(0);
24849     SDValue N1 = Op.getOperand(1);
24850     if (!Commute && MayFoldLoad(N1))
24851       return false;
24852     // Avoid disabling potential load folding opportunities.
24853     if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
24854       return false;
24855     if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
24856       return false;
24857     Promote = true;
24858   }
24859   }
24860
24861   PVT = MVT::i32;
24862   return Promote;
24863 }
24864
24865 //===----------------------------------------------------------------------===//
24866 //                           X86 Inline Assembly Support
24867 //===----------------------------------------------------------------------===//
24868
24869 // Helper to match a string separated by whitespace.
24870 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
24871   S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
24872
24873   for (StringRef Piece : Pieces) {
24874     if (!S.startswith(Piece)) // Check if the piece matches.
24875       return false;
24876
24877     S = S.substr(Piece.size());
24878     StringRef::size_type Pos = S.find_first_not_of(" \t");
24879     if (Pos == 0) // We matched a prefix.
24880       return false;
24881
24882     S = S.substr(Pos);
24883   }
24884
24885   return S.empty();
24886 }
24887
24888 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
24889
24890   if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
24891     if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
24892         std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
24893         std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
24894
24895       if (AsmPieces.size() == 3)
24896         return true;
24897       else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
24898         return true;
24899     }
24900   }
24901   return false;
24902 }
24903
24904 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
24905   InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
24906
24907   std::string AsmStr = IA->getAsmString();
24908
24909   IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
24910   if (!Ty || Ty->getBitWidth() % 16 != 0)
24911     return false;
24912
24913   // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
24914   SmallVector<StringRef, 4> AsmPieces;
24915   SplitString(AsmStr, AsmPieces, ";\n");
24916
24917   switch (AsmPieces.size()) {
24918   default: return false;
24919   case 1:
24920     // FIXME: this should verify that we are targeting a 486 or better.  If not,
24921     // we will turn this bswap into something that will be lowered to logical
24922     // ops instead of emitting the bswap asm.  For now, we don't support 486 or
24923     // lower so don't worry about this.
24924     // bswap $0
24925     if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
24926         matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
24927         matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
24928         matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
24929         matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
24930         matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
24931       // No need to check constraints, nothing other than the equivalent of
24932       // "=r,0" would be valid here.
24933       return IntrinsicLowering::LowerToByteSwap(CI);
24934     }
24935
24936     // rorw $$8, ${0:w}  -->  llvm.bswap.i16
24937     if (CI->getType()->isIntegerTy(16) &&
24938         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
24939         (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
24940          matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
24941       AsmPieces.clear();
24942       const std::string &ConstraintsStr = IA->getConstraintString();
24943       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
24944       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
24945       if (clobbersFlagRegisters(AsmPieces))
24946         return IntrinsicLowering::LowerToByteSwap(CI);
24947     }
24948     break;
24949   case 3:
24950     if (CI->getType()->isIntegerTy(32) &&
24951         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
24952         matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
24953         matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
24954         matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
24955       AsmPieces.clear();
24956       const std::string &ConstraintsStr = IA->getConstraintString();
24957       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
24958       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
24959       if (clobbersFlagRegisters(AsmPieces))
24960         return IntrinsicLowering::LowerToByteSwap(CI);
24961     }
24962
24963     if (CI->getType()->isIntegerTy(64)) {
24964       InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
24965       if (Constraints.size() >= 2 &&
24966           Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
24967           Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
24968         // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
24969         if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
24970             matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
24971             matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
24972           return IntrinsicLowering::LowerToByteSwap(CI);
24973       }
24974     }
24975     break;
24976   }
24977   return false;
24978 }
24979
24980 /// getConstraintType - Given a constraint letter, return the type of
24981 /// constraint it is for this target.
24982 X86TargetLowering::ConstraintType
24983 X86TargetLowering::getConstraintType(const std::string &Constraint) const {
24984   if (Constraint.size() == 1) {
24985     switch (Constraint[0]) {
24986     case 'R':
24987     case 'q':
24988     case 'Q':
24989     case 'f':
24990     case 't':
24991     case 'u':
24992     case 'y':
24993     case 'x':
24994     case 'Y':
24995     case 'l':
24996       return C_RegisterClass;
24997     case 'a':
24998     case 'b':
24999     case 'c':
25000     case 'd':
25001     case 'S':
25002     case 'D':
25003     case 'A':
25004       return C_Register;
25005     case 'I':
25006     case 'J':
25007     case 'K':
25008     case 'L':
25009     case 'M':
25010     case 'N':
25011     case 'G':
25012     case 'C':
25013     case 'e':
25014     case 'Z':
25015       return C_Other;
25016     default:
25017       break;
25018     }
25019   }
25020   return TargetLowering::getConstraintType(Constraint);
25021 }
25022
25023 /// Examine constraint type and operand type and determine a weight value.
25024 /// This object must already have been set up with the operand type
25025 /// and the current alternative constraint selected.
25026 TargetLowering::ConstraintWeight
25027   X86TargetLowering::getSingleConstraintMatchWeight(
25028     AsmOperandInfo &info, const char *constraint) const {
25029   ConstraintWeight weight = CW_Invalid;
25030   Value *CallOperandVal = info.CallOperandVal;
25031     // If we don't have a value, we can't do a match,
25032     // but allow it at the lowest weight.
25033   if (!CallOperandVal)
25034     return CW_Default;
25035   Type *type = CallOperandVal->getType();
25036   // Look at the constraint type.
25037   switch (*constraint) {
25038   default:
25039     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
25040   case 'R':
25041   case 'q':
25042   case 'Q':
25043   case 'a':
25044   case 'b':
25045   case 'c':
25046   case 'd':
25047   case 'S':
25048   case 'D':
25049   case 'A':
25050     if (CallOperandVal->getType()->isIntegerTy())
25051       weight = CW_SpecificReg;
25052     break;
25053   case 'f':
25054   case 't':
25055   case 'u':
25056     if (type->isFloatingPointTy())
25057       weight = CW_SpecificReg;
25058     break;
25059   case 'y':
25060     if (type->isX86_MMXTy() && Subtarget->hasMMX())
25061       weight = CW_SpecificReg;
25062     break;
25063   case 'x':
25064   case 'Y':
25065     if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) ||
25066         ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256()))
25067       weight = CW_Register;
25068     break;
25069   case 'I':
25070     if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
25071       if (C->getZExtValue() <= 31)
25072         weight = CW_Constant;
25073     }
25074     break;
25075   case 'J':
25076     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
25077       if (C->getZExtValue() <= 63)
25078         weight = CW_Constant;
25079     }
25080     break;
25081   case 'K':
25082     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
25083       if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
25084         weight = CW_Constant;
25085     }
25086     break;
25087   case 'L':
25088     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
25089       if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
25090         weight = CW_Constant;
25091     }
25092     break;
25093   case 'M':
25094     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
25095       if (C->getZExtValue() <= 3)
25096         weight = CW_Constant;
25097     }
25098     break;
25099   case 'N':
25100     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
25101       if (C->getZExtValue() <= 0xff)
25102         weight = CW_Constant;
25103     }
25104     break;
25105   case 'G':
25106   case 'C':
25107     if (isa<ConstantFP>(CallOperandVal)) {
25108       weight = CW_Constant;
25109     }
25110     break;
25111   case 'e':
25112     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
25113       if ((C->getSExtValue() >= -0x80000000LL) &&
25114           (C->getSExtValue() <= 0x7fffffffLL))
25115         weight = CW_Constant;
25116     }
25117     break;
25118   case 'Z':
25119     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
25120       if (C->getZExtValue() <= 0xffffffff)
25121         weight = CW_Constant;
25122     }
25123     break;
25124   }
25125   return weight;
25126 }
25127
25128 /// LowerXConstraint - try to replace an X constraint, which matches anything,
25129 /// with another that has more specific requirements based on the type of the
25130 /// corresponding operand.
25131 const char *X86TargetLowering::
25132 LowerXConstraint(EVT ConstraintVT) const {
25133   // FP X constraints get lowered to SSE1/2 registers if available, otherwise
25134   // 'f' like normal targets.
25135   if (ConstraintVT.isFloatingPoint()) {
25136     if (Subtarget->hasSSE2())
25137       return "Y";
25138     if (Subtarget->hasSSE1())
25139       return "x";
25140   }
25141
25142   return TargetLowering::LowerXConstraint(ConstraintVT);
25143 }
25144
25145 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
25146 /// vector.  If it is invalid, don't add anything to Ops.
25147 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
25148                                                      std::string &Constraint,
25149                                                      std::vector<SDValue>&Ops,
25150                                                      SelectionDAG &DAG) const {
25151   SDValue Result;
25152
25153   // Only support length 1 constraints for now.
25154   if (Constraint.length() > 1) return;
25155
25156   char ConstraintLetter = Constraint[0];
25157   switch (ConstraintLetter) {
25158   default: break;
25159   case 'I':
25160     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
25161       if (C->getZExtValue() <= 31) {
25162         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
25163                                        Op.getValueType());
25164         break;
25165       }
25166     }
25167     return;
25168   case 'J':
25169     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
25170       if (C->getZExtValue() <= 63) {
25171         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
25172                                        Op.getValueType());
25173         break;
25174       }
25175     }
25176     return;
25177   case 'K':
25178     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
25179       if (isInt<8>(C->getSExtValue())) {
25180         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
25181                                        Op.getValueType());
25182         break;
25183       }
25184     }
25185     return;
25186   case 'L':
25187     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
25188       if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
25189           (Subtarget->is64Bit() && C->getZExtValue() == 0xffffffff)) {
25190         Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
25191                                        Op.getValueType());
25192         break;
25193       }
25194     }
25195     return;
25196   case 'M':
25197     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
25198       if (C->getZExtValue() <= 3) {
25199         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
25200                                        Op.getValueType());
25201         break;
25202       }
25203     }
25204     return;
25205   case 'N':
25206     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
25207       if (C->getZExtValue() <= 255) {
25208         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
25209                                        Op.getValueType());
25210         break;
25211       }
25212     }
25213     return;
25214   case 'O':
25215     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
25216       if (C->getZExtValue() <= 127) {
25217         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
25218                                        Op.getValueType());
25219         break;
25220       }
25221     }
25222     return;
25223   case 'e': {
25224     // 32-bit signed value
25225     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
25226       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
25227                                            C->getSExtValue())) {
25228         // Widen to 64 bits here to get it sign extended.
25229         Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
25230         break;
25231       }
25232     // FIXME gcc accepts some relocatable values here too, but only in certain
25233     // memory models; it's complicated.
25234     }
25235     return;
25236   }
25237   case 'Z': {
25238     // 32-bit unsigned value
25239     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
25240       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
25241                                            C->getZExtValue())) {
25242         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
25243                                        Op.getValueType());
25244         break;
25245       }
25246     }
25247     // FIXME gcc accepts some relocatable values here too, but only in certain
25248     // memory models; it's complicated.
25249     return;
25250   }
25251   case 'i': {
25252     // Literal immediates are always ok.
25253     if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
25254       // Widen to 64 bits here to get it sign extended.
25255       Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
25256       break;
25257     }
25258
25259     // In any sort of PIC mode addresses need to be computed at runtime by
25260     // adding in a register or some sort of table lookup.  These can't
25261     // be used as immediates.
25262     if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC())
25263       return;
25264
25265     // If we are in non-pic codegen mode, we allow the address of a global (with
25266     // an optional displacement) to be used with 'i'.
25267     GlobalAddressSDNode *GA = nullptr;
25268     int64_t Offset = 0;
25269
25270     // Match either (GA), (GA+C), (GA+C1+C2), etc.
25271     while (1) {
25272       if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
25273         Offset += GA->getOffset();
25274         break;
25275       } else if (Op.getOpcode() == ISD::ADD) {
25276         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
25277           Offset += C->getZExtValue();
25278           Op = Op.getOperand(0);
25279           continue;
25280         }
25281       } else if (Op.getOpcode() == ISD::SUB) {
25282         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
25283           Offset += -C->getZExtValue();
25284           Op = Op.getOperand(0);
25285           continue;
25286         }
25287       }
25288
25289       // Otherwise, this isn't something we can handle, reject it.
25290       return;
25291     }
25292
25293     const GlobalValue *GV = GA->getGlobal();
25294     // If we require an extra load to get this address, as in PIC mode, we
25295     // can't accept it.
25296     if (isGlobalStubReference(
25297             Subtarget->ClassifyGlobalReference(GV, DAG.getTarget())))
25298       return;
25299
25300     Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
25301                                         GA->getValueType(0), Offset);
25302     break;
25303   }
25304   }
25305
25306   if (Result.getNode()) {
25307     Ops.push_back(Result);
25308     return;
25309   }
25310   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
25311 }
25312
25313 std::pair<unsigned, const TargetRegisterClass *>
25314 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
25315                                                 const std::string &Constraint,
25316                                                 MVT VT) const {
25317   // First, see if this is a constraint that directly corresponds to an LLVM
25318   // register class.
25319   if (Constraint.size() == 1) {
25320     // GCC Constraint Letters
25321     switch (Constraint[0]) {
25322     default: break;
25323       // TODO: Slight differences here in allocation order and leaving
25324       // RIP in the class. Do they matter any more here than they do
25325       // in the normal allocation?
25326     case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
25327       if (Subtarget->is64Bit()) {
25328         if (VT == MVT::i32 || VT == MVT::f32)
25329           return std::make_pair(0U, &X86::GR32RegClass);
25330         if (VT == MVT::i16)
25331           return std::make_pair(0U, &X86::GR16RegClass);
25332         if (VT == MVT::i8 || VT == MVT::i1)
25333           return std::make_pair(0U, &X86::GR8RegClass);
25334         if (VT == MVT::i64 || VT == MVT::f64)
25335           return std::make_pair(0U, &X86::GR64RegClass);
25336         break;
25337       }
25338       // 32-bit fallthrough
25339     case 'Q':   // Q_REGS
25340       if (VT == MVT::i32 || VT == MVT::f32)
25341         return std::make_pair(0U, &X86::GR32_ABCDRegClass);
25342       if (VT == MVT::i16)
25343         return std::make_pair(0U, &X86::GR16_ABCDRegClass);
25344       if (VT == MVT::i8 || VT == MVT::i1)
25345         return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
25346       if (VT == MVT::i64)
25347         return std::make_pair(0U, &X86::GR64_ABCDRegClass);
25348       break;
25349     case 'r':   // GENERAL_REGS
25350     case 'l':   // INDEX_REGS
25351       if (VT == MVT::i8 || VT == MVT::i1)
25352         return std::make_pair(0U, &X86::GR8RegClass);
25353       if (VT == MVT::i16)
25354         return std::make_pair(0U, &X86::GR16RegClass);
25355       if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit())
25356         return std::make_pair(0U, &X86::GR32RegClass);
25357       return std::make_pair(0U, &X86::GR64RegClass);
25358     case 'R':   // LEGACY_REGS
25359       if (VT == MVT::i8 || VT == MVT::i1)
25360         return std::make_pair(0U, &X86::GR8_NOREXRegClass);
25361       if (VT == MVT::i16)
25362         return std::make_pair(0U, &X86::GR16_NOREXRegClass);
25363       if (VT == MVT::i32 || !Subtarget->is64Bit())
25364         return std::make_pair(0U, &X86::GR32_NOREXRegClass);
25365       return std::make_pair(0U, &X86::GR64_NOREXRegClass);
25366     case 'f':  // FP Stack registers.
25367       // If SSE is enabled for this VT, use f80 to ensure the isel moves the
25368       // value to the correct fpstack register class.
25369       if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
25370         return std::make_pair(0U, &X86::RFP32RegClass);
25371       if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
25372         return std::make_pair(0U, &X86::RFP64RegClass);
25373       return std::make_pair(0U, &X86::RFP80RegClass);
25374     case 'y':   // MMX_REGS if MMX allowed.
25375       if (!Subtarget->hasMMX()) break;
25376       return std::make_pair(0U, &X86::VR64RegClass);
25377     case 'Y':   // SSE_REGS if SSE2 allowed
25378       if (!Subtarget->hasSSE2()) break;
25379       // FALL THROUGH.
25380     case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
25381       if (!Subtarget->hasSSE1()) break;
25382
25383       switch (VT.SimpleTy) {
25384       default: break;
25385       // Scalar SSE types.
25386       case MVT::f32:
25387       case MVT::i32:
25388         return std::make_pair(0U, &X86::FR32RegClass);
25389       case MVT::f64:
25390       case MVT::i64:
25391         return std::make_pair(0U, &X86::FR64RegClass);
25392       // Vector types.
25393       case MVT::v16i8:
25394       case MVT::v8i16:
25395       case MVT::v4i32:
25396       case MVT::v2i64:
25397       case MVT::v4f32:
25398       case MVT::v2f64:
25399         return std::make_pair(0U, &X86::VR128RegClass);
25400       // AVX types.
25401       case MVT::v32i8:
25402       case MVT::v16i16:
25403       case MVT::v8i32:
25404       case MVT::v4i64:
25405       case MVT::v8f32:
25406       case MVT::v4f64:
25407         return std::make_pair(0U, &X86::VR256RegClass);
25408       case MVT::v8f64:
25409       case MVT::v16f32:
25410       case MVT::v16i32:
25411       case MVT::v8i64:
25412         return std::make_pair(0U, &X86::VR512RegClass);
25413       }
25414       break;
25415     }
25416   }
25417
25418   // Use the default implementation in TargetLowering to convert the register
25419   // constraint into a member of a register class.
25420   std::pair<unsigned, const TargetRegisterClass*> Res;
25421   Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
25422
25423   // Not found as a standard register?
25424   if (!Res.second) {
25425     // Map st(0) -> st(7) -> ST0
25426     if (Constraint.size() == 7 && Constraint[0] == '{' &&
25427         tolower(Constraint[1]) == 's' &&
25428         tolower(Constraint[2]) == 't' &&
25429         Constraint[3] == '(' &&
25430         (Constraint[4] >= '0' && Constraint[4] <= '7') &&
25431         Constraint[5] == ')' &&
25432         Constraint[6] == '}') {
25433
25434       Res.first = X86::FP0+Constraint[4]-'0';
25435       Res.second = &X86::RFP80RegClass;
25436       return Res;
25437     }
25438
25439     // GCC allows "st(0)" to be called just plain "st".
25440     if (StringRef("{st}").equals_lower(Constraint)) {
25441       Res.first = X86::FP0;
25442       Res.second = &X86::RFP80RegClass;
25443       return Res;
25444     }
25445
25446     // flags -> EFLAGS
25447     if (StringRef("{flags}").equals_lower(Constraint)) {
25448       Res.first = X86::EFLAGS;
25449       Res.second = &X86::CCRRegClass;
25450       return Res;
25451     }
25452
25453     // 'A' means EAX + EDX.
25454     if (Constraint == "A") {
25455       Res.first = X86::EAX;
25456       Res.second = &X86::GR32_ADRegClass;
25457       return Res;
25458     }
25459     return Res;
25460   }
25461
25462   // Otherwise, check to see if this is a register class of the wrong value
25463   // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
25464   // turn into {ax},{dx}.
25465   if (Res.second->hasType(VT))
25466     return Res;   // Correct type already, nothing to do.
25467
25468   // All of the single-register GCC register classes map their values onto
25469   // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp".  If we
25470   // really want an 8-bit or 32-bit register, map to the appropriate register
25471   // class and return the appropriate register.
25472   if (Res.second == &X86::GR16RegClass) {
25473     if (VT == MVT::i8 || VT == MVT::i1) {
25474       unsigned DestReg = 0;
25475       switch (Res.first) {
25476       default: break;
25477       case X86::AX: DestReg = X86::AL; break;
25478       case X86::DX: DestReg = X86::DL; break;
25479       case X86::CX: DestReg = X86::CL; break;
25480       case X86::BX: DestReg = X86::BL; break;
25481       }
25482       if (DestReg) {
25483         Res.first = DestReg;
25484         Res.second = &X86::GR8RegClass;
25485       }
25486     } else if (VT == MVT::i32 || VT == MVT::f32) {
25487       unsigned DestReg = 0;
25488       switch (Res.first) {
25489       default: break;
25490       case X86::AX: DestReg = X86::EAX; break;
25491       case X86::DX: DestReg = X86::EDX; break;
25492       case X86::CX: DestReg = X86::ECX; break;
25493       case X86::BX: DestReg = X86::EBX; break;
25494       case X86::SI: DestReg = X86::ESI; break;
25495       case X86::DI: DestReg = X86::EDI; break;
25496       case X86::BP: DestReg = X86::EBP; break;
25497       case X86::SP: DestReg = X86::ESP; break;
25498       }
25499       if (DestReg) {
25500         Res.first = DestReg;
25501         Res.second = &X86::GR32RegClass;
25502       }
25503     } else if (VT == MVT::i64 || VT == MVT::f64) {
25504       unsigned DestReg = 0;
25505       switch (Res.first) {
25506       default: break;
25507       case X86::AX: DestReg = X86::RAX; break;
25508       case X86::DX: DestReg = X86::RDX; break;
25509       case X86::CX: DestReg = X86::RCX; break;
25510       case X86::BX: DestReg = X86::RBX; break;
25511       case X86::SI: DestReg = X86::RSI; break;
25512       case X86::DI: DestReg = X86::RDI; break;
25513       case X86::BP: DestReg = X86::RBP; break;
25514       case X86::SP: DestReg = X86::RSP; break;
25515       }
25516       if (DestReg) {
25517         Res.first = DestReg;
25518         Res.second = &X86::GR64RegClass;
25519       }
25520     }
25521   } else if (Res.second == &X86::FR32RegClass ||
25522              Res.second == &X86::FR64RegClass ||
25523              Res.second == &X86::VR128RegClass ||
25524              Res.second == &X86::VR256RegClass ||
25525              Res.second == &X86::FR32XRegClass ||
25526              Res.second == &X86::FR64XRegClass ||
25527              Res.second == &X86::VR128XRegClass ||
25528              Res.second == &X86::VR256XRegClass ||
25529              Res.second == &X86::VR512RegClass) {
25530     // Handle references to XMM physical registers that got mapped into the
25531     // wrong class.  This can happen with constraints like {xmm0} where the
25532     // target independent register mapper will just pick the first match it can
25533     // find, ignoring the required type.
25534
25535     if (VT == MVT::f32 || VT == MVT::i32)
25536       Res.second = &X86::FR32RegClass;
25537     else if (VT == MVT::f64 || VT == MVT::i64)
25538       Res.second = &X86::FR64RegClass;
25539     else if (X86::VR128RegClass.hasType(VT))
25540       Res.second = &X86::VR128RegClass;
25541     else if (X86::VR256RegClass.hasType(VT))
25542       Res.second = &X86::VR256RegClass;
25543     else if (X86::VR512RegClass.hasType(VT))
25544       Res.second = &X86::VR512RegClass;
25545   }
25546
25547   return Res;
25548 }
25549
25550 int X86TargetLowering::getScalingFactorCost(const AddrMode &AM,
25551                                             Type *Ty,
25552                                             unsigned AS) const {
25553   // Scaling factors are not free at all.
25554   // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
25555   // will take 2 allocations in the out of order engine instead of 1
25556   // for plain addressing mode, i.e. inst (reg1).
25557   // E.g.,
25558   // vaddps (%rsi,%drx), %ymm0, %ymm1
25559   // Requires two allocations (one for the load, one for the computation)
25560   // whereas:
25561   // vaddps (%rsi), %ymm0, %ymm1
25562   // Requires just 1 allocation, i.e., freeing allocations for other operations
25563   // and having less micro operations to execute.
25564   //
25565   // For some X86 architectures, this is even worse because for instance for
25566   // stores, the complex addressing mode forces the instruction to use the
25567   // "load" ports instead of the dedicated "store" port.
25568   // E.g., on Haswell:
25569   // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
25570   // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
25571   if (isLegalAddressingMode(AM, Ty, AS))
25572     // Scale represents reg2 * scale, thus account for 1
25573     // as soon as we use a second register.
25574     return AM.Scale != 0;
25575   return -1;
25576 }
25577
25578 bool X86TargetLowering::isTargetFTOL() const {
25579   return Subtarget->isTargetKnownWindowsMSVC() && !Subtarget->is64Bit();
25580 }