AVX-512: Floating point conversions for SKX - DAG Lowering.

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 3ba811574489259d55d6e2858553d395d9d9f59a..78b35010be6ea8106f171b56cc31d4efad971575 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -67,12 +67,6 @@ static cl::opt<bool> ExperimentalVectorWideningLegalization(
               "rather than promotion."),
      cl::Hidden);
  
-static cl::opt<int> ReciprocalEstimateRefinementSteps(
-    "x86-recip-refinement-steps", cl::init(1),
-    cl::desc("Specify the number of Newton-Raphson iterations applied to the "
-             "result of the hardware reciprocal estimate instruction."),
-    cl::NotHidden);
-
  // Forward declarations.
  static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
                         SDValue V2);
@@ -82,7 +76,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      : TargetLowering(TM), Subtarget(&STI) {
    X86ScalarSSEf64 = Subtarget->hasSSE2();
    X86ScalarSSEf32 = Subtarget->hasSSE1();
-  TD = getDataLayout();
+  TD = TM.getDataLayout();
  
    // Set up the TargetLowering object.
    static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
@@ -511,7 +505,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
    setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
    setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
  
-  setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(), Custom);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(*TD), Custom);
  
    // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
    setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
@@ -831,6 +825,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
      setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
  
+    setOperationAction(ISD::SMAX,               MVT::v8i16, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v16i8, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v8i16, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v16i8, Legal);
+
      setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
      setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
      setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
@@ -842,13 +841,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
  
-    // Only provide customized ctpop vector bit twiddling for vector types we
-    // know to perform better than using the popcnt instructions on each vector
-    // element. If popcnt isn't supported, always provide the custom version.
-    if (!Subtarget->hasPOPCNT()) {
-      setOperationAction(ISD::CTPOP,            MVT::v4i32, Custom);
-      setOperationAction(ISD::CTPOP,            MVT::v2i64, Custom);
-    }
+    setOperationAction(ISD::CTPOP,              MVT::v16i8, Custom);
+    setOperationAction(ISD::CTPOP,              MVT::v8i16, Custom);
+    setOperationAction(ISD::CTPOP,              MVT::v4i32, Custom);
+    setOperationAction(ISD::CTPOP,              MVT::v2i64, Custom);
  
      // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
      for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
@@ -924,6 +920,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
      setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
  
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v2i32, Custom);
+
      setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
      setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
      // As there is no 64-bit GPR available, we need build a special custom
@@ -951,6 +949,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
        setOperationAction(ISD::FNEARBYINT,       RoundedTy,  Legal);
      }
  
+    setOperationAction(ISD::SMAX,               MVT::v16i8, Legal);
+    setOperationAction(ISD::SMAX,               MVT::v4i32, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v8i16, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v4i32, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v16i8, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v4i32, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v8i16, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v4i32, Legal);
+
      // FIXME: Do we need to handle scalar-to-vector here?
      setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
  
@@ -1025,6 +1032,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
      setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
  
+    setOperationAction(ISD::SRA,               MVT::v2i64, Custom);
      setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
    }
  
@@ -1113,7 +1121,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
      setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
  
-    if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
+    setOperationAction(ISD::CTPOP,             MVT::v32i8, Custom);
+    setOperationAction(ISD::CTPOP,             MVT::v16i16, Custom);
+    setOperationAction(ISD::CTPOP,             MVT::v8i32, Custom);
+    setOperationAction(ISD::CTPOP,             MVT::v4i64, Custom);
+
+    if (Subtarget->hasFMA() || Subtarget->hasFMA4() || Subtarget->hasAVX512()) {
        setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
        setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
        setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
@@ -1143,20 +1156,23 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
        setOperationAction(ISD::MULHU,           MVT::v16i16, Legal);
        setOperationAction(ISD::MULHS,           MVT::v16i16, Legal);
  
+      setOperationAction(ISD::SMAX,            MVT::v32i8,  Legal);
+      setOperationAction(ISD::SMAX,            MVT::v16i16, Legal);
+      setOperationAction(ISD::SMAX,            MVT::v8i32,  Legal);
+      setOperationAction(ISD::UMAX,            MVT::v32i8,  Legal);
+      setOperationAction(ISD::UMAX,            MVT::v16i16, Legal);
+      setOperationAction(ISD::UMAX,            MVT::v8i32,  Legal);
+      setOperationAction(ISD::SMIN,            MVT::v32i8,  Legal);
+      setOperationAction(ISD::SMIN,            MVT::v16i16, Legal);
+      setOperationAction(ISD::SMIN,            MVT::v8i32,  Legal);
+      setOperationAction(ISD::UMIN,            MVT::v32i8,  Legal);
+      setOperationAction(ISD::UMIN,            MVT::v16i16, Legal);
+      setOperationAction(ISD::UMIN,            MVT::v8i32,  Legal);
+
        // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
        // when we have a 256bit-wide blend with immediate.
        setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
  
-      // Only provide customized ctpop vector bit twiddling for vector types we
-      // know to perform better than using the popcnt instructions on each
-      // vector element. If popcnt isn't supported, always provide the custom
-      // version.
-      if (!Subtarget->hasPOPCNT())
-        setOperationAction(ISD::CTPOP,           MVT::v4i64, Custom);
-
-      // Custom CTPOP always performs better on natively supported v8i32
-      setOperationAction(ISD::CTPOP,             MVT::v8i32, Custom);
-
        // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
        setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
        setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
@@ -1196,6 +1212,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
      setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
  
+    setOperationAction(ISD::SRA,               MVT::v4i64, Custom);
      setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
  
      // Custom lower several nodes for 256-bit types.
@@ -1273,7 +1290,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64,  MVT::v8i16,  Legal);
      setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64,  MVT::v8i32,  Legal);
      setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64,  MVT::v8i32,  Legal);
-    
+
      setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
      setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
      setOperationAction(ISD::XOR,                MVT::i1,    Legal);
@@ -1335,8 +1352,33 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
      setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
      if (Subtarget->hasDQI()) {
-      setOperationAction(ISD::TRUNCATE,           MVT::v2i1, Custom);
-      setOperationAction(ISD::TRUNCATE,           MVT::v4i1, Custom);
+      setOperationAction(ISD::TRUNCATE,         MVT::v2i1, Custom);
+      setOperationAction(ISD::TRUNCATE,         MVT::v4i1, Custom);
+
+      setOperationAction(ISD::SINT_TO_FP,       MVT::v8i64, Legal);
+      setOperationAction(ISD::UINT_TO_FP,       MVT::v8i64, Legal);
+      setOperationAction(ISD::FP_TO_SINT,       MVT::v8i64, Legal);
+      setOperationAction(ISD::FP_TO_UINT,       MVT::v8i64, Legal);
+      if (Subtarget->hasVLX()) {
+        setOperationAction(ISD::SINT_TO_FP,    MVT::v4i64, Legal);
+        setOperationAction(ISD::SINT_TO_FP,    MVT::v2i64, Legal);
+        setOperationAction(ISD::UINT_TO_FP,    MVT::v4i64, Legal);
+        setOperationAction(ISD::UINT_TO_FP,    MVT::v2i64, Legal);
+        setOperationAction(ISD::FP_TO_SINT,    MVT::v4i64, Legal);
+        setOperationAction(ISD::FP_TO_SINT,    MVT::v2i64, Legal);
+        setOperationAction(ISD::FP_TO_UINT,    MVT::v4i64, Legal);
+        setOperationAction(ISD::FP_TO_UINT,    MVT::v2i64, Legal);
+      }
+    }
+    if (Subtarget->hasVLX()) {
+      setOperationAction(ISD::SINT_TO_FP,       MVT::v8i32, Legal);
+      setOperationAction(ISD::UINT_TO_FP,       MVT::v8i32, Legal);
+      setOperationAction(ISD::FP_TO_SINT,       MVT::v8i32, Legal);
+      setOperationAction(ISD::FP_TO_UINT,       MVT::v8i32, Legal);
+      setOperationAction(ISD::SINT_TO_FP,       MVT::v4i32, Legal);
+      setOperationAction(ISD::UINT_TO_FP,       MVT::v4i32, Legal);
+      setOperationAction(ISD::FP_TO_SINT,       MVT::v4i32, Legal);
+      setOperationAction(ISD::FP_TO_UINT,       MVT::v4i32, Legal);
      }
      setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
      setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
@@ -1388,6 +1430,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      setOperationAction(ISD::SELECT,             MVT::v16i1, Custom);
      setOperationAction(ISD::SELECT,             MVT::v8i1,  Custom);
  
+    setOperationAction(ISD::SMAX,               MVT::v16i32, Legal);
+    setOperationAction(ISD::SMAX,               MVT::v8i64, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v16i32, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v8i64, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v16i32, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v8i64, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v16i32, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v8i64, Legal);
+
      setOperationAction(ISD::ADD,                MVT::v8i64, Legal);
      setOperationAction(ISD::ADD,                MVT::v16i32, Legal);
  
@@ -1485,6 +1536,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      setOperationAction(ISD::SUB,                MVT::v32i16, Legal);
      setOperationAction(ISD::SUB,                MVT::v64i8, Legal);
      setOperationAction(ISD::MUL,                MVT::v32i16, Legal);
+    setOperationAction(ISD::MULHS,              MVT::v32i16, Legal);
+    setOperationAction(ISD::MULHU,              MVT::v32i16, Legal);
      setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i1, Custom);
      setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i1, Custom);
      setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i1, Custom);
@@ -1504,6 +1557,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      setOperationAction(ISD::TRUNCATE,           MVT::v32i1, Custom);
      setOperationAction(ISD::TRUNCATE,           MVT::v64i1, Custom);
  
+    setOperationAction(ISD::SMAX,               MVT::v64i8, Legal);
+    setOperationAction(ISD::SMAX,               MVT::v32i16, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v64i8, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v32i16, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v64i8, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v32i16, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v64i8, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v32i16, Legal);
+
      for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
        const MVT VT = (MVT::SimpleValueType)i;
  
@@ -1543,6 +1605,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      setOperationAction(ISD::XOR,                MVT::v4i32, Legal);
      setOperationAction(ISD::SRA,                MVT::v2i64, Custom);
      setOperationAction(ISD::SRA,                MVT::v4i64, Custom);
+
+    setOperationAction(ISD::SMAX,               MVT::v2i64, Legal);
+    setOperationAction(ISD::SMAX,               MVT::v4i64, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v2i64, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v4i64, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v2i64, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v4i64, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v2i64, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v4i64, Legal);
    }
  
    // We want to custom lower some of our intrinsics.
@@ -1623,6 +1694,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
    setTargetDAGCombine(ISD::SIGN_EXTEND);
    setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
    setTargetDAGCombine(ISD::SINT_TO_FP);
+  setTargetDAGCombine(ISD::UINT_TO_FP);
    setTargetDAGCombine(ISD::SETCC);
    setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
    setTargetDAGCombine(ISD::BUILD_VECTOR);
@@ -1664,7 +1736,8 @@ X86TargetLowering::getPreferredVectorAction(EVT VT) const {
    return TargetLoweringBase::getPreferredVectorAction(VT);
  }
  
-EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
+EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
+                                          EVT VT) const {
    if (!VT.isVector())
      return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
  
@@ -1736,10 +1809,11 @@ static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
  /// function arguments in the caller parameter area. For X86, aggregates
  /// that contain SSE vectors are placed at 16-byte boundaries while the rest
  /// are at 4-byte boundaries.
-unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
+unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
+                                                  const DataLayout &DL) const {
    if (Subtarget->is64Bit()) {
      // Max of 8 and alignment of type.
-    unsigned TyAlign = TD->getABITypeAlignment(Ty);
+    unsigned TyAlign = DL.getABITypeAlignment(Ty);
      if (TyAlign > 8)
        return TyAlign;
      return 8;
@@ -1842,7 +1916,7 @@ X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
           Subtarget->isPICStyleGOT());
    // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
    // entries.
-  return MCSymbolRefExpr::Create(MBB->getSymbol(),
+  return MCSymbolRefExpr::create(MBB->getSymbol(),
                                   MCSymbolRefExpr::VK_GOTOFF, Ctx);
  }
  
@@ -1852,7 +1926,8 @@ SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
    if (!Subtarget->is64Bit())
      // This doesn't have SDLoc associated with it, but is not really the
      // same as a Register.
-    return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
+    return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
+                       getPointerTy(DAG.getDataLayout()));
    return Table;
  }
  
@@ -1866,7 +1941,7 @@ getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
      return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
  
    // Otherwise, the reference is relative to the PIC base.
-  return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
+  return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
  }
  
  std::pair<const TargetRegisterClass *, uint8_t>
@@ -1981,7 +2056,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
          ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
      }
      else if (VA.getLocInfo() == CCValAssign::BCvt)
-      ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
+      ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
  
      assert(VA.getLocInfo() != CCValAssign::FPExt &&
             "Unexpected FP-extend for return value.");
@@ -2018,13 +2093,13 @@ X86TargetLowering::LowerReturn(SDValue Chain,
      if (Subtarget->is64Bit()) {
        if (ValVT == MVT::x86mmx) {
          if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
-          ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
+          ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
            ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
                                    ValToCopy);
            // If we don't have SSE2 available, convert to v4f32 so the generated
            // register is legal.
            if (!Subtarget->hasSSE2())
-            ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
+            ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
          }
        }
      }
@@ -2044,7 +2119,8 @@ X86TargetLowering::LowerReturn(SDValue Chain,
    // false, then an sret argument may be implicitly inserted in the SelDAG. In
    // either case FuncInfo->setSRetReturnReg() will have been called.
    if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
-    SDValue Val = DAG.getCopyFromReg(Chain, dl, SRetReg, getPointerTy());
+    SDValue Val = DAG.getCopyFromReg(Chain, dl, SRetReg,
+                                     getPointerTy(MF.getDataLayout()));
  
      unsigned RetValReg
          = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
@@ -2053,7 +2129,8 @@ X86TargetLowering::LowerReturn(SDValue Chain,
      Flag = Chain.getValue(1);
  
      // RAX/EAX now acts like a return value.
-    RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
+    RetOps.push_back(
+        DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
    }
  
    RetOps[0] = Chain;  // Update chain.
@@ -2247,7 +2324,9 @@ static bool IsCCallConvention(CallingConv::ID CC) {
  }
  
  bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
-  if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
+  auto Attr =
+      CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
+  if (!CI->isTailCall() || Attr.getValueAsString() == "true")
      return false;
  
    CallSite CS(CI);
@@ -2298,11 +2377,11 @@ X86TargetLowering::LowerMemArgument(SDValue Chain,
      unsigned Bytes = Flags.getByValSize();
      if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
      int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
-    return DAG.getFrameIndex(FI, getPointerTy());
+    return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
    } else {
      int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
                                      VA.getLocMemOffset(), isImmutable);
-    SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+    SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
      SDValue Val =  DAG.getLoad(ValVT, dl, Chain, FIN,
                                 MachinePointerInfo::getFixedStack(FI),
                                 false, false, false, 0);
@@ -2451,7 +2530,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
          ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
                                 DAG.getValueType(VA.getValVT()));
        else if (VA.getLocInfo() == CCValAssign::BCvt)
-        ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
+        ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
  
        if (VA.isExtInLoc()) {
          // Handle MMX values passed in XMM regs.
@@ -2481,7 +2560,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
      if (Ins[i].Flags.isSRet()) {
        unsigned Reg = FuncInfo->getSRetReturnReg();
        if (!Reg) {
-        MVT PtrTy = getPointerTy();
+        MVT PtrTy = getPointerTy(DAG.getDataLayout());
          Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
          FuncInfo->setSRetReturnReg(Reg);
        }
@@ -2509,7 +2588,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
  
    MachineModuleInfo &MMI = MF.getMMI();
    const Function *WinEHParent = nullptr;
-  if (IsWin64 && MMI.hasWinEHFuncInfo(Fn))
+  if (MMI.hasWinEHFuncInfo(Fn))
      WinEHParent = MMI.getWinEHParent(Fn);
    bool IsWinEHOutlined = WinEHParent && WinEHParent != Fn;
    bool IsWinEHParent = WinEHParent && WinEHParent == Fn;
@@ -2571,11 +2650,11 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
      // Store the integer parameter registers.
      SmallVector<SDValue, 8> MemOps;
      SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
-                                      getPointerTy());
+                                      getPointerTy(DAG.getDataLayout()));
      unsigned Offset = FuncInfo->getVarArgsGPOffset();
      for (SDValue Val : LiveGPRs) {
-      SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
-                                DAG.getIntPtrConstant(Offset, dl));
+      SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
+                                RSFIN, DAG.getIntPtrConstant(Offset, dl));
        SDValue Store =
          DAG.getStore(Val.getValue(1), dl, Val, FIN,
                       MachinePointerInfo::getFixedStack(
@@ -2602,7 +2681,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
  
      if (!MemOps.empty())
        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
-  } else if (IsWinEHOutlined) {
+  } else if (IsWin64 && IsWinEHOutlined) {
      // Get to the caller-allocated home save location.  Add 8 to account
      // for the return address.
      int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
@@ -2615,8 +2694,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
  
      // Store the second integer parameter (rdx) into rsp+16 relative to the
      // stack pointer at the entry of the function.
-    SDValue RSFIN =
-        DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), getPointerTy());
+    SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
+                                      getPointerTy(DAG.getDataLayout()));
      unsigned GPR = MF.addLiveIn(X86::RDX, &X86::GR64RegClass);
      SDValue Val = DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64);
      Chain = DAG.getStore(
@@ -2690,14 +2769,21 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
    FuncInfo->setArgumentStackSize(StackSize);
  
    if (IsWinEHParent) {
-    int UnwindHelpFI = MFI->CreateStackObject(8, 8, /*isSS=*/false);
-    SDValue StackSlot = DAG.getFrameIndex(UnwindHelpFI, MVT::i64);
-    MMI.getWinEHFuncInfo(MF.getFunction()).UnwindHelpFrameIdx = UnwindHelpFI;
-    SDValue Neg2 = DAG.getConstant(-2, dl, MVT::i64);
-    Chain = DAG.getStore(Chain, dl, Neg2, StackSlot,
-                         MachinePointerInfo::getFixedStack(UnwindHelpFI),
-                         /*isVolatile=*/true,
-                         /*isNonTemporal=*/false, /*Alignment=*/0);
+    if (Is64Bit) {
+      int UnwindHelpFI = MFI->CreateStackObject(8, 8, /*isSS=*/false);
+      SDValue StackSlot = DAG.getFrameIndex(UnwindHelpFI, MVT::i64);
+      MMI.getWinEHFuncInfo(MF.getFunction()).UnwindHelpFrameIdx = UnwindHelpFI;
+      SDValue Neg2 = DAG.getConstant(-2, dl, MVT::i64);
+      Chain = DAG.getStore(Chain, dl, Neg2, StackSlot,
+                           MachinePointerInfo::getFixedStack(UnwindHelpFI),
+                           /*isVolatile=*/true,
+                           /*isNonTemporal=*/false, /*Alignment=*/0);
+    } else {
+      // Functions using Win32 EH are considered to have opaque SP adjustments
+      // to force local variables to be addressed from the frame or base
+      // pointers.
+      MFI->setHasOpaqueSPAdjustment(true);
+    }
    }
  
    return Chain;
@@ -2711,7 +2797,8 @@ X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
                                      ISD::ArgFlagsTy Flags) const {
    unsigned LocMemOffset = VA.getLocMemOffset();
    SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
-  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
+  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
+                       StackPtr, PtrOff);
    if (Flags.isByVal())
      return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
  
@@ -2728,7 +2815,7 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
                                             bool IsTailCall, bool Is64Bit,
                                             int FPDiff, SDLoc dl) const {
    // Adjust the Return address stack slot.
-  EVT VT = getPointerTy();
+  EVT VT = getPointerTy(DAG.getDataLayout());
    OutRetAddr = getReturnAddressFrameIndex(DAG);
  
    // Load the "old" Return address.
@@ -2776,10 +2863,24 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
    StructReturnType SR = callIsStructReturn(Outs);
    bool IsSibcall      = false;
    X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
+  auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
  
-  if (MF.getTarget().Options.DisableTailCalls)
+  if (Attr.getValueAsString() == "true")
      isTailCall = false;
  
+  if (Subtarget->isPICStyleGOT() &&
+      !MF.getTarget().Options.GuaranteedTailCallOpt) {
+    // If we are using a GOT, disable tail calls to external symbols with
+    // default visibility. Tail calling such a symbol requires using a GOT
+    // relocation, which forces early binding of the symbol. This breaks code
+    // that require lazy function symbol resolution. Using musttail or
+    // GuaranteedTailCallOpt will override this.
+    GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
+    if (!G || (!G->getGlobal()->hasLocalLinkage() &&
+               G->getGlobal()->hasDefaultVisibility()))
+      isTailCall = false;
+  }
+
    bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
    if (IsMustTail) {
      // Force this to be a tail call.  The verifier rules are enough to ensure
@@ -2898,14 +2999,14 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
          Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
        else if (RegVT.is128BitVector()) {
          // Special case: passing MMX values in XMM registers.
-        Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
+        Arg = DAG.getBitcast(MVT::i64, Arg);
          Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
          Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
        } else
          Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
        break;
      case CCValAssign::BCvt:
-      Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
+      Arg = DAG.getBitcast(RegVT, Arg);
        break;
      case CCValAssign::Indirect: {
        // Store the argument.
@@ -2938,7 +3039,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
        assert(VA.isMemLoc());
        if (!StackPtr.getNode())
          StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
-                                      getPointerTy());
+                                      getPointerTy(DAG.getDataLayout()));
        MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
                                               dl, DAG, VA, Flags));
      }
@@ -2951,8 +3052,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
      // ELF / PIC requires GOT in the EBX register before function calls via PLT
      // GOT pointer.
      if (!isTailCall) {
-      RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
-               DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
+      RegsToPass.push_back(std::make_pair(
+          unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
+                                          getPointerTy(DAG.getDataLayout()))));
      } else {
        // If we are tail calling and generating PIC/GOT style code load the
        // address of the callee into ECX. The value in ecx is used as target of
@@ -2964,8 +3066,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
  
        // Note: The actual moving to ECX is done further down.
        GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
-      if (G && !G->getGlobal()->hasHiddenVisibility() &&
-          !G->getGlobal()->hasProtectedVisibility())
+      if (G && !G->getGlobal()->hasLocalLinkage() &&
+          G->getGlobal()->hasDefaultVisibility())
          Callee = LowerGlobalAddress(Callee, DAG);
        else if (isa<ExternalSymbolSDNode>(Callee))
          Callee = LowerExternalSymbol(Callee, DAG);
@@ -3032,16 +3134,16 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
        int32_t Offset = VA.getLocMemOffset()+FPDiff;
        uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
        FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
-      FIN = DAG.getFrameIndex(FI, getPointerTy());
+      FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
  
        if (Flags.isByVal()) {
          // Copy relative to framepointer.
          SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
          if (!StackPtr.getNode())
-          StackPtr = DAG.getCopyFromReg(Chain, dl,
-                                        RegInfo->getStackRegister(),
-                                        getPointerTy());
-        Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
+          StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
+                                        getPointerTy(DAG.getDataLayout()));
+        Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
+                             StackPtr, Source);
  
          MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
                                                           ArgChain,
@@ -3060,8 +3162,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
  
      // Store the return address to the appropriate stack slot.
      Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
-                                     getPointerTy(), RegInfo->getSlotSize(),
-                                     FPDiff, dl);
+                                     getPointerTy(DAG.getDataLayout()),
+                                     RegInfo->getSlotSize(), FPDiff, dl);
    }
  
    // Build a sequence of copy-to-reg nodes chained together with token chain
@@ -3102,7 +3204,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
            GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
          OpFlags = X86II::MO_PLT;
        } else if (Subtarget->isPICStyleStubAny() &&
-                 (GV->isDeclaration() || GV->isWeakForLinker()) &&
+                 !GV->isStrongDefinitionForLinker() &&
                   (!Subtarget->getTargetTriple().isMacOSX() ||
                    Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
          // PC-relative references to external symbols should go through $stub,
@@ -3119,17 +3221,18 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
          ExtraLoad = true;
        }
  
-      Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
-                                          G->getOffset(), OpFlags);
+      Callee = DAG.getTargetGlobalAddress(
+          GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
  
        // Add a wrapper if needed.
        if (WrapperKind != ISD::DELETED_NODE)
-        Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
+        Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
+                             getPointerTy(DAG.getDataLayout()), Callee);
        // Add extra indirection if needed.
        if (ExtraLoad)
-        Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
-                             MachinePointerInfo::getGOT(),
-                             false, false, false, 0);
+        Callee = DAG.getLoad(
+            getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
+            MachinePointerInfo::getGOT(), false, false, false, 0);
      }
    } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
      unsigned char OpFlags = 0;
@@ -3148,8 +3251,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
        OpFlags = X86II::MO_DARWIN_STUB;
      }
  
-    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
-                                         OpFlags);
+    Callee = DAG.getTargetExternalSymbol(
+        S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
    } else if (Subtarget->isTarget64BitILP32() &&
               Callee->getValueType(0) == MVT::i32) {
      // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
@@ -3180,9 +3283,24 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                    RegsToPass[i].second.getValueType()));
  
    // Add a register mask operand representing the call-preserved registers.
-  const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
-  const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
+  const uint32_t *Mask = RegInfo->getCallPreservedMask(MF, CallConv);
    assert(Mask && "Missing call preserved mask for calling convention");
+
+  // If this is an invoke in a 32-bit function using an MSVC personality, assume
+  // the function clobbers all registers. If an exception is thrown, the runtime
+  // will not restore CSRs.
+  // FIXME: Model this more precisely so that we can register allocate across
+  // the normal edge and spill and fill across the exceptional edge.
+  if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) {
+    const Function *CallerFn = MF.getFunction();
+    EHPersonality Pers =
+        CallerFn->hasPersonalityFn()
+            ? classifyEHPersonality(CallerFn->getPersonalityFn())
+            : EHPersonality::Unknown;
+    if (isMSVCEHPersonality(Pers))
+      Mask = RegInfo->getNoPreservedMask();
+  }
+
    Ops.push_back(DAG.getRegisterMask(Mask));
  
    if (InFlag.getNode())
@@ -3646,7 +3764,7 @@ SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
      FuncInfo->setRAIndex(ReturnAddrIndex);
    }
  
-  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
+  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
  }
  
  bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
@@ -3877,6 +3995,15 @@ bool X86TargetLowering::isCheapToSpeculateCtlz() const {
    return Subtarget->hasLZCNT();
  }
  
+/// isUndefInRange - Return true if every element in Mask, beginning
+/// from position Pos and ending in Pos+Size is undef.
+static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
+  for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
+    if (0 <= Mask[i])
+      return false;
+  return true;
+}
+
  /// isUndefOrInRange - Return true if Val is undef or if its value falls within
  /// the specified range (L, H].
  static bool isUndefOrInRange(int Val, int Low, int Hi) {
@@ -4073,7 +4200,7 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
    } else
      llvm_unreachable("Unexpected vector type");
  
-  return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
+  return DAG.getBitcast(VT, Vec);
  }
  
  static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
@@ -4200,9 +4327,9 @@ static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
      MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32;
  
      SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8);
-    Vec256 = DAG.getNode(ISD::BITCAST, dl, CastVT, Vec256);
+    Vec256 = DAG.getBitcast(CastVT, Vec256);
      Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask);
-    return DAG.getNode(ISD::BITCAST, dl, ResultVT, Vec256);
+    return DAG.getBitcast(ResultVT, Vec256);
    }
  
    return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
@@ -4255,7 +4382,7 @@ static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
    } else
      llvm_unreachable("Unexpected vector type");
  
-  return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
+  return DAG.getBitcast(VT, Vec);
  }
  
  /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
@@ -4318,6 +4445,7 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
  /// IsUnary to true if only uses one source. Note that this will set IsUnary for
  /// shuffles which use a single input multiple times, and in those cases it will
  /// adjust the mask to only have indices within that single input.
+/// FIXME: Add support for Decode*Mask functions that return SM_SentinelZero.
  static bool getTargetShuffleMask(SDNode *N, MVT VT,
                                   SmallVectorImpl<int> &Mask, bool &IsUnary) {
    unsigned NumElems = VT.getVectorNumElements();
@@ -4447,6 +4575,10 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT,
      ImmN = N->getOperand(N->getNumOperands()-1);
      DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
      if (Mask.empty()) return false;
+    // Mask only contains negative index if an element is zero.
+    if (std::any_of(Mask.begin(), Mask.end(),
+                    [](int M){ return M == SM_SentinelZero; }))
+      return false;
      break;
    case X86ISD::MOVSLDUP:
      DecodeMOVSLDUPMask(VT, Mask);
@@ -4611,7 +4743,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
      }
    }
  
-  return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
+  return DAG.getBitcast(MVT::v16i8, V);
  }
  
  /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
@@ -4749,7 +4881,7 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
    SDLoc DL(Op);
    SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
                                 DAG.getIntPtrConstant(InsertPSMask, DL));
-  return DAG.getNode(ISD::BITCAST, DL, VT, Result);
+  return DAG.getBitcast(VT, Result);
  }
  
  /// Return a vector logical shift node.
@@ -4759,12 +4891,11 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
    assert(VT.is128BitVector() && "Unknown type for VShift");
    MVT ShVT = MVT::v2i64;
    unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
-  SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
-  MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(SrcOp.getValueType());
+  SrcOp = DAG.getBitcast(ShVT, SrcOp);
+  MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
    assert(NumBits % 8 == 0 && "Only support byte sized shifts");
    SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
-  return DAG.getNode(ISD::BITCAST, dl, VT,
-                     DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
+  return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
  }
  
  static SDValue
@@ -4949,7 +5080,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
                               SDValue(ResNode.getNode(), 1));
      }
  
-    return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
+    return DAG.getBitcast(VT, ResNode);
    }
    return SDValue();
  }
@@ -5079,7 +5210,8 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
        assert(C && "Invalid constant type");
  
        const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-      SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
+      SDValue CP =
+          DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
        unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
        Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
                         MachinePointerInfo::getConstantPool(),
@@ -5261,8 +5393,8 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
    if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
      SDValue Imm = ConvertI1VectorToInterger(Op, DAG);
      if (Imm.getValueSizeInBits() == VT.getSizeInBits())
-      return DAG.getNode(ISD::BITCAST, dl, VT, Imm);
-    SDValue ExtVec = DAG.getNode(ISD::BITCAST, dl, MVT::v8i1, Imm);
+      return DAG.getBitcast(VT, Imm);
+    SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
                          DAG.getIntPtrConstant(0, dl));
    }
@@ -5277,7 +5409,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
      SDValue In = Op.getOperand(idx);
      if (In.getOpcode() == ISD::UNDEF)
        continue;
-    if (!isa<ConstantSDNode>(In)) 
+    if (!isa<ConstantSDNode>(In))
        NonConstIdx.push_back(idx);
      else {
        Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
@@ -5304,12 +5436,12 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
    }
    else if (HasConstElts)
      Imm = DAG.getConstant(0, dl, VT);
-  else 
+  else
      Imm = DAG.getUNDEF(VT);
    if (Imm.getValueSizeInBits() == VT.getSizeInBits())
-    DstVec = DAG.getNode(ISD::BITCAST, dl, VT, Imm);
+    DstVec = DAG.getBitcast(VT, Imm);
    else {
-    SDValue ExtVec = DAG.getNode(ISD::BITCAST, dl, MVT::v8i1, Imm);
+    SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
      DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
                           DAG.getIntPtrConstant(0, dl));
    }
@@ -5443,7 +5575,7 @@ static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
  ///
  /// Otherwise, the first horizontal binop dag node takes as input the lower
  /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
-/// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1.
+/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
  ///   Example:
  ///     HADD V0_LO, V1_LO
  ///     HADD V0_HI, V1_HI
@@ -5818,9 +5950,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
          // convert it to a vector with movd (S2V+shuffle to zero extend).
          Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
          Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
-        return DAG.getNode(
-            ISD::BITCAST, dl, VT,
-            getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG));
+        return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
+                                      Item, Idx * 2, true, Subtarget, DAG));
        }
      }
  
@@ -5866,7 +5997,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
            Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
            Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
          }
-        return DAG.getNode(ISD::BITCAST, dl, VT, Item);
+        return DAG.getBitcast(VT, Item);
        }
      }
  
@@ -6341,10 +6472,9 @@ static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1,
    V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
    // We have to cast V2 around.
    MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
-  V2 = DAG.getNode(ISD::BITCAST, DL, VT,
-                   DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
-                               DAG.getNode(ISD::BITCAST, DL, MaskVT, V1Mask),
-                               DAG.getNode(ISD::BITCAST, DL, MaskVT, V2)));
+  V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
+                                      DAG.getBitcast(MaskVT, V1Mask),
+                                      DAG.getBitcast(MaskVT, V2)));
    return DAG.getNode(ISD::OR, DL, VT, V1, V2);
  }
  
@@ -6395,11 +6525,11 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
              BlendMask |= 1u << (i * Scale + j);
  
        MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
-      V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
-      V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
-      return DAG.getNode(ISD::BITCAST, DL, VT,
-                         DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
-                                     DAG.getConstant(BlendMask, DL, MVT::i8)));
+      V1 = DAG.getBitcast(BlendVT, V1);
+      V2 = DAG.getBitcast(BlendVT, V2);
+      return DAG.getBitcast(
+          VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
+                          DAG.getConstant(BlendMask, DL, MVT::i8)));
      }
      // FALLTHROUGH
    case MVT::v8i16: {
@@ -6412,11 +6542,11 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
          for (int j = 0; j < Scale; ++j)
            BlendMask |= 1u << (i * Scale + j);
  
-    V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
-    V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
-    return DAG.getNode(ISD::BITCAST, DL, VT,
-                       DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
-                                   DAG.getConstant(BlendMask, DL, MVT::i8)));
+    V1 = DAG.getBitcast(MVT::v8i16, V1);
+    V2 = DAG.getBitcast(MVT::v8i16, V2);
+    return DAG.getBitcast(VT,
+                          DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
+                                      DAG.getConstant(BlendMask, DL, MVT::i8)));
    }
  
    case MVT::v16i16: {
@@ -6465,13 +6595,12 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
                          : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
                                            MVT::i8));
  
-    V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
-    V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
-    return DAG.getNode(
-        ISD::BITCAST, DL, VT,
-        DAG.getNode(ISD::VSELECT, DL, BlendVT,
-                    DAG.getNode(ISD::BUILD_VECTOR, DL, BlendVT, VSELECTMask),
-                    V1, V2));
+    V1 = DAG.getBitcast(BlendVT, V1);
+    V2 = DAG.getBitcast(BlendVT, V2);
+    return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, DL, BlendVT,
+                                          DAG.getNode(ISD::BUILD_VECTOR, DL,
+                                                      BlendVT, VSELECTMask),
+                                          V1, V2));
    }
  
    default:
@@ -6652,13 +6781,12 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
    if (Subtarget->hasSSSE3()) {
      // Cast the inputs to i8 vector of correct length to match PALIGNR.
      MVT AlignVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes);
-    Lo = DAG.getNode(ISD::BITCAST, DL, AlignVT, Lo);
-    Hi = DAG.getNode(ISD::BITCAST, DL, AlignVT, Hi);
+    Lo = DAG.getBitcast(AlignVT, Lo);
+    Hi = DAG.getBitcast(AlignVT, Hi);
  
-    return DAG.getNode(ISD::BITCAST, DL, VT,
-                       DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Hi, Lo,
-                                   DAG.getConstant(Rotation * Scale, DL,
-                                                   MVT::i8)));
+    return DAG.getBitcast(
+        VT, DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Hi, Lo,
+                        DAG.getConstant(Rotation * Scale, DL, MVT::i8)));
    }
  
    assert(VT.getSizeInBits() == 128 &&
@@ -6671,15 +6799,15 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
    int HiByteShift = Rotation * Scale;
  
    // Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ.
-  Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Lo);
-  Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Hi);
+  Lo = DAG.getBitcast(MVT::v2i64, Lo);
+  Hi = DAG.getBitcast(MVT::v2i64, Hi);
  
    SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo,
                                  DAG.getConstant(LoByteShift, DL, MVT::i8));
    SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi,
                                  DAG.getConstant(HiByteShift, DL, MVT::i8));
-  return DAG.getNode(ISD::BITCAST, DL, VT,
-                     DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift));
+  return DAG.getBitcast(VT,
+                        DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift));
  }
  
  /// \brief Compute whether each element of a shuffle is zeroable.
@@ -6740,8 +6868,8 @@ static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1,
    SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
                                      IntEltVT);
    if (EltVT.isFloatingPoint()) {
-    Zero = DAG.getNode(ISD::BITCAST, DL, EltVT, Zero);
-    AllOnes = DAG.getNode(ISD::BITCAST, DL, EltVT, AllOnes);
+    Zero = DAG.getBitcast(EltVT, Zero);
+    AllOnes = DAG.getBitcast(EltVT, AllOnes);
    }
    SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
    SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
@@ -6833,11 +6961,11 @@ static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1,
      MVT ShiftVT = MVT::getVectorVT(ShiftSVT, Size / Scale);
      assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
             "Illegal integer vector type");
-    V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, V);
+    V = DAG.getBitcast(ShiftVT, V);
  
      V = DAG.getNode(OpCode, DL, ShiftVT, V,
                      DAG.getConstant(ShiftAmt, DL, MVT::i8));
-    return DAG.getNode(ISD::BITCAST, DL, VT, V);
+    return DAG.getBitcast(VT, V);
    };
  
    // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
@@ -6858,6 +6986,136 @@ static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1,
    return SDValue();
  }
  
+/// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
+static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1,
+                                           SDValue V2, ArrayRef<int> Mask,
+                                           SelectionDAG &DAG) {
+  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+  assert(!Zeroable.all() && "Fully zeroable shuffle mask");
+
+  int Size = Mask.size();
+  int HalfSize = Size / 2;
+  assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
+
+  // Upper half must be undefined.
+  if (!isUndefInRange(Mask, HalfSize, HalfSize))
+    return SDValue();
+
+  // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
+  // Remainder of lower half result is zero and upper half is all undef.
+  auto LowerAsEXTRQ = [&]() {
+    // Determine the extraction length from the part of the
+    // lower half that isn't zeroable.
+    int Len = HalfSize;
+    for (; Len >= 0; --Len)
+      if (!Zeroable[Len - 1])
+        break;
+    assert(Len > 0 && "Zeroable shuffle mask");
+
+    // Attempt to match first Len sequential elements from the lower half.
+    SDValue Src;
+    int Idx = -1;
+    for (int i = 0; i != Len; ++i) {
+      int M = Mask[i];
+      if (M < 0)
+        continue;
+      SDValue &V = (M < Size ? V1 : V2);
+      M = M % Size;
+
+      // All mask elements must be in the lower half.
+      if (M > HalfSize)
+        return SDValue();
+
+      if (Idx < 0 || (Src == V && Idx == (M - i))) {
+        Src = V;
+        Idx = M - i;
+        continue;
+      }
+      return SDValue();
+    }
+
+    if (Idx < 0)
+      return SDValue();
+
+    assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
+    int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
+    int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
+    return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src,
+                       DAG.getConstant(BitLen, DL, MVT::i8),
+                       DAG.getConstant(BitIdx, DL, MVT::i8));
+  };
+
+  if (SDValue ExtrQ = LowerAsEXTRQ())
+    return ExtrQ;
+
+  // INSERTQ: Extract lowest Len elements from lower half of second source and
+  // insert over first source, starting at Idx.
+  // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
+  auto LowerAsInsertQ = [&]() {
+    for (int Idx = 0; Idx != HalfSize; ++Idx) {
+      SDValue Base;
+
+      // Attempt to match first source from mask before insertion point.
+      if (isUndefInRange(Mask, 0, Idx)) {
+        /* EMPTY */
+      } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
+        Base = V1;
+      } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
+        Base = V2;
+      } else {
+        continue;
+      }
+
+      // Extend the extraction length looking to match both the insertion of
+      // the second source and the remaining elements of the first.
+      for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
+        SDValue Insert;
+        int Len = Hi - Idx;
+
+        // Match insertion.
+        if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
+          Insert = V1;
+        } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
+          Insert = V2;
+        } else {
+          continue;
+        }
+
+        // Match the remaining elements of the lower half.
+        if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
+          /* EMPTY */
+        } else if ((!Base || (Base == V1)) &&
+                   isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
+          Base = V1;
+        } else if ((!Base || (Base == V2)) &&
+                   isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
+                                              Size + Hi)) {
+          Base = V2;
+        } else {
+          continue;
+        }
+
+        // We may not have a base (first source) - this can safely be undefined.
+        if (!Base)
+          Base = DAG.getUNDEF(VT);
+
+        int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
+        int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
+        return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert,
+                           DAG.getConstant(BitLen, DL, MVT::i8),
+                           DAG.getConstant(BitIdx, DL, MVT::i8));
+      }
+    }
+
+    return SDValue();
+  };
+
+  if (SDValue InsertQ = LowerAsInsertQ())
+    return InsertQ;
+
+  return SDValue();
+}
+
  /// \brief Lower a vector shuffle as a zero or any extension.
  ///
  /// Given a specific number of elements, element bit width, and extension
@@ -6865,7 +7123,7 @@ static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1,
  /// features of the subtarget.
  static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
      SDLoc DL, MVT VT, int Scale, bool AnyExt, SDValue InputV,
-    const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+    ArrayRef<int> Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) {
    assert(Scale > 1 && "Need a scale to extend.");
    int NumElements = VT.getVectorNumElements();
    int EltBits = VT.getScalarSizeInBits();
@@ -6878,31 +7136,50 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
    if (Subtarget->hasSSE41()) {
      MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
                                   NumElements / Scale);
-    return DAG.getNode(ISD::BITCAST, DL, VT,
-                       DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV));
+    return DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV));
    }
  
    // For any extends we can cheat for larger element sizes and use shuffle
    // instructions that can fold with a load and/or copy.
    if (AnyExt && EltBits == 32) {
      int PSHUFDMask[4] = {0, -1, 1, -1};
-    return DAG.getNode(
-        ISD::BITCAST, DL, VT,
-        DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
-                    DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
-                    getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
+    return DAG.getBitcast(
+        VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
+                        DAG.getBitcast(MVT::v4i32, InputV),
+                        getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
    }
    if (AnyExt && EltBits == 16 && Scale > 2) {
      int PSHUFDMask[4] = {0, -1, 0, -1};
      InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
-                         DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
+                         DAG.getBitcast(MVT::v4i32, InputV),
                           getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
      int PSHUFHWMask[4] = {1, -1, -1, -1};
-    return DAG.getNode(
-        ISD::BITCAST, DL, VT,
-        DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16,
-                    DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, InputV),
-                    getV4X86ShuffleImm8ForMask(PSHUFHWMask, DL, DAG)));
+    return DAG.getBitcast(
+        VT, DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16,
+                        DAG.getBitcast(MVT::v8i16, InputV),
+                        getV4X86ShuffleImm8ForMask(PSHUFHWMask, DL, DAG)));
+  }
+
+  // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
+  // to 64-bits.
+  if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget->hasSSE4A()) {
+    assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
+    assert(VT.getSizeInBits() == 128 && "Unexpected vector width!");
+
+    SDValue Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
+                             DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
+                                         DAG.getConstant(EltBits, DL, MVT::i8),
+                                         DAG.getConstant(0, DL, MVT::i8)));
+    if (isUndefInRange(Mask, NumElements/2, NumElements/2))
+      return DAG.getNode(ISD::BITCAST, DL, VT, Lo);
+
+    SDValue Hi =
+        DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
+                    DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
+                                DAG.getConstant(EltBits, DL, MVT::i8),
+                                DAG.getConstant(EltBits, DL, MVT::i8)));
+    return DAG.getNode(ISD::BITCAST, DL, VT,
+                       DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
    }
  
    // If this would require more than 2 unpack instructions to expand, use
@@ -6914,11 +7191,11 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
      for (int i = 0; i < 16; ++i)
        PSHUFBMask[i] =
            DAG.getConstant((i % Scale == 0) ? i / Scale : 0x80, DL, MVT::i8);
-    InputV = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, InputV);
-    return DAG.getNode(ISD::BITCAST, DL, VT,
-                       DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
-                                   DAG.getNode(ISD::BUILD_VECTOR, DL,
-                                               MVT::v16i8, PSHUFBMask)));
+    InputV = DAG.getBitcast(MVT::v16i8, InputV);
+    return DAG.getBitcast(VT,
+                          DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
+                                      DAG.getNode(ISD::BUILD_VECTOR, DL,
+                                                  MVT::v16i8, PSHUFBMask)));
    }
  
    // Otherwise emit a sequence of unpacks.
@@ -6926,13 +7203,13 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
      MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
      SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
                           : getZeroVector(InputVT, Subtarget, DAG, DL);
-    InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
+    InputV = DAG.getBitcast(InputVT, InputV);
      InputV = DAG.getNode(X86ISD::UNPCKL, DL, InputVT, InputV, Ext);
      Scale /= 2;
      EltBits *= 2;
      NumElements /= 2;
    } while (Scale > 1);
-  return DAG.getNode(ISD::BITCAST, DL, VT, InputV);
+  return DAG.getBitcast(VT, InputV);
  }
  
  /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
@@ -6995,7 +7272,7 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
        return SDValue();
  
      return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
-        DL, VT, Scale, AnyExt, InputV, Subtarget, DAG);
+        DL, VT, Scale, AnyExt, InputV, Mask, Subtarget, DAG);
    };
  
    // The widest scale possible for extending is to a 64-bit integer.
@@ -7030,9 +7307,9 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
    };
  
    if (SDValue V = CanZExtLowHalf()) {
-    V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V);
+    V = DAG.getBitcast(MVT::v2i64, V);
      V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
-    return DAG.getNode(ISD::BITCAST, DL, VT, V);
+    return DAG.getBitcast(VT, V);
    }
  
    // No viable ext lowering found.
@@ -7106,7 +7383,7 @@ static SDValue lowerVectorShuffleAsElementInsertion(
    if (SDValue V2S = getScalarValueForVectorElement(
            V2, Mask[V2Index] - Mask.size(), DAG)) {
      // We need to zext the scalar if it is smaller than an i32.
-    V2S = DAG.getNode(ISD::BITCAST, DL, EltVT, V2S);
+    V2S = DAG.getBitcast(EltVT, V2S);
      if (EltVT == MVT::i8 || EltVT == MVT::i16) {
        // Using zext to expand a narrow element won't work for non-zero
        // insertions.
@@ -7155,7 +7432,7 @@ static SDValue lowerVectorShuffleAsElementInsertion(
  
    V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
    if (ExtVT != VT)
-    V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
+    V2 = DAG.getBitcast(VT, V2);
  
    if (V2Index != 0) {
      // If we have 4 or fewer lanes we can cheaply shuffle the element into
@@ -7167,13 +7444,13 @@ static SDValue lowerVectorShuffleAsElementInsertion(
        V2Shuffle[V2Index] = 0;
        V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
      } else {
-      V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V2);
+      V2 = DAG.getBitcast(MVT::v2i64, V2);
        V2 = DAG.getNode(
            X86ISD::VSHLDQ, DL, MVT::v2i64, V2,
-          DAG.getConstant(
-              V2Index * EltVT.getSizeInBits()/8, DL,
-              DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64)));
-      V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
+          DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
+                          DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
+                              DAG.getDataLayout(), VT)));
+      V2 = DAG.getBitcast(VT, V2);
      }
    }
    return V2;
@@ -7396,13 +7673,13 @@ static SDValue lowerVectorShuffleAsUnpack(SDLoc DL, MVT VT, SDValue V1,
      V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
  
      // Cast the inputs to the type we will use to unpack them.
-    V1 = DAG.getNode(ISD::BITCAST, DL, UnpackVT, V1);
-    V2 = DAG.getNode(ISD::BITCAST, DL, UnpackVT, V2);
+    V1 = DAG.getBitcast(UnpackVT, V1);
+    V2 = DAG.getBitcast(UnpackVT, V2);
  
      // Unpack the inputs and cast the result back to the desired type.
-    return DAG.getNode(ISD::BITCAST, DL, VT,
-                       DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH,
-                                   DL, UnpackVT, V1, V2));
+    return DAG.getBitcast(
+        VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
+                        UnpackVT, V1, V2));
    };
  
    // We try each unpack from the largest to the smallest to try and find one
@@ -7558,12 +7835,12 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
      // Straight shuffle of a single input vector. For everything from SSE2
      // onward this has a single fast instruction with no scary immediates.
      // We have to map the mask as it is actually a v4i32 shuffle instruction.
-    V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V1);
+    V1 = DAG.getBitcast(MVT::v4i32, V1);
      int WidenedMask[4] = {
          std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
          std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
-    return DAG.getNode(
-        ISD::BITCAST, DL, MVT::v2i64,
+    return DAG.getBitcast(
+        MVT::v2i64,
          DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
                      getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
    }
@@ -7584,12 +7861,12 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    };
    if (SDValue V1Pack = GetPackNode(V1))
      if (SDValue V2Pack = GetPackNode(V2))
-      return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
-                         DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8,
-                                     Mask[0] == 0 ? V1Pack.getOperand(0)
-                                                  : V1Pack.getOperand(1),
-                                     Mask[1] == 2 ? V2Pack.getOperand(0)
-                                                  : V2Pack.getOperand(1)));
+      return DAG.getBitcast(MVT::v2i64,
+                            DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8,
+                                        Mask[0] == 0 ? V1Pack.getOperand(0)
+                                                     : V1Pack.getOperand(1),
+                                        Mask[1] == 2 ? V2Pack.getOperand(0)
+                                                     : V2Pack.getOperand(1)));
  
    // Try to use shift instructions.
    if (SDValue Shift =
@@ -7639,10 +7916,10 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
    // However, all the alternatives are still more cycles and newer chips don't
    // have this problem. It would be really nice if x86 had better shuffles here.
-  V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V1);
-  V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V2);
-  return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
-                     DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
+  V1 = DAG.getBitcast(MVT::v2f64, V1);
+  V2 = DAG.getBitcast(MVT::v2f64, V2);
+  return DAG.getBitcast(MVT::v2i64,
+                        DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
  }
  
  /// \brief Test whether this can be lowered with a single SHUFPS instruction.
@@ -7941,11 +8218,10 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    // up the inputs, bypassing domain shift penalties that we would encur if we
    // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
    // relevant.
-  return DAG.getNode(ISD::BITCAST, DL, MVT::v4i32,
-                     DAG.getVectorShuffle(
-                         MVT::v4f32, DL,
-                         DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V1),
-                         DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V2), Mask));
+  return DAG.getBitcast(
+      MVT::v4i32,
+      DAG.getVectorShuffle(MVT::v4f32, DL, DAG.getBitcast(MVT::v4f32, V1),
+                           DAG.getBitcast(MVT::v4f32, V2), Mask));
  }
  
  /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
@@ -8123,11 +8399,10 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
      int PSHUFDMask[] = {0, 1, 2, 3};
      PSHUFDMask[ADWord] = BDWord;
      PSHUFDMask[BDWord] = ADWord;
-    V = DAG.getNode(ISD::BITCAST, DL, VT,
-                    DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT,
-                                DAG.getNode(ISD::BITCAST, DL, PSHUFDVT, V),
-                                getV4X86ShuffleImm8ForMask(PSHUFDMask, DL,
-                                                           DAG)));
+    V = DAG.getBitcast(
+        VT,
+        DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
+                    getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
  
      // Adjust the mask to match the new locations of A and B.
      for (int &M : Mask)
@@ -8368,11 +8643,10 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
      V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
                      getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
    if (!isNoopShuffleMask(PSHUFDMask))
-    V = DAG.getNode(ISD::BITCAST, DL, VT,
-                    DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT,
-                                DAG.getNode(ISD::BITCAST, DL, PSHUFDVT, V),
-                                getV4X86ShuffleImm8ForMask(PSHUFDMask, DL,
-                                                           DAG)));
+    V = DAG.getBitcast(
+        VT,
+        DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
+                    getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
  
    // At this point, each half should contain all its inputs, and we can then
    // just shuffle them into their final position.
@@ -8433,11 +8707,11 @@ static SDValue lowerVectorShuffleAsPSHUFB(SDLoc DL, MVT VT, SDValue V1,
  
    if (V1InUse)
      V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
-                     DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, V1),
+                     DAG.getBitcast(MVT::v16i8, V1),
                       DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
    if (V2InUse)
      V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
-                     DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, V2),
+                     DAG.getBitcast(MVT::v16i8, V2),
                       DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));
  
    // If we need shuffled inputs from both, blend the two.
@@ -8448,7 +8722,7 @@ static SDValue lowerVectorShuffleAsPSHUFB(SDLoc DL, MVT VT, SDValue V1,
      V = V1InUse ? V1 : V2;
  
    // Cast the result back to the correct type.
-  return DAG.getNode(ISD::BITCAST, DL, VT, V);
+  return DAG.getBitcast(VT, V);
  }
  
  /// \brief Generic lowering of 8-lane i16 shuffles.
@@ -8525,6 +8799,11 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
            lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, DAG))
      return Shift;
  
+  // See if we can use SSE4A Extraction / Insertion.
+  if (Subtarget->hasSSE4A())
+    if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, DAG))
+      return V;
+
    // There are special ways we can lower some single-element blends.
    if (NumV2Inputs == 1)
      if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v8i16, V1, V2,
@@ -8677,6 +8956,11 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
            DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
      return ZExt;
  
+  // See if we can use SSE4A Extraction / Insertion.
+  if (Subtarget->hasSSE4A())
+    if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, DAG))
+      return V;
+
    int NumV2Elements =
        std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; });
  
@@ -8749,10 +9033,9 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
          // Update the lane map based on the mapping we ended up with.
          LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
        }
-      V1 = DAG.getNode(
-          ISD::BITCAST, DL, MVT::v16i8,
-          DAG.getVectorShuffle(MVT::v8i16, DL,
-                               DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
+      V1 = DAG.getBitcast(
+          MVT::v16i8,
+          DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
                                 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
  
        // Unpack the bytes to form the i16s that will be shuffled into place.
@@ -8770,10 +9053,9 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
              assert(PostDupI16Shuffle[i / 2] == MappedMask &&
                     "Conflicting entrties in the original shuffle!");
          }
-      return DAG.getNode(
-          ISD::BITCAST, DL, MVT::v16i8,
-          DAG.getVectorShuffle(MVT::v8i16, DL,
-                               DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
+      return DAG.getBitcast(
+          MVT::v16i8,
+          DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
                                 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
      };
      if (SDValue V = tryToWidenViaDuplication())
@@ -8866,19 +9148,18 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
      // We use the mask type to pick which bytes are preserved based on how many
      // elements are dropped.
      MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
-    SDValue ByteClearMask =
-        DAG.getNode(ISD::BITCAST, DL, MVT::v16i8,
-                    DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
+    SDValue ByteClearMask = DAG.getBitcast(
+        MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
      V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
      if (!IsSingleInput)
        V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
  
      // Now pack things back together.
-    V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
-    V2 = IsSingleInput ? V1 : DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
+    V1 = DAG.getBitcast(MVT::v8i16, V1);
+    V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
      SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
      for (int i = 1; i < NumEvenDrops; ++i) {
-      Result = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Result);
+      Result = DAG.getBitcast(MVT::v8i16, Result);
        Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
      }
  
@@ -8912,7 +9193,7 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
        std::none_of(std::begin(HiBlendMask), std::end(HiBlendMask),
                     [](int M) { return M >= 0 && M % 2 == 1; })) {
      // Use a mask to drop the high bytes.
-    VLoHalf = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
+    VLoHalf = DAG.getBitcast(MVT::v8i16, V);
      VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
                       DAG.getConstant(0x00FF, DL, MVT::v8i16));
  
@@ -8929,10 +9210,10 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    } else {
      // Otherwise just unpack the low half of V into VLoHalf and the high half into
      // VHiHalf so that we can blend them as i16s.
-    VLoHalf = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
-                     DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
-    VHiHalf = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
-                     DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
+    VLoHalf = DAG.getBitcast(
+        MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
+    VHiHalf = DAG.getBitcast(
+        MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
    }
  
    SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
@@ -9073,8 +9354,8 @@ static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
        LoV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, LoOps);
        HiV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, HiOps);
      }
-    return std::make_pair(DAG.getNode(ISD::BITCAST, DL, SplitVT, LoV),
-                          DAG.getNode(ISD::BITCAST, DL, SplitVT, HiV));
+    return std::make_pair(DAG.getBitcast(SplitVT, LoV),
+                          DAG.getBitcast(SplitVT, HiV));
    };
  
    SDValue LoV1, HiV1, LoV2, HiV2;
@@ -9407,12 +9688,12 @@ static SDValue lowerVectorShuffleByMerging128BitLanes(
        LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
      }
  
-  V1 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V1);
-  V2 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V2);
+  V1 = DAG.getBitcast(LaneVT, V1);
+  V2 = DAG.getBitcast(LaneVT, V2);
    SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
  
    // Cast it back to the type we actually want.
-  LaneShuffle = DAG.getNode(ISD::BITCAST, DL, VT, LaneShuffle);
+  LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
  
    // Now do a simple shuffle that isn't lane crossing.
    SmallVector<int, 8> NewMask;
@@ -9441,6 +9722,37 @@ static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
    return true;
  }
  
+static SDValue lowerVectorShuffleWithSHUFPD(SDLoc DL, MVT VT,
+                                            ArrayRef<int> Mask, SDValue V1,
+                                            SDValue V2, SelectionDAG &DAG) {
+
+  // Mask for V8F64: 0/1,  8/9,  2/3,  10/11, 4/5, ..
+  // Mask for V4F64; 0/1,  4/5,  2/3,  6/7..
+  assert(VT.getScalarSizeInBits() == 64 && "Unexpected data type for VSHUFPD");
+  int NumElts = VT.getVectorNumElements();
+  bool ShufpdMask = true;
+  bool CommutableMask = true;
+  unsigned Immediate = 0;
+  for (int i = 0; i < NumElts; ++i) {
+    if (Mask[i] < 0)
+      continue;
+    int Val = (i & 6) + NumElts * (i & 1);
+    int CommutVal = (i & 0xe) + NumElts * ((i & 1)^1);
+    if (Mask[i] < Val ||  Mask[i] > Val + 1)
+      ShufpdMask = false;
+    if (Mask[i] < CommutVal ||  Mask[i] > CommutVal + 1)
+      CommutableMask = false;
+    Immediate |= (Mask[i] % 2) << i;
+  }
+  if (ShufpdMask)
+    return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
+                       DAG.getConstant(Immediate, DL, MVT::i8));
+  if (CommutableMask)
+    return DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
+                       DAG.getConstant(Immediate, DL, MVT::i8));
+  return SDValue();
+}
+
  /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
  ///
  /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
@@ -9505,24 +9817,9 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
      return Blend;
  
    // Check if the blend happens to exactly fit that of SHUFPD.
-  if ((Mask[0] == -1 || Mask[0] < 2) &&
-      (Mask[1] == -1 || (Mask[1] >= 4 && Mask[1] < 6)) &&
-      (Mask[2] == -1 || (Mask[2] >= 2 && Mask[2] < 4)) &&
-      (Mask[3] == -1 || Mask[3] >= 6)) {
-    unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 5) << 1) |
-                          ((Mask[2] == 3) << 2) | ((Mask[3] == 7) << 3);
-    return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V1, V2,
-                       DAG.getConstant(SHUFPDMask, DL, MVT::i8));
-  }
-  if ((Mask[0] == -1 || (Mask[0] >= 4 && Mask[0] < 6)) &&
-      (Mask[1] == -1 || Mask[1] < 2) &&
-      (Mask[2] == -1 || Mask[2] >= 6) &&
-      (Mask[3] == -1 || (Mask[3] >= 2 && Mask[3] < 4))) {
-    unsigned SHUFPDMask = (Mask[0] == 5) | ((Mask[1] == 1) << 1) |
-                          ((Mask[2] == 7) << 2) | ((Mask[3] == 3) << 3);
-    return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V2, V1,
-                       DAG.getConstant(SHUFPDMask, DL, MVT::i8));
-  }
+  if (SDValue Op =
+      lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
+    return Op;
  
    // Try to simplify this by merging 128-bit lanes to enable a lane-based
    // shuffle. However, if we have AVX2 and either inputs are already in place,
@@ -9584,10 +9881,10 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
            PSHUFDMask[2 * i] = 2 * RepeatedMask[i];
            PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1;
          }
-      return DAG.getNode(
-          ISD::BITCAST, DL, MVT::v4i64,
+      return DAG.getBitcast(
+          MVT::v4i64,
            DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
-                      DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, V1),
+                      DAG.getBitcast(MVT::v8i32, V1),
                        getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
      }
    }
@@ -9700,11 +9997,11 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
            DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask));
  
      if (Subtarget->hasAVX2())
-      return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32,
-                         DAG.getNode(ISD::BITCAST, DL, MVT::v8f32,
-                                     DAG.getNode(ISD::BUILD_VECTOR, DL,
+      return DAG.getNode(
+          X86ISD::VPERMV, DL, MVT::v8f32,
+          DAG.getBitcast(MVT::v8f32, DAG.getNode(ISD::BUILD_VECTOR, DL,
                                                   MVT::v8i32, VPermMask)),
-                         V1);
+          V1);
  
      // Otherwise, fall back.
      return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
@@ -9894,12 +10191,11 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
        PSHUFBMask[2 * i] = DAG.getConstant(2 * M, DL, MVT::i8);
        PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, DL, MVT::i8);
      }
-    return DAG.getNode(
-        ISD::BITCAST, DL, MVT::v16i16,
-        DAG.getNode(
-            X86ISD::PSHUFB, DL, MVT::v32i8,
-            DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1),
-            DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)));
+    return DAG.getBitcast(MVT::v16i16,
+                          DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8,
+                                      DAG.getBitcast(MVT::v32i8, V1),
+                                      DAG.getNode(ISD::BUILD_VECTOR, DL,
+                                                  MVT::v32i8, PSHUFBMask)));
    }
  
    // Try to simplify this by merging 128-bit lanes to enable a lane-based
@@ -10039,10 +10335,9 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
  
      MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
                                  VT.getVectorNumElements());
-    V1 = DAG.getNode(ISD::BITCAST, DL, FpVT, V1);
-    V2 = DAG.getNode(ISD::BITCAST, DL, FpVT, V2);
-    return DAG.getNode(ISD::BITCAST, DL, VT,
-                       DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
+    V1 = DAG.getBitcast(FpVT, V1);
+    V2 = DAG.getBitcast(FpVT, V2);
+    return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
    }
  
    switch (VT.SimpleTy) {
@@ -10311,10 +10606,10 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
      // Make sure that the new vector type is legal. For example, v2f64 isn't
      // legal on SSE1.
      if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
-      V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
-      V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
-      return DAG.getNode(ISD::BITCAST, dl, VT,
-                         DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask));
+      V1 = DAG.getBitcast(NewVT, V1);
+      V2 = DAG.getBitcast(NewVT, V2);
+      return DAG.getBitcast(
+          VT, DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask));
      }
    }
  
@@ -10509,12 +10804,11 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
      unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
      // If Idx is 0, it's cheaper to do a move instead of a pextrw.
      if (Idx == 0)
-      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
-                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
-                                     DAG.getNode(ISD::BITCAST, dl,
-                                                 MVT::v4i32,
-                                                 Op.getOperand(0)),
-                                     Op.getOperand(1)));
+      return DAG.getNode(
+          ISD::TRUNCATE, dl, MVT::i16,
+          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+                      DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
+                      Op.getOperand(1)));
      SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
                                    Op.getOperand(0), Op.getOperand(1));
      SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
@@ -10538,10 +10832,9 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
           User->getValueType(0) != MVT::i32))
        return SDValue();
      SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
-                                  DAG.getNode(ISD::BITCAST, dl, MVT::v4i32,
-                                              Op.getOperand(0)),
-                                              Op.getOperand(1));
-    return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
+                                  DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
+                                  Op.getOperand(1));
+    return DAG.getBitcast(MVT::f32, Extract);
    }
  
    if (VT == MVT::i32 || VT == MVT::i64) {
@@ -10611,12 +10904,13 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
                                      MaskEltVT.getSizeInBits());
  
        Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
+      auto PtrVT = getPointerTy(DAG.getDataLayout());
        SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
-                                getZeroVector(MaskVT, Subtarget, DAG, dl),
-                                Idx, DAG.getConstant(0, dl, getPointerTy()));
+                                 getZeroVector(MaskVT, Subtarget, DAG, dl), Idx,
+                                 DAG.getConstant(0, dl, PtrVT));
        SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
-      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(),
-                        Perm, DAG.getConstant(0, dl, getPointerTy()));
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Perm,
+                         DAG.getConstant(0, dl, PtrVT));
      }
      return SDValue();
    }
@@ -10641,11 +10935,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
  
    assert(VecVT.is128BitVector() && "Unexpected vector length");
  
-  if (Subtarget->hasSSE41()) {
-    SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
-    if (Res.getNode())
+  if (Subtarget->hasSSE41())
+    if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
        return Res;
-  }
  
    MVT VT = Op.getSimpleValueType();
    // TODO: handle v16i8.
@@ -10655,8 +10947,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
      if (Idx == 0)
        return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
                           DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
-                                     DAG.getNode(ISD::BITCAST, dl,
-                                                 MVT::v4i32, Vec),
+                                     DAG.getBitcast(MVT::v4i32, Vec),
                                       Op.getOperand(1)));
      // Transform it so it match pextrw which produces a 32-bit result.
      MVT EltVT = MVT::i32;
@@ -10877,8 +11168,8 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
  
    SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
    assert(OpVT.is128BitVector() && "Expected an SSE type!");
-  return DAG.getNode(ISD::BITCAST, dl, OpVT,
-                     DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
+  return DAG.getBitcast(
+      OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
  }
  
  // Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
@@ -10939,9 +11230,8 @@ static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
      if (auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2))) {
        if (Idx2->getZExtValue() == 0) {
          SDValue Ops[] = { SubVec2, SubVec };
-        SDValue LD = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false);
-        if (LD.getNode())
-          return LD;
+        if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false))
+          return Ld;
        }
      }
    }
@@ -11011,17 +11301,16 @@ X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
    else if (Subtarget->isPICStyleStubPIC())
      OpFlag = X86II::MO_PIC_BASE_OFFSET;
  
-  SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
-                                             CP->getAlignment(),
-                                             CP->getOffset(), OpFlag);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue Result = DAG.getTargetConstantPool(
+      CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
    SDLoc DL(CP);
-  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
+  Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
    // With PIC, the address is actually $g + Offset.
    if (OpFlag) {
-    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
-                         DAG.getNode(X86ISD::GlobalBaseReg,
-                                     SDLoc(), getPointerTy()),
-                         Result);
+    Result =
+        DAG.getNode(ISD::ADD, DL, PtrVT,
+                    DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
    }
  
    return Result;
@@ -11044,17 +11333,16 @@ SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
    else if (Subtarget->isPICStyleStubPIC())
      OpFlag = X86II::MO_PIC_BASE_OFFSET;
  
-  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
-                                          OpFlag);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
    SDLoc DL(JT);
-  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
+  Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
  
    // With PIC, the address is actually $g + Offset.
    if (OpFlag)
-    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
-                         DAG.getNode(X86ISD::GlobalBaseReg,
-                                     SDLoc(), getPointerTy()),
-                         Result);
+    Result =
+        DAG.getNode(ISD::ADD, DL, PtrVT,
+                    DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
  
    return Result;
  }
@@ -11082,24 +11370,24 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
      OpFlag = X86II::MO_DARWIN_NONLAZY;
    }
  
-  SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
  
    SDLoc DL(Op);
-  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
+  Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
  
    // With PIC, the address is actually $g + Offset.
    if (DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
        !Subtarget->is64Bit()) {
-    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
-                         DAG.getNode(X86ISD::GlobalBaseReg,
-                                     SDLoc(), getPointerTy()),
-                         Result);
+    Result =
+        DAG.getNode(ISD::ADD, DL, PtrVT,
+                    DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
    }
  
    // For symbols that require a load from a stub to get the address, emit the
    // load.
    if (isGlobalStubReference(OpFlag))
-    Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
+    Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
                           MachinePointerInfo::getGOT(), false, false, false, 0);
  
    return Result;
@@ -11114,20 +11402,19 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
    const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
    int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
    SDLoc dl(Op);
-  SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset,
-                                             OpFlags);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
  
    if (Subtarget->isPICStyleRIPRel() &&
        (M == CodeModel::Small || M == CodeModel::Kernel))
-    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
+    Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result);
    else
-    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
+    Result = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, Result);
  
    // With PIC, the address is actually $g + Offset.
    if (isGlobalRelativeToPICBase(OpFlags)) {
-    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
-                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
-                         Result);
+    Result = DAG.getNode(ISD::ADD, dl, PtrVT,
+                         DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
    }
  
    return Result;
@@ -11141,40 +11428,40 @@ X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
    unsigned char OpFlags =
        Subtarget->ClassifyGlobalReference(GV, DAG.getTarget());
    CodeModel::Model M = DAG.getTarget().getCodeModel();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
    SDValue Result;
    if (OpFlags == X86II::MO_NO_FLAG &&
        X86::isOffsetSuitableForCodeModel(Offset, M)) {
      // A direct static reference to a global.
-    Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
+    Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
      Offset = 0;
    } else {
-    Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
+    Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
    }
  
    if (Subtarget->isPICStyleRIPRel() &&
        (M == CodeModel::Small || M == CodeModel::Kernel))
-    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
+    Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result);
    else
-    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
+    Result = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, Result);
  
    // With PIC, the address is actually $g + Offset.
    if (isGlobalRelativeToPICBase(OpFlags)) {
-    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
-                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
-                         Result);
+    Result = DAG.getNode(ISD::ADD, dl, PtrVT,
+                         DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
    }
  
    // For globals that require a load from a stub to get the address, emit the
    // load.
    if (isGlobalStubReference(OpFlags))
-    Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
+    Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
                           MachinePointerInfo::getGOT(), false, false, false, 0);
  
    // If there was a non-zero offset that we didn't fold, create an explicit
    // addition for it.
    if (Offset != 0)
-    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
-                         DAG.getConstant(Offset, dl, getPointerTy()));
+    Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
+                         DAG.getConstant(Offset, dl, PtrVT));
  
    return Result;
  }
@@ -11338,22 +11625,23 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
  
    GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
    const GlobalValue *GV = GA->getGlobal();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
  
    if (Subtarget->isTargetELF()) {
      TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
      switch (model) {
        case TLSModel::GeneralDynamic:
          if (Subtarget->is64Bit())
-          return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
-        return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
+          return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
+        return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
        case TLSModel::LocalDynamic:
-        return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(),
+        return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
                                             Subtarget->is64Bit());
        case TLSModel::InitialExec:
        case TLSModel::LocalExec:
-        return LowerToTLSExecModel(
-            GA, DAG, getPointerTy(), model, Subtarget->is64Bit(),
-            DAG.getTarget().getRelocationModel() == Reloc::PIC_);
+        return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget->is64Bit(),
+                                   DAG.getTarget().getRelocationModel() ==
+                                       Reloc::PIC_);
      }
      llvm_unreachable("Unknown TLS model.");
    }
@@ -11376,13 +11664,12 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
      SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
                                                  GA->getValueType(0),
                                                  GA->getOffset(), OpFlag);
-    SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
+    SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
  
      // With PIC32, the address is actually $g + Offset.
      if (PIC32)
-      Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),
-                           DAG.getNode(X86ISD::GlobalBaseReg,
-                                       SDLoc(), getPointerTy()),
+      Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
+                           DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
                             Offset);
  
      // Lowering the machine isd will make sure everything is in the right
@@ -11399,8 +11686,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
      // And our return value (tls address) is in the standard call return value
      // location.
      unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
-    return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
-                              Chain.getValue(1));
+    return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
    }
  
    if (Subtarget->isTargetKnownWindowsMSVC() ||
@@ -11428,50 +11714,50 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
                                          : Type::getInt32PtrTy(*DAG.getContext(),
                                                                257));
  
-    SDValue TlsArray =
-        Subtarget->is64Bit()
-            ? DAG.getIntPtrConstant(0x58, dl)
-            : (Subtarget->isTargetWindowsGNU()
-                   ? DAG.getIntPtrConstant(0x2C, dl)
-                   : DAG.getExternalSymbol("_tls_array", getPointerTy()));
+    SDValue TlsArray = Subtarget->is64Bit()
+                           ? DAG.getIntPtrConstant(0x58, dl)
+                           : (Subtarget->isTargetWindowsGNU()
+                                  ? DAG.getIntPtrConstant(0x2C, dl)
+                                  : DAG.getExternalSymbol("_tls_array", PtrVT));
  
      SDValue ThreadPointer =
-        DAG.getLoad(getPointerTy(), dl, Chain, TlsArray,
-                    MachinePointerInfo(Ptr), false, false, false, 0);
+        DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr), false,
+                    false, false, 0);
  
      SDValue res;
      if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
        res = ThreadPointer;
      } else {
        // Load the _tls_index variable
-      SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy());
+      SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
        if (Subtarget->is64Bit())
-        IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain, IDX,
+        IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
                               MachinePointerInfo(), MVT::i32, false, false,
                               false, 0);
        else
-        IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(),
-                          false, false, false, 0);
+        IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo(), false,
+                          false, false, 0);
  
-      SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()), dl,
-                                      getPointerTy());
-      IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);
+      auto &DL = DAG.getDataLayout();
+      SDValue Scale =
+          DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
+      IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
  
-      res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);
+      res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
      }
  
-    res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(),
-                      false, false, false, 0);
+    res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo(), false, false,
+                      false, 0);
  
      // Get the offset of start of .tls section
      SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
                                               GA->getValueType(0),
                                               GA->getOffset(), X86II::MO_SECREL);
-    SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA);
+    SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
  
      // The address of the thread local variable is the add of the thread
      // pointer with the offset of the variable.
-    return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset);
+    return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
    }
  
    llvm_unreachable("TLS not implemented for this target.");
@@ -11533,15 +11819,21 @@ static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
  
  SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
                                             SelectionDAG &DAG) const {
-  MVT SrcVT = Op.getOperand(0).getSimpleValueType();
+  SDValue Src = Op.getOperand(0);
+  MVT SrcVT = Src.getSimpleValueType();
+  MVT VT = Op.getSimpleValueType();
    SDLoc dl(Op);
  
    if (SrcVT.isVector()) {
+    if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
+      return DAG.getNode(X86ISD::CVTDQ2PD, dl, VT,
+                         DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
+                         DAG.getUNDEF(SrcVT)));
+    }
      if (SrcVT.getVectorElementType() == MVT::i1) {
        MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
        return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
-                         DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT,
-                                     Op.getOperand(0)));
+                         DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src));
      }
      return SDValue();
    }
@@ -11560,8 +11852,9 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
  
    unsigned Size = SrcVT.getSizeInBits()/8;
    MachineFunction &MF = DAG.getMachineFunction();
+  auto PtrVT = getPointerTy(MF.getDataLayout());
    int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
-  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+  SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
    SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
                                 StackSlot,
                                 MachinePointerInfo::getFixedStack(SSFI),
@@ -11610,7 +11903,8 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
      MachineFunction &MF = DAG.getMachineFunction();
      unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
      int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
-    SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+    auto PtrVT = getPointerTy(MF.getDataLayout());
+    SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
      Tys = DAG.getVTList(MVT::Other);
      SDValue Ops[] = {
        Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
@@ -11652,7 +11946,8 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
    // Build some magic constants.
    static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
    Constant *C0 = ConstantDataVector::get(*Context, CV0);
-  SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
  
    SmallVector<Constant*,2> CV1;
    CV1.push_back(
@@ -11662,7 +11957,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
      ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
                                        APInt(64, 0x4530000000000000ULL))));
    Constant *C1 = ConstantVector::get(CV1);
-  SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
+  SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
  
    // Load the 64-bit value into an XMM register.
    SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
@@ -11670,14 +11965,13 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
    SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
                                MachinePointerInfo::getConstantPool(),
                                false, false, false, 16);
-  SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32,
-                              DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1),
-                              CLod0);
+  SDValue Unpck1 =
+      getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
  
    SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
                                MachinePointerInfo::getConstantPool(),
                                false, false, false, 16);
-  SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1);
+  SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
    SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
    SDValue Result;
  
@@ -11685,12 +11979,11 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
      // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
      Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
    } else {
-    SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub);
+    SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
      SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
                                             S2F, 0x4E, DAG);
      Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
-                         DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle),
-                         Sub);
+                         DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
    }
  
    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
@@ -11713,20 +12006,19 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
    Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
  
    Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
-                     DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
+                     DAG.getBitcast(MVT::v2f64, Load),
                       DAG.getIntPtrConstant(0, dl));
  
    // Or the load with the bias.
-  SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
-                           DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
-                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
-                                                   MVT::v2f64, Load)),
-                           DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
-                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
-                                                   MVT::v2f64, Bias)));
-  Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
-                   DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or),
-                   DAG.getIntPtrConstant(0, dl));
+  SDValue Or = DAG.getNode(
+      ISD::OR, dl, MVT::v2i64,
+      DAG.getBitcast(MVT::v2i64,
+                     DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
+      DAG.getBitcast(MVT::v2i64,
+                     DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
+  Or =
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+                  DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
  
    // Subtract the bias.
    SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
@@ -11805,19 +12097,16 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
    if (Subtarget.hasSSE41()) {
      EVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
      //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
-    SDValue VecCstLowBitcast =
-        DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstLow);
-    SDValue VecBitcast = DAG.getNode(ISD::BITCAST, DL, VecI16VT, V);
+    SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
+    SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
      // Low will be bitcasted right away, so do not bother bitcasting back to its
      // original type.
      Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
                        VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
      //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
      //                                 (uint4) 0x53000000, 0xaa);
-    SDValue VecCstHighBitcast =
-        DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstHigh);
-    SDValue VecShiftBitcast =
-        DAG.getNode(ISD::BITCAST, DL, VecI16VT, HighShift);
+    SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
+    SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
      // High will be bitcasted right away, so do not bother bitcasting back to
      // its original type.
      High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
@@ -11843,11 +12132,11 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
                                     makeArrayRef(&CstFAddArray[0], NumElts));
  
    //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
-  SDValue HighBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, High);
+  SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
    SDValue FHigh =
        DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
    //     return (float4) lo + fhi;
-  SDValue LowBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, Low);
+  SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
    return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
  }
  
@@ -11884,6 +12173,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
                                             SelectionDAG &DAG) const {
    SDValue N0 = Op.getOperand(0);
    SDLoc dl(Op);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
  
    if (Op.getValueType().isVector())
      return lowerUINT_TO_FP_vec(Op, DAG);
@@ -11906,9 +12196,8 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
    // Make a 64-bit buffer, and use it to build an FILD.
    SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
    if (SrcVT == MVT::i32) {
-    SDValue WordOff = DAG.getConstant(4, dl, getPointerTy());
-    SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
-                                     getPointerTy(), StackSlot, WordOff);
+    SDValue WordOff = DAG.getConstant(4, dl, PtrVT);
+    SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, WordOff);
      SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
                                    StackSlot, MachinePointerInfo(),
                                    false, false, 0);
@@ -11942,22 +12231,20 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
    APInt FF(32, 0x5F800000ULL);
  
    // Check whether the sign bit is set.
-  SDValue SignSet = DAG.getSetCC(dl,
-                                 getSetCCResultType(*DAG.getContext(), MVT::i64),
-                                 Op.getOperand(0),
-                                 DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
+  SDValue SignSet = DAG.getSetCC(
+      dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
+      Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
  
    // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
    SDValue FudgePtr = DAG.getConstantPool(
-                             ConstantInt::get(*DAG.getContext(), FF.zext(64)),
-                                         getPointerTy());
+      ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
  
    // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
    SDValue Zero = DAG.getIntPtrConstant(0, dl);
    SDValue Four = DAG.getIntPtrConstant(4, dl);
    SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
                                 Zero, Four);
-  FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset);
+  FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
  
    // Load the value out, extending it from f32 to f80.
    // FIXME: Avoid the extend by constructing the right constant pool?
@@ -11976,6 +12263,7 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
    SDLoc DL(Op);
  
    EVT DstTy = Op.getValueType();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
  
    if (!IsSigned && !isIntegerTypeFTOL(DstTy)) {
      assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
@@ -12000,7 +12288,7 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
    MachineFunction &MF = DAG.getMachineFunction();
    unsigned MemSize = DstTy.getSizeInBits()/8;
    int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
-  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+  SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
  
    unsigned Opc;
    if (!IsSigned && isIntegerTypeFTOL(DstTy))
@@ -12034,7 +12322,7 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
      Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
      Chain = Value.getValue(1);
      SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
-    StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+    StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
    }
  
    MachineMemOperand *MMO =
@@ -12103,8 +12391,8 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
    MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
                               VT.getVectorNumElements()/2);
  
-  OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
-  OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
+  OpLo = DAG.getBitcast(HVT, OpLo);
+  OpHi = DAG.getBitcast(HVT, OpHi);
  
    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
  }
@@ -12137,11 +12425,9 @@ static  SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
  
  static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
                                 SelectionDAG &DAG) {
-  if (Subtarget->hasFp256()) {
-    SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
-    if (Res.getNode())
+  if (Subtarget->hasFp256())
+    if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
        return Res;
-  }
  
    return SDValue();
  }
@@ -12156,11 +12442,9 @@ static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
    if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
      return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG);
  
-  if (Subtarget->hasFp256()) {
-    SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
-    if (Res.getNode())
+  if (Subtarget->hasFp256())
+    if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
        return Res;
-  }
  
    assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
           VT.getVectorNumElements() != SVT.getVectorNumElements());
@@ -12189,14 +12473,14 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
      if (InVT.is512BitVector() && InVT.getScalarSizeInBits() <= 16 &&
          Subtarget->hasBWI())
        return Op; // legal, will go to VPMOVB2M, VPMOVW2M
-    if ((InVT.is256BitVector() || InVT.is128BitVector()) 
+    if ((InVT.is256BitVector() || InVT.is128BitVector())
          && InVT.getScalarSizeInBits() <= 16 &&
          Subtarget->hasBWI() && Subtarget->hasVLX())
        return Op; // legal, will go to VPMOVB2M, VPMOVW2M
      if (InVT.is512BitVector() && InVT.getScalarSizeInBits() >= 32 &&
          Subtarget->hasDQI())
        return Op; // legal, will go to VPMOVD2M, VPMOVQ2M
-    if ((InVT.is256BitVector() || InVT.is128BitVector()) 
+    if ((InVT.is256BitVector() || InVT.is128BitVector())
          && InVT.getScalarSizeInBits() >= 32 &&
          Subtarget->hasDQI() && Subtarget->hasVLX())
        return Op; // legal, will go to VPMOVB2M, VPMOVQ2M
@@ -12224,7 +12508,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
      // On AVX2, v4i64 -> v4i32 becomes VPERMD.
      if (Subtarget->hasInt256()) {
        static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
-      In = DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, In);
+      In = DAG.getBitcast(MVT::v8i32, In);
        In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
                                  ShufMask);
        return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
@@ -12235,8 +12519,8 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
                                 DAG.getIntPtrConstant(0, DL));
      SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
                                 DAG.getIntPtrConstant(2, DL));
-    OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
-    OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
+    OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
+    OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
      static const int ShufMask[] = {0, 2, 4, 6};
      return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
    }
@@ -12244,7 +12528,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
    if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
      // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
      if (Subtarget->hasInt256()) {
-      In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In);
+      In = DAG.getBitcast(MVT::v32i8, In);
  
        SmallVector<SDValue,32> pshufbMask;
        for (unsigned i = 0; i < 2; ++i) {
@@ -12261,14 +12545,14 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
        }
        SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask);
        In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
-      In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In);
+      In = DAG.getBitcast(MVT::v4i64, In);
  
        static const int ShufMask[] = {0,  2,  -1,  -1};
        In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, DAG.getUNDEF(MVT::v4i64),
                                  &ShufMask[0]);
        In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
                         DAG.getIntPtrConstant(0, DL));
-      return DAG.getNode(ISD::BITCAST, DL, VT, In);
+      return DAG.getBitcast(VT, In);
      }
  
      SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
@@ -12277,8 +12561,8 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
      SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
                                 DAG.getIntPtrConstant(4, DL));
  
-    OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpLo);
-    OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpHi);
+    OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
+    OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
  
      // The PSHUFB mask:
      static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
@@ -12288,13 +12572,13 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
      OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
      OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
  
-    OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
-    OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
+    OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
+    OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
  
      // The MOVLHPS Mask:
      static const int ShufMask2[] = {0, 1, 4, 5};
      SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
-    return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, res);
+    return DAG.getBitcast(MVT::v8i16, res);
    }
  
    // Handle truncation of V256 to V128 using shuffles.
@@ -12310,8 +12594,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
    // Prepare truncation shuffle mask
    for (unsigned i = 0; i != NumElems; ++i)
      MaskVec[i] = i * 2;
-  SDValue V = DAG.getVectorShuffle(NVT, DL,
-                                   DAG.getNode(ISD::BITCAST, DL, NVT, In),
+  SDValue V = DAG.getVectorShuffle(NVT, DL, DAG.getBitcast(NVT, In),
                                     DAG.getUNDEF(NVT), &MaskVec[0]);
    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
                       DAG.getIntPtrConstant(0, DL));
@@ -12410,7 +12693,7 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
    Constant *C = ConstantInt::get(*Context, MaskElt);
    C = ConstantVector::getSplat(NumElts, C);
    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy());
+  SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
    unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
    SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
                               MachinePointerInfo::getConstantPool(),
@@ -12420,13 +12703,12 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
      // For a vector, cast operands to a vector type, perform the logic op,
      // and cast the result back to the original value type.
      MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
-    SDValue MaskCasted = DAG.getNode(ISD::BITCAST, dl, VecVT, Mask);
-    SDValue Operand = IsFNABS ?
-      DAG.getNode(ISD::BITCAST, dl, VecVT, Op0.getOperand(0)) :
-      DAG.getNode(ISD::BITCAST, dl, VecVT, Op0);
+    SDValue MaskCasted = DAG.getBitcast(VecVT, Mask);
+    SDValue Operand = IsFNABS ? DAG.getBitcast(VecVT, Op0.getOperand(0))
+                              : DAG.getBitcast(VecVT, Op0);
      unsigned BitOp = IsFABS ? ISD::AND : IsFNABS ? ISD::OR : ISD::XOR;
-    return DAG.getNode(ISD::BITCAST, dl, VT,
-                       DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted));
+    return DAG.getBitcast(VT,
+                          DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted));
    }
  
    // If not vector, then scalar.
@@ -12470,7 +12752,8 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
    CV[0] = ConstantFP::get(*Context,
                            APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1)));
    Constant *C = ConstantVector::get(CV);
-  SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
+  auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+  SDValue CPIdx = DAG.getConstantPool(C, PtrVT, 16);
    SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
                                MachinePointerInfo::getConstantPool(),
                                false, false, false, 16);
@@ -12491,7 +12774,7 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
          APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1)));
    }
    C = ConstantVector::get(CV);
-  CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
+  CPIdx = DAG.getConstantPool(C, PtrVT, 16);
    SDValue Val = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
                              MachinePointerInfo::getConstantPool(),
                              false, false, false, 16);
@@ -12591,7 +12874,7 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
  
    // Cast all vectors into TestVT for PTEST.
    for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
-    VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]);
+    VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
  
    // If more than one full vectors are evaluated, OR them first before PTEST.
    for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
@@ -12925,29 +13208,31 @@ SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
                                              DAGCombinerInfo &DCI,
                                              unsigned &RefinementSteps,
                                              bool &UseOneConstNR) const {
-  // FIXME: We should use instruction latency models to calculate the cost of
-  // each potential sequence, but this is very hard to do reliably because
-  // at least Intel's Core* chips have variable timing based on the number of
-  // significant digits in the divisor and/or sqrt operand.
-  if (!Subtarget->useSqrtEst())
-    return SDValue();
-
    EVT VT = Op.getValueType();
+  const char *RecipOp;
  
-  // SSE1 has rsqrtss and rsqrtps.
+  // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
    // TODO: Add support for AVX512 (v16f32).
    // It is likely not profitable to do this for f64 because a double-precision
    // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
    // instructions: convert to single, rsqrtss, convert back to double, refine
    // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
    // along with FMA, this could be a throughput win.
-  if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
-      (Subtarget->hasAVX() && VT == MVT::v8f32)) {
-    RefinementSteps = 1;
-    UseOneConstNR = false;
-    return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
-  }
-  return SDValue();
+  if (VT == MVT::f32 && Subtarget->hasSSE1())
+    RecipOp = "sqrtf";
+  else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) ||
+           (VT == MVT::v8f32 && Subtarget->hasAVX()))
+    RecipOp = "vec-sqrtf";
+  else
+    return SDValue();
+
+  TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
+  if (!Recips.isEnabled(RecipOp))
+    return SDValue();
+
+  RefinementSteps = Recips.getRefinementSteps(RecipOp);
+  UseOneConstNR = false;
+  return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
  }
  
  /// The minimum architected relative accuracy is 2^-12. We need one
@@ -12955,14 +13240,8 @@ SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
  SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
                                              DAGCombinerInfo &DCI,
                                              unsigned &RefinementSteps) const {
-  // FIXME: We should use instruction latency models to calculate the cost of
-  // each potential sequence, but this is very hard to do reliably because
-  // at least Intel's Core* chips have variable timing based on the number of
-  // significant digits in the divisor.
-  if (!Subtarget->useReciprocalEst())
-    return SDValue();
-
    EVT VT = Op.getValueType();
+  const char *RecipOp;
  
    // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
    // TODO: Add support for AVX512 (v16f32).
@@ -12971,12 +13250,20 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
    // 15 instructions: convert to single, rcpss, convert back to double, refine
    // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
    // along with FMA, this could be a throughput win.
-  if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
-      (Subtarget->hasAVX() && VT == MVT::v8f32)) {
-    RefinementSteps = ReciprocalEstimateRefinementSteps;
-    return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
-  }
-  return SDValue();
+  if (VT == MVT::f32 && Subtarget->hasSSE1())
+    RecipOp = "divf";
+  else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) ||
+           (VT == MVT::v8f32 && Subtarget->hasAVX()))
+    RecipOp = "vec-divf";
+  else
+    return SDValue();
+
+  TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
+  if (!Recips.isEnabled(RecipOp))
+    return SDValue();
+
+  RefinementSteps = Recips.getRefinementSteps(RecipOp);
+  return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
  }
  
  /// If we have at least two divisions that use the same divisor, convert to
@@ -13156,13 +13443,13 @@ static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
                                 DAG.getConstant(-1, dl, VT));
    switch (SetCCOpcode) {
    default: llvm_unreachable("Unexpected SETCC condition");
-  case ISD::SETNE:
-    // (x != y) -> ~(x ^ y)
+  case ISD::SETEQ:
+    // (x == y) -> ~(x ^ y)
      return DAG.getNode(ISD::XOR, dl, VT,
                         DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
                         DAG.getConstant(-1, dl, VT));
-  case ISD::SETEQ:
-    // (x == y) -> (x ^ y)
+  case ISD::SETNE:
+    // (x != y) -> (x ^ y)
      return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
    case ISD::SETUGT:
    case ISD::SETGT:
@@ -13356,8 +13643,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
    if (hasMinMax) {
      switch (SetCCOpcode) {
      default: break;
-    case ISD::SETULE: Opc = X86ISD::UMIN; MinMax = true; break;
-    case ISD::SETUGE: Opc = X86ISD::UMAX; MinMax = true; break;
+    case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
+    case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
      }
  
      if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
@@ -13407,8 +13694,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
        assert(Subtarget->hasSSE2() && "Don't know how to lower!");
  
        // First cast everything to the right type.
-      Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
-      Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
+      Op0 = DAG.getBitcast(MVT::v4i32, Op0);
+      Op1 = DAG.getBitcast(MVT::v4i32, Op1);
  
        // Since SSE has no unsigned integer comparisons, we need to flip the sign
        // bits of the inputs before performing those operations. The lower
@@ -13442,7 +13729,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
        if (Invert)
          Result = DAG.getNOT(dl, Result, MVT::v4i32);
  
-      return DAG.getNode(ISD::BITCAST, dl, VT, Result);
+      return DAG.getBitcast(VT, Result);
      }
  
      if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) {
@@ -13451,8 +13738,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
        assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!");
  
        // First cast everything to the right type.
-      Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
-      Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
+      Op0 = DAG.getBitcast(MVT::v4i32, Op0);
+      Op1 = DAG.getBitcast(MVT::v4i32, Op1);
  
        // Do the compare.
        SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
@@ -13465,7 +13752,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
        if (Invert)
          Result = DAG.getNOT(dl, Result, MVT::v4i32);
  
-      return DAG.getNode(ISD::BITCAST, dl, VT, Result);
+      return DAG.getBitcast(VT, Result);
      }
    }
  
@@ -13662,7 +13949,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
          SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
  
          EVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
-        VCmp = DAG.getNode(ISD::BITCAST, DL, VCmpVT, VCmp);
+        VCmp = DAG.getBitcast(VCmpVT, VCmp);
  
          SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2);
  
@@ -13687,12 +13974,12 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
        else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
          Op2Scalar = Op2.getOperand(0);
        if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
-        SDValue newSelect = DAG.getNode(ISD::SELECT, DL, 
+        SDValue newSelect = DAG.getNode(ISD::SELECT, DL,
                                          Op1Scalar.getValueType(),
                                          Cond, Op1Scalar, Op2Scalar);
          if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
-          return DAG.getNode(ISD::BITCAST, DL, VT, newSelect);
-        SDValue ExtVec = DAG.getNode(ISD::BITCAST, DL, MVT::v8i1, newSelect);
+          return DAG.getBitcast(VT, newSelect);
+        SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
          return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
                             DAG.getIntPtrConstant(0, DL));
      }
@@ -13975,7 +14262,7 @@ static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op,
      Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
      MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
      CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
-    Curr = DAG.getNode(ISD::BITCAST, dl, CurrVT, Curr);
+    Curr = DAG.getBitcast(CurrVT, Curr);
    }
  
    SDValue SignExt = Curr;
@@ -13993,7 +14280,7 @@ static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op,
      SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
                                 DAG.getConstant(31, dl, MVT::i8));
      SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
-    return DAG.getNode(ISD::BITCAST, dl, VT, Ext);
+    return DAG.getBitcast(VT, Ext);
    }
  
    return SDValue();
@@ -14176,8 +14463,8 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
  
    SmallVector<SDValue, 8> Chains;
    SDValue Ptr = Ld->getBasePtr();
-  SDValue Increment =
-      DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl, TLI.getPointerTy());
+  SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
+                                      TLI.getPointerTy(DAG.getDataLayout()));
    SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
  
    for (unsigned i = 0; i < NumLoads; ++i) {
@@ -14202,7 +14489,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
  
    // Bitcast the loaded value to a vector of the original element type, in
    // the size of the target vector type.
-  SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
+  SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
    unsigned SizeRatio = RegSz / MemSz;
  
    if (Ext == ISD::SEXTLOAD) {
@@ -14227,7 +14514,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
      SDValue Shuff = DAG.getVectorShuffle(
          WideVecVT, dl, SlicedVec, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
  
-    Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
+    Shuff = DAG.getBitcast(RegVT, Shuff);
  
      // Build the arithmetic shift.
      unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
@@ -14249,7 +14536,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
                                         DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
  
    // Bitcast to the requested type.
-  Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
+  Shuff = DAG.getBitcast(RegVT, Shuff);
    DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
    return Shuff;
  }
@@ -14617,7 +14904,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
    EVT VT = Op.getNode()->getValueType(0);
  
    bool Is64Bit = Subtarget->is64Bit();
-  EVT SPTy = getPointerTy();
+  MVT SPTy = getPointerTy(DAG.getDataLayout());
  
    if (SplitStack) {
      MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -14634,8 +14921,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
                               "have nested arguments.");
      }
  
-    const TargetRegisterClass *AddrRegClass =
-      getRegClassFor(getPointerTy());
+    const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
      unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
      Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
      SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
@@ -14670,6 +14956,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
  
  SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
    MachineFunction &MF = DAG.getMachineFunction();
+  auto PtrVT = getPointerTy(MF.getDataLayout());
    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
  
    const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
@@ -14678,8 +14965,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
    if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) {
      // vastart just stores the address of the VarArgsFrameIndex slot into the
      // memory location argument.
-    SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
-                                   getPointerTy());
+    SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
      return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
                          MachinePointerInfo(SV), false, false, 0);
    }
@@ -14699,8 +14985,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
    MemOps.push_back(Store);
  
    // Store fp_offset
-  FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
-                    FIN, DAG.getIntPtrConstant(4, DL));
+  FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
    Store = DAG.getStore(Op.getOperand(0), DL,
                         DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL,
                                         MVT::i32),
@@ -14708,20 +14993,16 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
    MemOps.push_back(Store);
  
    // Store ptr to overflow_arg_area
-  FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
-                    FIN, DAG.getIntPtrConstant(4, DL));
-  SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
-                                    getPointerTy());
+  FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
+  SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
    Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
                         MachinePointerInfo(SV, 8),
                         false, false, 0);
    MemOps.push_back(Store);
  
    // Store ptr to reg_save_area.
-  FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
-                    FIN, DAG.getIntPtrConstant(8, DL));
-  SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
-                                    getPointerTy());
+  FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(8, DL));
+  SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
    Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
                         MachinePointerInfo(SV, 16), false, false, 0);
    MemOps.push_back(Store);
@@ -14743,7 +15024,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
  
    EVT ArgVT = Op.getNode()->getValueType(0);
    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
-  uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
+  uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
    uint8_t ArgMode;
  
    // Decide which area this value should be read from.
@@ -14772,7 +15053,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
    SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
                         DAG.getConstant(ArgMode, dl, MVT::i8),
                         DAG.getConstant(Align, dl, MVT::i32)};
-  SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other);
+  SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
    SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
                                            VTs, InstOps, MVT::i64,
                                            MachinePointerInfo(SV),
@@ -14933,7 +15214,7 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
    MVT EltVT = VT.getVectorElementType();
    EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
  
-  ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt);
+  ShAmt = DAG.getBitcast(ShVT, ShAmt);
    return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
  }
  
@@ -14959,8 +15240,8 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
      // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
      // are extracted by EXTRACT_SUBVECTOR.
      SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
-                              DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
-                              DAG.getIntPtrConstant(0, dl));
+                                DAG.getBitcast(BitcastVT, Mask),
+                                DAG.getIntPtrConstant(0, dl));
  
      switch (Op.getOpcode()) {
        default: break;
@@ -14999,6 +15280,60 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
      return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
  }
  
+static int getSEHRegistrationNodeSize(const Function *Fn) {
+  if (!Fn->hasPersonalityFn())
+    report_fatal_error(
+        "querying registration node size for function without personality");
+  // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
+  // WinEHStatePass for the full struct definition.
+  switch (classifyEHPersonality(Fn->getPersonalityFn())) {
+  case EHPersonality::MSVC_X86SEH: return 24;
+  case EHPersonality::MSVC_CXX: return 16;
+  default: break;
+  }
+  report_fatal_error("can only recover FP for MSVC EH personality functions");
+}
+
+/// When the 32-bit MSVC runtime transfers control to us, either to an outlined
+/// function or when returning to a parent frame after catching an exception, we
+/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
+/// Here's the math:
+///   RegNodeBase = EntryEBP - RegNodeSize
+///   ParentFP = RegNodeBase - RegNodeFrameOffset
+/// Subtracting RegNodeSize takes us to the offset of the registration node, and
+/// subtracting the offset (negative on x86) takes us back to the parent FP.
+static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
+                                   SDValue EntryEBP) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  SDLoc dl;
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+
+  // It's possible that the parent function no longer has a personality function
+  // if the exceptional code was optimized away, in which case we just return
+  // the incoming EBP.
+  if (!Fn->hasPersonalityFn())
+    return EntryEBP;
+
+  int RegNodeSize = getSEHRegistrationNodeSize(Fn);
+
+  // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
+  // registration.
+  MCSymbol *OffsetSym =
+      MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
+          GlobalValue::getRealLinkageName(Fn->getName()));
+  SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
+  SDValue RegNodeFrameOffset =
+      DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
+
+  // RegNodeBase = EntryEBP - RegNodeSize
+  // ParentFP = RegNodeBase - RegNodeFrameOffset
+  SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
+                                    DAG.getConstant(RegNodeSize, dl, PtrVT));
+  return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, RegNodeFrameOffset);
+}
+
  static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
                                         SelectionDAG &DAG) {
    SDLoc dl(Op);
@@ -15015,14 +15350,36 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
      case INTR_TYPE_3OP:
        return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
          Op.getOperand(2), Op.getOperand(3));
+    case INTR_TYPE_4OP:
+      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
+        Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
      case INTR_TYPE_1OP_MASK_RM: {
        SDValue Src = Op.getOperand(1);
-      SDValue Src0 = Op.getOperand(2);
+      SDValue PassThru = Op.getOperand(2);
        SDValue Mask = Op.getOperand(3);
-      SDValue RoundingMode = Op.getOperand(4);
+      SDValue RoundingMode;
+      if (Op.getNumOperands() == 4)
+        RoundingMode = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
+      else
+        RoundingMode = Op.getOperand(4);
+      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+      if (IntrWithRoundingModeOpcode != 0) {
+        unsigned Round = cast<ConstantSDNode>(RoundingMode)->getZExtValue();
+        if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION)
+          return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+                                      dl, Op.getValueType(), Src, RoundingMode),
+                                      Mask, PassThru, Subtarget, DAG);
+      }
        return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
                                                RoundingMode),
-                                  Mask, Src0, Subtarget, DAG);
+                                  Mask, PassThru, Subtarget, DAG);
+    }
+    case INTR_TYPE_1OP_MASK: {
+      SDValue Src = Op.getOperand(1);
+      SDValue Passthru = Op.getOperand(2);
+      SDValue Mask = Op.getOperand(3);
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
+                                  Mask, Passthru, Subtarget, DAG);
      }
      case INTR_TYPE_SCALAR_MASK_RM: {
        SDValue Src1 = Op.getOperand(1);
@@ -15069,11 +15426,67 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
                                                Src1,Src2),
                                    Mask, PassThru, Subtarget, DAG);
      }
+    case INTR_TYPE_2OP_MASK_RM: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue PassThru = Op.getOperand(3);
+      SDValue Mask = Op.getOperand(4);
+      // We specify 2 possible modes for intrinsics, with/without rounding modes.
+      // First, we check if the intrinsic have rounding mode (6 operands),
+      // if not, we set rounding mode to "current".
+      SDValue Rnd;
+      if (Op.getNumOperands() == 6)
+        Rnd = Op.getOperand(5);
+      else
+        Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+                                              Src1, Src2, Rnd),
+                                  Mask, PassThru, Subtarget, DAG);
+    }
+    case INTR_TYPE_3OP_MASK: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue Src3 = Op.getOperand(3);
+      SDValue PassThru = Op.getOperand(4);
+      SDValue Mask = Op.getOperand(5);
+      // We specify 2 possible opcodes for intrinsics with rounding modes.
+      // First, we check if the intrinsic may have non-default rounding mode,
+      // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+      if (IntrWithRoundingModeOpcode != 0) {
+        SDValue Rnd = Op.getOperand(6);
+        unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
+        if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
+          return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+                                      dl, Op.getValueType(),
+                                      Src1, Src2, Src3, Rnd),
+                                      Mask, PassThru, Subtarget, DAG);
+        }
+      }
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+                                              Src1, Src2, Src3),
+                                  Mask, PassThru, Subtarget, DAG);
+    }
+    case VPERM_3OP_MASKZ:
+    case VPERM_3OP_MASK:
+    case FMA_OP_MASK3:
+    case FMA_OP_MASKZ:
      case FMA_OP_MASK: {
        SDValue Src1 = Op.getOperand(1);
        SDValue Src2 = Op.getOperand(2);
        SDValue Src3 = Op.getOperand(3);
        SDValue Mask = Op.getOperand(4);
+      EVT VT = Op.getValueType();
+      SDValue PassThru = SDValue();
+
+      // set PassThru element
+      if (IntrData->Type == VPERM_3OP_MASKZ || IntrData->Type == FMA_OP_MASKZ)
+        PassThru = getZeroVector(VT, Subtarget, DAG, dl);
+      else if (IntrData->Type == FMA_OP_MASK3)
+        PassThru = Src3;
+      else
+        PassThru = Src1;
+
        // We specify 2 possible opcodes for intrinsics with rounding modes.
        // First, we check if the intrinsic may have non-default rounding mode,
        // (IntrData->Opc1 != 0), then we check the rounding mode operand.
@@ -15085,12 +15498,12 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
            return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
                                                    dl, Op.getValueType(),
                                                    Src1, Src2, Src3, Rnd),
-                                      Mask, Src1, Subtarget, DAG);
+                                      Mask, PassThru, Subtarget, DAG);
        }
        return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
                                                dl, Op.getValueType(),
                                                Src1, Src2, Src3),
-                                  Mask, Src1, Subtarget, DAG);
+                                  Mask, PassThru, Subtarget, DAG);
      }
      case CMP_MASK:
      case CMP_MASK_CC: {
@@ -15140,7 +15553,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
        SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
                                  DAG.getUNDEF(BitcastVT), CmpMask,
                                  DAG.getIntPtrConstant(0, dl));
-      return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
+      return DAG.getBitcast(Op.getValueType(), Res);
      }
      case COMI: { // Comparison intrinsics
        ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
@@ -15169,18 +15582,10 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
        SDValue PassThru = Op.getOperand(2);
        if (isAllOnes(Mask)) // return data as is
          return Op.getOperand(1);
-      EVT VT = Op.getValueType();
-      EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
-                                    VT.getVectorNumElements());
-      EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
-                                       Mask.getValueType().getSizeInBits());
-      SDLoc dl(Op);
-      SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
-                                  DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
-                                  DAG.getIntPtrConstant(0, dl));
  
-      return DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToCompress,
-                         PassThru);
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+                                              DataToCompress),
+                                  Mask, PassThru, Subtarget, DAG);
      }
      case BLEND: {
        SDValue Mask = Op.getOperand(3);
@@ -15191,7 +15596,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
                                         Mask.getValueType().getSizeInBits());
        SDLoc dl(Op);
        SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
-                                  DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
+                                  DAG.getBitcast(BitcastVT, Mask),
                                    DAG.getIntPtrConstant(0, dl));
        return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1),
                           Op.getOperand(2));
@@ -15211,16 +15616,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
      return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
                         Op.getOperand(2), Op.getOperand(1));
  
-  case Intrinsic::x86_avx512_mask_valign_q_512:
-  case Intrinsic::x86_avx512_mask_valign_d_512:
-    // Vector source operands are swapped.
-    return getVectorMaskingNode(DAG.getNode(X86ISD::VALIGN, dl,
-                                            Op.getValueType(), Op.getOperand(2),
-                                            Op.getOperand(1),
-                                            Op.getOperand(3)),
-                                Op.getOperand(5), Op.getOperand(4),
-                                Subtarget, DAG);
-
    // ptest and testp intrinsics. The intrinsic these come from are designed to
    // return an integer value, not just an instruction so lower it to the ptest
    // or testp pattern and a setcc for the result.
@@ -15289,8 +15684,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
    case Intrinsic::x86_avx512_kortestz_w:
    case Intrinsic::x86_avx512_kortestc_w: {
      unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B;
-    SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1));
-    SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2));
+    SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
+    SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
      SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8);
      SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
      SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i1, CC, Test);
@@ -15378,19 +15773,39 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
      // Compute the symbol for the LSDA. We know it'll get emitted later.
      MachineFunction &MF = DAG.getMachineFunction();
      SDValue Op1 = Op.getOperand(1);
-    Op1->dump();
      auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
      MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
          GlobalValue::getRealLinkageName(Fn->getName()));
-    StringRef Name = LSDASym->getName();
-    assert(Name.data()[Name.size()] == '\0' && "not null terminated");
  
      // Generate a simple absolute symbol reference. This intrinsic is only
      // supported on 32-bit Windows, which isn't PIC.
-    SDValue Result =
-        DAG.getTargetExternalSymbol(Name.data(), VT, X86II::MO_NOPREFIX);
+    SDValue Result = DAG.getMCSymbol(LSDASym, VT);
      return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
    }
+
+  case Intrinsic::x86_seh_recoverfp: {
+    SDValue FnOp = Op.getOperand(1);
+    SDValue IncomingFPOp = Op.getOperand(2);
+    GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
+    auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
+    if (!Fn)
+      report_fatal_error(
+          "llvm.x86.seh.recoverfp must take a function as the first argument");
+    return recoverFramePointer(DAG, Fn, IncomingFPOp);
+  }
+
+  case Intrinsic::localaddress: {
+    // Returns one of the stack, base, or frame pointer registers, depending on
+    // which is used to reference local variables.
+    MachineFunction &MF = DAG.getMachineFunction();
+    const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+    unsigned Reg;
+    if (RegInfo->hasBasePointer(MF))
+      Reg = RegInfo->getBaseRegister();
+    else // This function handles the SP or FP case.
+      Reg = RegInfo->getPtrSizedFrameRegister(MF);
+    return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
+  }
    }
  }
  
@@ -15400,7 +15815,12 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                                const X86Subtarget * Subtarget) {
    SDLoc dl(Op);
    ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
-  assert(C && "Invalid scale type");
+  if (!C)
+    llvm_unreachable("Invalid scale type");
+  unsigned ScaleVal = C->getZExtValue();
+  if (ScaleVal > 2 && ScaleVal != 4 && ScaleVal != 8)
+    llvm_unreachable("Valid scale values are 1, 2, 4, 8");
+
    SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
    EVT MaskVT = MVT::getVectorVT(MVT::i1,
                               Index.getSimpleValueType().getVectorNumElements());
@@ -15408,8 +15828,16 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
    ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
    if (MaskC)
      MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT);
-  else
-    MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
+  else {
+    EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+                                     Mask.getValueType().getSizeInBits());
+
+    // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
+    // are extracted by EXTRACT_SUBVECTOR.
+    MaskInReg = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
+                            DAG.getBitcast(BitcastVT, Mask),
+                            DAG.getIntPtrConstant(0, dl));
+  }
    SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
    SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
    SDValue Segment = DAG.getRegister(0, MVT::i32);
@@ -15426,7 +15854,12 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                                 SDValue Index, SDValue ScaleOp, SDValue Chain) {
    SDLoc dl(Op);
    ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
-  assert(C && "Invalid scale type");
+  if (!C)
+    llvm_unreachable("Invalid scale type");
+  unsigned ScaleVal = C->getZExtValue();
+  if (ScaleVal > 2 && ScaleVal != 4 && ScaleVal != 8)
+    llvm_unreachable("Valid scale values are 1, 2, 4, 8");
+
    SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
    SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
    SDValue Segment = DAG.getRegister(0, MVT::i32);
@@ -15436,8 +15869,16 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
    ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
    if (MaskC)
      MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT);
-  else
-    MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
+  else {
+    EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+                                     Mask.getValueType().getSizeInBits());
+
+    // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
+    // are extracted by EXTRACT_SUBVECTOR.
+    MaskInReg = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
+                            DAG.getBitcast(BitcastVT, Mask),
+                            DAG.getIntPtrConstant(0, dl));
+  }
    SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
    SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
    SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
@@ -15460,7 +15901,7 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
    if (MaskC)
      MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT);
    else
-    MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
+    MaskInReg = DAG.getBitcast(MaskVT, Mask);
    //SDVTList VTs = DAG.getVTList(MVT::Other);
    SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
    SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
@@ -15575,14 +16016,76 @@ static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
    return DAG.getMergeValues(Results, DL);
  }
  
+static SDValue LowerSEHRESTOREFRAME(SDValue Op, const X86Subtarget *Subtarget,
+                                    SelectionDAG &DAG) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  const Function *Fn = MF.getFunction();
+  SDLoc dl(Op);
+  SDValue Chain = Op.getOperand(0);
+
+  assert(Subtarget->getFrameLowering()->hasFP(MF) &&
+         "using llvm.x86.seh.restoreframe requires a frame pointer");
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  MVT VT = TLI.getPointerTy(DAG.getDataLayout());
+
+  const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+  unsigned FrameReg =
+      RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
+  unsigned SPReg = RegInfo->getStackRegister();
+  unsigned SlotSize = RegInfo->getSlotSize();
+
+  // Get incoming EBP.
+  SDValue IncomingEBP =
+      DAG.getCopyFromReg(Chain, dl, FrameReg, VT);
+
+  // SP is saved in the first field of every registration node, so load
+  // [EBP-RegNodeSize] into SP.
+  int RegNodeSize = getSEHRegistrationNodeSize(Fn);
+  SDValue SPAddr = DAG.getNode(ISD::ADD, dl, VT, IncomingEBP,
+                               DAG.getConstant(-RegNodeSize, dl, VT));
+  SDValue NewSP =
+      DAG.getLoad(VT, dl, Chain, SPAddr, MachinePointerInfo(), false, false,
+                  false, VT.getScalarSizeInBits() / 8);
+  Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP);
+
+  if (!RegInfo->needsStackRealignment(MF)) {
+    // Adjust EBP to point back to the original frame position.
+    SDValue NewFP = recoverFramePointer(DAG, Fn, IncomingEBP);
+    Chain = DAG.getCopyToReg(Chain, dl, FrameReg, NewFP);
+  } else {
+    assert(RegInfo->hasBasePointer(MF) &&
+           "functions with Win32 EH must use frame or base pointer register");
+
+    // Reload the base pointer (ESI) with the adjusted incoming EBP.
+    SDValue NewBP = recoverFramePointer(DAG, Fn, IncomingEBP);
+    Chain = DAG.getCopyToReg(Chain, dl, RegInfo->getBaseRegister(), NewBP);
+
+    // Reload the spilled EBP value, now that the stack and base pointers are
+    // set up.
+    X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+    X86FI->setHasSEHFramePtrSave(true);
+    int FI = MF.getFrameInfo()->CreateSpillStackObject(SlotSize, SlotSize);
+    X86FI->setSEHFramePtrSaveIndex(FI);
+    SDValue NewFP = DAG.getLoad(VT, dl, Chain, DAG.getFrameIndex(FI, VT),
+                                MachinePointerInfo(), false, false, false,
+                                VT.getScalarSizeInBits() / 8);
+    Chain = DAG.getCopyToReg(NewFP, dl, FrameReg, NewFP);
+  }
+
+  return Chain;
+}
  
  static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
                                        SelectionDAG &DAG) {
    unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
  
    const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo);
-  if (!IntrData)
+  if (!IntrData) {
+    if (IntNo == llvm::Intrinsic::x86_seh_restoreframe)
+      return LowerSEHRESTOREFRAME(Op, Subtarget, DAG);
      return SDValue();
+  }
  
    SDLoc dl(Op);
    switch(IntrData->Type) {
@@ -15693,49 +16196,38 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
      SDValue Addr = Op.getOperand(2);
      SDValue Chain = Op.getOperand(0);
  
+    EVT VT = DataToCompress.getValueType();
      if (isAllOnes(Mask)) // return just a store
        return DAG.getStore(Chain, dl, DataToCompress, Addr,
-                          MachinePointerInfo(), false, false, 0);
-
-    EVT VT = DataToCompress.getValueType();
-    EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
-                                  VT.getVectorNumElements());
-    EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
-                                     Mask.getValueType().getSizeInBits());
-    SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
-                                DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
-                                DAG.getIntPtrConstant(0, dl));
+                          MachinePointerInfo(), false, false,
+                          VT.getScalarSizeInBits()/8);
  
-    SDValue Compressed =  DAG.getNode(IntrData->Opc0, dl, VT, VMask,
-                                      DataToCompress, DAG.getUNDEF(VT));
+    SDValue Compressed =
+      getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress),
+                           Mask, DAG.getUNDEF(VT), Subtarget, DAG);
      return DAG.getStore(Chain, dl, Compressed, Addr,
-                        MachinePointerInfo(), false, false, 0);
+                        MachinePointerInfo(), false, false,
+                        VT.getScalarSizeInBits()/8);
    }
    case EXPAND_FROM_MEM: {
      SDLoc dl(Op);
      SDValue Mask = Op.getOperand(4);
-    SDValue PathThru = Op.getOperand(3);
+    SDValue PassThru = Op.getOperand(3);
      SDValue Addr = Op.getOperand(2);
      SDValue Chain = Op.getOperand(0);
      EVT VT = Op.getValueType();
  
      if (isAllOnes(Mask)) // return just a load
        return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false,
-                         false, 0);
-    EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
-                                  VT.getVectorNumElements());
-    EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
-                                     Mask.getValueType().getSizeInBits());
-    SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
-                                DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
-                                DAG.getIntPtrConstant(0, dl));
+                         false, VT.getScalarSizeInBits()/8);
  
      SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(),
-                                   false, false, false, 0);
+                                       false, false, false,
+                                       VT.getScalarSizeInBits()/8);
  
      SDValue Results[] = {
-        DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToExpand, PathThru),
-        Chain};
+      getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToExpand),
+                           Mask, PassThru, Subtarget, DAG), Chain};
      return DAG.getMergeValues(Results, dl);
    }
    }
@@ -15751,7 +16243,7 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
  
    unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
    SDLoc dl(Op);
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
  
    if (Depth > 0) {
      SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
@@ -15810,14 +16302,36 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
  
  // FIXME? Maybe this could be a TableGen attribute on some registers and
  // this table could be generated automatically from RegInfo.
-unsigned X86TargetLowering::getRegisterByName(const char* RegName,
-                                              EVT VT) const {
+unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
+                                              SelectionDAG &DAG) const {
+  const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
+  const MachineFunction &MF = DAG.getMachineFunction();
+
    unsigned Reg = StringSwitch<unsigned>(RegName)
                         .Case("esp", X86::ESP)
                         .Case("rsp", X86::RSP)
+                       .Case("ebp", X86::EBP)
+                       .Case("rbp", X86::RBP)
                         .Default(0);
+
+  if (Reg == X86::EBP || Reg == X86::RBP) {
+    if (!TFI.hasFP(MF))
+      report_fatal_error("register " + StringRef(RegName) +
+                         " is allocatable: function has no frame pointer");
+#ifndef NDEBUG
+    else {
+      const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+      unsigned FrameReg =
+          RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
+      assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
+             "Invalid Frame Register!");
+    }
+#endif
+  }
+
    if (Reg)
      return Reg;
+
    report_fatal_error("Invalid register name global variable");
  }
  
@@ -15833,7 +16347,7 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
    SDValue Handler   = Op.getOperand(2);
    SDLoc dl      (Op);
  
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
    const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
    unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
    assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
@@ -16052,7 +16566,8 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
  
    // Save FP Control Word to stack slot
    int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
-  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+  SDValue StackSlot =
+      DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
  
    MachineMemOperand *MMO =
     MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
@@ -16274,8 +16789,8 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
                                -1, 4, -1, 5, -1, 6, -1, 7};
        ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
        BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
-      ALo = DAG.getNode(ISD::BITCAST, dl, ExVT, ALo);
-      BLo = DAG.getNode(ISD::BITCAST, dl, ExVT, BLo);
+      ALo = DAG.getBitcast(ExVT, ALo);
+      BLo = DAG.getBitcast(ExVT, BLo);
        ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
        BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
      }
@@ -16294,8 +16809,8 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
                                -1, 12, -1, 13, -1, 14, -1, 15};
        AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
        BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
-      AHi = DAG.getNode(ISD::BITCAST, dl, ExVT, AHi);
-      BHi = DAG.getNode(ISD::BITCAST, dl, ExVT, BHi);
+      AHi = DAG.getBitcast(ExVT, AHi);
+      BHi = DAG.getBitcast(ExVT, BHi);
        AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
        BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
      }
@@ -16323,8 +16838,8 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
      // Now multiply odd parts.
      SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
  
-    Evens = DAG.getNode(ISD::BITCAST, dl, VT, Evens);
-    Odds = DAG.getNode(ISD::BITCAST, dl, VT, Odds);
+    Evens = DAG.getBitcast(VT, Evens);
+    Odds = DAG.getBitcast(VT, Odds);
  
      // Merge the two vectors back together with a shuffle. This expands into 2
      // shuffles.
@@ -16349,20 +16864,26 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
    SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
    SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
  
+  SDValue AhiBlo = Ahi;
+  SDValue AloBhi = Bhi;
    // Bit cast to 32-bit vectors for MULUDQ
    EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
                                    (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;
-  A = DAG.getNode(ISD::BITCAST, dl, MulVT, A);
-  B = DAG.getNode(ISD::BITCAST, dl, MulVT, B);
-  Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi);
-  Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi);
+  A = DAG.getBitcast(MulVT, A);
+  B = DAG.getBitcast(MulVT, B);
+  Ahi = DAG.getBitcast(MulVT, Ahi);
+  Bhi = DAG.getBitcast(MulVT, Bhi);
  
    SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
-  SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
-  SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
-
-  AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG);
-  AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG);
+  // After shifting right const values the result may be all-zero.
+  if (!ISD::isBuildVectorAllZeros(Ahi.getNode())) {
+    AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
+    AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG);
+  }
+  if (!ISD::isBuildVectorAllZeros(Bhi.getNode())) {
+    AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
+    AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG);
+  }
  
    SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
    return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
@@ -16407,7 +16928,7 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
    }
  
    SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
-                                         getPointerTy());
+                                         getPointerTy(DAG.getDataLayout()));
  
    TargetLowering::CallLoweringInfo CLI(DAG);
    CLI.setDebugLoc(dl).setChain(InChain)
@@ -16417,7 +16938,7 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
      .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
  
    std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
-  return DAG.getNode(ISD::BITCAST, dl, VT, CallInfo.first);
+  return DAG.getBitcast(VT, CallInfo.first);
  }
  
  static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
@@ -16455,12 +16976,10 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
        (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
    // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
    // => <2 x i64> <ae|cg>
-  SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT,
-                             DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
+  SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
    // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
    // => <2 x i64> <bf|dh>
-  SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT,
-                             DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
+  SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
  
    // Shuffle it back into the right order.
    SDValue Highs, Lows;
@@ -16479,9 +16998,9 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
    // If we have a signed multiply but no PMULDQ fix up the high parts of a
    // unsigned multiply.
    if (IsSigned && !Subtarget->hasSSE41()) {
-    SDValue ShAmt =
-        DAG.getConstant(31, dl,
-                        DAG.getTargetLoweringInfo().getShiftAmountTy(VT));
+    SDValue ShAmt = DAG.getConstant(
+        31, dl,
+        DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
      SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
                               DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
      SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
@@ -16497,18 +17016,18 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
    return DAG.getMergeValues(Ops, dl);
  }
  
-// Return true if the requred (according to Opcode) shift-imm form is natively
+// Return true if the required (according to Opcode) shift-imm form is natively
  // supported by the Subtarget
-static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget *Subtarget, 
+static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget *Subtarget,
                                          unsigned Opcode) {
    if (VT.getScalarSizeInBits() < 16)
      return false;
- 
+
    if (VT.is512BitVector() &&
        (VT.getScalarSizeInBits() > 16 || Subtarget->hasBWI()))
      return true;
  
-  bool LShift = VT.is128BitVector() || 
+  bool LShift = VT.is128BitVector() ||
      (VT.is256BitVector() && Subtarget->hasInt256());
  
    bool AShift = LShift && (Subtarget->hasVLX() ||
@@ -16517,16 +17036,16 @@ static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget *Subtarget,
  }
  
  // The shift amount is a variable, but it is the same for all vector lanes.
-// These instrcutions are defined together with shift-immediate.
-static 
-bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget *Subtarget, 
+// These instructions are defined together with shift-immediate.
+static
+bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget *Subtarget,
                                        unsigned Opcode) {
    return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
  }
  
-// Return true if the requred (according to Opcode) variable-shift form is
+// Return true if the required (according to Opcode) variable-shift form is
  // natively supported by the Subtarget
-static bool SupportedVectorVarShift(MVT VT, const X86Subtarget *Subtarget, 
+static bool SupportedVectorVarShift(MVT VT, const X86Subtarget *Subtarget,
                                      unsigned Opcode) {
  
    if (!Subtarget->hasInt256() || VT.getScalarSizeInBits() < 16)
@@ -16554,6 +17073,38 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
    unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
      (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
  
+  auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
+    assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
+    MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
+    SDValue Ex = DAG.getBitcast(ExVT, R);
+
+    if (ShiftAmt >= 32) {
+      // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
+      SDValue Upper =
+          getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
+      SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
+                                                 ShiftAmt - 32, DAG);
+      if (VT == MVT::v2i64)
+        Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
+      if (VT == MVT::v4i64)
+        Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
+                                  {9, 1, 11, 3, 13, 5, 15, 7});
+    } else {
+      // SRA upper i32, SHL whole i64 and select lower i32.
+      SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
+                                                 ShiftAmt, DAG);
+      SDValue Lower =
+          getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
+      Lower = DAG.getBitcast(ExVT, Lower);
+      if (VT == MVT::v2i64)
+        Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
+      if (VT == MVT::v4i64)
+        Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
+                                  {8, 1, 10, 3, 12, 5, 14, 7});
+    }
+    return DAG.getBitcast(VT, Ex);
+  };
+
    // Optimize shl/srl/sra with constant shift amount.
    if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
      if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
@@ -16562,6 +17113,11 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
        if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
          return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
  
+      // i64 SRA needs to be performed as partial shifts.
+      if ((VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
+          Op.getOpcode() == ISD::SRA)
+        return ArithmeticShiftRight64(ShiftAmt);
+
        if (VT == MVT::v16i8 || (Subtarget->hasInt256() && VT == MVT::v32i8)) {
          unsigned NumElts = VT.getVectorNumElements();
          MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
@@ -16574,7 +17130,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
            // Make a large shift.
            SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
                                                     R, ShiftAmt, DAG);
-          SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
+          SHL = DAG.getBitcast(VT, SHL);
            // Zero out the rightmost bits.
            SmallVector<SDValue, 32> V(
                NumElts, DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, MVT::i8));
@@ -16585,7 +17141,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
            // Make a large shift.
            SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
                                                     R, ShiftAmt, DAG);
-          SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
+          SRL = DAG.getBitcast(VT, SRL);
            // Zero out the leftmost bits.
            SmallVector<SDValue, 32> V(
                NumElts, DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, MVT::i8));
@@ -16645,7 +17201,12 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
        if (ShAmt != ShiftAmt)
          return SDValue();
      }
-    return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
+
+    if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
+      return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
+
+    if (Op.getOpcode() == ISD::SRA)
+      return ArithmeticShiftRight64(ShiftAmt);
    }
  
    return SDValue();
@@ -16727,7 +17288,9 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
          if (Vals[j] != Amt.getOperand(i + j))
            return SDValue();
      }
-    return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
+
+    if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
+      return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
    }
    return SDValue();
  }
@@ -16801,7 +17364,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
  
      Op = DAG.getNode(ISD::ADD, dl, VT, Op,
                       DAG.getConstant(0x3f800000U, dl, VT));
-    Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op);
+    Op = DAG.getBitcast(MVT::v4f32, Op);
      Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
      return DAG.getNode(ISD::MUL, dl, VT, Op, R);
    }
@@ -16871,44 +17434,166 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
        SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
        if (TargetOpcode == X86ISD::MOVSD)
          CastVT = MVT::v2i64;
-      SDValue BitCast1 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift1);
-      SDValue BitCast2 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift2);
+      SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1);
+      SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2);
        SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2,
                                              BitCast1, DAG);
-      return DAG.getNode(ISD::BITCAST, dl, VT, Result);
+      return DAG.getBitcast(VT, Result);
      }
    }
  
-  if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
-    // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
-    Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(5, dl, VT));
-
-    SDValue VSelM = DAG.getConstant(0x80, dl, VT);
-    SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
-    OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
-
-    // r = VSELECT(r, shl(r, 4), a);
-    SDValue M = DAG.getNode(ISD::SHL, dl, VT, R, DAG.getConstant(4, dl, VT));
-    R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
-
-    // a += a
-    Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
-    OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
-    OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
-
-    // r = VSELECT(r, shl(r, 2), a);
-    M = DAG.getNode(ISD::SHL, dl, VT, R, DAG.getConstant(2, dl, VT));
-    R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
+  // v4i32 Non Uniform Shifts.
+  // If the shift amount is constant we can shift each lane using the SSE2
+  // immediate shifts, else we need to zero-extend each lane to the lower i64
+  // and shift using the SSE2 variable shifts.
+  // The separate results can then be blended together.
+  if (VT == MVT::v4i32) {
+    unsigned Opc = Op.getOpcode();
+    SDValue Amt0, Amt1, Amt2, Amt3;
+    if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
+      Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
+      Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
+      Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
+      Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
+    } else {
+      // ISD::SHL is handled above but we include it here for completeness.
+      switch (Opc) {
+      default:
+        llvm_unreachable("Unknown target vector shift node");
+      case ISD::SHL:
+        Opc = X86ISD::VSHL;
+        break;
+      case ISD::SRL:
+        Opc = X86ISD::VSRL;
+        break;
+      case ISD::SRA:
+        Opc = X86ISD::VSRA;
+        break;
+      }
+      // The SSE2 shifts use the lower i64 as the same shift amount for
+      // all lanes and the upper i64 is ignored. These shuffle masks
+      // optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
+      SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
+      Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
+      Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
+      Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
+      Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
+    }
+
+    SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
+    SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
+    SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
+    SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
+    SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
+    SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
+    return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
+  }
+
+  if (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget->hasInt256())) {
+    MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
+    unsigned ShiftOpcode = Op->getOpcode();
+
+    auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
+      // On SSE41 targets we make use of the fact that VSELECT lowers
+      // to PBLENDVB which selects bytes based just on the sign bit.
+      if (Subtarget->hasSSE41()) {
+        V0 = DAG.getBitcast(VT, V0);
+        V1 = DAG.getBitcast(VT, V1);
+        Sel = DAG.getBitcast(VT, Sel);
+        return DAG.getBitcast(SelVT,
+                              DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1));
+      }
+      // On pre-SSE41 targets we test for the sign bit by comparing to
+      // zero - a negative value will set all bits of the lanes to true
+      // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
+      SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
+      SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
+      return DAG.getNode(ISD::VSELECT, dl, SelVT, C, V0, V1);
+    };
  
-    // a += a
-    Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
-    OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
-    OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
+    // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
+    // We can safely do this using i16 shifts as we're only interested in
+    // the 3 lower bits of each byte.
+    Amt = DAG.getBitcast(ExtVT, Amt);
+    Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
+    Amt = DAG.getBitcast(VT, Amt);
+
+    if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
+      // r = VSELECT(r, shift(r, 4), a);
+      SDValue M =
+          DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
+      R = SignBitSelect(VT, Amt, M, R);
+
+      // a += a
+      Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
+
+      // r = VSELECT(r, shift(r, 2), a);
+      M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
+      R = SignBitSelect(VT, Amt, M, R);
+
+      // a += a
+      Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
+
+      // return VSELECT(r, shift(r, 1), a);
+      M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
+      R = SignBitSelect(VT, Amt, M, R);
+      return R;
+    }
  
-    // return VSELECT(r, r+r, a);
-    R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel,
-                    DAG.getNode(ISD::ADD, dl, VT, R, R), R);
-    return R;
+    if (Op->getOpcode() == ISD::SRA) {
+      // For SRA we need to unpack each byte to the higher byte of a i16 vector
+      // so we can correctly sign extend. We don't care what happens to the
+      // lower byte.
+      SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
+      SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
+      SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
+      SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
+      ALo = DAG.getBitcast(ExtVT, ALo);
+      AHi = DAG.getBitcast(ExtVT, AHi);
+      RLo = DAG.getBitcast(ExtVT, RLo);
+      RHi = DAG.getBitcast(ExtVT, RHi);
+
+      // r = VSELECT(r, shift(r, 4), a);
+      SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
+                                DAG.getConstant(4, dl, ExtVT));
+      SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
+                                DAG.getConstant(4, dl, ExtVT));
+      RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
+      RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
+
+      // a += a
+      ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
+      AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
+
+      // r = VSELECT(r, shift(r, 2), a);
+      MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
+                        DAG.getConstant(2, dl, ExtVT));
+      MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
+                        DAG.getConstant(2, dl, ExtVT));
+      RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
+      RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
+
+      // a += a
+      ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
+      AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
+
+      // r = VSELECT(r, shift(r, 1), a);
+      MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
+                        DAG.getConstant(1, dl, ExtVT));
+      MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
+                        DAG.getConstant(1, dl, ExtVT));
+      RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
+      RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
+
+      // Logical shift the result back to the lower byte, leaving a zero upper
+      // byte
+      // meaning that we can safely pack with PACKUSWB.
+      RLo =
+          DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
+      RHi =
+          DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
+      return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
+    }
    }
  
    // It's worth extending once and using the v8i32 shifts for 16-bit types, but
@@ -16931,10 +17616,10 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
      SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
      SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, R, R);
      SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, R, R);
-    ALo = DAG.getNode(ISD::BITCAST, dl, ExtVT, ALo);
-    AHi = DAG.getNode(ISD::BITCAST, dl, ExtVT, AHi);
-    RLo = DAG.getNode(ISD::BITCAST, dl, ExtVT, RLo);
-    RHi = DAG.getNode(ISD::BITCAST, dl, ExtVT, RHi);
+    ALo = DAG.getBitcast(ExtVT, ALo);
+    AHi = DAG.getBitcast(ExtVT, AHi);
+    RLo = DAG.getBitcast(ExtVT, RLo);
+    RHi = DAG.getBitcast(ExtVT, RHi);
      SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
      SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
      Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
@@ -16942,6 +17627,67 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
      return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
    }
  
+  if (VT == MVT::v8i16) {
+    unsigned ShiftOpcode = Op->getOpcode();
+
+    auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
+      // On SSE41 targets we make use of the fact that VSELECT lowers
+      // to PBLENDVB which selects bytes based just on the sign bit.
+      if (Subtarget->hasSSE41()) {
+        MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
+        V0 = DAG.getBitcast(ExtVT, V0);
+        V1 = DAG.getBitcast(ExtVT, V1);
+        Sel = DAG.getBitcast(ExtVT, Sel);
+        return DAG.getBitcast(
+            VT, DAG.getNode(ISD::VSELECT, dl, ExtVT, Sel, V0, V1));
+      }
+      // On pre-SSE41 targets we splat the sign bit - a negative value will
+      // set all bits of the lanes to true and VSELECT uses that in
+      // its OR(AND(V0,C),AND(V1,~C)) lowering.
+      SDValue C =
+          DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
+      return DAG.getNode(ISD::VSELECT, dl, VT, C, V0, V1);
+    };
+
+    // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
+    if (Subtarget->hasSSE41()) {
+      // On SSE41 targets we need to replicate the shift mask in both
+      // bytes for PBLENDVB.
+      Amt = DAG.getNode(
+          ISD::OR, dl, VT,
+          DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
+          DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
+    } else {
+      Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
+    }
+
+    // r = VSELECT(r, shift(r, 8), a);
+    SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
+    R = SignBitSelect(Amt, M, R);
+
+    // a += a
+    Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
+
+    // r = VSELECT(r, shift(r, 4), a);
+    M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
+    R = SignBitSelect(Amt, M, R);
+
+    // a += a
+    Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
+
+    // r = VSELECT(r, shift(r, 2), a);
+    M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
+    R = SignBitSelect(Amt, M, R);
+
+    // a += a
+    Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
+
+    // return VSELECT(r, shift(r, 1), a);
+    M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
+    R = SignBitSelect(Amt, M, R);
+    return R;
+  }
+
    // Decompose 256-bit shifts into smaller 128-bit shifts.
    if (VT.is256BitVector()) {
      unsigned NumElems = VT.getVectorNumElements();
@@ -17293,7 +18039,7 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
  
      EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
      SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts);
-    SDValue ToV2F64 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, BV);
+    SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
                         DAG.getIntPtrConstant(0, dl));
    }
@@ -17315,141 +18061,241 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
    return SDValue();
  }
  
-static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget,
-                          SelectionDAG &DAG) {
-  SDNode *Node = Op.getNode();
-  SDLoc dl(Node);
+/// Compute the horizontal sum of bytes in V for the elements of VT.
+///
+/// Requires V to be a byte vector and VT to be an integer vector type with
+/// wider elements than V's type. The width of the elements of VT determines
+/// how many bytes of V are summed horizontally to produce each element of the
+/// result.
+static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
+                                      const X86Subtarget *Subtarget,
+                                      SelectionDAG &DAG) {
+  SDLoc DL(V);
+  MVT ByteVecVT = V.getSimpleValueType();
+  MVT EltVT = VT.getVectorElementType();
+  int NumElts = VT.getVectorNumElements();
+  assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
+         "Expected value to have byte element type.");
+  assert(EltVT != MVT::i8 &&
+         "Horizontal byte sum only makes sense for wider elements!");
+  unsigned VecSize = VT.getSizeInBits();
+  assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
+
+  // PSADBW instruction horizontally add all bytes and leave the result in i64
+  // chunks, thus directly computes the pop count for v2i64 and v4i64.
+  if (EltVT == MVT::i64) {
+    SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
+    V = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT, V, Zeros);
+    return DAG.getBitcast(VT, V);
+  }
+
+  if (EltVT == MVT::i32) {
+    // We unpack the low half and high half into i32s interleaved with zeros so
+    // that we can use PSADBW to horizontally sum them. The most useful part of
+    // this is that it lines up the results of two PSADBW instructions to be
+    // two v2i64 vectors which concatenated are the 4 population counts. We can
+    // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
+    SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
+    SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V, Zeros);
+    SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V, Zeros);
+
+    // Do the horizontal sums into two v2i64s.
+    Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
+    Low = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT,
+                      DAG.getBitcast(ByteVecVT, Low), Zeros);
+    High = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT,
+                       DAG.getBitcast(ByteVecVT, High), Zeros);
+
+    // Merge them together.
+    MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
+    V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
+                    DAG.getBitcast(ShortVecVT, Low),
+                    DAG.getBitcast(ShortVecVT, High));
+
+    return DAG.getBitcast(VT, V);
+  }
+
+  // The only element type left is i16.
+  assert(EltVT == MVT::i16 && "Unknown how to handle type");
+
+  // To obtain pop count for each i16 element starting from the pop count for
+  // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
+  // right by 8. It is important to shift as i16s as i8 vector shift isn't
+  // directly supported.
+  SmallVector<SDValue, 16> Shifters(NumElts, DAG.getConstant(8, DL, EltVT));
+  SDValue Shifter = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Shifters);
+  SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), Shifter);
+  V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
+                  DAG.getBitcast(ByteVecVT, V));
+  return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), Shifter);
+}
+
+static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, SDLoc DL,
+                                        const X86Subtarget *Subtarget,
+                                        SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  MVT EltVT = VT.getVectorElementType();
+  unsigned VecSize = VT.getSizeInBits();
  
-  Op = Op.getOperand(0);
-  EVT VT = Op.getValueType();
-  assert((VT.is128BitVector() || VT.is256BitVector()) &&
-         "CTPOP lowering only implemented for 128/256-bit wide vector types");
+  // Implement a lookup table in register by using an algorithm based on:
+  // http://wm.ite.pl/articles/sse-popcount.html
+  //
+  // The general idea is that every lower byte nibble in the input vector is an
+  // index into a in-register pre-computed pop count table. We then split up the
+  // input vector in two new ones: (1) a vector with only the shifted-right
+  // higher nibbles for each byte and (2) a vector with the lower nibbles (and
+  // masked out higher ones) for each byte. PSHUB is used separately with both
+  // to index the in-register table. Next, both are added and the result is a
+  // i8 vector where each element contains the pop count for input byte.
+  //
+  // To obtain the pop count for elements != i8, we follow up with the same
+  // approach and use additional tricks as described below.
+  //
+  const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
+                       /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
+                       /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
+                       /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
+
+  int NumByteElts = VecSize / 8;
+  MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
+  SDValue In = DAG.getBitcast(ByteVecVT, Op);
+  SmallVector<SDValue, 16> LUTVec;
+  for (int i = 0; i < NumByteElts; ++i)
+    LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
+  SDValue InRegLUT = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, LUTVec);
+  SmallVector<SDValue, 16> Mask0F(NumByteElts,
+                                  DAG.getConstant(0x0F, DL, MVT::i8));
+  SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, Mask0F);
+
+  // High nibbles
+  SmallVector<SDValue, 16> Four(NumByteElts, DAG.getConstant(4, DL, MVT::i8));
+  SDValue FourV = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, Four);
+  SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
+
+  // Low nibbles
+  SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
+
+  // The input vector is used as the shuffle mask that index elements into the
+  // LUT. After counting low and high nibbles, add the vector to obtain the
+  // final pop count per i8 element.
+  SDValue HighPopCnt =
+      DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
+  SDValue LowPopCnt =
+      DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
+  SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
  
-  unsigned NumElts = VT.getVectorNumElements();
-  EVT EltVT = VT.getVectorElementType();
-  unsigned Len = EltVT.getSizeInBits();
+  if (EltVT == MVT::i8)
+    return PopCnt;
+
+  return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
+}
+
+static SDValue LowerVectorCTPOPBitmath(SDValue Op, SDLoc DL,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  assert(VT.is128BitVector() &&
+         "Only 128-bit vector bitmath lowering supported.");
+
+  int VecSize = VT.getSizeInBits();
+  MVT EltVT = VT.getVectorElementType();
+  int Len = EltVT.getSizeInBits();
  
    // This is the vectorized version of the "best" algorithm from
    // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
    // with a minor tweak to use a series of adds + shifts instead of vector
-  // multiplications. Implemented for the v2i64, v4i64, v4i32, v8i32 types:
-  //
-  //  v2i64, v4i64, v4i32 => Only profitable w/ popcnt disabled
-  //  v8i32 => Always profitable
-  //
-  // FIXME: There a couple of possible improvements:
-  //
-  // 1) Support for i8 and i16 vectors (needs measurements if popcnt enabled).
-  // 2) Use strategies from http://wm.ite.pl/articles/sse-popcount.html
-  //
-  assert(EltVT.isInteger() && (Len == 32 || Len == 64) && Len % 8 == 0 &&
-         "CTPOP not implemented for this vector element type.");
+  // multiplications. Implemented for all integer vector types. We only use
+  // this when we don't have SSSE3 which allows a LUT-based lowering that is
+  // much faster, even faster than using native popcnt instructions.
+
+  auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
+    MVT VT = V.getSimpleValueType();
+    SmallVector<SDValue, 32> Shifters(
+        VT.getVectorNumElements(),
+        DAG.getConstant(Shifter, DL, VT.getVectorElementType()));
+    return DAG.getNode(OpCode, DL, VT, V,
+                       DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Shifters));
+  };
+  auto GetMask = [&](SDValue V, APInt Mask) {
+    MVT VT = V.getSimpleValueType();
+    SmallVector<SDValue, 32> Masks(
+        VT.getVectorNumElements(),
+        DAG.getConstant(Mask, DL, VT.getVectorElementType()));
+    return DAG.getNode(ISD::AND, DL, VT, V,
+                       DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Masks));
+  };
  
-  // X86 canonicalize ANDs to vXi64, generate the appropriate bitcasts to avoid
-  // extra legalization.
-  bool NeedsBitcast = EltVT == MVT::i32;
-  MVT BitcastVT = VT.is256BitVector() ? MVT::v4i64 : MVT::v2i64;
+  // We don't want to incur the implicit masks required to SRL vNi8 vectors on
+  // x86, so set the SRL type to have elements at least i16 wide. This is
+  // correct because all of our SRLs are followed immediately by a mask anyways
+  // that handles any bits that sneak into the high bits of the byte elements.
+  MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
  
-  SDValue Cst55 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), dl,
-                                  EltVT);
-  SDValue Cst33 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), dl,
-                                  EltVT);
-  SDValue Cst0F = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), dl,
-                                  EltVT);
+  SDValue V = Op;
  
    // v = v - ((v >> 1) & 0x55555555...)
-  SmallVector<SDValue, 8> Ones(NumElts, DAG.getConstant(1, dl, EltVT));
-  SDValue OnesV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ones);
-  SDValue Srl = DAG.getNode(ISD::SRL, dl, VT, Op, OnesV);
-  if (NeedsBitcast)
-    Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);
-
-  SmallVector<SDValue, 8> Mask55(NumElts, Cst55);
-  SDValue M55 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask55);
-  if (NeedsBitcast)
-    M55 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M55);
-
-  SDValue And = DAG.getNode(ISD::AND, dl, Srl.getValueType(), Srl, M55);
-  if (VT != And.getValueType())
-    And = DAG.getNode(ISD::BITCAST, dl, VT, And);
-  SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, Op, And);
+  SDValue Srl =
+      DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
+  SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
+  V = DAG.getNode(ISD::SUB, DL, VT, V, And);
  
    // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
-  SmallVector<SDValue, 8> Mask33(NumElts, Cst33);
-  SDValue M33 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask33);
-  SmallVector<SDValue, 8> Twos(NumElts, DAG.getConstant(2, dl, EltVT));
-  SDValue TwosV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Twos);
+  SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
+  Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
+  SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
+  V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
  
-  Srl = DAG.getNode(ISD::SRL, dl, VT, Sub, TwosV);
-  if (NeedsBitcast) {
-    Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);
-    M33 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M33);
-    Sub = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Sub);
-  }
+  // v = (v + (v >> 4)) & 0x0F0F0F0F...
+  Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
+  SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
+  V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
  
-  SDValue AndRHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Srl, M33);
-  SDValue AndLHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Sub, M33);
-  if (VT != AndRHS.getValueType()) {
-    AndRHS = DAG.getNode(ISD::BITCAST, dl, VT, AndRHS);
-    AndLHS = DAG.getNode(ISD::BITCAST, dl, VT, AndLHS);
-  }
-  SDValue Add = DAG.getNode(ISD::ADD, dl, VT, AndLHS, AndRHS);
+  // At this point, V contains the byte-wise population count, and we are
+  // merely doing a horizontal sum if necessary to get the wider element
+  // counts.
+  if (EltVT == MVT::i8)
+    return V;
  
-  // v = (v + (v >> 4)) & 0x0F0F0F0F...
-  SmallVector<SDValue, 8> Fours(NumElts, DAG.getConstant(4, dl, EltVT));
-  SDValue FoursV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Fours);
-  Srl = DAG.getNode(ISD::SRL, dl, VT, Add, FoursV);
-  Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);
-
-  SmallVector<SDValue, 8> Mask0F(NumElts, Cst0F);
-  SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask0F);
-  if (NeedsBitcast) {
-    Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);
-    M0F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M0F);
-  }
-  And = DAG.getNode(ISD::AND, dl, M0F.getValueType(), Add, M0F);
-  if (VT != And.getValueType())
-    And = DAG.getNode(ISD::BITCAST, dl, VT, And);
-
-  // The algorithm mentioned above uses:
-  //    v = (v * 0x01010101...) >> (Len - 8)
-  //
-  // Change it to use vector adds + vector shifts which yield faster results on
-  // Haswell than using vector integer multiplication.
-  //
-  // For i32 elements:
-  //    v = v + (v >> 8)
-  //    v = v + (v >> 16)
-  //
-  // For i64 elements:
-  //    v = v + (v >> 8)
-  //    v = v + (v >> 16)
-  //    v = v + (v >> 32)
-  //
-  Add = And;
-  SmallVector<SDValue, 8> Csts;
-  for (unsigned i = 8; i <= Len/2; i *= 2) {
-    Csts.assign(NumElts, DAG.getConstant(i, dl, EltVT));
-    SDValue CstsV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Csts);
-    Srl = DAG.getNode(ISD::SRL, dl, VT, Add, CstsV);
-    Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);
-    Csts.clear();
+  return LowerHorizontalByteSum(
+      DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
+      DAG);
+}
+
+static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget *Subtarget,
+                                SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  // FIXME: Need to add AVX-512 support here!
+  assert((VT.is256BitVector() || VT.is128BitVector()) &&
+         "Unknown CTPOP type to handle");
+  SDLoc DL(Op.getNode());
+  SDValue Op0 = Op.getOperand(0);
+
+  if (!Subtarget->hasSSSE3()) {
+    // We can't use the fast LUT approach, so fall back on vectorized bitmath.
+    assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
+    return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
    }
  
-  // The result is on the least significant 6-bits on i32 and 7-bits on i64.
-  SDValue Cst3F = DAG.getConstant(APInt(Len, Len == 32 ? 0x3F : 0x7F), dl,
-                                  EltVT);
-  SmallVector<SDValue, 8> Cst3FV(NumElts, Cst3F);
-  SDValue M3F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Cst3FV);
-  if (NeedsBitcast) {
-    Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);
-    M3F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M3F);
+  if (VT.is256BitVector() && !Subtarget->hasInt256()) {
+    unsigned NumElems = VT.getVectorNumElements();
+
+    // Extract each 128-bit vector, compute pop count and concat the result.
+    SDValue LHS = Extract128BitVector(Op0, 0, DAG, DL);
+    SDValue RHS = Extract128BitVector(Op0, NumElems/2, DAG, DL);
+
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
+                       LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
+                       LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
    }
-  And = DAG.getNode(ISD::AND, dl, M3F.getValueType(), Add, M3F);
-  if (VT != And.getValueType())
-    And = DAG.getNode(ISD::BITCAST, dl, VT, And);
  
-  return And;
+  return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
+}
+
+static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget,
+                          SelectionDAG &DAG) {
+  assert(Op.getValueType().isVector() &&
+         "We only do custom lowering for vector population count.");
+  return LowerVectorCTPOP(Op, Subtarget, DAG);
  }
  
  static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
@@ -17545,7 +18391,8 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
    // the results are returned via SRet in memory.
    const char *LibcallName =  isF64 ? "__sincos_stret" : "__sincosf_stret";
    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy());
+  SDValue Callee =
+      DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
  
    Type *RetTy = isF64
      ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
@@ -17840,8 +18687,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
                                       MVT::f64);
      SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias);
      SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
-                             DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, VBias));
-    Or = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or);
+                             DAG.getBitcast(MVT::v2i64, VBias));
+    Or = DAG.getBitcast(MVT::v2f64, Or);
      SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
      Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
      return;
@@ -17964,7 +18811,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
      EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
      SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
                                     MVT::v2f64, N->getOperand(0));
-    SDValue ToVecInt = DAG.getNode(ISD::BITCAST, dl, WiderVT, Expanded);
+    SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
  
      if (ExperimentalVectorWideningLegalization) {
        // If we are legalizing vectors by widening, we already have the desired
@@ -17994,7 +18841,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::FANDN:              return "X86ISD::FANDN";
    case X86ISD::FOR:                return "X86ISD::FOR";
    case X86ISD::FXOR:               return "X86ISD::FXOR";
-  case X86ISD::FSRL:               return "X86ISD::FSRL";
    case X86ISD::FILD:               return "X86ISD::FILD";
    case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
    case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
@@ -18045,10 +18891,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::HSUB:               return "X86ISD::HSUB";
    case X86ISD::FHADD:              return "X86ISD::FHADD";
    case X86ISD::FHSUB:              return "X86ISD::FHSUB";
-  case X86ISD::UMAX:               return "X86ISD::UMAX";
-  case X86ISD::UMIN:               return "X86ISD::UMIN";
-  case X86ISD::SMAX:               return "X86ISD::SMAX";
-  case X86ISD::SMIN:               return "X86ISD::SMIN";
+  case X86ISD::ABS:                return "X86ISD::ABS";
    case X86ISD::FMAX:               return "X86ISD::FMAX";
    case X86ISD::FMAX_RND:           return "X86ISD::FMAX_RND";
    case X86ISD::FMIN:               return "X86ISD::FMIN";
@@ -18057,6 +18900,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::FMINC:              return "X86ISD::FMINC";
    case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
    case X86ISD::FRCP:               return "X86ISD::FRCP";
+  case X86ISD::EXTRQI:             return "X86ISD::EXTRQI";
+  case X86ISD::INSERTQI:           return "X86ISD::INSERTQI";
    case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
    case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
    case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
@@ -18078,6 +18923,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::VINSERT:            return "X86ISD::VINSERT";
    case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
    case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
+  case X86ISD::CVTDQ2PD:           return "X86ISD::CVTDQ2PD";
+  case X86ISD::CVTUDQ2PD:          return "X86ISD::CVTUDQ2PD";
    case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
    case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
    case X86ISD::VSHL:               return "X86ISD::VSHL";
@@ -18121,6 +18968,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
    case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
    case X86ISD::SHUFP:              return "X86ISD::SHUFP";
+  case X86ISD::SHUF128:            return "X86ISD::SHUF128";
    case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
    case X86ISD::MOVLHPD:            return "X86ISD::MOVLHPD";
    case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
@@ -18143,8 +18991,11 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::VPERMV3:            return "X86ISD::VPERMV3";
    case X86ISD::VPERMIV3:           return "X86ISD::VPERMIV3";
    case X86ISD::VPERMI:             return "X86ISD::VPERMI";
+  case X86ISD::VFIXUPIMM:          return "X86ISD::VFIXUPIMM";
+  case X86ISD::VRANGE:             return "X86ISD::VRANGE";
    case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
    case X86ISD::PMULDQ:             return "X86ISD::PMULDQ";
+  case X86ISD::PSADBW:             return "X86ISD::PSADBW";
    case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
    case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
    case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
@@ -18184,16 +19035,26 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::FSUB_RND:           return "X86ISD::FSUB_RND";
    case X86ISD::FMUL_RND:           return "X86ISD::FMUL_RND";
    case X86ISD::FDIV_RND:           return "X86ISD::FDIV_RND";
+  case X86ISD::FSQRT_RND:          return "X86ISD::FSQRT_RND";
+  case X86ISD::FGETEXP_RND:        return "X86ISD::FGETEXP_RND";
+  case X86ISD::SCALEF:             return "X86ISD::SCALEF";
    case X86ISD::ADDS:               return "X86ISD::ADDS";
    case X86ISD::SUBS:               return "X86ISD::SUBS";
+  case X86ISD::AVG:                return "X86ISD::AVG";
+  case X86ISD::MULHRS:             return "X86ISD::MULHRS";
+  case X86ISD::SINT_TO_FP_RND:     return "X86ISD::SINT_TO_FP_RND";
+  case X86ISD::UINT_TO_FP_RND:     return "X86ISD::UINT_TO_FP_RND";
+  case X86ISD::FP_TO_SINT_RND:     return "X86ISD::FP_TO_SINT_RND";
+  case X86ISD::FP_TO_UINT_RND:     return "X86ISD::FP_TO_UINT_RND";
    }
    return nullptr;
  }
  
  // isLegalAddressingMode - Return true if the addressing mode represented
  // by AM is legal for this target, for a load/store of the specified type.
-bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                              Type *Ty) const {
+bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
+                                              const AddrMode &AM, Type *Ty,
+                                              unsigned AS) const {
    // X86 supports extremely general addressing modes.
    CodeModel::Model M = getTargetMachine().getCodeModel();
    Reloc::Model R = getTargetMachine().getRelocationModel();
@@ -18339,7 +19200,7 @@ bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
  
  bool
  X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
-  if (!(Subtarget->hasFMA() || Subtarget->hasFMA4()))
+  if (!(Subtarget->hasFMA() || Subtarget->hasFMA4() || Subtarget->hasAVX512()))
      return false;
  
    VT = VT.getScalarType();
@@ -19144,7 +20005,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
  
    MachineRegisterInfo &MRI = MF->getRegInfo();
    const TargetRegisterClass *AddrRegClass =
-    getRegClassFor(getPointerTy());
+      getRegClassFor(getPointerTy(MF->getDataLayout()));
  
    unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
      bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
@@ -19245,7 +20106,8 @@ X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
  
    assert(!Subtarget->isTargetMachO());
  
-  X86FrameLowering::emitStackProbeCall(*BB->getParent(), *BB, MI, DL);
+  Subtarget->getFrameLowering()->emitStackProbeCall(*BB->getParent(), *BB, MI,
+                                                    DL);
  
    MI->eraseFromParent();   // The pseudo instruction is gone now.
    return BB;
@@ -19338,7 +20200,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
  
    MemOpndSlot = CurOp;
  
-  MVT PVT = getPointerTy();
+  MVT PVT = getPointerTy(MF->getDataLayout());
    assert((PVT == MVT::i64 || PVT == MVT::i32) &&
           "Invalid Pointer Size!");
  
@@ -19470,7 +20332,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
    MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
    MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
  
-  MVT PVT = getPointerTy();
+  MVT PVT = getPointerTy(MF->getDataLayout());
    assert((PVT == MVT::i64 || PVT == MVT::i32) &&
           "Invalid Pointer Size!");
  
@@ -19523,6 +20385,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
  // Replace 213-type (isel default) FMA3 instructions with 231-type for
  // accumulator loops. Writing back to the accumulator allows the coalescer
  // to remove extra copies in the loop.
+// FIXME: Do this on AVX512.  We don't support 231 variants yet (PR23937).
  MachineBasicBlock *
  X86TargetLowering::emitFMA3Instr(MachineInstr *MI,
                                   MachineBasicBlock *MBB) const {
@@ -20028,7 +20891,7 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
                                   SDValue(ResNode.getNode(), 1));
          }
  
-        return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
+        return DAG.getBitcast(VT, ResNode);
        }
      }
  
@@ -20087,7 +20950,7 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
  
    // Just remove no-op shuffle masks.
    if (Mask.size() == 1) {
-    DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Input),
+    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Input),
                    /*AddTo*/ true);
      return true;
    }
@@ -20123,14 +20986,14 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
        }
        if (Depth == 1 && Root->getOpcode() == Shuffle)
          return false; // Nothing to do!
-      Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
+      Op = DAG.getBitcast(ShuffleVT, Input);
        DCI.AddToWorklist(Op.getNode());
        if (Shuffle == X86ISD::MOVDDUP)
          Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
        else
          Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
        DCI.AddToWorklist(Op.getNode());
-      DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
+      DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
                      /*AddTo*/ true);
        return true;
      }
@@ -20141,11 +21004,11 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
        MVT ShuffleVT = MVT::v4f32;
        if (Depth == 1 && Root->getOpcode() == Shuffle)
          return false; // Nothing to do!
-      Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
+      Op = DAG.getBitcast(ShuffleVT, Input);
        DCI.AddToWorklist(Op.getNode());
        Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
        DCI.AddToWorklist(Op.getNode());
-      DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
+      DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
                      /*AddTo*/ true);
        return true;
      }
@@ -20155,11 +21018,11 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
        MVT ShuffleVT = MVT::v4f32;
        if (Depth == 1 && Root->getOpcode() == Shuffle)
          return false; // Nothing to do!
-      Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
+      Op = DAG.getBitcast(ShuffleVT, Input);
        DCI.AddToWorklist(Op.getNode());
        Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
        DCI.AddToWorklist(Op.getNode());
-      DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
+      DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
                      /*AddTo*/ true);
        return true;
      }
@@ -20189,11 +21052,11 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
      default:
        llvm_unreachable("Impossible mask size!");
      };
-    Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
+    Op = DAG.getBitcast(ShuffleVT, Input);
      DCI.AddToWorklist(Op.getNode());
      Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
      DCI.AddToWorklist(Op.getNode());
-    DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
+    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
                    /*AddTo*/ true);
      return true;
    }
@@ -20222,14 +21085,14 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
        PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
      }
      MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
-    Op = DAG.getNode(ISD::BITCAST, DL, ByteVT, Input);
+    Op = DAG.getBitcast(ByteVT, Input);
      DCI.AddToWorklist(Op.getNode());
      SDValue PSHUFBMaskOp =
          DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVT, PSHUFBMask);
      DCI.AddToWorklist(PSHUFBMaskOp.getNode());
      Op = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Op, PSHUFBMaskOp);
      DCI.AddToWorklist(Op.getNode());
-    DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
+    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
                    /*AddTo*/ true);
      return true;
    }
@@ -20401,7 +21264,7 @@ static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
  #ifndef NDEBUG
      for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
        for (int j = 0; j < LaneElts; ++j)
-        assert(Mask[j] == Mask[i * LaneElts + j] - LaneElts &&
+        assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
                 "Mask doesn't repeat in high 128-bit lanes!");
  #endif
      Mask.resize(LaneElts);
@@ -20532,7 +21395,7 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
      SDValue W = Chain.pop_back_val();
  
      if (V.getValueType() != W.getOperand(0).getValueType())
-      V = DAG.getNode(ISD::BITCAST, DL, W.getOperand(0).getValueType(), V);
+      V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
  
      switch (W.getOpcode()) {
      default:
@@ -20551,7 +21414,7 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
      }
    }
    if (V.getValueType() != N.getValueType())
-    V = DAG.getNode(ISD::BITCAST, DL, N.getValueType(), V);
+    V = DAG.getBitcast(N.getValueType(), V);
  
    // Return the new chain to replace N.
    return V;
@@ -20668,12 +21531,12 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
        DMask[DOffset + 0] = DOffset + 1;
        DMask[DOffset + 1] = DOffset + 0;
        MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
-      V = DAG.getNode(ISD::BITCAST, DL, DVT, V);
+      V = DAG.getBitcast(DVT, V);
        DCI.AddToWorklist(V.getNode());
        V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
                        getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
        DCI.AddToWorklist(V.getNode());
-      return DAG.getNode(ISD::BITCAST, DL, VT, V);
+      return DAG.getBitcast(VT, V);
      }
  
      // Look for shuffle patterns which can be implemented as a single unpack.
@@ -20704,7 +21567,7 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
          if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
              makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
            // We can replace all three shuffles with an unpack.
-          V = DAG.getNode(ISD::BITCAST, DL, VT, D.getOperand(0));
+          V = DAG.getBitcast(VT, D.getOperand(0));
            DCI.AddToWorklist(V.getNode());
            return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
                                                  : X86ISD::UNPCKH,
@@ -20848,8 +21711,8 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
          CanFold = SVOp->getMaskElt(i) < 0;
  
        if (CanFold) {
-        SDValue BC00 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(0));
-        SDValue BC01 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(1));
+        SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
+        SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
          SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
          return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]);
        }
@@ -20863,8 +21726,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
    for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
      Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
  
-  SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true);
-  if (LD.getNode())
+  if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true))
      return LD;
  
    if (isTargetShuffle(N->getOpcode())) {
@@ -20965,7 +21827,7 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
    // alignment is valid.
    unsigned Align = LN0->getAlignment();
    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  unsigned NewAlign = TLI.getDataLayout()->getABITypeAlignment(
+  unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
        EltVT.getTypeForEVT(*DAG.getContext()));
  
    if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
@@ -20981,7 +21843,7 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
    Shuffle = DAG.getVectorShuffle(CurrentVT, dl,
                                   InVec.getOperand(0), Shuffle,
                                   &ShuffleMask[0]);
-  Shuffle = DAG.getNode(ISD::BITCAST, dl, OriginalVT, Shuffle);
+  Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
                       EltNo);
  }
@@ -21012,8 +21874,7 @@ static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG) {
  /// use 64-bit extracts and shifts.
  static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
                                           TargetLowering::DAGCombinerInfo &DCI) {
-  SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI);
-  if (NewOp.getNode())
+  if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
      return NewOp;
  
    SDValue InputVector = N->getOperand(0);
@@ -21101,15 +21962,16 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
    SDValue Vals[4];
  
    if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
-    SDValue Cst = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, InputVector);
-    EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy();
+    SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
+    auto &DL = DAG.getDataLayout();
+    EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
      SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
        DAG.getConstant(0, dl, VecIdxTy));
      SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
        DAG.getConstant(1, dl, VecIdxTy));
  
-    SDValue ShAmt = DAG.getConstant(32, dl,
-      DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64));
+    SDValue ShAmt = DAG.getConstant(
+        32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
      Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
      Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
        DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
@@ -21128,10 +21990,11 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
      // Replace each use (extract) with a load of the appropriate element.
      for (unsigned i = 0; i < 4; ++i) {
        uint64_t Offset = EltSize * i;
-      SDValue OffsetVal = DAG.getConstant(Offset, dl, TLI.getPointerTy());
+      auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+      SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
  
-      SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
-                                       StackPtr, OffsetVal);
+      SDValue ScalarAddr =
+          DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
  
        // Load the scalar.
        Vals[i] = DAG.getLoad(ElementType, dl, Ch,
@@ -21211,16 +22074,16 @@ matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
      default: break;
      case ISD::SETULT:
      case ISD::SETULE:
-      Opc = hasUnsigned ? X86ISD::UMIN : 0u; break;
+      Opc = hasUnsigned ? ISD::UMIN : 0; break;
      case ISD::SETUGT:
      case ISD::SETUGE:
-      Opc = hasUnsigned ? X86ISD::UMAX : 0u; break;
+      Opc = hasUnsigned ? ISD::UMAX : 0; break;
      case ISD::SETLT:
      case ISD::SETLE:
-      Opc = hasSigned ? X86ISD::SMIN : 0u; break;
+      Opc = hasSigned ? ISD::SMIN : 0; break;
      case ISD::SETGT:
      case ISD::SETGE:
-      Opc = hasSigned ? X86ISD::SMAX : 0u; break;
+      Opc = hasSigned ? ISD::SMAX : 0; break;
      }
    // Check for x CC y ? y : x -- a min/max with reversed arms.
    } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
@@ -21229,16 +22092,16 @@ matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
      default: break;
      case ISD::SETULT:
      case ISD::SETULE:
-      Opc = hasUnsigned ? X86ISD::UMAX : 0u; break;
+      Opc = hasUnsigned ? ISD::UMAX : 0; break;
      case ISD::SETUGT:
      case ISD::SETUGE:
-      Opc = hasUnsigned ? X86ISD::UMIN : 0u; break;
+      Opc = hasUnsigned ? ISD::UMIN : 0; break;
      case ISD::SETLT:
      case ISD::SETLE:
-      Opc = hasSigned ? X86ISD::SMAX : 0u; break;
+      Opc = hasSigned ? ISD::SMAX : 0; break;
      case ISD::SETGT:
      case ISD::SETGE:
-      Opc = hasSigned ? X86ISD::SMIN : 0u; break;
+      Opc = hasSigned ? ISD::SMIN : 0; break;
      }
    }
  
@@ -21695,7 +22558,8 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
          // Check if the selector will be produced by CMPP*/PCMP*
          Cond.getOpcode() == ISD::SETCC &&
          // Check if SETCC has already been promoted
-        TLI.getSetCCResultType(*DAG.getContext(), VT) == CondVT) {
+        TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
+            CondVT) {
        bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
        bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
  
@@ -21717,13 +22581,13 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
        if (TValIsAllOnes && FValIsAllZeros)
          Ret = Cond;
        else if (TValIsAllOnes)
-        Ret = DAG.getNode(ISD::OR, DL, CondVT, Cond,
-                          DAG.getNode(ISD::BITCAST, DL, CondVT, RHS));
+        Ret =
+            DAG.getNode(ISD::OR, DL, CondVT, Cond, DAG.getBitcast(CondVT, RHS));
        else if (FValIsAllZeros)
          Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond,
-                          DAG.getNode(ISD::BITCAST, DL, CondVT, LHS));
+                          DAG.getBitcast(CondVT, LHS));
  
-      return DAG.getNode(ISD::BITCAST, DL, VT, Ret);
+      return DAG.getBitcast(VT, Ret);
      }
    }
  
@@ -22415,7 +23279,7 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
        // We shift all of the values by one. In many cases we do not have
        // hardware support for this operation. This is better expressed as an ADD
        // of two values.
-      if (N1SplatC->getZExtValue() == 1)
+      if (N1SplatC->getAPIntValue() == 1)
          return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
      }
  
@@ -22456,16 +23320,14 @@ static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
  static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
                                     TargetLowering::DAGCombinerInfo &DCI,
                                     const X86Subtarget *Subtarget) {
-  if (N->getOpcode() == ISD::SHL) {
-    SDValue V = PerformSHLCombine(N, DAG);
-    if (V.getNode()) return V;
-  }
+  if (N->getOpcode() == ISD::SHL)
+    if (SDValue V = PerformSHLCombine(N, DAG))
+      return V;
  
-  if (N->getOpcode() != ISD::SRA) {
-    // Try to fold this logical shift into a zero vector.
-    SDValue V = performShiftToAllZeros(N, DAG, Subtarget);
-    if (V.getNode()) return V;
-  }
+  // Try to fold this logical shift into a zero vector.
+  if (N->getOpcode() != ISD::SRA)
+    if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget))
+      return V;
  
    return SDValue();
  }
@@ -22554,15 +23416,13 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
              // and work with those going forward.
              SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
                                             OnesOrZeroesF);
-            SDValue Vector32 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32,
-                                           Vector64);
+            SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
              OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
                                          Vector32, DAG.getIntPtrConstant(0, DL));
              IntVT = MVT::i32;
            }
  
-          SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, IntVT,
-                                              OnesOrZeroesF);
+          SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
            SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
                                        DAG.getConstant(1, DL, IntVT));
            SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
@@ -22775,7 +23635,7 @@ static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG,
  
    SDValue NewShuffle = DAG.getVectorShuffle(Shuffle->getValueType(0), DL,
      Shuffle->getOperand(0), DAG.getConstant(0, DL, SrcType), Mask);
-  return DAG.getNode(ISD::BITCAST, DL, N0.getValueType(), NewShuffle);
+  return DAG.getBitcast(N0.getValueType(), NewShuffle);
  }
  
  static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
@@ -22847,8 +23707,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
    if (DCI.isBeforeLegalizeOps())
      return SDValue();
  
-  SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
-  if (R.getNode())
+  if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget))
      return R;
  
    SDValue N0 = N->getOperand(0);
@@ -22916,7 +23775,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
          assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
                 "Unsupported VT for PSIGN");
          Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0));
-        return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
+        return DAG.getBitcast(VT, Mask);
        }
        // PBLENDVB only available on SSE 4.1
        if (!Subtarget->hasSSE41())
@@ -22924,11 +23783,11 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
  
        EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
  
-      X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X);
-      Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y);
-      Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask);
+      X = DAG.getBitcast(BlendVT, X);
+      Y = DAG.getBitcast(BlendVT, Y);
+      Mask = DAG.getBitcast(BlendVT, Mask);
        Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
-      return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
+      return DAG.getBitcast(VT, Mask);
      }
    }
  
@@ -23043,11 +23902,9 @@ static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
    if (DCI.isBeforeLegalizeOps())
      return SDValue();
  
-  if (Subtarget->hasCMov()) {
-    SDValue RV = performIntegerAbsCombine(N, DAG);
-    if (RV.getNode())
+  if (Subtarget->hasCMov())
+    if (SDValue RV = performIntegerAbsCombine(N, DAG))
        return RV;
-  }
  
    return SDValue();
  }
@@ -23074,7 +23931,8 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
        return SDValue();
  
      SDValue Ptr = Ld->getBasePtr();
-    SDValue Increment = DAG.getConstant(16, dl, TLI.getPointerTy());
+    SDValue Increment =
+        DAG.getConstant(16, dl, TLI.getPointerTy(DAG.getDataLayout()));
  
      EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
                                    NumElems/2);
@@ -23129,7 +23987,7 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
    assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
  
    // Convert Src0 value
-  SDValue WideSrc0 = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mld->getSrc0());
+  SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
    if (Mld->getSrc0().getOpcode() != ISD::UNDEF) {
      SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
      for (unsigned i = 0; i != NumElems; ++i)
@@ -23146,7 +24004,7 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
    SDValue Mask = Mld->getMask();
    if (Mask.getValueType() == VT) {
      // Mask and original value have the same type
-    NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
+    NewMask = DAG.getBitcast(WideVecVT, Mask);
      SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
      for (unsigned i = 0; i != NumElems; ++i)
        ShuffleVec[i] = i * SizeRatio;
@@ -23214,7 +24072,7 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
  
    assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
  
-  SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mst->getValue());
+  SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
    SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
    for (unsigned i = 0; i != NumElems; ++i)
      ShuffleVec[i] = i * SizeRatio;
@@ -23231,7 +24089,7 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
    SDValue Mask = Mst->getMask();
    if (Mask.getValueType() == VT) {
      // Mask and original value have the same type
-    NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
+    NewMask = DAG.getBitcast(WideVecVT, Mask);
      for (unsigned i = 0; i != NumElems; ++i)
        ShuffleVec[i] = i * SizeRatio;
      for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
@@ -23283,7 +24141,8 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
      SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl);
      SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl);
  
-    SDValue Stride = DAG.getConstant(16, dl, TLI.getPointerTy());
+    SDValue Stride =
+        DAG.getConstant(16, dl, TLI.getPointerTy(DAG.getDataLayout()));
      SDValue Ptr0 = St->getBasePtr();
      SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
  
@@ -23323,7 +24182,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
  
      assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
  
-    SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());
+    SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
      SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
      for (unsigned i = 0; i != NumElems; ++i)
        ShuffleVec[i] = i * SizeRatio;
@@ -23354,10 +24213,10 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
      EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
              StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
      assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
-    SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
+    SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
      SmallVector<SDValue, 8> Chains;
-    SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8, dl,
-                                        TLI.getPointerTy());
+    SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, dl,
+                                        TLI.getPointerTy(DAG.getDataLayout()));
      SDValue Ptr = St->getBasePtr();
  
      // Perform one or more big stores into memory.
@@ -23495,7 +24354,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
      SDValue ExtOp0 = OldExtract.getOperand(0);
      unsigned VecSize = ExtOp0.getValueSizeInBits();
      EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
-    SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtOp0);
+    SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
      SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
                                       BitCast, OldExtract.getOperand(1));
      return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
@@ -23802,7 +24661,7 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
    SDValue N0 = N->getOperand(0);
    EVT VT = N->getValueType(0);
    EVT SVT = VT.getScalarType();
-  EVT InVT = N0->getValueType(0);
+  EVT InVT = N0.getValueType();
    EVT InSVT = InVT.getScalarType();
    SDLoc DL(N);
  
@@ -23820,7 +24679,7 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
    }
  
    if (!DCI.isBeforeLegalizeOps()) {
-    if (N0.getValueType() == MVT::i1) {
+    if (InVT == MVT::i1) {
        SDValue Zero = DAG.getConstant(0, DL, VT);
        SDValue AllOnes =
          DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT);
@@ -23829,23 +24688,37 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
      return SDValue();
    }
  
-  if (VT.isVector()) {
-    auto ExtendToVec128 = [&DAG](SDLoc DL, SDValue N) {
-      EVT InVT = N->getValueType(0);
+  if (VT.isVector() && Subtarget->hasSSE2()) {
+    auto ExtendVecSize = [&DAG](SDLoc DL, SDValue N, unsigned Size) {
+      EVT InVT = N.getValueType();
        EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
-                                   128 / InVT.getScalarSizeInBits());
-      SmallVector<SDValue, 8> Opnds(128 / InVT.getSizeInBits(),
+                                   Size / InVT.getScalarSizeInBits());
+      SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
                                      DAG.getUNDEF(InVT));
        Opnds[0] = N;
        return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
      };
  
+    // If target-size is less than 128-bits, extend to a type that would extend
+    // to 128 bits, extend that and extract the original target vector.
+    if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits()) &&
+        (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) &&
+        (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) {
+      unsigned Scale = 128 / VT.getSizeInBits();
+      EVT ExVT =
+          EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
+      SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
+      SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, ExVT, Ex);
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
+                         DAG.getIntPtrConstant(0, DL));
+    }
+
      // If target-size is 128-bits, then convert to ISD::SIGN_EXTEND_VECTOR_INREG
      // which ensures lowering to X86ISD::VSEXT (pmovsx*).
      if (VT.getSizeInBits() == 128 &&
          (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) &&
          (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) {
-      SDValue ExOp = ExtendToVec128(DL, N0);
+      SDValue ExOp = ExtendVecSize(DL, N0, 128);
        return DAG.getSignExtendVectorInReg(ExOp, DL, VT);
      }
  
@@ -23864,7 +24737,7 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
             ++i, Offset += NumSubElts) {
          SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
                                       DAG.getIntPtrConstant(Offset, DL));
-        SrcVec = ExtendToVec128(DL, SrcVec);
+        SrcVec = ExtendVecSize(DL, SrcVec, 128);
          SrcVec = DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT);
          Opnds.push_back(SrcVec);
        }
@@ -23875,11 +24748,9 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
    if (!Subtarget->hasFp256())
      return SDValue();
  
-  if (VT.isVector() && VT.getSizeInBits() == 256) {
-    SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
-    if (R.getNode())
+  if (VT.isVector() && VT.getSizeInBits() == 256)
+    if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
        return R;
-  }
  
    return SDValue();
  }
@@ -23895,7 +24766,8 @@ static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
  
    EVT ScalarVT = VT.getScalarType();
    if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
-      (!Subtarget->hasFMA() && !Subtarget->hasFMA4()))
+      (!Subtarget->hasFMA() && !Subtarget->hasFMA4() &&
+       !Subtarget->hasAVX512()))
      return SDValue();
  
    SDValue A = N->getOperand(0);
@@ -23961,11 +24833,10 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
                           DAG.getConstant(1, dl, VT));
      }
    }
-  if (VT.is256BitVector()) {
-    SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
-    if (R.getNode())
+
+  if (VT.is256BitVector())
+    if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
        return R;
-  }
  
    // (i8,i32 zext (udivrem (i8 x, i8 y)) ->
    // (i8,i32 (udivrem_zext_hreg (i8 x, i8 y)
@@ -24169,10 +25040,7 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
    if (CC == X86::COND_B)
      return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
  
-  SDValue Flags;
-
-  Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
-  if (Flags.getNode()) {
+  if (SDValue Flags = checkBoolTestSetCCCombine(EFLAGS, CC)) {
      SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
      return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
    }
@@ -24191,10 +25059,7 @@ static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
    SDValue EFLAGS = N->getOperand(3);
    X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
  
-  SDValue Flags;
-
-  Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
-  if (Flags.getNode()) {
+  if (SDValue Flags = checkBoolTestSetCCCombine(EFLAGS, CC)) {
      SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
      return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
                         Flags);
@@ -24239,51 +25104,79 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
      // DAG.
      SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
      // The AND node needs bitcasts to/from an integer vector type around it.
-    SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
+    SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
      SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
                                   N->getOperand(0)->getOperand(0), MaskConst);
-    SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
+    SDValue Res = DAG.getBitcast(VT, NewAnd);
      return Res;
    }
  
    return SDValue();
  }
  
+static SDValue PerformUINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
+                                        const X86Subtarget *Subtarget) {
+  SDValue Op0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  EVT InVT = Op0.getValueType();
+  EVT InSVT = InVT.getScalarType();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
+  // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
+  if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
+    SDLoc dl(N);
+    EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+                                 InVT.getVectorNumElements());
+    SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
+
+    if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT))
+      return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
+
+    return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
+  }
+
+  return SDValue();
+}
+
  static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
                                          const X86Subtarget *Subtarget) {
    // First try to optimize away the conversion entirely when it's
    // conditionally from a constant. Vectors only.
-  SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG);
-  if (Res != SDValue())
+  if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
      return Res;
  
    // Now move on to more general possibilities.
    SDValue Op0 = N->getOperand(0);
-  EVT InVT = Op0->getValueType(0);
+  EVT VT = N->getValueType(0);
+  EVT InVT = Op0.getValueType();
+  EVT InSVT = InVT.getScalarType();
  
-  // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32))
-  if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
+  // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
+  // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
+  if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
      SDLoc dl(N);
-    MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
+    EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+                                 InVT.getVectorNumElements());
      SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
-    return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
+    return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
    }
  
    // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
    // a 32-bit target where SSE doesn't support i64->FP operations.
    if (Op0.getOpcode() == ISD::LOAD) {
      LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
-    EVT VT = Ld->getValueType(0);
+    EVT LdVT = Ld->getValueType(0);
  
      // This transformation is not supported if the result type is f16
-    if (N->getValueType(0) == MVT::f16)
+    if (VT == MVT::f16)
        return SDValue();
  
-    if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
+    if (!Ld->isVolatile() && !VT.isVector() &&
          ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
-        !Subtarget->is64Bit() && VT == MVT::i64) {
+        !Subtarget->is64Bit() && LdVT == MVT::i64) {
        SDValue FILDChain = Subtarget->getTargetLowering()->BuildFILD(
-          SDValue(N, 0), Ld->getValueType(0), Ld->getChain(), Op0, DAG);
+          SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
        DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
        return FILDChain;
      }
@@ -24442,8 +25335,7 @@ static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
      // In this case, the inner vzext is completely dead because we're going to
      // only look at bits inside of the low element. Just do the outer vzext on
      // a bitcast of the input to the inner.
-    return DAG.getNode(X86ISD::VZEXT, DL, VT,
-                       DAG.getNode(ISD::BITCAST, DL, OpVT, V));
+    return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
    }
  
    // Check if we can bypass extracting and re-inserting an element of an input
@@ -24465,7 +25357,7 @@ static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
            OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
                                DAG.getIntPtrConstant(0, DL));
          }
-        Op = DAG.getNode(ISD::BITCAST, DL, OpVT, OrigV);
+        Op = DAG.getBitcast(OpVT, OrigV);
          return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
        }
    }
@@ -24501,6 +25393,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
    case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
    case ISD::MSTORE:         return PerformMSTORECombine(N, DAG, Subtarget);
    case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, Subtarget);
+  case ISD::UINT_TO_FP:     return PerformUINT_TO_FPCombine(N, DAG, Subtarget);
    case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
    case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
    case X86ISD::FXOR:
@@ -24723,7 +25616,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
          (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
           matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
        AsmPieces.clear();
-      const std::string &ConstraintsStr = IA->getConstraintString();
+      StringRef ConstraintsStr = IA->getConstraintString();
        SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
        array_pod_sort(AsmPieces.begin(), AsmPieces.end());
        if (clobbersFlagRegisters(AsmPieces))
@@ -24737,7 +25630,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
          matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
          matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
        AsmPieces.clear();
-      const std::string &ConstraintsStr = IA->getConstraintString();
+      StringRef ConstraintsStr = IA->getConstraintString();
        SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
        array_pod_sort(AsmPieces.begin(), AsmPieces.end());
        if (clobbersFlagRegisters(AsmPieces))
@@ -24764,7 +25657,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
  /// getConstraintType - Given a constraint letter, return the type of
  /// constraint it is for this target.
  X86TargetLowering::ConstraintType
-X86TargetLowering::getConstraintType(const std::string &Constraint) const {
+X86TargetLowering::getConstraintType(StringRef Constraint) const {
    if (Constraint.size() == 1) {
      switch (Constraint[0]) {
      case 'R':
@@ -25096,7 +25989,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
  
  std::pair<unsigned, const TargetRegisterClass *>
  X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                                                const std::string &Constraint,
+                                                StringRef Constraint,
                                                  MVT VT) const {
    // First, see if this is a constraint that directly corresponds to an LLVM
    // register class.
@@ -25246,71 +26139,40 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
    // Otherwise, check to see if this is a register class of the wrong value
    // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
    // turn into {ax},{dx}.
-  if (Res.second->hasType(VT))
+  // MVT::Other is used to specify clobber names.
+  if (Res.second->hasType(VT) || VT == MVT::Other)
      return Res;   // Correct type already, nothing to do.
  
-  // All of the single-register GCC register classes map their values onto
-  // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp".  If we
-  // really want an 8-bit or 32-bit register, map to the appropriate register
-  // class and return the appropriate register.
-  if (Res.second == &X86::GR16RegClass) {
-    if (VT == MVT::i8 || VT == MVT::i1) {
-      unsigned DestReg = 0;
-      switch (Res.first) {
-      default: break;
-      case X86::AX: DestReg = X86::AL; break;
-      case X86::DX: DestReg = X86::DL; break;
-      case X86::CX: DestReg = X86::CL; break;
-      case X86::BX: DestReg = X86::BL; break;
-      }
-      if (DestReg) {
-        Res.first = DestReg;
-        Res.second = &X86::GR8RegClass;
-      }
-    } else if (VT == MVT::i32 || VT == MVT::f32) {
-      unsigned DestReg = 0;
-      switch (Res.first) {
-      default: break;
-      case X86::AX: DestReg = X86::EAX; break;
-      case X86::DX: DestReg = X86::EDX; break;
-      case X86::CX: DestReg = X86::ECX; break;
-      case X86::BX: DestReg = X86::EBX; break;
-      case X86::SI: DestReg = X86::ESI; break;
-      case X86::DI: DestReg = X86::EDI; break;
-      case X86::BP: DestReg = X86::EBP; break;
-      case X86::SP: DestReg = X86::ESP; break;
-      }
-      if (DestReg) {
-        Res.first = DestReg;
-        Res.second = &X86::GR32RegClass;
-      }
-    } else if (VT == MVT::i64 || VT == MVT::f64) {
-      unsigned DestReg = 0;
-      switch (Res.first) {
-      default: break;
-      case X86::AX: DestReg = X86::RAX; break;
-      case X86::DX: DestReg = X86::RDX; break;
-      case X86::CX: DestReg = X86::RCX; break;
-      case X86::BX: DestReg = X86::RBX; break;
-      case X86::SI: DestReg = X86::RSI; break;
-      case X86::DI: DestReg = X86::RDI; break;
-      case X86::BP: DestReg = X86::RBP; break;
-      case X86::SP: DestReg = X86::RSP; break;
-      }
-      if (DestReg) {
-        Res.first = DestReg;
-        Res.second = &X86::GR64RegClass;
-      }
-    }
-  } else if (Res.second == &X86::FR32RegClass ||
-             Res.second == &X86::FR64RegClass ||
-             Res.second == &X86::VR128RegClass ||
-             Res.second == &X86::VR256RegClass ||
-             Res.second == &X86::FR32XRegClass ||
-             Res.second == &X86::FR64XRegClass ||
-             Res.second == &X86::VR128XRegClass ||
-             Res.second == &X86::VR256XRegClass ||
-             Res.second == &X86::VR512RegClass) {
+  // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
+  // return "eax". This should even work for things like getting 64bit integer
+  // registers when given an f64 type.
+  const TargetRegisterClass *Class = Res.second;
+  if (Class == &X86::GR8RegClass || Class == &X86::GR16RegClass ||
+      Class == &X86::GR32RegClass || Class == &X86::GR64RegClass) {
+    unsigned Size = VT.getSizeInBits();
+    MVT::SimpleValueType SimpleTy = Size == 1 || Size == 8 ? MVT::i8
+                                  : Size == 16 ? MVT::i16
+                                  : Size == 32 ? MVT::i32
+                                  : Size == 64 ? MVT::i64
+                                  : MVT::Other;
+    unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, SimpleTy);
+    if (DestReg > 0) {
+      Res.first = DestReg;
+      Res.second = SimpleTy == MVT::i8 ? &X86::GR8RegClass
+                 : SimpleTy == MVT::i16 ? &X86::GR16RegClass
+                 : SimpleTy == MVT::i32 ? &X86::GR32RegClass
+                 : &X86::GR64RegClass;
+      assert(Res.second->contains(Res.first) && "Register in register class");
+    } else {
+      // No register found/type mismatch.
+      Res.first = 0;
+      Res.second = nullptr;
+    }
+  } else if (Class == &X86::FR32RegClass || Class == &X86::FR64RegClass ||
+             Class == &X86::VR128RegClass || Class == &X86::VR256RegClass ||
+             Class == &X86::FR32XRegClass || Class == &X86::FR64XRegClass ||
+             Class == &X86::VR128XRegClass || Class == &X86::VR256XRegClass ||
+             Class == &X86::VR512RegClass) {
      // Handle references to XMM physical registers that got mapped into the
      // wrong class.  This can happen with constraints like {xmm0} where the
      // target independent register mapper will just pick the first match it can
@@ -25326,13 +26188,19 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
        Res.second = &X86::VR256RegClass;
      else if (X86::VR512RegClass.hasType(VT))
        Res.second = &X86::VR512RegClass;
+    else {
+      // Type mismatch and not a clobber: Return an error;
+      Res.first = 0;
+      Res.second = nullptr;
+    }
    }
  
    return Res;
  }
  
-int X86TargetLowering::getScalingFactorCost(const AddrMode &AM,
-                                            Type *Ty) const {
+int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
+                                            const AddrMode &AM, Type *Ty,
+                                            unsigned AS) const {
    // Scaling factors are not free at all.
    // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
    // will take 2 allocations in the out of order engine instead of 1
@@ -25351,7 +26219,7 @@ int X86TargetLowering::getScalingFactorCost(const AddrMode &AM,
    // E.g., on Haswell:
    // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
    // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
-  if (isLegalAddressingMode(AM, Ty))
+  if (isLegalAddressingMode(DL, AM, Ty, AS))
      // Scale represents reg2 * scale, thus account for 1
      // as soon as we use a second register.
      return AM.Scale != 0;