Implememting named register intrinsics

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 936699e9b5986964d5f439a0ebfbb607f075cab3..2f14d790742f2f77a8852af3c996a0492ecb86ce 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -22,6 +22,7 @@
  #include "llvm/ADT/SmallSet.h"
  #include "llvm/ADT/Statistic.h"
  #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSwitch.h"
  #include "llvm/ADT/VariadicFunction.h"
  #include "llvm/CodeGen/IntrinsicLowering.h"
  #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -85,7 +86,8 @@ static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
    // If the input is a buildvector just emit a smaller one.
    if (Vec.getOpcode() == ISD::BUILD_VECTOR)
      return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
-                       Vec->op_begin()+NormalizedIdxVal, ElemsPerChunk);
+                       makeArrayRef(Vec->op_begin()+NormalizedIdxVal,
+                                    ElemsPerChunk));
  
    SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
    SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
@@ -266,10 +268,10 @@ void X86TargetLowering::resetOperationActions() {
  
      // The _ftol2 runtime function has an unusual calling conv, which
      // is modeled by a special pseudo-instruction.
-    setLibcallName(RTLIB::FPTOUINT_F64_I64, 0);
-    setLibcallName(RTLIB::FPTOUINT_F32_I64, 0);
-    setLibcallName(RTLIB::FPTOUINT_F64_I32, 0);
-    setLibcallName(RTLIB::FPTOUINT_F32_I32, 0);
+    setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr);
+    setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr);
+    setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr);
+    setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr);
    }
  
    if (Subtarget->isTargetDarwin()) {
@@ -826,7 +828,9 @@ void X86TargetLowering::resetOperationActions() {
      setOperationAction(ISD::FRINT, VT, Expand);
      setOperationAction(ISD::FNEARBYINT, VT, Expand);
      setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::MULHS, VT, Expand);
      setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::MULHU, VT, Expand);
      setOperationAction(ISD::SDIVREM, VT, Expand);
      setOperationAction(ISD::UDIVREM, VT, Expand);
      setOperationAction(ISD::FPOW, VT, Expand);
@@ -938,6 +942,10 @@ void X86TargetLowering::resetOperationActions() {
      setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
      setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
      setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
+    setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
+    setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
+    setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
+    setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
      setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
      setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
      setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
@@ -1105,9 +1113,6 @@ void X86TargetLowering::resetOperationActions() {
      setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
  
      setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
-
-    setOperationAction(ISD::SDIV,              MVT::v8i16, Custom);
-    setOperationAction(ISD::SDIV,              MVT::v4i32, Custom);
    }
  
    if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
@@ -1172,8 +1177,6 @@ void X86TargetLowering::resetOperationActions() {
      setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
      setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
  
-    setOperationAction(ISD::SDIV,              MVT::v16i16, Custom);
-
      setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
      setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
      setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
@@ -1226,9 +1229,12 @@ void X86TargetLowering::resetOperationActions() {
        setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
        // Don't lower v32i8 because there is no 128-bit byte mul
  
-      setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
+      setOperationAction(ISD::UMUL_LOHI,       MVT::v8i32, Custom);
+      setOperationAction(ISD::SMUL_LOHI,       MVT::v8i32, Custom);
+      setOperationAction(ISD::MULHU,           MVT::v16i16, Legal);
+      setOperationAction(ISD::MULHS,           MVT::v16i16, Legal);
  
-      setOperationAction(ISD::SDIV,            MVT::v8i32, Custom);
+      setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
      } else {
        setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
        setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
@@ -1337,7 +1343,6 @@ void X86TargetLowering::resetOperationActions() {
      setOperationAction(ISD::FNEG,               MVT::v8f64, Custom);
      setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
      setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
-    setOperationAction(ISD::SDIV,               MVT::v16i32, Custom);
  
      setOperationAction(ISD::FP_TO_SINT,         MVT::i32, Legal);
      setOperationAction(ISD::FP_TO_UINT,         MVT::i32, Legal);
@@ -1498,9 +1503,9 @@ void X86TargetLowering::resetOperationActions() {
  
    if (!Subtarget->is64Bit()) {
      // These libcalls are not available in 32-bit.
-    setLibcallName(RTLIB::SHL_I128, 0);
-    setLibcallName(RTLIB::SRL_I128, 0);
-    setLibcallName(RTLIB::SRA_I128, 0);
+    setLibcallName(RTLIB::SHL_I128, nullptr);
+    setLibcallName(RTLIB::SRL_I128, nullptr);
+    setLibcallName(RTLIB::SRA_I128, nullptr);
    }
  
    // Combine sin / cos into one node or libcall if possible.
@@ -1516,6 +1521,15 @@ void X86TargetLowering::resetOperationActions() {
      }
    }
  
+  if (Subtarget->isTargetWin64()) {
+    setOperationAction(ISD::SDIV, MVT::i128, Custom);
+    setOperationAction(ISD::UDIV, MVT::i128, Custom);
+    setOperationAction(ISD::SREM, MVT::i128, Custom);
+    setOperationAction(ISD::UREM, MVT::i128, Custom);
+    setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
+    setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
+  }
+
    // We have target-specific dag combine patterns for the following nodes:
    setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
    setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
@@ -1738,7 +1752,7 @@ getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
  // FIXME: Why this routine is here? Move to RegInfo!
  std::pair<const TargetRegisterClass*, uint8_t>
  X86TargetLowering::findRepresentativeClass(MVT VT) const{
-  const TargetRegisterClass *RRC = 0;
+  const TargetRegisterClass *RRC = nullptr;
    uint8_t Cost = 1;
    switch (VT.SimpleTy) {
    default:
@@ -1930,8 +1944,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
    if (Flag.getNode())
      RetOps.push_back(Flag);
  
-  return DAG.getNode(X86ISD::RET_FLAG, dl,
-                     MVT::Other, &RetOps[0], RetOps.size());
+  return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps);
  }
  
  bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
@@ -2424,13 +2437,11 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
            SaveXMMOps.push_back(Val);
          }
          MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
-                                     MVT::Other,
-                                     &SaveXMMOps[0], SaveXMMOps.size()));
+                                     MVT::Other, SaveXMMOps));
        }
  
        if (!MemOps.empty())
-        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                            &MemOps[0], MemOps.size());
+        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
      }
    }
  
@@ -2497,10 +2508,10 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
  
  /// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call
  /// optimization is performed and it is required (FPDiff!=0).
-static SDValue
-EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
-                         SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT,
-                         unsigned SlotSize, int FPDiff, SDLoc dl) {
+static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
+                                        SDValue Chain, SDValue RetAddrFrIdx,
+                                        EVT PtrVT, unsigned SlotSize,
+                                        int FPDiff, SDLoc dl) {
    // Store the return address to the appropriate stack slot.
    if (!FPDiff) return Chain;
    // Calculate the new stack slot for the return address.
@@ -2537,7 +2548,13 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
    if (MF.getTarget().Options.DisableTailCalls)
      isTailCall = false;
  
-  if (isTailCall) {
+  bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
+  if (IsMustTail) {
+    // Force this to be a tail call.  The verifier rules are enough to ensure
+    // that we can lower this successfully without moving the return address
+    // around.
+    isTailCall = true;
+  } else if (isTailCall) {
      // Check if it's really possible to do a tail call.
      isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
                      isVarArg, SR != NotStructReturn,
@@ -2578,7 +2595,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
      NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
  
    int FPDiff = 0;
-  if (isTailCall && !IsSibcall) {
+  if (isTailCall && !IsSibcall && !IsMustTail) {
      // Lower arguments at fp - stackoffset + fpdiff.
      X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
      unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
@@ -2683,7 +2700,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
        }
      } else if (!IsSibcall && (!isTailCall || isByVal)) {
        assert(VA.isMemLoc());
-      if (StackPtr.getNode() == 0)
+      if (!StackPtr.getNode())
          StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
                                        getPointerTy());
        MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
@@ -2692,8 +2709,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
    }
  
    if (!MemOpChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                        &MemOpChains[0], MemOpChains.size());
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
  
    if (Subtarget->isPICStyleGOT()) {
      // ELF / PIC requires GOT in the EBX register before function calls via PLT
@@ -2742,8 +2758,10 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                          DAG.getConstant(NumXMMRegs, MVT::i8)));
    }
  
-  // For tail calls lower the arguments to the 'real' stack slot.
-  if (isTailCall) {
+  // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
+  // don't need this because the eligibility check rejects calls that require
+  // shuffling arguments passed in memory.
+  if (!IsSibcall && isTailCall) {
      // Force all the incoming stack arguments to be loaded from the stack
      // before any new outgoing arguments are stored to the stack, because the
      // outgoing stack slots may alias the incoming argument stack slots, and
@@ -2755,45 +2773,45 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
      SmallVector<SDValue, 8> MemOpChains2;
      SDValue FIN;
      int FI = 0;
-    if (getTargetMachine().Options.GuaranteedTailCallOpt) {
-      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
-        CCValAssign &VA = ArgLocs[i];
-        if (VA.isRegLoc())
-          continue;
-        assert(VA.isMemLoc());
-        SDValue Arg = OutVals[i];
-        ISD::ArgFlagsTy Flags = Outs[i].Flags;
-        // Create frame index.
-        int32_t Offset = VA.getLocMemOffset()+FPDiff;
-        uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
-        FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
-        FIN = DAG.getFrameIndex(FI, getPointerTy());
-
-        if (Flags.isByVal()) {
-          // Copy relative to framepointer.
-          SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
-          if (StackPtr.getNode() == 0)
-            StackPtr = DAG.getCopyFromReg(Chain, dl,
-                                          RegInfo->getStackRegister(),
-                                          getPointerTy());
-          Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
-
-          MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
-                                                           ArgChain,
-                                                           Flags, DAG, dl));
-        } else {
-          // Store relative to framepointer.
-          MemOpChains2.push_back(
-            DAG.getStore(ArgChain, dl, Arg, FIN,
-                         MachinePointerInfo::getFixedStack(FI),
-                         false, false, 0));
-        }
+    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+      CCValAssign &VA = ArgLocs[i];
+      if (VA.isRegLoc())
+        continue;
+      assert(VA.isMemLoc());
+      SDValue Arg = OutVals[i];
+      ISD::ArgFlagsTy Flags = Outs[i].Flags;
+      // Skip inalloca arguments.  They don't require any work.
+      if (Flags.isInAlloca())
+        continue;
+      // Create frame index.
+      int32_t Offset = VA.getLocMemOffset()+FPDiff;
+      uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
+      FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
+      FIN = DAG.getFrameIndex(FI, getPointerTy());
+
+      if (Flags.isByVal()) {
+        // Copy relative to framepointer.
+        SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
+        if (!StackPtr.getNode())
+          StackPtr = DAG.getCopyFromReg(Chain, dl,
+                                        RegInfo->getStackRegister(),
+                                        getPointerTy());
+        Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
+
+        MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
+                                                         ArgChain,
+                                                         Flags, DAG, dl));
+      } else {
+        // Store relative to framepointer.
+        MemOpChains2.push_back(
+          DAG.getStore(ArgChain, dl, Arg, FIN,
+                       MachinePointerInfo::getFixedStack(FI),
+                       false, false, 0));
        }
      }
  
      if (!MemOpChains2.empty())
-      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                          &MemOpChains2[0], MemOpChains2.size());
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
  
      // Store the return address to the appropriate stack slot.
      Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
@@ -2930,10 +2948,10 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
      // This isn't right, although it's probably harmless on x86; liveouts
      // should be computed from returns not tail calls.  Consider a void
      // function making a tail call to a function returning int.
-    return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size());
+    return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
    }
  
-  Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
+  Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
    InFlag = Chain.getValue(1);
  
    // Create the CALLSEQ_END node.
@@ -3927,6 +3945,29 @@ static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
    return true;
  }
  
+/// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to INSERTPS.
+/// i. e: If all but one element come from the same vector.
+static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
+  // TODO: Deal with AVX's VINSERTPS
+  if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))
+    return false;
+
+  unsigned CorrectPosV1 = 0;
+  unsigned CorrectPosV2 = 0;
+  for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i)
+    if (Mask[i] == i)
+      ++CorrectPosV1;
+    else if (Mask[i] == i + 4)
+      ++CorrectPosV2;
+
+  if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
+    // We have 3 elements from one vector, and one from another.
+    return true;
+
+  return false;
+}
+
  //
  // Some special combinations that can be optimized.
  //
@@ -4146,6 +4187,29 @@ static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
    return true;
  }
  
+// Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or
+// (src1[0], src0[1]), manipulation with 256-bit sub-vectors
+static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) {
+  if (!VT.is512BitVector())
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned HalfSize = NumElts/2;
+  if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) {
+    if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) {
+      *Imm = 1;
+      return true;
+    }
+  }
+  if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) {
+    if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) {
+      *Imm = 0;
+      return true;
+    }
+  }
+  return false;
+}
+
  /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
  /// specifies a shuffle of elements that is suitable for input to MOVSS,
  /// MOVSD, and MOVD, i.e. setting the lowest element.
@@ -4677,7 +4741,7 @@ static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) {
  /// isScalarLoadToVector - Returns true if the node is a scalar load that
  /// is promoted to a vector. It also returns the LoadSDNode by reference if
  /// required.
-static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
+static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) {
    if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
      return false;
    N = N->getOperand(0).getNode();
@@ -4803,28 +4867,24 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
      if (Subtarget->hasInt256()) { // AVX2
        SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
        SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
-      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops,
-                        array_lengthof(Ops));
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
      } else {
        // 256-bit logic and arithmetic instructions in AVX are all
        // floating-point, no support for integer ops. Emit fp zeroed vectors.
        SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
        SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
-      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops,
-                        array_lengthof(Ops));
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
      }
    } else if (VT.is512BitVector()) { // AVX-512
        SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
        SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
                          Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
-      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops, 16);
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
    } else if (VT.getScalarType() == MVT::i1) {
      assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type");
      SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
-    SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
-                      Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
-    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
-                       Ops, VT.getVectorNumElements());
+    SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
    } else
      llvm_unreachable("Unexpected vector type");
  
@@ -4844,8 +4904,7 @@ static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
    if (VT.is256BitVector()) {
      if (HasInt256) { // AVX2
        SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
-      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops,
-                        array_lengthof(Ops));
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
      } else { // AVX
        Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
        Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
@@ -5307,7 +5366,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
      return SDValue();
  
    SDLoc dl(Op);
-  SDValue V(0, 0);
+  SDValue V;
    bool First = true;
    for (unsigned i = 0; i < 16; ++i) {
      bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
@@ -5320,7 +5379,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
      }
  
      if ((i & 1) != 0) {
-      SDValue ThisElt(0, 0), LastElt(0, 0);
+      SDValue ThisElt, LastElt;
        bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
        if (LastIsNonZero) {
          LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
@@ -5355,7 +5414,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
      return SDValue();
  
    SDLoc dl(Op);
-  SDValue V(0, 0);
+  SDValue V;
    bool First = true;
    for (unsigned i = 0; i < 8; ++i) {
      bool isNonZero = (NonZeros & (1 << i)) != 0;
@@ -5480,7 +5539,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
    EVT EltVT = VT.getVectorElementType();
    unsigned NumElems = Elts.size();
  
-  LoadSDNode *LDBase = NULL;
+  LoadSDNode *LDBase = nullptr;
    unsigned LastLoadedElt = -1U;
  
    // For each element in the initializer, see if we've found a load or an undef.
@@ -5545,8 +5604,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
      SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
      SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
      SDValue ResNode =
-        DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,
-                                array_lengthof(Ops), MVT::i64,
+        DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
                                  LDBase->getPointerInfo(),
                                  LDBase->getAlignment(),
                                  false/*isVolatile*/, true/*ReadMem*/,
@@ -5661,7 +5719,7 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
      unsigned ScalarSize = CVT.getSizeInBits();
  
      if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) {
-      const Constant *C = 0;
+      const Constant *C = nullptr;
        if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
          C = CI->getConstantIntValue();
        else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
@@ -5783,10 +5841,10 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
      if (ExtractedFromVec.getValueType() != VT)
        return SDValue();
  
-    if (VecIn1.getNode() == 0)
+    if (!VecIn1.getNode())
        VecIn1 = ExtractedFromVec;
      else if (VecIn1 != ExtractedFromVec) {
-      if (VecIn2.getNode() == 0)
+      if (!VecIn2.getNode())
          VecIn2 = ExtractedFromVec;
        else if (VecIn2 != ExtractedFromVec)
          // Quit if more than 2 vectors to shuffle
@@ -5799,7 +5857,7 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
        Mask[i] = Idx + NumElems;
    }
  
-  if (VecIn1.getNode() == 0)
+  if (!VecIn1.getNode())
      return SDValue();
  
    VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
@@ -5824,18 +5882,14 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
    SDLoc dl(Op);
    if (ISD::isBuildVectorAllZeros(Op.getNode())) {
      SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
-    SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
-                      Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
-    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
-                       Ops, VT.getVectorNumElements());
+    SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
    }
  
    if (ISD::isBuildVectorAllOnes(Op.getNode())) {
      SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
-    SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
-                      Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
-    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
-                       Ops, VT.getVectorNumElements());
+    SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
    }
  
    bool AllContants = true;
@@ -6095,9 +6149,10 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
      EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
  
      // Build both the lower and upper subvector.
-    SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2);
-    SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2],
-                                NumElems/2);
+    SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
+                                makeArrayRef(&V[0], NumElems/2));
+    SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
+                                makeArrayRef(&V[NumElems / 2], NumElems/2));
  
      // Recreate the wider vector with the lower and upper part.
      if (VT.is256BitVector())
@@ -6384,8 +6439,7 @@ static SDValue getPSHUFB(ArrayRef<int> MaskVals, SDValue V1, SDLoc &dl,
    if (ShufVT != VT)
      V1 = DAG.getNode(ISD::BITCAST, dl, ShufVT, V1);
    return DAG.getNode(X86ISD::PSHUFB, dl, ShufVT, V1,
-                     DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT,
-                                 PshufbMask.data(), PshufbMask.size()));
+                     DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT, PshufbMask));
  }
  
  // v8i16 shuffles - Prefer shuffles in the following order:
@@ -6687,7 +6741,7 @@ static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
      }
      V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
                       DAG.getNode(ISD::BUILD_VECTOR, dl,
-                                 MVT::v16i8, &pshufbMask[0], 16));
+                                 MVT::v16i8, pshufbMask));
  
      // As PSHUFB will zero elements with negative indices, it's safe to ignore
      // the 2nd operand if it's undefined or zero.
@@ -6705,7 +6759,7 @@ static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
      }
      V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
                       DAG.getNode(ISD::BUILD_VECTOR, dl,
-                                 MVT::v16i8, &pshufbMask[0], 16));
+                                 MVT::v16i8, pshufbMask));
      return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
    }
  
@@ -6857,7 +6911,7 @@ static SDValue getVZextMovL(MVT VT, MVT OpVT,
                              SDValue SrcOp, SelectionDAG &DAG,
                              const X86Subtarget *Subtarget, SDLoc dl) {
    if (VT == MVT::v2f64 || VT == MVT::v4f32) {
-    LoadSDNode *LD = NULL;
+    LoadSDNode *LD = nullptr;
      if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
        LD = dyn_cast<LoadSDNode>(SrcOp);
      if (!LD) {
@@ -6976,8 +7030,7 @@ LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
        }
  
        // Construct the output using a BUILD_VECTOR.
-      Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &SVOps[0],
-                              SVOps.size());
+      Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, SVOps);
      } else if (InputUsed[0] < 0) {
        // No input vectors were used! The result is undefined.
        Output[l] = DAG.getUNDEF(NVT);
@@ -7259,6 +7312,84 @@ SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
                                getShuffleSHUFImmediate(SVOp), DAG);
  }
  
+// It is only safe to call this function if isINSERTPSMask is true for
+// this shufflevector mask.
+static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
+                           SelectionDAG &DAG) {
+  // Generate an insertps instruction when inserting an f32 from memory onto a
+  // v4f32 or when copying a member from one v4f32 to another.
+  // We also use it for transferring i32 from one register to another,
+  // since it simply copies the same bits.
+  // If we're transfering an i32 from memory to a specific element in a
+  // register, we output a generic DAG that will match the PINSRD
+  // instruction.
+  // TODO: Optimize for AVX cases too (VINSERTPS)
+  MVT VT = SVOp->getSimpleValueType(0);
+  MVT EVT = VT.getVectorElementType();
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  auto Mask = SVOp->getMask();
+  assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
+         "unsupported vector type for insertps/pinsrd");
+
+  int FromV1 = std::count_if(Mask.begin(), Mask.end(),
+                             [](const int &i) { return i < 4; });
+
+  SDValue From;
+  SDValue To;
+  unsigned DestIndex;
+  if (FromV1 == 1) {
+    From = V1;
+    To = V2;
+    DestIndex = std::find_if(Mask.begin(), Mask.end(),
+                             [](const int &i) { return i < 4; }) -
+                Mask.begin();
+  } else {
+    From = V2;
+    To = V1;
+    DestIndex = std::find_if(Mask.begin(), Mask.end(),
+                             [](const int &i) { return i >= 4; }) -
+                Mask.begin();
+  }
+
+  if (MayFoldLoad(From)) {
+    // Trivial case, when From comes from a load and is only used by the
+    // shuffle. Make it use insertps from the vector that we need from that
+    // load.
+    SDValue Addr = From.getOperand(1);
+    SDValue NewAddr =
+        DAG.getNode(ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
+                    DAG.getConstant(DestIndex * EVT.getStoreSize(),
+                                    Addr.getSimpleValueType()));
+
+    LoadSDNode *Load = cast<LoadSDNode>(From);
+    SDValue NewLoad =
+        DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
+                    DAG.getMachineFunction().getMachineMemOperand(
+                        Load->getMemOperand(), 0, EVT.getStoreSize()));
+
+    if (EVT == MVT::f32) {
+      // Create this as a scalar to vector to match the instruction pattern.
+      SDValue LoadScalarToVector =
+          DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad);
+      SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4);
+      return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector,
+                         InsertpsMask);
+    } else { // EVT == MVT::i32
+      // If we're getting an i32 from memory, use an INSERT_VECTOR_ELT
+      // instruction, to match the PINSRD instruction, which loads an i32 to a
+      // certain vector element.
+      return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, To, NewLoad,
+                         DAG.getConstant(DestIndex, MVT::i32));
+    }
+  }
+
+  // Vector-element-to-vector
+  unsigned SrcIndex = Mask[DestIndex] % 4;
+  SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6);
+  return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask);
+}
+
  // Reduce a vector shuffle to zext.
  static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
                                      SelectionDAG &DAG) {
@@ -7661,6 +7792,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
                                  getShuffleSHUFImmediate(SVOp), DAG);
    }
  
+  unsigned Idx;
+  if (VT.is512BitVector() && isINSERT64x4Mask(M, VT, &Idx))
+    return Insert256BitVector(V1, Extract256BitVector(V2, 0, DAG, dl),
+                              Idx*(NumElems/2), DAG, dl);
+
    // Handle VPERM2F128/VPERM2I128 permutations
    if (isVPERM2X128Mask(M, VT, HasFp256))
      return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
@@ -7670,6 +7806,9 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
    if (BlendOp.getNode())
      return BlendOp;
  
+  if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT))
+    return getINSERTPS(SVOp, dl, DAG);
+
    unsigned Imm8;
    if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8))
      return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG);
@@ -7683,8 +7822,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
        permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT));
      }
  
-    SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT,
-                                &permclMask[0], NumElems);
+    SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, permclMask);
      if (V2IsUndef)
        // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
        return DAG.getNode(X86ISD::VPERMV, dl, VT,
@@ -8383,10 +8521,10 @@ GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
  
    if (InFlag) {
      SDValue Ops[] = { Chain,  TGA, *InFlag };
-    Chain = DAG.getNode(CallType, dl, NodeTys, Ops, array_lengthof(Ops));
+    Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
    } else {
      SDValue Ops[]  = { Chain, TGA };
-    Chain = DAG.getNode(CallType, dl, NodeTys, Ops, array_lengthof(Ops));
+    Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
    }
  
    // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
@@ -8414,7 +8552,7 @@ LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
  static SDValue
  LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
                                  const EVT PtrVT) {
-  return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT,
+  return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
                      X86::RAX, X86II::MO_TLSGD);
  }
  
@@ -8431,7 +8569,7 @@ static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
  
    SDValue Base;
    if (is64Bit) {
-    Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, X86::RAX,
+    Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
                        X86II::MO_TLSLD, /*LocalDynamic=*/true);
    } else {
      SDValue InFlag;
@@ -8570,7 +8708,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
      SDValue Chain = DAG.getEntryNode();
      SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
      SDValue Args[] = { Chain, Offset };
-    Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2);
+    Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
  
      // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
      MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
@@ -8698,15 +8836,15 @@ static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
    SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
  
    if (Op.getOpcode() == ISD::SHL_PARTS) {
-    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
-    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
+    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
+    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
    } else {
-    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
-    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
+    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
+    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
    }
  
    SDValue Ops[2] = { Lo, Hi };
-  return DAG.getMergeValues(Ops, array_lengthof(Ops), dl);
+  return DAG.getMergeValues(Ops, dl);
  }
  
  SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
@@ -8769,8 +8907,7 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
    SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
    SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
                                             X86ISD::FILD, DL,
-                                           Tys, Ops, array_lengthof(Ops),
-                                           SrcVT, MMO);
+                                           Tys, Ops, SrcVT, MMO);
  
    if (useSSE) {
      Chain = Result.getValue(1);
@@ -8793,8 +8930,7 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
                              MachineMemOperand::MOStore, SSFISize, SSFISize);
  
      Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
-                                    Ops, array_lengthof(Ops),
-                                    Op.getValueType(), MMO);
+                                    Ops, Op.getValueType(), MMO);
      Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
                           MachinePointerInfo::getFixedStack(SSFI),
                           false, false, false, 0);
@@ -8989,7 +9125,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
    SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
    SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
    SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
-                                         array_lengthof(Ops), MVT::i64, MMO);
+                                         MVT::i64, MMO);
  
    APInt FF(32, 0x5F800000ULL);
  
@@ -9082,8 +9218,7 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
      MachineMemOperand *MMO =
        MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
                                MachineMemOperand::MOLoad, MemSize, MemSize);
-    Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops,
-                                    array_lengthof(Ops), DstTy, MMO);
+    Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
      Chain = Value.getValue(1);
      SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
      StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
@@ -9097,8 +9232,7 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
      // Build the FP_TO_INT*_IN_MEM
      SDValue Ops[] = { Chain, Value, StackSlot };
      SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
-                                           Ops, array_lengthof(Ops), DstTy,
-                                           MMO);
+                                           Ops, DstTy, MMO);
      return std::make_pair(FIST, StackSlot);
    } else {
      SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
@@ -9110,8 +9244,8 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
        MVT::i32, eax.getValue(2));
      SDValue Ops[] = { eax, edx };
      SDValue pair = IsReplace
-      ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops, array_lengthof(Ops))
-      : DAG.getMergeValues(Ops, array_lengthof(Ops), DL);
+      ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops)
+      : DAG.getMergeValues(Ops, DL);
      return std::make_pair(pair, SDValue());
    }
  }
@@ -9306,8 +9440,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
          for (unsigned j = 0; j < 8; ++j)
            pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
        }
-      SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8,
-                               &pshufbMask[0], 32);
+      SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask);
        In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
        In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In);
  
@@ -9373,7 +9506,7 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
      /*IsSigned=*/ true, /*IsReplace=*/ false);
    SDValue FIST = Vals.first, StackSlot = Vals.second;
    // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
-  if (FIST.getNode() == 0) return Op;
+  if (!FIST.getNode()) return Op;
  
    if (StackSlot.getNode())
      // Load the result.
@@ -9798,7 +9931,8 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
      // If we have a constant logical shift that's only used in a comparison
      // against zero turn it into an equivalent AND. This allows turning it into
      // a TEST instruction later.
-    if (isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
+    if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
+        isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
        EVT VT = Op.getValueType();
        unsigned BitWidth = VT.getSizeInBits();
        unsigned ShAmt = Op->getConstantOperandVal(1);
@@ -9903,7 +10037,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
    for (unsigned i = 0; i != NumOperands; ++i)
      Ops.push_back(Op.getOperand(i));
  
-  SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands);
+  SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
    DAG.ReplaceAllUsesWith(Op, New);
    return SDValue(New.getNode(), 1);
  }
@@ -10186,7 +10320,7 @@ static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG)
      ULTOp1.push_back(DAG.getConstant(Val - 1, EVT));
    }
  
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ULTOp1.data(), ULTOp1.size());
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ULTOp1);
  }
  
  static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
@@ -10624,7 +10758,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
          Res = DAG.getNOT(DL, Res, Res.getValueType());
  
        ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
-      if (N2C == 0 || !N2C->isNullValue())
+      if (!N2C || !N2C->isNullValue())
          Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
        return Res;
      }
@@ -10753,7 +10887,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
    // condition is true.
    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
    SDValue Ops[] = { Op2, Op1, CC, Cond };
-  return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops));
+  return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
  }
  
  static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) {
@@ -11190,7 +11324,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
          SDLoc(Node));
  
      SDValue Ops[2] = { Tmp1, Tmp2 };
-    return DAG.getMergeValues(Ops, 2, dl);
+    return DAG.getMergeValues(Ops, dl);
    }
  
    // Get the inputs.
@@ -11224,7 +11358,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
      SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
                                  DAG.getRegister(Vreg, SPTy));
      SDValue Ops1[2] = { Value, Chain };
-    return DAG.getMergeValues(Ops1, 2, dl);
+    return DAG.getMergeValues(Ops1, dl);
    } else {
      SDValue Flag;
      unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX);
@@ -11248,7 +11382,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
      }
  
      SDValue Ops1[2] = { SP, Chain };
-    return DAG.getMergeValues(Ops1, 2, dl);
+    return DAG.getMergeValues(Ops1, dl);
    }
  }
  
@@ -11309,8 +11443,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
    Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
                         MachinePointerInfo(SV, 16), false, false, 0);
    MemOps.push_back(Store);
-  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
-                     &MemOps[0], MemOps.size());
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
  }
  
  SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
@@ -11364,8 +11497,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
    InstOps.push_back(DAG.getConstant(Align, MVT::i32));
    SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other);
    SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
-                                          VTs, &InstOps[0], InstOps.size(),
-                                          MVT::i64,
+                                          VTs, InstOps, MVT::i64,
                                            MachinePointerInfo(SV),
                                            /*Align=*/0,
                                            /*Volatile=*/false,
@@ -11425,7 +11557,7 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
      ConstantSDNode *ND;
  
      switch(Opc) {
-    default: llvm_unreachable(0);
+    default: llvm_unreachable(nullptr);
      case X86ISD::VSHLI:
        for (unsigned i=0; i!=NumElts; ++i) {
          SDValue CurrentOp = SrcOp->getOperand(i);
@@ -11464,7 +11596,7 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
        break;
      }
  
-    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Elts[0], NumElts);
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
    }
  
    return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8));
@@ -11496,7 +11628,7 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
    ShOps[0] = ShAmt;
    ShOps[1] = DAG.getConstant(0, MVT::i32);
    ShOps[2] = ShOps[3] = DAG.getUNDEF(MVT::i32);
-  ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4);
+  ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, ShOps);
  
    // The return type has to be a 128-bit type with the same element
    // type as the input type.
@@ -11619,6 +11751,21 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
      return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(),
                         Op.getOperand(1), Op.getOperand(2));
  
+  case Intrinsic::x86_sse41_pmuldq:
+  case Intrinsic::x86_avx2_pmul_dq:
+    return DAG.getNode(X86ISD::PMULDQ, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+
+  case Intrinsic::x86_sse2_pmulhu_w:
+  case Intrinsic::x86_avx2_pmulhu_w:
+    return DAG.getNode(ISD::MULHU, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+
+  case Intrinsic::x86_sse2_pmulh_w:
+  case Intrinsic::x86_avx2_pmulh_w:
+    return DAG.getNode(ISD::MULHS, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+
    // SSE2/AVX2 sub with unsigned saturation intrinsics
    case Intrinsic::x86_sse2_psubus_b:
    case Intrinsic::x86_sse2_psubus_w:
@@ -12070,7 +12217,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
      }
      SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
      SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
-    SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
+    SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
      SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
                                  DAG.getConstant(X86CC, MVT::i8),
                                  SDValue(PCMP.getNode(), 1));
@@ -12087,7 +12234,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
  
      SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
      SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
-    return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
+    return DAG.getNode(Opcode, dl, VTs, NewOps);
    }
    case Intrinsic::x86_fma_vfmadd_ps:
    case Intrinsic::x86_fma_vfmadd_pd:
@@ -12202,7 +12349,7 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
    SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
    SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
    SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
-  return DAG.getMergeValues(RetOps, array_lengthof(RetOps), dl);
+  return DAG.getMergeValues(RetOps, dl);
  }
  
  static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
@@ -12224,7 +12371,7 @@ static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
    SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
    SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
    SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
-  return DAG.getMergeValues(RetOps, array_lengthof(RetOps), dl);
+  return DAG.getMergeValues(RetOps, dl);
  }
  
  static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
@@ -12270,8 +12417,7 @@ static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode,
                                SelectionDAG &DAG, const X86Subtarget *Subtarget,
                                SmallVectorImpl<SDValue> &Results) {
    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
-  SDValue TheChain = N->getOperand(0);
-  SDValue rd = DAG.getNode(Opcode, DL, Tys, &TheChain, 1);
+  SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
    SDValue LO, HI;
  
    // The processor's time-stamp counter (a 64-bit MSR) is stored into the
@@ -12313,8 +12459,7 @@ static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode,
  
    // Use a buildpair to merge the two 32-bit values into a 64-bit one.
    SDValue Ops[] = { LO, HI };
-  SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops,
-                             array_lengthof(Ops));
+  SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
    Results.push_back(Pair);
    Results.push_back(Chain);
  }
@@ -12325,7 +12470,7 @@ static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
    SDLoc DL(Op);
    getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
                            Results);
-  return DAG.getMergeValues(&Results[0], Results.size(), DL);
+  return DAG.getMergeValues(Results, DL);
  }
  
  static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
@@ -12358,7 +12503,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
                        SDValue(Result.getNode(), 1) };
      SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
                                    DAG.getVTList(Op->getValueType(1), MVT::Glue),
-                                  Ops, array_lengthof(Ops));
+                                  Ops);
  
      // Return { result, isValid, chain }.
      return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
@@ -12516,7 +12661,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
      }
      SmallVector<SDValue, 2> Results;
      getReadTimeStampCounter(Op.getNode(), dl, Opc, DAG, Subtarget, Results);
-    return DAG.getMergeValues(&Results[0], Results.size(), dl);
+    return DAG.getMergeValues(Results, dl);
    }
    // XTEST intrinsics.
    case Intrinsic::x86_xtest: {
@@ -12582,6 +12727,18 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
    return FrameAddr;
  }
  
+// FIXME? Maybe this could be a TableGen attribute on some registers and
+// this table could be generated automatically from RegInfo.
+unsigned X86TargetLowering::getRegisterByName(const char* RegName) const {
+  unsigned Reg = StringSwitch<unsigned>(RegName)
+                       .Case("esp", X86::ESP)
+                       .Case("rsp", X86::RSP)
+                       .Default(0);
+  if (Reg)
+    return Reg;
+  report_fatal_error("Invalid register name global variable");
+}
+
  SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
                                                       SelectionDAG &DAG) const {
    const X86RegisterInfo *RegInfo =
@@ -12701,7 +12858,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
                                  MachinePointerInfo(TrmpAddr, 22),
                                  false, false, 0);
  
-    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6);
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
    } else {
      const Function *Func =
        cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
@@ -12781,7 +12938,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
                                  MachinePointerInfo(TrmpAddr, 6),
                                  false, false, 1);
  
-    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4);
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
    }
  }
  
@@ -12824,8 +12981,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
    SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
    SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
                                            DAG.getVTList(MVT::Other),
-                                          Ops, array_lengthof(Ops), MVT::i16,
-                                          MMO);
+                                          Ops, MVT::i16, MMO);
  
    // Load FP Control Word from stack slot
    SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
@@ -12878,7 +13034,7 @@ static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
      DAG.getConstant(X86::COND_E, MVT::i8),
      Op.getValue(1)
    };
-  Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops));
+  Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
  
    // Finally xor with NumBits-1.
    Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
@@ -12930,7 +13086,7 @@ static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
      DAG.getConstant(X86::COND_E, MVT::i8),
      Op.getValue(1)
    };
-  return DAG.getNode(X86ISD::CMOV, dl, VT, Ops, array_lengthof(Ops));
+  return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
  }
  
  // Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
@@ -13048,59 +13204,104 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
    return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
  }
  
-static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
-  MVT VT = Op.getSimpleValueType();
-  MVT EltTy = VT.getVectorElementType();
-  unsigned NumElts = VT.getVectorNumElements();
-  SDValue N0 = Op.getOperand(0);
+SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetWin64() && "Unexpected target");
+  EVT VT = Op.getValueType();
+  assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
+         "Unexpected return type for lowering");
+
+  RTLIB::Libcall LC;
+  bool isSigned;
+  switch (Op->getOpcode()) {
+  default: llvm_unreachable("Unexpected request for libcall!");
+  case ISD::SDIV:      isSigned = true;  LC = RTLIB::SDIV_I128;    break;
+  case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;
+  case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;
+  case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;
+  case ISD::SDIVREM:   isSigned = true;  LC = RTLIB::SDIVREM_I128; break;
+  case ISD::UDIVREM:   isSigned = false; LC = RTLIB::UDIVREM_I128; break;
+  }
+
    SDLoc dl(Op);
+  SDValue InChain = DAG.getEntryNode();
  
-  // Lower sdiv X, pow2-const.
-  BuildVectorSDNode *C = dyn_cast<BuildVectorSDNode>(Op.getOperand(1));
-  if (!C)
-    return SDValue();
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
+    EVT ArgVT = Op->getOperand(i).getValueType();
+    assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
+           "Unexpected argument type for lowering");
+    SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
+    Entry.Node = StackPtr;
+    InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(),
+                           false, false, 16);
+    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+    Entry.Ty = PointerType::get(ArgTy,0);
+    Entry.isSExt = false;
+    Entry.isZExt = false;
+    Args.push_back(Entry);
+  }
+
+  SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
+                                         getPointerTy());
  
-  APInt SplatValue, SplatUndef;
-  unsigned SplatBitSize;
-  bool HasAnyUndefs;
-  if (!C->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
-                          HasAnyUndefs) ||
-      EltTy.getSizeInBits() < SplatBitSize)
-    return SDValue();
+  TargetLowering::CallLoweringInfo CLI(
+      InChain, static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
+      isSigned, !isSigned, false, true, 0, getLibcallCallingConv(LC),
+      /*isTailCall=*/false,
+      /*doesNotReturn=*/false, /*isReturnValueUsed=*/true, Callee, Args, DAG,
+      dl);
+  std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
+
+  return DAG.getNode(ISD::BITCAST, dl, VT, CallInfo.first);
+}
+
+static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
+                             SelectionDAG &DAG) {
+  SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
+  EVT VT = Op0.getValueType();
+  SDLoc dl(Op);
+
+  assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) ||
+         (VT == MVT::v8i32 && Subtarget->hasInt256()));
+
+  // Get the high parts.
+  const int Mask[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  SDValue Hi0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask);
+  SDValue Hi1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask);
  
-  if ((SplatValue != 0) &&
-      (SplatValue.isPowerOf2() || (-SplatValue).isPowerOf2())) {
-    unsigned Lg2 = SplatValue.countTrailingZeros();
-    // Splat the sign bit.
-    SmallVector<SDValue, 16> Sz(NumElts,
-                                DAG.getConstant(EltTy.getSizeInBits() - 1,
-                                                EltTy));
-    SDValue SGN = DAG.getNode(ISD::SRA, dl, VT, N0,
-                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Sz[0],
-                                          NumElts));
-    // Add (N0 < 0) ? abs2 - 1 : 0;
-    SmallVector<SDValue, 16> Amt(NumElts,
-                                 DAG.getConstant(EltTy.getSizeInBits() - Lg2,
-                                                 EltTy));
-    SDValue SRL = DAG.getNode(ISD::SRL, dl, VT, SGN,
-                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Amt[0],
-                                          NumElts));
-    SDValue ADD = DAG.getNode(ISD::ADD, dl, VT, N0, SRL);
-    SmallVector<SDValue, 16> Lg2Amt(NumElts, DAG.getConstant(Lg2, EltTy));
-    SDValue SRA = DAG.getNode(ISD::SRA, dl, VT, ADD,
-                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Lg2Amt[0],
-                                          NumElts));
-
-    // If we're dividing by a positive value, we're done.  Otherwise, we must
-    // negate the result.
-    if (SplatValue.isNonNegative())
-      return SRA;
-
-    SmallVector<SDValue, 16> V(NumElts, DAG.getConstant(0, EltTy));
-    SDValue Zero = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], NumElts);
-    return DAG.getNode(ISD::SUB, dl, VT, Zero, SRA);
+  // Emit two multiplies, one for the lower 2 ints and one for the higher 2
+  // ints.
+  MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
+  bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
+  unsigned Opcode =
+      (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
+  SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT,
+                             DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
+  SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT,
+                             DAG.getNode(Opcode, dl, MulVT, Hi0, Hi1));
+
+  // Shuffle it back into the right order.
+  const int HighMask[] = {1, 5, 3, 7, 9, 13, 11, 15};
+  SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
+  const int LowMask[] = {0, 4, 2, 6, 8, 12, 10, 14};
+  SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
+
+  // If we have a signed multiply but no PMULDQ fix up the high parts of a
+  // unsigned multiply.
+  if (IsSigned && !Subtarget->hasSSE41()) {
+    SDValue ShAmt =
+        DAG.getConstant(31, DAG.getTargetLoweringInfo().getShiftAmountTy(VT));
+    SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
+                             DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
+    SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
+                             DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
+
+    SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
+    Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
    }
-  return SDValue();
+
+  return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getValueType(), Highs, Lows);
  }
  
  static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
@@ -13144,7 +13345,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
                                       DAG.getConstant(uint8_t(-1U << ShiftAmt),
                                                       MVT::i8));
            return DAG.getNode(ISD::AND, dl, VT, SHL,
-                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
+                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
          }
          if (Op.getOpcode() == ISD::SRL) {
            // Make a large shift.
@@ -13157,7 +13358,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
                                       DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
                                                       MVT::i8));
            return DAG.getNode(ISD::AND, dl, VT, SRL,
-                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
+                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
          }
          if (Op.getOpcode() == ISD::SRA) {
            if (ShiftAmt == 7) {
@@ -13170,7 +13371,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
            SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
            SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
                                                           MVT::i8));
-          SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16);
+          SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
            Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
            Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
            return Res;
@@ -13190,7 +13391,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
                                       DAG.getConstant(uint8_t(-1U << ShiftAmt),
                                                       MVT::i8));
            return DAG.getNode(ISD::AND, dl, VT, SHL,
-                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
+                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
          }
          if (Op.getOpcode() == ISD::SRL) {
            // Make a large shift.
@@ -13203,7 +13404,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
                                       DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
                                                       MVT::i8));
            return DAG.getNode(ISD::AND, dl, VT, SRL,
-                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
+                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
          }
          if (Op.getOpcode() == ISD::SRA) {
            if (ShiftAmt == 7) {
@@ -13216,7 +13417,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
            SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
            SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt,
                                                           MVT::i8));
-          SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32);
+          SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
            Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
            Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
            return Res;
@@ -13238,7 +13439,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
      uint64_t ShiftAmt = 0;
      for (unsigned i = 0; i != Ratio; ++i) {
        ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i));
-      if (C == 0)
+      if (!C)
          return SDValue();
        // 6 == Log2(64)
        ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
@@ -13249,7 +13450,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
        for (unsigned j = 0; j != Ratio; ++j) {
          ConstantSDNode *C =
            dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
-        if (C == 0)
+        if (!C)
            return SDValue();
          // 6 == Log2(64)
          ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
@@ -13331,7 +13532,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
                 BaseShAmt = InVec.getOperand(1);
             }
          }
-        if (BaseShAmt.getNode() == 0)
+        if (!BaseShAmt.getNode())
            BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Amt,
                                    DAG.getIntPtrConstant(0));
        }
@@ -13484,7 +13685,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
        }
        Elts.push_back(DAG.getConstant(One.shl(ShAmt), SVT));
      }
-    SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Elts[0], NumElems);
+    SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
      return DAG.getNode(ISD::MUL, dl, VT, R, BV);
    }
  
@@ -13648,10 +13849,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
        for (unsigned i = NumElems/2; i != NumElems; ++i)
          Amt2Csts.push_back(Amt->getOperand(i));
  
-      Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
-                                 &Amt1Csts[0], NumElems/2);
-      Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
-                                 &Amt2Csts[0], NumElems/2);
+      Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts);
+      Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts);
      } else {
        // Variable shift amount
        Amt1 = Extract128BitVector(Amt, 0, DAG, dl);
@@ -13882,7 +14081,7 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
    MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
    SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
-                                           Ops, array_lengthof(Ops), T, MMO);
+                                           Ops, T, MMO);
    SDValue cpOut =
      DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
    return cpOut;
@@ -14092,6 +14291,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ_ZERO_UNDEF(Op, DAG);
    case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
    case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
+  case ISD::UMUL_LOHI:
+  case ISD::SMUL_LOHI:          return LowerMUL_LOHI(Op, Subtarget, DAG);
    case ISD::SRA:
    case ISD::SRL:
    case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
@@ -14109,7 +14310,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
    case ISD::ADD:                return LowerADD(Op, DAG);
    case ISD::SUB:                return LowerSUB(Op, DAG);
-  case ISD::SDIV:               return LowerSDIV(Op, DAG);
    case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
    }
  }
@@ -14152,10 +14352,10 @@ ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
    SDValue Ops[] = { Chain, In1, In2L, In2H };
    SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
    SDValue Result =
-    DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, array_lengthof(Ops), MVT::i64,
+    DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, MVT::i64,
                              cast<MemSDNode>(Node)->getMemOperand());
    SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)};
-  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
+  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF));
    Results.push_back(Result.getValue(2));
  }
  
@@ -14176,6 +14376,16 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
    case ISD::SUBE:
      // We don't want to expand or promote these.
      return;
+  case ISD::SDIV:
+  case ISD::UDIV:
+  case ISD::SREM:
+  case ISD::UREM:
+  case ISD::SDIVREM:
+  case ISD::UDIVREM: {
+    SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
+    Results.push_back(V);
+    return;
+  }
    case ISD::FP_TO_SINT:
    case ISD::FP_TO_UINT: {
      bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
@@ -14186,10 +14396,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
      std::pair<SDValue,SDValue> Vals =
          FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
      SDValue FIST = Vals.first, StackSlot = Vals.second;
-    if (FIST.getNode() != 0) {
+    if (FIST.getNode()) {
        EVT VT = N->getValueType(0);
        // Return a load from the stack slot.
-      if (StackSlot.getNode() != 0)
+      if (StackSlot.getNode())
          Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
                                        MachinePointerInfo(),
                                        false, false, false, 0));
@@ -14273,8 +14483,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
      MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
      unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
                                    X86ISD::LCMPXCHG8_DAG;
-    SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys,
-                                             Ops, array_lengthof(Ops), T, MMO);
+    SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
      SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
                                          Regs64bit ? X86::RAX : X86::EAX,
                                          HalfT, Result.getValue(1));
@@ -14282,7 +14491,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
                                          Regs64bit ? X86::RDX : X86::EDX,
                                          HalfT, cpOutL.getValue(2));
      SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
-    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF, 2));
+    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
      Results.push_back(cpOutH.getValue(1));
      return;
    }
@@ -14344,7 +14553,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
  
  const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    switch (Opcode) {
-  default: return NULL;
+  default: return nullptr;
    case X86ISD::BSF:                return "X86ISD::BSF";
    case X86ISD::BSR:                return "X86ISD::BSR";
    case X86ISD::SHLD:               return "X86ISD::SHLD";
@@ -14489,6 +14698,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::VPERMIV3:           return "X86ISD::VPERMIV3";
    case X86ISD::VPERMI:             return "X86ISD::VPERMI";
    case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
+  case X86ISD::PMULDQ:             return "X86ISD::PMULDQ";
    case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
    case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
    case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
@@ -14519,7 +14729,7 @@ bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
    Reloc::Model R = getTargetMachine().getRelocationModel();
  
    // X86 allows a sign-extended 32-bit immediate field as a displacement.
-  if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL))
+  if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
      return false;
  
    if (AM.BaseGV) {
@@ -15645,7 +15855,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(
      OffsetDestReg = 0; // unused
      OverflowDestReg = DestReg;
  
-    offsetMBB = NULL;
+    offsetMBB = nullptr;
      overflowMBB = thisMBB;
      endMBB = thisMBB;
    } else {
@@ -16960,7 +17170,6 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
          SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
          SDValue ResNode =
            DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
-                                  array_lengthof(Ops),
                                    Ld->getMemoryVT(),
                                    Ld->getPointerInfo(),
                                    Ld->getAlignment(),
@@ -17886,7 +18095,7 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
    SDValue Op2 = Cmp.getOperand(1);
  
    SDValue SetCC;
-  const ConstantSDNode* C = 0;
+  const ConstantSDNode* C = nullptr;
    bool needOppositeCond = (CC == X86::COND_E);
    bool checkAgainstTrue = false; // Is it a comparison against 1?
  
@@ -18021,8 +18230,7 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
        (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
      SDValue Ops[] = { FalseOp, TrueOp,
                        DAG.getConstant(CC, MVT::i8), Flags };
-    return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(),
-                       Ops, array_lengthof(Ops));
+    return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
    }
  
    // If this is a select between two integer constants, try to do some
@@ -18137,7 +18345,7 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
      // the DCI.xxxx conditions are provided to postpone the optimization as
      // late as possible.
  
-    ConstantSDNode *CmpAgainst = 0;
+    ConstantSDNode *CmpAgainst = nullptr;
      if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
          (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
          !isa<ConstantSDNode>(Cond.getOperand(0))) {
@@ -18152,8 +18360,7 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
            CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
          SDValue Ops[] = { FalseOp, Cond.getOperand(0),
                            DAG.getConstant(CC, MVT::i8), Cond };
-        return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops,
-                           array_lengthof(Ops));
+        return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
        }
      }
    }
@@ -18504,7 +18711,7 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
      N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(),
                       N1->getOperand(0));
      SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);
-    N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, &C[0], C.size());
+    N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C);
    } else if (RHSTrunc) {
      N1 = N1->getOperand(0);
    }
@@ -18787,8 +18994,7 @@ static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
          SDValue Ops[] = { N0.getOperand(0), Neg,
                            DAG.getConstant(X86::COND_GE, MVT::i8),
                            SDValue(Neg.getNode(), 1) };
-        return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue),
-                           Ops, array_lengthof(Ops));
+        return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
        }
    return SDValue();
  }
@@ -18945,8 +19151,7 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
        Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
      }
  
-    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
-                               Chains.size());
+    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
  
      // Bitcast the loaded value to a vector of the original element type, in
      // the size of the target vector type.
@@ -19121,8 +19326,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
        Chains.push_back(Ch);
      }
  
-    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
-                               Chains.size());
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
    }
  
    // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
@@ -19145,7 +19349,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
        !cast<LoadSDNode>(St->getValue())->isVolatile() &&
        St->getChain().hasOneUse() && !St->isVolatile()) {
      SDNode* LdVal = St->getValue().getNode();
-    LoadSDNode *Ld = 0;
+    LoadSDNode *Ld = nullptr;
      int TokenFactorIndex = -1;
      SmallVector<SDValue, 8> Ops;
      SDNode* ChainVal = St->getChain().getNode();
@@ -19188,8 +19392,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
        SDValue NewChain = NewLd.getValue(1);
        if (TokenFactorIndex != -1) {
          Ops.push_back(NewChain);
-        NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
-                               Ops.size());
+        NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
        }
        return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
                            St->getPointerInfo(),
@@ -19216,8 +19419,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
      if (TokenFactorIndex != -1) {
        Ops.push_back(LoLd);
        Ops.push_back(HiLd);
-      NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
-                             Ops.size());
+      NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
      }
  
      LoAddr = St->getBasePtr();
@@ -20260,7 +20462,7 @@ TargetLowering::ConstraintWeight
    Value *CallOperandVal = info.CallOperandVal;
      // If we don't have a value, we can't do a match,
      // but allow it at the lowest weight.
-  if (CallOperandVal == NULL)
+  if (!CallOperandVal)
      return CW_Default;
    Type *type = CallOperandVal->getType();
    // Look at the constraint type.
@@ -20378,7 +20580,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
                                                       std::string &Constraint,
                                                       std::vector<SDValue>&Ops,
                                                       SelectionDAG &DAG) const {
-  SDValue Result(0, 0);
+  SDValue Result;
  
    // Only support length 1 constraints for now.
    if (Constraint.length() > 1) return;
@@ -20461,7 +20663,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
  
      // If we are in non-pic codegen mode, we allow the address of a global (with
      // an optional displacement) to be used with 'i'.
-    GlobalAddressSDNode *GA = 0;
+    GlobalAddressSDNode *GA = nullptr;
      int64_t Offset = 0;
  
      // Match either (GA), (GA+C), (GA+C1+C2), etc.
@@ -20617,7 +20819,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
    Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
  
    // Not found as a standard register?
-  if (Res.second == 0) {
+  if (!Res.second) {
      // Map st(0) -> st(7) -> ST0
      if (Constraint.size() == 7 && Constraint[0] == '{' &&
          tolower(Constraint[1]) == 's' &&
@@ -20742,3 +20944,30 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
  
    return Res;
  }
+
+int X86TargetLowering::getScalingFactorCost(const AddrMode &AM,
+                                            Type *Ty) const {
+  // Scaling factors are not free at all.
+  // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
+  // will take 2 allocations in the out of order engine instead of 1
+  // for plain addressing mode, i.e. inst (reg1).
+  // E.g.,
+  // vaddps (%rsi,%drx), %ymm0, %ymm1
+  // Requires two allocations (one for the load, one for the computation)
+  // whereas:
+  // vaddps (%rsi), %ymm0, %ymm1
+  // Requires just 1 allocation, i.e., freeing allocations for other operations
+  // and having less micro operations to execute.
+  //
+  // For some X86 architectures, this is even worse because for instance for
+  // stores, the complex addressing mode forces the instruction to use the
+  // "load" ports instead of the dedicated "store" port.
+  // E.g., on Haswell:
+  // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
+  // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.   
+  if (isLegalAddressingMode(AM, Ty))
+    // Scale represents reg2 * scale, thus account for 1
+    // as soon as we use a second register.
+    return AM.Scale != 0;
+  return -1;
+}