Revert r247216: "Fix Clang-tidy misc-use-override warnings, other minor fixes"

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 642fe904735ccc7bab50f1af247912e4478617b4..bc97ea036bf2d5a12060064d4ee636573e988ae2 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -114,13 +114,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
      setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
      setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
-
-    // The _ftol2 runtime function has an unusual calling conv, which
-    // is modeled by a special pseudo-instruction.
-    setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr);
-    setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr);
-    setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr);
-    setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr);
    }
  
    if (Subtarget->isTargetDarwin()) {
@@ -228,8 +221,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
    setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
  
    if (Subtarget->is64Bit()) {
-    setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
-    setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
+    if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512()) {
+      // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
+      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
+      setOperationAction(ISD::FP_TO_UINT   , MVT::i64  , Custom);
+    } else {
+      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Promote);
+      setOperationAction(ISD::FP_TO_UINT   , MVT::i64  , Expand);
+    }
    } else if (!Subtarget->useSoftFloat()) {
      // Since AVX is a superset of SSE3, only check for SSE here.
      if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
@@ -238,14 +237,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
        // the optimal thing for SSE vs. the default expansion in the legalizer.
        setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
      else
+      // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
        // With SSE3 we can use fisttpll to convert to a signed i64; without
        // SSE, we're stuck with a fistpll.
        setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
-  }
  
-  if (isTargetFTOL()) {
-    // Use the _ftol2 runtime function, which has a pseudo-instruction
-    // to handle its weird calling convention.
      setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
    }
  
@@ -308,7 +304,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
    setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
-  setOperationAction(ISD::FREM             , MVT::f32  , Expand);
+
+  if (Subtarget->is32Bit() && Subtarget->isTargetKnownWindowsMSVC()) {
+    // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
+    // is. We should promote the value to 64-bits to solve this.
+    // This is what the CRT headers do - `fmodf` is an inline header
+    // function casting to f64 and calling `fmod`.
+    setOperationAction(ISD::FREM           , MVT::f32  , Promote);
+  } else {
+    setOperationAction(ISD::FREM           , MVT::f32  , Expand);
+  }
+
    setOperationAction(ISD::FREM             , MVT::f64  , Expand);
    setOperationAction(ISD::FREM             , MVT::f80  , Expand);
    setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
@@ -411,6 +417,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
    }
    setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
+  setOperationAction(ISD::CATCHRET        , MVT::Other, Custom);
+  setOperationAction(ISD::CLEANUPRET      , MVT::Other, Custom);
    // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
    // SjLj exception handling but a light-weight setjmp/longjmp replacement to
    // support continuation, user-level threading, and etc.. As a result, no
@@ -469,7 +477,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
    }
  
-  if (Subtarget->is64Bit()) {
+  if (Subtarget->isTarget64BitLP64()) {
      setExceptionPointerRegister(X86::RAX);
      setExceptionSelectorRegister(X86::RDX);
    } else {
@@ -488,8 +496,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
    // VASTART needs to be custom lowered to use the VarArgsFrameIndex
    setOperationAction(ISD::VASTART           , MVT::Other, Custom);
    setOperationAction(ISD::VAEND             , MVT::Other, Expand);
-  if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {
-    // TargetInfo::X86_64ABIBuiltinVaList
+  if (Subtarget->is64Bit()) {
      setOperationAction(ISD::VAARG           , MVT::Other, Custom);
      setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
    } else {
@@ -1330,13 +1337,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
      setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
  
-    setOperationAction(ISD::FP_TO_SINT,         MVT::i32, Legal);
-    setOperationAction(ISD::FP_TO_UINT,         MVT::i32, Legal);
+    // FIXME:  [US]INT_TO_FP are not legal for f80.
      setOperationAction(ISD::SINT_TO_FP,         MVT::i32, Legal);
      setOperationAction(ISD::UINT_TO_FP,         MVT::i32, Legal);
      if (Subtarget->is64Bit()) {
-      setOperationAction(ISD::FP_TO_UINT,       MVT::i64, Legal);
-      setOperationAction(ISD::FP_TO_SINT,       MVT::i64, Legal);
        setOperationAction(ISD::SINT_TO_FP,       MVT::i64, Legal);
        setOperationAction(ISD::UINT_TO_FP,       MVT::i64, Legal);
      }
@@ -1493,6 +1497,18 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      if (Subtarget->hasCDI()) {
        setOperationAction(ISD::CTLZ,             MVT::v8i64, Legal);
        setOperationAction(ISD::CTLZ,             MVT::v16i32, Legal);
+      setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v8i64, Legal);
+      setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v16i32, Legal);
+    }
+    if (Subtarget->hasVLX() && Subtarget->hasCDI()) {
+      setOperationAction(ISD::CTLZ,             MVT::v4i64, Legal);
+      setOperationAction(ISD::CTLZ,             MVT::v8i32, Legal);
+      setOperationAction(ISD::CTLZ,             MVT::v2i64, Legal);
+      setOperationAction(ISD::CTLZ,             MVT::v4i32, Legal);
+      setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v4i64, Legal);
+      setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v8i32, Legal);
+      setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v2i64, Legal);
+      setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v4i32, Legal);
      }
      if (Subtarget->hasDQI()) {
        setOperationAction(ISD::MUL,             MVT::v2i64, Legal);
@@ -1565,8 +1581,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      setOperationAction(ISD::MUL,                MVT::v32i16, Legal);
      setOperationAction(ISD::MULHS,              MVT::v32i16, Legal);
      setOperationAction(ISD::MULHU,              MVT::v32i16, Legal);
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i1, Custom);
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i1, Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i1, Legal);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i1, Legal);
      setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i1, Custom);
      setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i1, Custom);
      setOperationAction(ISD::SELECT,             MVT::v32i1, Custom);
@@ -1575,6 +1591,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i8, Custom);
      setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i16, Custom);
      setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i16, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v32i16, Custom);
      setOperationAction(ISD::SIGN_EXTEND,        MVT::v64i8, Custom);
      setOperationAction(ISD::ZERO_EXTEND,        MVT::v64i8, Custom);
      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v32i1, Custom);
@@ -1876,10 +1893,11 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size,
    if ((!IsMemset || ZeroMemset) &&
        !F->hasFnAttribute(Attribute::NoImplicitFloat)) {
      if (Size >= 16 &&
-        (Subtarget->isUnalignedMemAccessFast() ||
+        (!Subtarget->isUnalignedMem16Slow() ||
           ((DstAlign == 0 || DstAlign >= 16) &&
            (SrcAlign == 0 || SrcAlign >= 16)))) {
        if (Size >= 32) {
+        // FIXME: Check if unaligned 32-byte accesses are slow.
          if (Subtarget->hasInt256())
            return MVT::v8i32;
          if (Subtarget->hasFp256())
@@ -1897,6 +1915,9 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size,
        return MVT::f64;
      }
    }
+  // This is a compromise. If we reach here, unaligned accesses may be slow on
+  // this target. However, creating smaller, aligned accesses could be even
+  // slower and would certainly be a lot more code.
    if (Subtarget->is64Bit() && Size >= 8)
      return MVT::i64;
    return MVT::i32;
@@ -1916,13 +1937,21 @@ X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
                                                    unsigned,
                                                    bool *Fast) const {
    if (Fast) {
-    // FIXME: We should be checking 128-bit accesses separately from smaller
-    // accesses.
-    if (VT.getSizeInBits() == 256)
+    switch (VT.getSizeInBits()) {
+    default:
+      // 8-byte and under are always assumed to be fast.
+      *Fast = true;
+      break;
+    case 128:
+      *Fast = !Subtarget->isUnalignedMem16Slow();
+      break;
+    case 256:
        *Fast = !Subtarget->isUnalignedMem32Slow();
-    else
-      *Fast = Subtarget->isUnalignedMemAccessFast();
+      break;
+    // TODO: What about AVX-512 (512-bit) accesses?
+    }
    }
+  // Misaligned accesses of any size are always allowed.
    return true;
  }
  
@@ -3767,6 +3796,8 @@ static bool isTargetShuffle(unsigned Opcode) {
    case X86ISD::VPERMILPI:
    case X86ISD::VPERM2X128:
    case X86ISD::VPERMI:
+  case X86ISD::VPERMV:
+  case X86ISD::VPERMV3:
      return true;
    }
  }
@@ -4631,6 +4662,122 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT,
    case X86ISD::MOVLPS:
      // Not yet implemented
      return false;
+  case X86ISD::VPERMV: {
+    IsUnary = true;
+    SDValue MaskNode = N->getOperand(0);
+    while (MaskNode->getOpcode() == ISD::BITCAST)
+      MaskNode = MaskNode->getOperand(0);
+
+    unsigned MaskLoBits = Log2_64(VT.getVectorNumElements());
+    SmallVector<uint64_t, 32> RawMask;
+    if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
+      // If we have a build-vector, then things are easy.
+      assert(MaskNode.getValueType().isInteger() &&
+             MaskNode.getValueType().getVectorNumElements() ==
+             VT.getVectorNumElements());
+
+      for (unsigned i = 0; i < MaskNode->getNumOperands(); ++i) {
+        SDValue Op = MaskNode->getOperand(i);
+        if (Op->getOpcode() == ISD::UNDEF)
+          RawMask.push_back((uint64_t)SM_SentinelUndef);
+        else if (isa<ConstantSDNode>(Op)) {
+          APInt MaskElement = cast<ConstantSDNode>(Op)->getAPIntValue();
+          RawMask.push_back(MaskElement.getLoBits(MaskLoBits).getZExtValue());
+        } else
+          return false;
+      }
+      DecodeVPERMVMask(RawMask, Mask);
+      break;
+    }
+    if (MaskNode->getOpcode() == X86ISD::VBROADCAST) {
+      unsigned NumEltsInMask = MaskNode->getNumOperands();
+      MaskNode = MaskNode->getOperand(0);
+      auto *CN = dyn_cast<ConstantSDNode>(MaskNode);
+      if (CN) {
+        APInt MaskEltValue = CN->getAPIntValue();
+        for (unsigned i = 0; i < NumEltsInMask; ++i)
+          RawMask.push_back(MaskEltValue.getLoBits(MaskLoBits).getZExtValue());
+        DecodeVPERMVMask(RawMask, Mask);
+        break;
+      }
+      // It may be a scalar load
+    }
+
+    auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
+    if (!MaskLoad)
+      return false;
+
+    SDValue Ptr = MaskLoad->getBasePtr();
+    if (Ptr->getOpcode() == X86ISD::Wrapper ||
+        Ptr->getOpcode() == X86ISD::WrapperRIP)
+      Ptr = Ptr->getOperand(0);
+
+    auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
+    if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
+      return false;
+
+    auto *C = dyn_cast<Constant>(MaskCP->getConstVal());
+    if (C) {
+      DecodeVPERMVMask(C, VT, Mask);
+      if (Mask.empty())
+        return false;
+      break;
+    }
+    return false;
+  }
+  case X86ISD::VPERMV3: {
+    IsUnary = false;
+    SDValue MaskNode = N->getOperand(1);
+    while (MaskNode->getOpcode() == ISD::BITCAST)
+      MaskNode = MaskNode->getOperand(1);
+
+    if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
+      // If we have a build-vector, then things are easy.
+      assert(MaskNode.getValueType().isInteger() &&
+             MaskNode.getValueType().getVectorNumElements() ==
+             VT.getVectorNumElements());
+
+      SmallVector<uint64_t, 32> RawMask;
+      unsigned MaskLoBits = Log2_64(VT.getVectorNumElements()*2);
+
+      for (unsigned i = 0; i < MaskNode->getNumOperands(); ++i) {
+        SDValue Op = MaskNode->getOperand(i);
+        if (Op->getOpcode() == ISD::UNDEF)
+          RawMask.push_back((uint64_t)SM_SentinelUndef);
+        else {
+          auto *CN = dyn_cast<ConstantSDNode>(Op.getNode());
+          if (!CN)
+            return false;
+          APInt MaskElement = CN->getAPIntValue();
+          RawMask.push_back(MaskElement.getLoBits(MaskLoBits).getZExtValue());
+        }
+      }
+      DecodeVPERMV3Mask(RawMask, Mask);
+      break;
+    }
+
+    auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
+    if (!MaskLoad)
+      return false;
+
+    SDValue Ptr = MaskLoad->getBasePtr();
+    if (Ptr->getOpcode() == X86ISD::Wrapper ||
+        Ptr->getOpcode() == X86ISD::WrapperRIP)
+      Ptr = Ptr->getOperand(0);
+
+    auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
+    if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
+      return false;
+
+    auto *C = dyn_cast<Constant>(MaskCP->getConstVal());
+    if (C) {
+      DecodeVPERMV3Mask(C, VT, Mask);
+      if (Mask.empty())
+        return false;
+      break;
+    }
+    return false;
+  }
    default: llvm_unreachable("unknown target shuffle node");
    }
  
@@ -6907,7 +7054,7 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
      Hi = DAG.getBitcast(AlignVT, Hi);
  
      return DAG.getBitcast(
-        VT, DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Hi, Lo,
+        VT, DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Lo, Hi,
                          DAG.getConstant(Rotation * Scale, DL, MVT::i8)));
    }
  
@@ -7553,6 +7700,23 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V,
  
    // Check if this is a broadcast of a scalar. We special case lowering
    // for scalars so that we can more effectively fold with loads.
+  // First, look through bitcast: if the original value has a larger element
+  // type than the shuffle, the broadcast element is in essence truncated.
+  // Make that explicit to ease folding.
+  if (V.getOpcode() == ISD::BITCAST && VT.isInteger()) {
+    EVT EltVT = VT.getVectorElementType();
+    SDValue V0 = V.getOperand(0);
+    EVT V0VT = V0.getValueType();
+
+    if (V0VT.isInteger() && V0VT.getVectorElementType().bitsGT(EltVT) &&
+        ((V0.getOpcode() == ISD::BUILD_VECTOR ||
+         (V0.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)))) {
+      V = DAG.getNode(ISD::TRUNCATE, DL, EltVT, V0.getOperand(BroadcastIdx));
+      BroadcastIdx = 0;
+    }
+  }
+
+  // Also check the simpler case, where we can directly reuse the scalar.
    if (V.getOpcode() == ISD::BUILD_VECTOR ||
        (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
      V = V.getOperand(BroadcastIdx);
@@ -10401,6 +10565,73 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    }
  }
  
+static SDValue lowerVectorShuffleWithPERMV(SDLoc DL, MVT VT,
+                                           ArrayRef<int> Mask, SDValue V1,
+                                           SDValue V2, SelectionDAG &DAG) {
+
+  assert(VT.getScalarSizeInBits() >= 16 && "Unexpected data type for PERMV");
+
+  MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
+  MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
+
+  SmallVector<SDValue, 32>  VPermMask;
+  for (unsigned i = 0; i < VT.getVectorNumElements(); ++i)
+    VPermMask.push_back(Mask[i] < 0 ? DAG.getUNDEF(MaskEltVT) :
+                        DAG.getConstant(Mask[i], DL, MaskEltVT));
+  SDValue MaskNode = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVecVT,
+                                 VPermMask);
+  if (isSingleInputShuffleMask(Mask))
+    return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
+
+  return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
+}
+
+// X86 has dedicated unpack instructions that can handle specific blend
+// operations: UNPCKH and UNPCKL.
+static SDValue lowerVectorShuffleWithUNPCK(SDLoc DL, MVT VT,
+                                           ArrayRef<int> Mask, SDValue V1,
+                                           SDValue V2, SelectionDAG &DAG) {
+  int NumElts = VT.getVectorNumElements();
+  bool Unpckl = true;
+  bool Unpckh = true;
+  bool UnpcklSwapped = true;
+  bool UnpckhSwapped = true;
+  int NumEltsInLane = 128 / VT.getScalarSizeInBits();
+
+  for (int i = 0; i < NumElts ; ++i) {
+    unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
+
+    int LoPos = (i % NumEltsInLane) / 2 + LaneStart + NumElts * (i % 2);
+    int HiPos = LoPos + NumEltsInLane / 2;
+    int LoPosSwapped = (LoPos + NumElts) % (NumElts * 2);
+    int HiPosSwapped = (HiPos + NumElts) % (NumElts * 2);
+
+    if (Mask[i] == -1)
+      continue;
+    if (Mask[i] != LoPos)
+      Unpckl = false;
+    if (Mask[i] != HiPos)
+      Unpckh = false;
+    if (Mask[i] != LoPosSwapped)
+      UnpcklSwapped = false;
+    if (Mask[i] != HiPosSwapped)
+      UnpckhSwapped = false;
+    if (!Unpckl && !Unpckh && !UnpcklSwapped && !UnpckhSwapped)
+      return SDValue();
+  }
+  if (Unpckl)
+    return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
+  if (Unpckh)
+    return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
+  if (UnpcklSwapped)
+    return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
+  if (UnpckhSwapped)
+    return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
+
+  llvm_unreachable("Unexpected result of UNPCK mask analysis");
+  return SDValue();
+}
+
  /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
  static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                                         const X86Subtarget *Subtarget,
@@ -10412,15 +10643,12 @@ static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    ArrayRef<int> Mask = SVOp->getMask();
    assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
  
-  // X86 has dedicated unpack instructions that can handle specific blend
-  // operations: UNPCKH and UNPCKL.
-  if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14}))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2);
-  if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15}))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2);
+  SDValue UnpckNode =
+       lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG);
+  if (UnpckNode)
+    return UnpckNode;
  
-  // FIXME: Implement direct support for this type!
-  return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG);
+  return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
  }
  
  /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
@@ -10434,22 +10662,12 @@ static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    ArrayRef<int> Mask = SVOp->getMask();
    assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
  
-  // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(V1, V2, Mask,
-                          {// First 128-bit lane.
-                           0, 16, 1, 17, 4, 20, 5, 21,
-                           // Second 128-bit lane.
-                           8, 24, 9, 25, 12, 28, 13, 29}))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2);
-  if (isShuffleEquivalent(V1, V2, Mask,
-                          {// First 128-bit lane.
-                           2, 18, 3, 19, 6, 22, 7, 23,
-                           // Second 128-bit lane.
-                           10, 26, 11, 27, 14, 30, 15, 31}))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2);
+  SDValue UnpckNode =
+       lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG);
+  if (UnpckNode)
+    return UnpckNode;
  
-  // FIXME: Implement direct support for this type!
-  return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG);
+  return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
  }
  
  /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
@@ -10463,15 +10681,12 @@ static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    ArrayRef<int> Mask = SVOp->getMask();
    assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
  
-  // X86 has dedicated unpack instructions that can handle specific blend
-  // operations: UNPCKH and UNPCKL.
-  if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14}))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2);
-  if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15}))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2);
+  SDValue UnpckNode =
+       lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG);
+  if (UnpckNode)
+    return UnpckNode;
  
-  // FIXME: Implement direct support for this type!
-  return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG);
+  return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
  }
  
  /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
@@ -10485,22 +10700,12 @@ static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    ArrayRef<int> Mask = SVOp->getMask();
    assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
  
-  // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(V1, V2, Mask,
-                          {// First 128-bit lane.
-                           0, 16, 1, 17, 4, 20, 5, 21,
-                           // Second 128-bit lane.
-                           8, 24, 9, 25, 12, 28, 13, 29}))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2);
-  if (isShuffleEquivalent(V1, V2, Mask,
-                          {// First 128-bit lane.
-                           2, 18, 3, 19, 6, 22, 7, 23,
-                           // Second 128-bit lane.
-                           10, 26, 11, 27, 14, 30, 15, 31}))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2);
+  SDValue UnpckNode =
+       lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG);
+  if (UnpckNode)
+    return UnpckNode;
  
-  // FIXME: Implement direct support for this type!
-  return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG);
+  return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
  }
  
  /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
@@ -10515,8 +10720,7 @@ static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
    assert(Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
  
-  // FIXME: Implement direct support for this type!
-  return splitAndLowerVectorShuffle(DL, MVT::v32i16, V1, V2, Mask, DAG);
+  return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
  }
  
  /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
@@ -10713,8 +10917,6 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
    if (VT.getSizeInBits() == 256)
      return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
  
-  // Force AVX-512 vectors to be scalarized for now.
-  // FIXME: Implement AVX-512 support!
    if (VT.getSizeInBits() == 512)
      return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
  
@@ -10732,7 +10934,7 @@ static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
    unsigned NumLanes = (NumElems - 1) / 8 + 1;
    unsigned NumElemsInLane = NumElems / NumLanes;
  
-  // Blend for v16i16 should be symetric for the both lanes.
+  // Blend for v16i16 should be symmetric for the both lanes.
    for (unsigned i = 0; i < NumElemsInLane; ++i) {
      SDValue EltCond = BuildVector->getOperand(i);
      SDValue SndLaneEltCond =
@@ -12315,15 +12517,45 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
                       DAG.getIntPtrConstant(0, dl));
  }
  
+// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
+// is legal, or has an f16 source (which needs to be promoted to f32),
+// just return an <SDValue(), SDValue()> pair.
+// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
+// to i16, i32 or i64, and we lower it to a legal sequence.
+// If lowered to the final integer result we return a <result, SDValue()> pair.
+// Otherwise we lower it to a sequence ending with a FIST, return a
+// <FIST, StackSlot> pair, and the caller is responsible for loading
+// the final integer result from StackSlot.
  std::pair<SDValue,SDValue>
-X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
-                                    bool IsSigned, bool IsReplace) const {
+X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
+                                   bool IsSigned, bool IsReplace) const {
    SDLoc DL(Op);
  
    EVT DstTy = Op.getValueType();
+  EVT TheVT = Op.getOperand(0).getValueType();
    auto PtrVT = getPointerTy(DAG.getDataLayout());
  
-  if (!IsSigned && !isIntegerTypeFTOL(DstTy)) {
+  if (TheVT == MVT::f16)
+    // We need to promote the f16 to f32 before using the lowering
+    // in this routine.
+    return std::make_pair(SDValue(), SDValue());
+
+  assert((TheVT == MVT::f32 ||
+          TheVT == MVT::f64 ||
+          TheVT == MVT::f80) &&
+         "Unexpected FP operand type in FP_TO_INTHelper");
+
+  // If using FIST to compute an unsigned i64, we'll need some fixup
+  // to handle values above the maximum signed i64.  A FIST is always
+  // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
+  bool UnsignedFixup = !IsSigned &&
+                       DstTy == MVT::i64 &&
+                       (!Subtarget->is64Bit() ||
+                        !isScalarFPTypeInSSEReg(TheVT));
+
+  if (!IsSigned && DstTy != MVT::i64 && !Subtarget->hasAVX512()) {
+    // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
+    // The low 32 bits of the fist result will have the correct uint32 result.
      assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
      DstTy = MVT::i64;
    }
@@ -12341,27 +12573,72 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
        isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
      return std::make_pair(SDValue(), SDValue());
  
-  // We lower FP->int64 either into FISTP64 followed by a load from a temporary
-  // stack slot, or into the FTOL runtime function.
+  // We lower FP->int64 into FISTP64 followed by a load from a temporary
+  // stack slot.
    MachineFunction &MF = DAG.getMachineFunction();
    unsigned MemSize = DstTy.getSizeInBits()/8;
    int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
    SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
  
    unsigned Opc;
-  if (!IsSigned && isIntegerTypeFTOL(DstTy))
-    Opc = X86ISD::WIN_FTOL;
-  else
-    switch (DstTy.getSimpleVT().SimpleTy) {
-    default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
-    case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
-    case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
-    case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
-    }
+  switch (DstTy.getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
+  case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
+  case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
+  case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
+  }
  
    SDValue Chain = DAG.getEntryNode();
    SDValue Value = Op.getOperand(0);
-  EVT TheVT = Op.getOperand(0).getValueType();
+  SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
+
+  if (UnsignedFixup) {
+    //
+    // Conversion to unsigned i64 is implemented with a select,
+    // depending on whether the source value fits in the range
+    // of a signed i64.  Let Thresh be the FP equivalent of
+    // 0x8000000000000000ULL.
+    //
+    //  Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
+    //  FistSrc    = (Value < Thresh) ? Value : (Value - Thresh);
+    //  Fist-to-mem64 FistSrc
+    //  Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
+    //  to XOR'ing the high 32 bits with Adjust.
+    //
+    // Being a power of 2, Thresh is exactly representable in all FP formats.
+    // For X87 we'd like to use the smallest FP type for this constant, but
+    // for DAG type consistency we have to match the FP operand type.
+
+    APFloat Thresh(APFloat::IEEEsingle, APInt(32, 0x5f000000));
+    APFloat::opStatus Status = APFloat::opOK;
+    bool LosesInfo = false;
+    if (TheVT == MVT::f64)
+      // The rounding mode is irrelevant as the conversion should be exact.
+      Status = Thresh.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven,
+                              &LosesInfo);
+    else if (TheVT == MVT::f80)
+      Status = Thresh.convert(APFloat::x87DoubleExtended,
+                              APFloat::rmNearestTiesToEven, &LosesInfo);
+
+    assert(Status == APFloat::opOK && !LosesInfo &&
+           "FP conversion should have been exact");
+
+    SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
+
+    SDValue Cmp = DAG.getSetCC(DL,
+                               getSetCCResultType(DAG.getDataLayout(),
+                                                  *DAG.getContext(), TheVT),
+                               Value, ThreshVal, ISD::SETLT);
+    Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
+                           DAG.getConstant(0, DL, MVT::i32),
+                           DAG.getConstant(0x80000000, DL, MVT::i32));
+    SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
+    Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
+                                              *DAG.getContext(), TheVT),
+                       Value, ThreshVal, ISD::SETLT);
+    Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
+  }
+
    // FIXME This causes a redundant load/store if the SSE-class value is already
    // in memory, such as if it is on the callstack.
    if (isScalarFPTypeInSSEReg(TheVT)) {
@@ -12387,25 +12664,49 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
        MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
                                MachineMemOperand::MOStore, MemSize, MemSize);
  
-  if (Opc != X86ISD::WIN_FTOL) {
+  if (UnsignedFixup) {
+
+    // Insert the FIST, load its result as two i32's,
+    // and XOR the high i32 with Adjust.
+
+    SDValue FistOps[] = { Chain, Value, StackSlot };
+    SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
+                                           FistOps, DstTy, MMO);
+
+    SDValue Low32 = DAG.getLoad(MVT::i32, DL, FIST, StackSlot,
+                                MachinePointerInfo(),
+                                false, false, false, 0);
+    SDValue HighAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackSlot,
+                                   DAG.getConstant(4, DL, PtrVT));
+
+    SDValue High32 = DAG.getLoad(MVT::i32, DL, FIST, HighAddr,
+                                 MachinePointerInfo(),
+                                 false, false, false, 0);
+    High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
+
+    if (Subtarget->is64Bit()) {
+      // Join High32 and Low32 into a 64-bit result.
+      // (High32 << 32) | Low32
+      Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
+      High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
+      High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
+                           DAG.getConstant(32, DL, MVT::i8));
+      SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
+      return std::make_pair(Result, SDValue());
+    }
+
+    SDValue ResultOps[] = { Low32, High32 };
+
+    SDValue pair = IsReplace
+      ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
+      : DAG.getMergeValues(ResultOps, DL);
+    return std::make_pair(pair, SDValue());
+  } else {
      // Build the FP_TO_INT*_IN_MEM
      SDValue Ops[] = { Chain, Value, StackSlot };
      SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
                                             Ops, DstTy, MMO);
      return std::make_pair(FIST, StackSlot);
-  } else {
-    SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
-      DAG.getVTList(MVT::Other, MVT::Glue),
-      Chain, Value);
-    SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX,
-      MVT::i32, ftol.getValue(1));
-    SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX,
-      MVT::i32, eax.getValue(2));
-    SDValue Ops[] = { eax, edx };
-    SDValue pair = IsReplace
-      ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops)
-      : DAG.getMergeValues(Ops, DL);
-    return std::make_pair(pair, SDValue());
    }
  }
  
@@ -12669,7 +12970,8 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
      /*IsSigned=*/ true, /*IsReplace=*/ false);
    SDValue FIST = Vals.first, StackSlot = Vals.second;
    // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
-  if (!FIST.getNode()) return Op;
+  if (!FIST.getNode())
+    return Op;
  
    if (StackSlot.getNode())
      // Load the result.
@@ -12686,7 +12988,9 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
    std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
      /*IsSigned=*/ false, /*IsReplace=*/ false);
    SDValue FIST = Vals.first, StackSlot = Vals.second;
-  assert(FIST.getNode() && "Unexpected failure");
+  // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
+  if (!FIST.getNode())
+    return Op;
  
    if (StackSlot.getNode())
      // Load the result.
@@ -15039,7 +15343,8 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
    const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
    SDLoc DL(Op);
  
-  if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) {
+  if (!Subtarget->is64Bit() ||
+      Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv())) {
      // vastart just stores the address of the VarArgsFrameIndex slot into the
      // memory location argument.
      SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
@@ -15078,10 +15383,11 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
    MemOps.push_back(Store);
  
    // Store ptr to reg_save_area.
-  FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(8, DL));
+  FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
+      Subtarget->isTarget64BitLP64() ? 8 : 4, DL));
    SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
-  Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
-                       MachinePointerInfo(SV, 16), false, false, 0);
+  Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, MachinePointerInfo(
+      SV, Subtarget->isTarget64BitLP64() ? 16 : 12), false, false, 0);
    MemOps.push_back(Store);
    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
  }
@@ -15089,10 +15395,13 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
  SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
    assert(Subtarget->is64Bit() &&
           "LowerVAARG only handles 64-bit va_arg!");
-  assert((Subtarget->isTargetLinux() ||
-          Subtarget->isTargetDarwin()) &&
-          "Unhandled target in LowerVAARG");
    assert(Op.getNode()->getNumOperands() == 4);
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  if (Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv()))
+    // The Win64 ABI uses char* instead of a structure.
+    return DAG.expandVAArg(Op.getNode());
+
    SDValue Chain = Op.getOperand(0);
    SDValue SrcPtr = Op.getOperand(1);
    const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
@@ -15120,8 +15429,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
    if (ArgMode == 2) {
      // Sanity Check: Make sure using fp_offset makes sense.
      assert(!Subtarget->useSoftFloat() &&
-           !(DAG.getMachineFunction().getFunction()->hasFnAttribute(
-               Attribute::NoImplicitFloat)) &&
+           !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) &&
             Subtarget->hasSSE1());
    }
  
@@ -15150,8 +15458,14 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
  
  static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
                             SelectionDAG &DAG) {
-  // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
+  // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
+  // where a va_list is still an i8*.
    assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
+  if (Subtarget->isCallingConvWin64(
+        DAG.getMachineFunction().getFunction()->getCallingConv()))
+    // Probably a Win64 va_copy.
+    return DAG.expandVACopy(Op.getNode());
+
    SDValue Chain = Op.getOperand(0);
    SDValue DstPtr = Op.getOperand(1);
    SDValue SrcPtr = Op.getOperand(2);
@@ -15441,6 +15755,9 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
      case INTR_TYPE_2OP:
        return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
          Op.getOperand(2));
+    case INTR_TYPE_2OP_IMM8:
+      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
+                         DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(2)));
      case INTR_TYPE_3OP:
        return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
          Op.getOperand(2), Op.getOperand(3));
@@ -15554,6 +15871,18 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
                                                Src1, Src2, Rnd),
                                    Mask, PassThru, Subtarget, DAG);
      }
+    case INTR_TYPE_3OP_SCALAR_MASK_RM: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue Src3 = Op.getOperand(3);
+      SDValue PassThru = Op.getOperand(4);
+      SDValue Mask = Op.getOperand(5);
+      SDValue Sae  = Op.getOperand(6);
+
+      return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
+                                              Src2, Src3, Sae),
+                                  Mask, PassThru, Subtarget, DAG);
+    }
      case INTR_TYPE_3OP_MASK_RM: {
        SDValue Src1 = Op.getOperand(1);
        SDValue Src2 = Op.getOperand(2);
@@ -15572,12 +15901,16 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
          Src1, Src2, Imm, Rnd),
          Mask, PassThru, Subtarget, DAG);
      }
+    case INTR_TYPE_3OP_IMM8_MASK:
      case INTR_TYPE_3OP_MASK: {
        SDValue Src1 = Op.getOperand(1);
        SDValue Src2 = Op.getOperand(2);
        SDValue Src3 = Op.getOperand(3);
        SDValue PassThru = Op.getOperand(4);
        SDValue Mask = Op.getOperand(5);
+
+      if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
+        Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
        // We specify 2 possible opcodes for intrinsics with rounding modes.
        // First, we check if the intrinsic may have non-default rounding mode,
        // (IntrData->Opc1 != 0), then we check the rounding mode operand.
@@ -16542,6 +16875,41 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
                       DAG.getRegister(StoreAddrReg, PtrVT));
  }
  
+SDValue X86TargetLowering::LowerCATCHRET(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  SDValue Dest = Op.getOperand(1);
+  SDLoc DL(Op);
+
+  MVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  if (isAsynchronousEHPersonality(
+          classifyEHPersonality(MF.getFunction()->getPersonalityFn()))) {
+    // For SEH, codegen catchret as a branch for now.
+    // FIXME: Insert something to restore the frame.
+    return DAG.getNode(ISD::BR, DL, MVT::Other, Chain, Dest);
+  }
+
+  unsigned ReturnReg = (PtrVT == MVT::i64 ? X86::RAX : X86::EAX);
+
+  // Load the address of the destination block.
+  // FIXME: Do this without creating a BlockAddress.
+  MachineBasicBlock *DestMBB = cast<BasicBlockSDNode>(Dest)->getBasicBlock();
+  BlockAddress *BA =
+      BlockAddress::get(const_cast<Function *>(MF.getFunction()),
+                        const_cast<BasicBlock *>(DestMBB->getBasicBlock()));
+  DestMBB->setHasAddressTaken();
+  SDValue BlockPtr = DAG.getBlockAddress(BA, PtrVT);
+  Chain = DAG.getCopyToReg(Chain, DL, ReturnReg, BlockPtr);
+  return DAG.getNode(X86ISD::CATCHRET, DL, MVT::Other, Chain,
+                     DAG.getRegister(ReturnReg, PtrVT));
+}
+
+SDValue X86TargetLowering::LowerCLEANUPRET(SDValue Op, SelectionDAG &DAG) const {
+  return DAG.getNode(X86ISD::CLEANUPRET, SDLoc(Op), MVT::Other,
+                     Op.getOperand(0));
+}
+
  SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
                                                 SelectionDAG &DAG) const {
    SDLoc DL(Op);
@@ -18781,6 +19149,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
                                  return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
    case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
    case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
+  case ISD::CLEANUPRET:         return LowerCLEANUPRET(Op, DAG);
+  case ISD::CATCHRET:           return LowerCATCHRET(Op, DAG);
    case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
    case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
    case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
@@ -18866,17 +19236,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
      return;
    }
    case ISD::FP_TO_SINT:
-    // FP_TO_INT*_IN_MEM is not legal for f16 inputs.  Do not convert
-    // (FP_TO_SINT (load f16)) to FP_TO_INT*.
-    if (N->getOperand(0).getValueType() == MVT::f16)
-      break;
-    // fallthrough
    case ISD::FP_TO_UINT: {
      bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
  
-    if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType()))
-      return;
-
      std::pair<SDValue,SDValue> Vals =
          FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
      SDValue FIST = Vals.first, StackSlot = Vals.second;
@@ -19108,6 +19470,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::FHADD:              return "X86ISD::FHADD";
    case X86ISD::FHSUB:              return "X86ISD::FHSUB";
    case X86ISD::ABS:                return "X86ISD::ABS";
+  case X86ISD::CONFLICT:           return "X86ISD::CONFLICT";
    case X86ISD::FMAX:               return "X86ISD::FMAX";
    case X86ISD::FMAX_RND:           return "X86ISD::FMAX_RND";
    case X86ISD::FMIN:               return "X86ISD::FMIN";
@@ -19124,6 +19487,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::EH_SJLJ_SETJMP:     return "X86ISD::EH_SJLJ_SETJMP";
    case X86ISD::EH_SJLJ_LONGJMP:    return "X86ISD::EH_SJLJ_LONGJMP";
    case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
+  case X86ISD::CATCHRET:           return "X86ISD::CATCHRET";
+  case X86ISD::CLEANUPRET:         return "X86ISD::CLEANUPRET";
    case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
    case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
    case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
@@ -19177,6 +19542,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::TESTM:              return "X86ISD::TESTM";
    case X86ISD::TESTNM:             return "X86ISD::TESTNM";
    case X86ISD::KORTEST:            return "X86ISD::KORTEST";
+  case X86ISD::KTEST:              return "X86ISD::KTEST";
    case X86ISD::PACKSS:             return "X86ISD::PACKSS";
    case X86ISD::PACKUS:             return "X86ISD::PACKUS";
    case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
@@ -19213,6 +19579,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
    case X86ISD::PMULDQ:             return "X86ISD::PMULDQ";
    case X86ISD::PSADBW:             return "X86ISD::PSADBW";
+  case X86ISD::DBPSADBW:           return "X86ISD::DBPSADBW";
    case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
    case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
    case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
@@ -19221,7 +19588,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::SFENCE:             return "X86ISD::SFENCE";
    case X86ISD::LFENCE:             return "X86ISD::LFENCE";
    case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
-  case X86ISD::WIN_FTOL:           return "X86ISD::WIN_FTOL";
    case X86ISD::SAHF:               return "X86ISD::SAHF";
    case X86ISD::RDRAND:             return "X86ISD::RDRAND";
    case X86ISD::RDSEED:             return "X86ISD::RDSEED";
@@ -19241,6 +19607,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::FMSUBADD_RND:       return "X86ISD::FMSUBADD_RND";
    case X86ISD::VRNDSCALE:          return "X86ISD::VRNDSCALE";
    case X86ISD::VREDUCE:            return "X86ISD::VREDUCE";
+  case X86ISD::VGETMANT:           return "X86ISD::VGETMANT";
    case X86ISD::PCMPESTRI:          return "X86ISD::PCMPESTRI";
    case X86ISD::PCMPISTRI:          return "X86ISD::PCMPISTRI";
    case X86ISD::XTEST:              return "X86ISD::XTEST";
@@ -19929,7 +20296,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
    int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
    int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
  
-  if (!Subtarget->isTargetWin64()) {
+  if (!Subtarget->isCallingConvWin64(F->getFunction()->getCallingConv())) {
      // If %al is 0, branch around the XMM save block.
      BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
      BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
@@ -23435,18 +23802,34 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
        N1C && N0.getOpcode() == ISD::AND &&
        N0.getOperand(1).getOpcode() == ISD::Constant) {
      SDValue N00 = N0.getOperand(0);
-    if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
-        ((N00.getOpcode() == ISD::ANY_EXTEND ||
-          N00.getOpcode() == ISD::ZERO_EXTEND) &&
-         N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) {
-      APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
-      APInt ShAmt = N1C->getAPIntValue();
-      Mask = Mask.shl(ShAmt);
-      if (Mask != 0) {
-        SDLoc DL(N);
-        return DAG.getNode(ISD::AND, DL, VT,
-                           N00, DAG.getConstant(Mask, DL, VT));
-      }
+    APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+    APInt ShAmt = N1C->getAPIntValue();
+    Mask = Mask.shl(ShAmt);
+    bool MaskOK = false;
+    // We can handle cases concerning bit-widening nodes containing setcc_c if
+    // we carefully interrogate the mask to make sure we are semantics
+    // preserving.
+    // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
+    // of the underlying setcc_c operation if the setcc_c was zero extended.
+    // Consider the following example:
+    //   zext(setcc_c)                 -> i32 0x0000FFFF
+    //   c1                            -> i32 0x0000FFFF
+    //   c2                            -> i32 0x00000001
+    //   (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
+    //   (and setcc_c, (c1 << c2))     -> i32 0x0000FFFE
+    if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
+      MaskOK = true;
+    } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
+               N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
+      MaskOK = true;
+    } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
+                N00.getOpcode() == ISD::ANY_EXTEND) &&
+               N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
+      MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
+    }
+    if (MaskOK && Mask != 0) {
+      SDLoc DL(N);
+      return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
      }
    }
  
@@ -24074,13 +24457,61 @@ static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
    return SDValue();
  }
  
-// PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes
+// Try to turn tests against the signbit in the form of:
+//   XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
+// into:
+//   SETGT(X, -1)
+static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
+  // This is only worth doing if the output type is i8.
+  if (N->getValueType(0) != MVT::i8)
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  // We should be performing an xor against a truncated shift.
+  if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
+    return SDValue();
+
+  // Make sure we are performing an xor against one.
+  if (!isa<ConstantSDNode>(N1) || !cast<ConstantSDNode>(N1)->isOne())
+    return SDValue();
+
+  // SetCC on x86 zero extends so only act on this if it's a logical shift.
+  SDValue Shift = N0.getOperand(0);
+  if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
+    return SDValue();
+
+  // Make sure we are truncating from one of i16, i32 or i64.
+  EVT ShiftTy = Shift.getValueType();
+  if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
+    return SDValue();
+
+  // Make sure the shift amount extracts the sign bit.
+  if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
+      Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
+    return SDValue();
+
+  // Create a greater-than comparison against -1.
+  // N.B. Using SETGE against 0 works but we want a canonical looking
+  // comparison, using SETGT matches up with what TranslateX86CC.
+  SDLoc DL(N);
+  SDValue ShiftOp = Shift.getOperand(0);
+  EVT ShiftOpTy = ShiftOp.getValueType();
+  SDValue Cond = DAG.getSetCC(DL, MVT::i8, ShiftOp,
+                              DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
+  return Cond;
+}
+
  static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
                                   TargetLowering::DAGCombinerInfo &DCI,
                                   const X86Subtarget *Subtarget) {
    if (DCI.isBeforeLegalizeOps())
      return SDValue();
  
+  if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
+    return RV;
+
    if (Subtarget->hasCMov())
      if (SDValue RV = performIntegerAbsCombine(N, DAG))
        return RV;
@@ -26424,6 +26855,15 @@ int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
    return -1;
  }
  
-bool X86TargetLowering::isTargetFTOL() const {
-  return Subtarget->isTargetKnownWindowsMSVC() && !Subtarget->is64Bit();
+bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
+  // Integer division on x86 is expensive. However, when aggressively optimizing
+  // for code size, we prefer to use a div instruction, as it is usually smaller
+  // than the alternative sequence.
+  // The exception to this is vector division. Since x86 doesn't have vector
+  // integer division, leaving the division as-is is a loss even in terms of
+  // size, because it will have to be scalarized, while the alternative code
+  // sequence can be performed in vector form.
+  bool OptSize = Attr.hasAttribute(AttributeSet::FunctionIndex,
+                                   Attribute::MinSize);
+  return OptSize && !VT.isVector();
  }