[X86] Part 2 to fix x86-64 fp128 calling convention.

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 0bf5ee68d1341e5174dc71316e58f465cf73edc6..829d6302bcee65eeecbac5bcc595c27f9a8c8a29 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -25,7 +25,7 @@
  #include "llvm/ADT/Statistic.h"
  #include "llvm/ADT/StringExtras.h"
  #include "llvm/ADT/StringSwitch.h"
-#include "llvm/Analysis/LibCallSemantics.h"
+#include "llvm/Analysis/EHPersonalities.h"
  #include "llvm/CodeGen/IntrinsicLowering.h"
  #include "llvm/CodeGen/MachineFrameInfo.h"
  #include "llvm/CodeGen/MachineFunction.h"
@@ -296,6 +296,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
    setOperationAction(ISD::BR_CC            , MVT::f32,   Expand);
    setOperationAction(ISD::BR_CC            , MVT::f64,   Expand);
    setOperationAction(ISD::BR_CC            , MVT::f80,   Expand);
+  setOperationAction(ISD::BR_CC            , MVT::f128,  Expand);
    setOperationAction(ISD::BR_CC            , MVT::i8,    Expand);
    setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
    setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
@@ -303,6 +304,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
    setOperationAction(ISD::SELECT_CC        , MVT::f32,   Expand);
    setOperationAction(ISD::SELECT_CC        , MVT::f64,   Expand);
    setOperationAction(ISD::SELECT_CC        , MVT::f80,   Expand);
+  setOperationAction(ISD::SELECT_CC        , MVT::f128,  Expand);
    setOperationAction(ISD::SELECT_CC        , MVT::i8,    Expand);
    setOperationAction(ISD::SELECT_CC        , MVT::i16,   Expand);
    setOperationAction(ISD::SELECT_CC        , MVT::i32,   Expand);
@@ -415,12 +417,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
    setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
    setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
    setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
+  setOperationAction(ISD::SELECT          , MVT::f128 , Custom);
    setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
    setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
    setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
    setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
    setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
    setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
+  setOperationAction(ISD::SETCC           , MVT::f128 , Custom);
    setOperationAction(ISD::SETCCE          , MVT::i8   , Custom);
    setOperationAction(ISD::SETCCE          , MVT::i16  , Custom);
    setOperationAction(ISD::SETCCE          , MVT::i32  , Custom);
@@ -619,8 +623,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
    setOperationAction(ISD::FMA, MVT::f64, Expand);
    setOperationAction(ISD::FMA, MVT::f32, Expand);
  
-  // Long double always uses X87.
+  // Long double always uses X87, except f128 in MMX.
    if (!Subtarget->useSoftFloat()) {
+    if (Subtarget->is64Bit() && Subtarget->hasMMX()) {
+      addRegisterClass(MVT::f128, &X86::FR128RegClass);
+      ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
+      setOperationAction(ISD::FABS , MVT::f128, Custom);
+      setOperationAction(ISD::FNEG , MVT::f128, Custom);
+      setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
+    }
+
      addRegisterClass(MVT::f80, &X86::RFP80RegClass);
      setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
      setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
@@ -1148,7 +1160,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      setOperationAction(ISD::CTTZ_ZERO_UNDEF,   MVT::v8i32, Custom);
      setOperationAction(ISD::CTTZ_ZERO_UNDEF,   MVT::v4i64, Custom);
  
-    if (Subtarget->hasFMA() || Subtarget->hasFMA4() || Subtarget->hasAVX512()) {
+    if (Subtarget->hasAnyFMA()) {
        setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
        setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
        setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
@@ -1340,6 +1352,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      setOperationAction(ISD::FDIV,               MVT::v16f32, Legal);
      setOperationAction(ISD::FSQRT,              MVT::v16f32, Legal);
      setOperationAction(ISD::FNEG,               MVT::v16f32, Custom);
+    setOperationAction(ISD::FABS,               MVT::v16f32, Custom);
  
      setOperationAction(ISD::FADD,               MVT::v8f64, Legal);
      setOperationAction(ISD::FSUB,               MVT::v8f64, Legal);
@@ -1347,6 +1360,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      setOperationAction(ISD::FDIV,               MVT::v8f64, Legal);
      setOperationAction(ISD::FSQRT,              MVT::v8f64, Legal);
      setOperationAction(ISD::FNEG,               MVT::v8f64, Custom);
+    setOperationAction(ISD::FABS,               MVT::v8f64, Custom);
      setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
      setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
  
@@ -1384,6 +1398,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
        setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
        setTruncStoreAction(MVT::v4i32, MVT::v4i8,  Legal);
        setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
+    } else {
+      setOperationAction(ISD::MLOAD,    MVT::v8i32, Custom);
+      setOperationAction(ISD::MLOAD,    MVT::v8f32, Custom);
+      setOperationAction(ISD::MSTORE,   MVT::v8i32, Custom);
+      setOperationAction(ISD::MSTORE,   MVT::v8f32, Custom);
      }
      setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
      setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
@@ -1459,6 +1478,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
  
      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
+    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v16i1, Custom);
      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
      setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
@@ -1710,8 +1730,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
    setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
    setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
-  if (!Subtarget->is64Bit())
+  if (!Subtarget->is64Bit()) {
      setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
+    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
+  }
  
    // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
    // handle type legalization for these operations here.
@@ -2353,7 +2375,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
      EVT CopyVT = VA.getLocVT();
  
      // If this is x86-64, and we disabled SSE, we can't return FP values
-    if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
+    if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
          ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
        report_fatal_error("SSE register return with SSE disabled");
      }
@@ -2637,6 +2659,8 @@ SDValue X86TargetLowering::LowerFormalArguments(
          RC = &X86::FR32RegClass;
        else if (RegVT == MVT::f64)
          RC = &X86::FR64RegClass;
+      else if (RegVT == MVT::f128)
+        RC = &X86::FR128RegClass;
        else if (RegVT.is512BitVector())
          RC = &X86::VR512RegClass;
        else if (RegVT.is256BitVector())
@@ -4264,19 +4288,9 @@ unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
    return getInsertVINSERTImmediate(N, 256);
  }
  
-/// Returns true if V is a constant integer zero.
-static bool isZero(SDValue V) {
-  ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
-  return C && C->isNullValue();
-}
-
  /// Returns true if Elt is a constant zero or a floating point constant +0.0.
  bool X86::isZeroNode(SDValue Elt) {
-  if (isZero(Elt))
-    return true;
-  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
-    return CFP->getValueAPF().isPosZero();
-  return false;
+  return isNullConstant(Elt) || isNullFPConstant(Elt);
  }
  
  // Build a vector of constants
@@ -4314,7 +4328,7 @@ static SDValue getConstVector(ArrayRef<int> Values, MVT VT,
  }
  
  /// Returns a vector of specified type with all zero elements.
-static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
+static SDValue getZeroVector(MVT VT, const X86Subtarget *Subtarget,
                               SelectionDAG &DAG, SDLoc dl) {
    assert(VT.isVector() && "Expected a vector type");
  
@@ -6244,7 +6258,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
  
    unsigned NumZero  = 0;
    unsigned NumNonZero = 0;
-  unsigned NonZeros = 0;
+  uint64_t NonZeros = 0;
    bool IsAllConstants = true;
    SmallSet<SDValue, 8> Values;
    for (unsigned i = 0; i < NumElems; ++i) {
@@ -6258,7 +6272,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
      if (X86::isZeroNode(Elt))
        NumZero++;
      else {
-      NonZeros |= (1 << i);
+      assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
+      NonZeros |= ((uint64_t)1 << i);
        NumNonZero++;
      }
    }
@@ -6440,7 +6455,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
    SmallVector<SDValue, 8> V(NumElems);
    if (NumElems == 4 && NumZero > 0) {
      for (unsigned i = 0; i < 4; ++i) {
-      bool isZero = !(NonZeros & (1 << i));
+      bool isZero = !(NonZeros & (1ULL << i));
        if (isZero)
          V[i] = getZeroVector(VT, Subtarget, DAG, dl);
        else
@@ -8032,6 +8047,7 @@ static SDValue lowerVectorShuffleAsTruncBroadcast(SDLoc DL, MVT VT, SDValue V0,
  /// For convenience, this code also bundles all of the subtarget feature set
  /// filtering. While a little annoying to re-dispatch on type here, there isn't
  /// a convenient way to factor it out.
+/// FIXME: This is very similar to LowerVectorBroadcast - can we merge them?
  static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V,
                                               ArrayRef<int> Mask,
                                               const X86Subtarget *Subtarget,
@@ -8104,6 +8120,20 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V,
      // Only AVX2 has register broadcasts.
      if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V))
        return SDValue();
+  } else if (MayFoldLoad(V) && !cast<LoadSDNode>(V)->isVolatile()) {
+    // If we are broadcasting a load that is only used by the shuffle
+    // then we can reduce the vector load to the broadcasted scalar load.
+    LoadSDNode *Ld = cast<LoadSDNode>(V);
+    SDValue BaseAddr = Ld->getOperand(1);
+    EVT AddrVT = BaseAddr.getValueType();
+    EVT SVT = VT.getScalarType();
+    unsigned Offset = BroadcastIdx * SVT.getStoreSize();
+    SDValue NewAddr = DAG.getNode(
+        ISD::ADD, DL, AddrVT, BaseAddr,
+        DAG.getConstant(Offset, DL, AddrVT));
+    V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
+                    DAG.getMachineFunction().getMachineMemOperand(
+                        Ld->getMemOperand(), Offset, SVT.getStoreSize()));
    } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) {
      // We can't broadcast from a vector register without AVX2, and we can only
      // broadcast from the zero-element of a vector register.
@@ -10548,9 +10578,7 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
      if (Subtarget->hasAVX2())
        return DAG.getNode(
            X86ISD::VPERMV, DL, MVT::v8f32,
-          DAG.getBitcast(MVT::v8f32, DAG.getNode(ISD::BUILD_VECTOR, DL,
-                                                 MVT::v8i32, VPermMask)),
-          V1);
+          DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
  
      // Otherwise, fall back.
      return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
@@ -11309,9 +11337,9 @@ static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
  
      int Lane1Cond = -1, Lane2Cond = -1;
      if (isa<ConstantSDNode>(EltCond))
-      Lane1Cond = !isZero(EltCond);
+      Lane1Cond = !isNullConstant(EltCond);
      if (isa<ConstantSDNode>(SndLaneEltCond))
-      Lane2Cond = !isZero(SndLaneEltCond);
+      Lane2Cond = !isNullConstant(SndLaneEltCond);
  
      unsigned LaneMask = 0;
      if (Lane1Cond == Lane2Cond || Lane2Cond < 0)
@@ -11352,7 +11380,8 @@ static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
    for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
      SDValue CondElt = CondBV->getOperand(i);
      Mask.push_back(
-        isa<ConstantSDNode>(CondElt) ? i + (isZero(CondElt) ? Size : 0) : -1);
+        isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
+                                     : -1);
    }
    return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
  }
@@ -11417,9 +11446,8 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
    }
  
    if (VT.getSizeInBits() == 16) {
-    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
      // If Idx is 0, it's cheaper to do a move instead of a pextrw.
-    if (Idx == 0)
+    if (isNullConstant(Op.getOperand(1)))
        return DAG.getNode(
            ISD::TRUNCATE, dl, MVT::i16,
            DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
@@ -11442,8 +11470,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
        return SDValue();
      SDNode *User = *Op.getNode()->use_begin();
      if ((User->getOpcode() != ISD::STORE ||
-         (isa<ConstantSDNode>(Op.getOperand(1)) &&
-          cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
+         isNullConstant(Op.getOperand(1))) &&
          (User->getOpcode() != ISD::BITCAST ||
           User->getValueType(0) != MVT::i32))
        return SDValue();
@@ -11560,8 +11587,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
    // TODO: handle v16i8.
    if (VT.getSizeInBits() == 16) {
      SDValue Vec = Op.getOperand(0);
-    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-    if (Idx == 0)
+    if (isNullConstant(Op.getOperand(1)))
        return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
                           DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
                                       DAG.getBitcast(MVT::v4i32, Vec),
@@ -11593,8 +11619,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
      // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
      // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
      //        to match extract_elt for f64.
-    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-    if (Idx == 0)
+    if (isNullConstant(Op.getOperand(1)))
        return Op;
  
      // UNPCKHPD the element to the lowest double word, then movsd.
@@ -13399,6 +13424,8 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
    SDLoc dl(Op);
    MVT VT = Op.getSimpleValueType();
  
+  bool IsF128 = (VT == MVT::f128);
+
    // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
    // decide if we should generate a 16-byte constant mask when we only need 4 or
    // 8 bytes for the scalar case.
@@ -13411,6 +13438,11 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
      LogicVT = VT;
      EltVT = VT.getVectorElementType();
      NumElts = VT.getVectorNumElements();
+  } else if (IsF128) {
+    // SSE instructions are used for optimized f128 logical operations.
+    LogicVT = MVT::f128;
+    EltVT = VT;
+    NumElts = 1;
    } else {
      // There are no scalar bitwise logical SSE/AVX instructions, so we
      // generate a 16-byte vector constant and logic op even for the scalar case.
@@ -13442,7 +13474,7 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
      IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
    SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
  
-  if (VT.isVector())
+  if (VT.isVector() || IsF128)
      return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
  
    // For the scalar case extend to a 128-bit vector, perform the logic op,
@@ -13461,6 +13493,7 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
    SDLoc dl(Op);
    MVT VT = Op.getSimpleValueType();
    MVT SrcVT = Op1.getSimpleValueType();
+  bool IsF128 = (VT == MVT::f128);
  
    // If second operand is smaller, extend it first.
    if (SrcVT.bitsLT(VT)) {
@@ -13475,13 +13508,16 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
  
    // At this point the operands and the result should have the same
    // type, and that won't be f80 since that is not custom lowered.
+  assert((VT == MVT::f64 || VT == MVT::f32 || IsF128) &&
+         "Unexpected type in LowerFCOPYSIGN");
  
    const fltSemantics &Sem =
-      VT == MVT::f64 ? APFloat::IEEEdouble : APFloat::IEEEsingle;
+      VT == MVT::f64 ? APFloat::IEEEdouble :
+          (IsF128 ? APFloat::IEEEquad : APFloat::IEEEsingle);
    const unsigned SizeInBits = VT.getSizeInBits();
  
    SmallVector<Constant *, 4> CV(
-      VT == MVT::f64 ? 2 : 4,
+      VT == MVT::f64 ? 2 : (IsF128 ? 1 : 4),
        ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0))));
  
    // First, clear all bits but the sign bit from the second operand (sign).
@@ -13494,12 +13530,13 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
    // Perform all logic operations as 16-byte vectors because there are no
    // scalar FP logic instructions in SSE. This allows load folding of the
    // constants into the logic instructions.
-  MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
+  MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : (IsF128 ? MVT::f128 : MVT::v4f32);
    SDValue Mask1 =
        DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
                    MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
                    false, false, false, 16);
-  Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1);
+  if (!IsF128)
+    Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1);
    SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op1, Mask1);
  
    // Next, clear the sign bit from the first operand (magnitude).
@@ -13508,8 +13545,9 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
      APFloat APF = Op0CN->getValueAPF();
      // If the magnitude is a positive zero, the sign bit alone is enough.
      if (APF.isPosZero())
-      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit,
-                         DAG.getIntPtrConstant(0, dl));
+      return IsF128 ? SignBit :
+          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit,
+                      DAG.getIntPtrConstant(0, dl));
      APF.clearSign();
      CV[0] = ConstantFP::get(*Context, APF);
    } else {
@@ -13525,13 +13563,15 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
                    false, false, false, 16);
    // If the magnitude operand wasn't a constant, we need to AND out the sign.
    if (!isa<ConstantFPSDNode>(Op0)) {
-    Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0);
+    if (!IsF128)
+      Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0);
      Val = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op0, Val);
    }
    // OR the magnitude value with the sign bit.
    Val = DAG.getNode(X86ISD::FOR, dl, LogicVT, Val, SignBit);
-  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val,
-                     DAG.getIntPtrConstant(0, dl));
+  return IsF128 ? Val :
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val,
+                  DAG.getIntPtrConstant(0, dl));
  }
  
  static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
@@ -13758,14 +13798,14 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
      if (ConstantSDNode *C =
          dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
        // An add of one will be selected as an INC.
-      if (C->getAPIntValue() == 1 && !Subtarget->slowIncDec()) {
+      if (C->isOne() && !Subtarget->slowIncDec()) {
          Opcode = X86ISD::INC;
          NumOperands = 1;
          break;
        }
  
        // An add of negative one (subtract of one) will be selected as a DEC.
-      if (C->getAPIntValue().isAllOnesValue() && !Subtarget->slowIncDec()) {
+      if (C->isAllOnesValue() && !Subtarget->slowIncDec()) {
          Opcode = X86ISD::DEC;
          NumOperands = 1;
          break;
@@ -13894,13 +13934,11 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
  /// equivalent.
  SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
                                     SDLoc dl, SelectionDAG &DAG) const {
-  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) {
-    if (C->getAPIntValue() == 0)
-      return EmitTest(Op0, X86CC, dl, DAG);
+  if (isNullConstant(Op1))
+    return EmitTest(Op0, X86CC, dl, DAG);
  
-     assert(Op0.getValueType() != MVT::i1 &&
-            "Unexpected comparison operation for MVT::i1 operands");
-  }
+  assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
+         "Unexpected comparison operation for MVT::i1 operands");
  
    if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
         Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
@@ -13946,6 +13984,9 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
    SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
                              DAG.getConstant(8, dl, MVT::i8));
    SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
+
+  // Some 64-bit targets lack SAHF support, but they do support FCOMI.
+  assert(Subtarget->hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
    return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
  }
  
@@ -14023,11 +14064,6 @@ unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
    return 2;
  }
  
-static bool isAllOnes(SDValue V) {
-  ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
-  return C && C->isAllOnesValue();
-}
-
  /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
  /// if it's possible.
  SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
@@ -14043,8 +14079,7 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
    if (Op1.getOpcode() == ISD::SHL)
      std::swap(Op0, Op1);
    if (Op0.getOpcode() == ISD::SHL) {
-    if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0)))
-      if (And00C->getZExtValue() == 1) {
+    if (isOneConstant(Op0.getOperand(0))) {
          // If we looked past a truncate, check that it's only truncating away
          // known zeros.
          unsigned BitWidth = Op0.getValueSizeInBits();
@@ -14603,8 +14638,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
    // Lower ((X >>u N) & 1) != 0 to BT(X, N).
    // Lower ((X >>s N) & 1) != 0 to BT(X, N).
    if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
-      Op1.getOpcode() == ISD::Constant &&
-      cast<ConstantSDNode>(Op1)->isNullValue() &&
+      isNullConstant(Op1) &&
        (CC == ISD::SETEQ || CC == ISD::SETNE)) {
      if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) {
        if (VT == MVT::i1)
@@ -14615,17 +14649,14 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
  
    // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
    // these.
-  if (Op1.getOpcode() == ISD::Constant &&
-      (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
-       cast<ConstantSDNode>(Op1)->isNullValue()) &&
+  if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
        (CC == ISD::SETEQ || CC == ISD::SETNE)) {
  
      // If the input is a setcc, then reuse the input setcc or use a new one with
      // the inverted condition.
      if (Op0.getOpcode() == X86ISD::SETCC) {
        X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
-      bool Invert = (CC == ISD::SETNE) ^
-        cast<ConstantSDNode>(Op1)->isNullValue();
+      bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
        if (!Invert)
          return Op0;
  
@@ -14638,8 +14669,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
        return SetCC;
      }
    }
-  if ((Op0.getValueType() == MVT::i1) && (Op1.getOpcode() == ISD::Constant) &&
-      (cast<ConstantSDNode>(Op1)->getZExtValue() == 1) &&
+  if ((Op0.getValueType() == MVT::i1) && isOneConstant(Op1) &&
        (CC == ISD::SETEQ || CC == ISD::SETNE)) {
  
      ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
@@ -14827,22 +14857,21 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
    // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
    if (Cond.getOpcode() == X86ISD::SETCC &&
        Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
-      isZero(Cond.getOperand(1).getOperand(1))) {
+      isNullConstant(Cond.getOperand(1).getOperand(1))) {
      SDValue Cmp = Cond.getOperand(1);
  
      unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
  
-    if ((isAllOnes(Op1) || isAllOnes(Op2)) &&
+    if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
          (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
-      SDValue Y = isAllOnes(Op2) ? Op1 : Op2;
+      SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
  
        SDValue CmpOp0 = Cmp.getOperand(0);
        // Apply further optimizations for special cases
        // (select (x != 0), -1, 0) -> neg & sbb
        // (select (x == 0), 0, -1) -> neg & sbb
-      if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y))
-        if (YC->isNullValue() &&
-            (isAllOnes(Op1) == (CondCode == X86::COND_NE))) {
+      if (isNullConstant(Y) &&
+            (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
            SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
            SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
                                      DAG.getConstant(0, DL,
@@ -14862,11 +14891,10 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
          DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
                      DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
  
-      if (isAllOnes(Op1) != (CondCode == X86::COND_E))
+      if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
          Res = DAG.getNOT(DL, Res, Res.getValueType());
  
-      ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
-      if (!N2C || !N2C->isNullValue())
+      if (!isNullConstant(Op2))
          Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
        return Res;
      }
@@ -14874,11 +14902,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
  
    // Look past (and (setcc_carry (cmp ...)), 1).
    if (Cond.getOpcode() == ISD::AND &&
-      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
-    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
-    if (C && C->getAPIntValue() == 1)
-      Cond = Cond.getOperand(0);
-  }
+      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
+      isOneConstant(Cond.getOperand(1)))
+    Cond = Cond.getOperand(0);
  
    // If condition flag is set by a X86ISD::CMP, then use it as the condition
    // setting operand in place of the X86ISD::SETCC.
@@ -14966,11 +14992,12 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
      unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
  
      if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
-        (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) {
+        (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
+        (isNullConstant(Op1) || isNullConstant(Op2))) {
        SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
                                  DAG.getConstant(X86::COND_B, DL, MVT::i8),
                                  Cond);
-      if (isAllOnes(Op1) != (CondCode == X86::COND_B))
+      if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
          return DAG.getNOT(DL, Res, Res.getValueType());
        return Res;
      }
@@ -15360,11 +15387,9 @@ static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
  static bool isXor1OfSetCC(SDValue Op) {
    if (Op.getOpcode() != ISD::XOR)
      return false;
-  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
-  if (N1C && N1C->getAPIntValue() == 1) {
+  if (isOneConstant(Op.getOperand(1)))
      return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
-      Op.getOperand(0).hasOneUse();
-  }
+           Op.getOperand(0).hasOneUse();
    return false;
  }
  
@@ -15380,8 +15405,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
    if (Cond.getOpcode() == ISD::SETCC) {
      // Check for setcc([su]{add,sub,mul}o == 0).
      if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
-        isa<ConstantSDNode>(Cond.getOperand(1)) &&
-        cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() &&
+        isNullConstant(Cond.getOperand(1)) &&
          Cond.getOperand(0).getResNo() == 1 &&
          (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
           Cond.getOperand(0).getOpcode() == ISD::UADDO ||
@@ -15408,11 +15432,9 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
  
    // Look pass (and (setcc_carry (cmp ...)), 1).
    if (Cond.getOpcode() == ISD::AND &&
-      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
-    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
-    if (C && C->getAPIntValue() == 1)
-      Cond = Cond.getOperand(0);
-  }
+      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
+      isOneConstant(Cond.getOperand(1)))
+    Cond = Cond.getOperand(0);
  
    // If condition flag is set by a X86ISD::CMP, then use it as the condition
    // setting operand in place of the X86ISD::SETCC.
@@ -15456,16 +15478,14 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
      switch (CondOpcode) {
      case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
      case ISD::SADDO:
-      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
-        if (C->isOne()) {
+      if (isOneConstant(RHS)) {
            X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
            break;
          }
        X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
      case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
      case ISD::SSUBO:
-      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
-        if (C->isOne()) {
+      if (isOneConstant(RHS)) {
            X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
            break;
          }
@@ -15659,54 +15679,40 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
                 SplitStack;
    SDLoc dl(Op);
  
+  // Get the inputs.
+  SDNode *Node = Op.getNode();
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size  = Op.getOperand(1);
+  unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  EVT VT = Node->getValueType(0);
+
+  // Chain the dynamic stack allocation so that it doesn't modify the stack
+  // pointer when other instructions are using the stack.
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl);
+
+  bool Is64Bit = Subtarget->is64Bit();
+  MVT SPTy = getPointerTy(DAG.getDataLayout());
+
+  SDValue Result;
    if (!Lower) {
      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-    SDNode* Node = Op.getNode();
-
      unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
      assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
-        " not tell us which reg is the stack pointer!");
+                    " not tell us which reg is the stack pointer!");
      EVT VT = Node->getValueType(0);
-    SDValue Tmp1 = SDValue(Node, 0);
-    SDValue Tmp2 = SDValue(Node, 1);
      SDValue Tmp3 = Node->getOperand(2);
-    SDValue Chain = Tmp1.getOperand(0);
-
-    // Chain the dynamic stack allocation so that it doesn't modify the stack
-    // pointer when other instructions are using the stack.
-    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true),
-        SDLoc(Node));
  
-    SDValue Size = Tmp2.getOperand(1);
      SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
      Chain = SP.getValue(1);
      unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
      const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
      unsigned StackAlign = TFI.getStackAlignment();
-    Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
+    Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
      if (Align > StackAlign)
-      Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
-          DAG.getConstant(-(uint64_t)Align, dl, VT));
-    Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
-
-    Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
-        DAG.getIntPtrConstant(0, dl, true), SDValue(),
-        SDLoc(Node));
-
-    SDValue Ops[2] = { Tmp1, Tmp2 };
-    return DAG.getMergeValues(Ops, dl);
-  }
-
-  // Get the inputs.
-  SDValue Chain = Op.getOperand(0);
-  SDValue Size  = Op.getOperand(1);
-  unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
-  EVT VT = Op.getNode()->getValueType(0);
-
-  bool Is64Bit = Subtarget->is64Bit();
-  MVT SPTy = getPointerTy(DAG.getDataLayout());
-
-  if (SplitStack) {
+      Result = DAG.getNode(ISD::AND, dl, VT, Result,
+                         DAG.getConstant(-(uint64_t)Align, dl, VT));
+    Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
+  } else if (SplitStack) {
      MachineRegisterInfo &MRI = MF.getRegInfo();
  
      if (Is64Bit) {
@@ -15724,10 +15730,8 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
      const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
      unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
      Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
-    SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
+    Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
                                  DAG.getRegister(Vreg, SPTy));
-    SDValue Ops1[2] = { Value, Chain };
-    return DAG.getMergeValues(Ops1, dl);
    } else {
      SDValue Flag;
      const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX);
@@ -15749,9 +15753,14 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
        Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
      }
  
-    SDValue Ops1[2] = { SP, Chain };
-    return DAG.getMergeValues(Ops1, dl);
+    Result = SP;
    }
+
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
+                             DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
+
+  SDValue Ops[2] = {Result, Chain};
+  return DAG.getMergeValues(Ops, dl);
  }
  
  SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
@@ -16028,57 +16037,90 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
    return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
  }
  
+/// \brief Return Mask with the necessary casting or extending
+/// for \p Mask according to \p MaskVT when lowering masking intrinsics
+static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
+                           const X86Subtarget *Subtarget,
+                           SelectionDAG &DAG, SDLoc dl) {
+
+  if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
+    // Mask should be extended
+    Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
+                       MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
+  }
+
+  if (Mask.getSimpleValueType() == MVT::i64 && Subtarget->is32Bit()) {
+    if (MaskVT == MVT::v64i1) {
+      assert(Subtarget->hasBWI() && "Expected AVX512BW target!");
+      // In case 32bit mode, bitcast i64 is illegal, extend/split it.
+      SDValue Lo, Hi;
+      Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
+                          DAG.getConstant(0, dl, MVT::i32));
+      Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
+                          DAG.getConstant(1, dl, MVT::i32));
+
+      Lo = DAG.getBitcast(MVT::v32i1, Lo);
+      Hi = DAG.getBitcast(MVT::v32i1, Hi);
+
+      return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
+    } else {
+      // MaskVT require < 64bit. Truncate mask (should succeed in any case),
+      // and bitcast.
+      MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
+      return DAG.getBitcast(MaskVT,
+                            DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
+    }
+
+  } else {
+    MVT BitcastVT = MVT::getVectorVT(MVT::i1,
+                                     Mask.getSimpleValueType().getSizeInBits());
+    // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
+    // are extracted by EXTRACT_SUBVECTOR.
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
+                       DAG.getBitcast(BitcastVT, Mask),
+                       DAG.getIntPtrConstant(0, dl));
+  }
+}
+
  /// \brief Return (and \p Op, \p Mask) for compare instructions or
  /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
  /// necessary casting or extending for \p Mask when lowering masking intrinsics
  static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
-                                    SDValue PreservedSrc,
-                                    const X86Subtarget *Subtarget,
-                                    SelectionDAG &DAG) {
-    MVT VT = Op.getSimpleValueType();
-    MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
-    SDValue VMask;
-    unsigned OpcodeSelect = ISD::VSELECT;
-    SDLoc dl(Op);
+                  SDValue PreservedSrc,
+                  const X86Subtarget *Subtarget,
+                  SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+  unsigned OpcodeSelect = ISD::VSELECT;
+  SDLoc dl(Op);
  
-    if (isAllOnes(Mask))
-      return Op;
+  if (isAllOnesConstant(Mask))
+    return Op;
  
-    if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
-      MVT newMaskVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
-      VMask = DAG.getBitcast(MaskVT,
-                             DAG.getNode(ISD::ANY_EXTEND, dl, newMaskVT, Mask));
-    } else {
-      MVT BitcastVT = MVT::getVectorVT(MVT::i1,
-                                       Mask.getSimpleValueType().getSizeInBits());
-      // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
-      // are extracted by EXTRACT_SUBVECTOR.
-      VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
-                          DAG.getBitcast(BitcastVT, Mask),
-                          DAG.getIntPtrConstant(0, dl));
-    }
+  SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
  
-    switch (Op.getOpcode()) {
-    default: break;
-    case X86ISD::PCMPEQM:
-    case X86ISD::PCMPGTM:
-    case X86ISD::CMPM:
-    case X86ISD::CMPMU:
-      return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
-    case X86ISD::VFPCLASS:
-      return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
-    case X86ISD::VTRUNC:
-    case X86ISD::VTRUNCS:
-    case X86ISD::VTRUNCUS:
-      // We can't use ISD::VSELECT here because it is not always "Legal"
-      // for the destination type. For example vpmovqb require only AVX512
-      // and vselect that can operate on byte element type require BWI
-      OpcodeSelect = X86ISD::SELECT;
-      break;
-    }
-    if (PreservedSrc.getOpcode() == ISD::UNDEF)
-      PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
-    return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
+  switch (Op.getOpcode()) {
+  default: break;
+  case X86ISD::PCMPEQM:
+  case X86ISD::PCMPGTM:
+  case X86ISD::CMPM:
+  case X86ISD::CMPMU:
+    return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
+  case X86ISD::VFPCLASS:
+    case X86ISD::VFPCLASSS:
+    return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
+  case X86ISD::VTRUNC:
+  case X86ISD::VTRUNCS:
+  case X86ISD::VTRUNCUS:
+    // We can't use ISD::VSELECT here because it is not always "Legal"
+    // for the destination type. For example vpmovqb require only AVX512
+    // and vselect that can operate on byte element type require BWI
+    OpcodeSelect = X86ISD::SELECT;
+    break;
+  }
+  if (PreservedSrc.getOpcode() == ISD::UNDEF)
+    PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
+  return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
  }
  
  /// \brief Creates an SDNode for a predicated scalar operation.
@@ -16092,7 +16134,7 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
                                      SDValue PreservedSrc,
                                      const X86Subtarget *Subtarget,
                                      SelectionDAG &DAG) {
-  if (isAllOnes(Mask))
+  if (isAllOnesConstant(Mask))
      return Op;
  
    MVT VT = Op.getSimpleValueType();
@@ -16102,7 +16144,8 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
  
    if (Op.getOpcode() == X86ISD::FSETCC)
      return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
-  if (Op.getOpcode() == X86ISD::VFPCLASS)
+  if (Op.getOpcode() == X86ISD::VFPCLASS ||
+      Op.getOpcode() == X86ISD::VFPCLASSS)
      return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
  
    if (PreservedSrc.getOpcode() == ISD::UNDEF)
@@ -16376,7 +16419,27 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
                                    Mask, PassThru, Subtarget, DAG);
      }
      case VPERM_3OP_MASKZ:
-    case VPERM_3OP_MASK:
+    case VPERM_3OP_MASK:{
+      // Src2 is the PassThru
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue Src3 = Op.getOperand(3);
+      SDValue Mask = Op.getOperand(4);
+      MVT VT = Op.getSimpleValueType();
+      SDValue PassThru = SDValue();
+
+      // set PassThru element
+      if (IntrData->Type == VPERM_3OP_MASKZ)
+        PassThru = getZeroVector(VT, Subtarget, DAG, dl);
+      else
+        PassThru = DAG.getBitcast(VT, Src2);
+
+      // Swap Src1 and Src2 in the node creation
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
+                                              dl, Op.getValueType(),
+                                              Src2, Src1, Src3),
+                                  Mask, PassThru, Subtarget, DAG);
+    }
      case FMA_OP_MASK3:
      case FMA_OP_MASKZ:
      case FMA_OP_MASK: {
@@ -16388,7 +16451,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
        SDValue PassThru = SDValue();
  
        // set PassThru element
-      if (IntrData->Type == VPERM_3OP_MASKZ || IntrData->Type == FMA_OP_MASKZ)
+      if (IntrData->Type == FMA_OP_MASKZ)
          PassThru = getZeroVector(VT, Subtarget, DAG, dl);
        else if (IntrData->Type == FMA_OP_MASK3)
          PassThru = Src3;
@@ -16543,6 +16606,24 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
                                    DAG.getConstant(X86CC, dl, MVT::i8), Cond);
        return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
      }
+    case COMI_RM: { // Comparison intrinsics with Sae
+      SDValue LHS = Op.getOperand(1);
+      SDValue RHS = Op.getOperand(2);
+      SDValue CC = Op.getOperand(3);
+      SDValue Sae = Op.getOperand(4);
+      auto ComiType = TranslateX86ConstCondToX86CC(CC);
+      // choose between ordered and unordered (comi/ucomi)
+      unsigned comiOp = std::get<0>(ComiType) ? IntrData->Opc0 : IntrData->Opc1;
+      SDValue Cond;
+      if (cast<ConstantSDNode>(Sae)->getZExtValue() !=
+                                           X86::STATIC_ROUNDING::CUR_DIRECTION)
+        Cond = DAG.getNode(comiOp, dl, MVT::i32, LHS, RHS, Sae);
+      else
+        Cond = DAG.getNode(comiOp, dl, MVT::i32, LHS, RHS);
+      SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+        DAG.getConstant(std::get<1>(ComiType), dl, MVT::i8), Cond);
+      return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+    }
      case VSHIFT:
        return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
                                   Op.getOperand(1), Op.getOperand(2), DAG);
@@ -16557,7 +16638,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
        SDValue Mask = Op.getOperand(3);
        SDValue DataToCompress = Op.getOperand(1);
        SDValue PassThru = Op.getOperand(2);
-      if (isAllOnes(Mask)) // return data as is
+      if (isAllOnesConstant(Mask)) // return data as is
          return Op.getOperand(1);
  
        return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
@@ -16574,15 +16655,22 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
        SDValue Mask = Op.getOperand(3);
        MVT VT = Op.getSimpleValueType();
        MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
-      MVT BitcastVT = MVT::getVectorVT(MVT::i1,
-                                       Mask.getSimpleValueType().getSizeInBits());
-      SDLoc dl(Op);
-      SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
-                                  DAG.getBitcast(BitcastVT, Mask),
-                                  DAG.getIntPtrConstant(0, dl));
+      SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
        return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1),
                           Op.getOperand(2));
      }
+    case KUNPCK: {
+      MVT VT = Op.getSimpleValueType();
+      MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
+
+      SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
+      SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
+      // Arguments should be swapped.
+      SDValue Res = DAG.getNode(IntrData->Opc0, dl,
+                                MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
+                                Src2, Src1);
+      return DAG.getBitcast(VT, Res);
+    }
      default:
        break;
      }
@@ -16818,7 +16906,7 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
    SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
    SDValue Segment = DAG.getRegister(0, MVT::i32);
    if (Src.getOpcode() == ISD::UNDEF)
-    Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl);
+    Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
    SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
    SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
    SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
@@ -17077,7 +17165,7 @@ static SDValue LowerINTRINSIC_TRUNCATE_TO_MEM(const SDValue & Op,
    MVT VT  = DataToTruncate.getSimpleValueType();
    MVT SVT = MVT::getVectorVT(ElementType, VT.getVectorNumElements());
  
-  if (isAllOnes(Mask)) // return just a truncate store
+  if (isAllOnesConstant(Mask)) // return just a truncate store
      return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr,
                               MachinePointerInfo(), SVT, false, false,
                               SVT.getScalarSizeInBits()/8);
@@ -17221,7 +17309,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
      SDValue Chain = Op.getOperand(0);
  
      MVT VT = DataToCompress.getSimpleValueType();
-    if (isAllOnes(Mask)) // return just a store
+    if (isAllOnesConstant(Mask)) // return just a store
        return DAG.getStore(Chain, dl, DataToCompress, Addr,
                            MachinePointerInfo(), false, false,
                            VT.getScalarSizeInBits()/8);
@@ -17247,7 +17335,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
      SDValue Chain = Op.getOperand(0);
      MVT VT = Op.getSimpleValueType();
  
-    if (isAllOnes(Mask)) // return just a load
+    if (isAllOnesConstant(Mask)) // return just a load
        return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false,
                           false, VT.getScalarSizeInBits()/8);
  
@@ -18971,8 +19059,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
    case ISD::SADDO:
      // A subtract of one will be selected as a INC. Note that INC doesn't
      // set CF, so we can't do this for UADDO.
-    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
-      if (C->isOne()) {
+    if (isOneConstant(RHS)) {
          BaseOp = X86ISD::INC;
          Cond = X86::COND_O;
          break;
@@ -18987,8 +19074,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
    case ISD::SSUBO:
      // A subtract of one will be selected as a DEC. Note that DEC doesn't
      // set CF, so we can't do this for USUBO.
-    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
-      if (C->isOne()) {
+    if (isOneConstant(RHS)) {
          BaseOp = X86ISD::DEC;
          Cond = X86::COND_O;
          break;
@@ -19651,6 +19737,47 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
    return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
  }
  
+/// Widen a vector input to a vector of NVT.  The
+/// input vector must have the same element type as NVT.
+static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
+                            bool FillWithZeroes = false) {
+  // Check if InOp already has the right width.
+  MVT InVT = InOp.getSimpleValueType();
+  if (InVT == NVT)
+    return InOp;
+
+  if (InOp.isUndef())
+    return DAG.getUNDEF(NVT);
+
+  assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
+         "input and widen element type must match");
+
+  unsigned InNumElts = InVT.getVectorNumElements();
+  unsigned WidenNumElts = NVT.getVectorNumElements();
+  assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
+         "Unexpected request for vector widening");
+
+  EVT EltVT = NVT.getVectorElementType();
+
+  SDLoc dl(InOp);
+  if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
+      ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
+    SmallVector<SDValue, 16> Ops;
+    for (unsigned i = 0; i < InNumElts; ++i)
+      Ops.push_back(InOp.getOperand(i));
+
+    SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
+      DAG.getUNDEF(EltVT);
+    for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
+      Ops.push_back(FillVal);
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Ops);
+  }
+  SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
+    DAG.getUNDEF(NVT);
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
+                     InOp, DAG.getIntPtrConstant(0, dl));
+}
+
  static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget *Subtarget,
                               SelectionDAG &DAG) {
    assert(Subtarget->hasAVX512() &&
@@ -19680,6 +19807,62 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget *Subtarget,
    return Op;
  }
  
+static SDValue LowerMLOAD(SDValue Op, const X86Subtarget *Subtarget,
+                          SelectionDAG &DAG) {
+
+  MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
+  MVT VT = Op.getSimpleValueType();
+  SDValue Mask = N->getMask();
+  SDLoc dl(Op);
+
+  if (Subtarget->hasAVX512() && !Subtarget->hasVLX() &&
+      !VT.is512BitVector() && Mask.getValueType() == MVT::v8i1) {
+    // This operation is legal for targets with VLX, but without
+    // VLX the vector should be widened to 512 bit
+    unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
+    MVT WideDataVT = MVT::getVectorVT(VT.getScalarType(), NumEltsInWideVec);
+    MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
+    SDValue Src0 = N->getSrc0();
+    Src0 = ExtendToType(Src0, WideDataVT, DAG);
+    Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
+    SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
+                                        N->getBasePtr(), Mask, Src0,
+                                        N->getMemoryVT(), N->getMemOperand(),
+                                        N->getExtensionType());
+
+    SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+                                 NewLoad.getValue(0),
+                                 DAG.getIntPtrConstant(0, dl));
+    SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
+    return DAG.getMergeValues(RetOps, dl);
+  }
+  return Op;
+}
+
+static SDValue LowerMSTORE(SDValue Op, const X86Subtarget *Subtarget,
+                           SelectionDAG &DAG) {
+  MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
+  SDValue DataToStore = N->getValue();
+  MVT VT = DataToStore.getSimpleValueType();
+  SDValue Mask = N->getMask();
+  SDLoc dl(Op);
+
+  if (Subtarget->hasAVX512() && !Subtarget->hasVLX() &&
+      !VT.is512BitVector() && Mask.getValueType() == MVT::v8i1) {
+    // This operation is legal for targets with VLX, but without
+    // VLX the vector should be widened to 512 bit
+    unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
+    MVT WideDataVT = MVT::getVectorVT(VT.getScalarType(), NumEltsInWideVec);
+    MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
+    DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
+    Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
+    return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
+                              Mask, N->getMemoryVT(), N->getMemOperand(),
+                              N->isTruncatingStore());
+  }
+  return Op;
+}
+
  static SDValue LowerMGATHER(SDValue Op, const X86Subtarget *Subtarget,
                              SelectionDAG &DAG) {
    assert(Subtarget->hasAVX512() &&
@@ -19839,6 +20022,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    case ISD::UMAX:
    case ISD::UMIN:               return LowerMINMAX(Op, DAG);
    case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
+  case ISD::MLOAD:              return LowerMLOAD(Op, Subtarget, DAG);
+  case ISD::MSTORE:             return LowerMSTORE(Op, Subtarget, DAG);
    case ISD::MGATHER:            return LowerMGATHER(Op, Subtarget, DAG);
    case ISD::MSCATTER:           return LowerMSCATTER(Op, Subtarget, DAG);
    case ISD::GC_TRANSITION_START:
@@ -19985,6 +20170,11 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
        return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
      }
    }
+  case ISD::INTRINSIC_WO_CHAIN: {
+    if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG))
+      Results.push_back(V);
+    return;
+  }
    case ISD::READCYCLECOUNTER: {
      return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
                                     Results);
@@ -20325,6 +20515,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::FP_TO_SINT_RND:     return "X86ISD::FP_TO_SINT_RND";
    case X86ISD::FP_TO_UINT_RND:     return "X86ISD::FP_TO_UINT_RND";
    case X86ISD::VFPCLASS:           return "X86ISD::VFPCLASS";
+  case X86ISD::VFPCLASSS:          return "X86ISD::VFPCLASSS";
    }
    return nullptr;
  }
@@ -20479,7 +20670,7 @@ bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
  
  bool
  X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
-  if (!(Subtarget->hasFMA() || Subtarget->hasFMA4() || Subtarget->hasAVX512()))
+  if (!Subtarget->hasAnyFMA())
      return false;
  
    VT = VT.getScalarType();
@@ -21996,6 +22187,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
      return EmitLoweredTLSCall(MI, BB);
    case X86::CMOV_FR32:
    case X86::CMOV_FR64:
+  case X86::CMOV_FR128:
    case X86::CMOV_GR8:
    case X86::CMOV_GR16:
    case X86::CMOV_GR32:
@@ -22335,7 +22527,7 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
    ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
    SDValue V1 = SVOp->getOperand(0);
    SDValue V2 = SVOp->getOperand(1);
-  EVT VT = SVOp->getValueType(0);
+  MVT VT = SVOp->getSimpleValueType(0);
    unsigned NumElems = VT.getVectorNumElements();
  
    if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
@@ -23155,14 +23347,19 @@ static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) {
      return SDValue();
  
    auto *SVN = cast<ShuffleVectorSDNode>(N);
-  ArrayRef<int> Mask = SVN->getMask();
+  SmallVector<int, 8> Mask;
+  for (int M : SVN->getMask())
+    Mask.push_back(M);
+
    SDValue V1 = N->getOperand(0);
    SDValue V2 = N->getOperand(1);
  
-  // We require the first shuffle operand to be the SUB node, and the second to
-  // be the ADD node.
-  // FIXME: We should support the commuted patterns.
-  if (V1->getOpcode() != ISD::FSUB || V2->getOpcode() != ISD::FADD)
+  // We require the first shuffle operand to be the FSUB node, and the second to
+  // be the FADD node.
+  if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) {
+    ShuffleVectorSDNode::commuteMask(Mask);
+    std::swap(V1, V2);
+  } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD)
      return SDValue();
  
    // If there are other uses of these operations we can't fold them.
@@ -23213,7 +23410,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
        return AddSub;
  
    // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
-  if (Subtarget->hasFp256() && VT.is256BitVector() &&
+  if (TLI.isTypeLegal(VT) && Subtarget->hasFp256() && VT.is256BitVector() &&
        N->getOpcode() == ISD::VECTOR_SHUFFLE)
      return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
  
@@ -23408,9 +23605,9 @@ static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG,
    // store-load conversions.
    if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
        N0.getValueType() == MVT::v2i32 &&
-      isa<ConstantSDNode>(N0.getOperand(1))) {
+      isNullConstant(N0.getOperand(1))) {
      SDValue N00 = N0->getOperand(0);
-    if (N0.getConstantOperandVal(1) == 0 && N00.getValueType() == MVT::i32)
+    if (N00.getValueType() == MVT::i32)
        return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
    }
  
@@ -23654,7 +23851,8 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
    // ignored in unsafe-math mode).
    // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
    if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
-      VT != MVT::f80 && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
+      VT != MVT::f80 && VT != MVT::f128 &&
+      (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
        (Subtarget->hasSSE2() ||
         (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
      ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
@@ -24211,12 +24409,9 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
           SetCC.getOpcode() == ISD::AND) {
      if (SetCC.getOpcode() == ISD::AND) {
        int OpIdx = -1;
-      ConstantSDNode *CS;
-      if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(0))) &&
-          CS->getZExtValue() == 1)
+      if (isOneConstant(SetCC.getOperand(0)))
          OpIdx = 1;
-      if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(1))) &&
-          CS->getZExtValue() == 1)
+      if (isOneConstant(SetCC.getOperand(1)))
          OpIdx = 0;
        if (OpIdx == -1)
          break;
@@ -24295,8 +24490,7 @@ static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
                                             X86::CondCode &CC1, SDValue &Flags,
                                             bool &isAnd) {
    if (Cond->getOpcode() == X86ISD::CMP) {
-    ConstantSDNode *CondOp1C = dyn_cast<ConstantSDNode>(Cond->getOperand(1));
-    if (!CondOp1C || !CondOp1C->isNullValue())
+    if (!isNullConstant(Cond->getOperand(1)))
        return false;
  
      Cond = Cond->getOperand(0);
@@ -24575,9 +24769,11 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
      MulAmt1 = 3;
      MulAmt2 = MulAmt / 3;
    }
+
+  SDLoc DL(N);
+  SDValue NewMul;
    if (MulAmt2 &&
        (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
-    SDLoc DL(N);
  
      if (isPowerOf2_64(MulAmt2) &&
          !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
@@ -24586,7 +24782,6 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
        // is an add.
        std::swap(MulAmt1, MulAmt2);
  
-    SDValue NewMul;
      if (isPowerOf2_64(MulAmt1))
        NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
                             DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
@@ -24600,10 +24795,31 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
      else
        NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
                             DAG.getConstant(MulAmt2, DL, VT));
+  }
  
+  if (!NewMul) {
+    assert(MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX)
+           && "Both cases that could cause potential overflows should have "
+              "already been handled.");
+    if (isPowerOf2_64(MulAmt - 1))
+      // (mul x, 2^N + 1) => (add (shl x, N), x)
+      NewMul = DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
+                                DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+                                DAG.getConstant(Log2_64(MulAmt - 1), DL,
+                                MVT::i8)));
+
+    else if (isPowerOf2_64(MulAmt + 1))
+      // (mul x, 2^N - 1) => (sub (shl x, N), x)
+      NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::SHL, DL, VT,
+                                N->getOperand(0),
+                                DAG.getConstant(Log2_64(MulAmt + 1),
+                                DL, MVT::i8)), N->getOperand(0));
+  }
+
+  if (NewMul)
      // Do not add new nodes to DAG combiner worklist.
      DCI.CombineTo(N, NewMul, false);
-  }
+
    return SDValue();
  }
  
@@ -24692,7 +24908,7 @@ static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
        // the element size. The constant shift amount will be
        // encoded as a 8-bit immediate.
        if (ShiftAmt.trunc(8).uge(MaxAmount))
-        return getZeroVector(VT, Subtarget, DAG, DL);
+        return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
      }
  
    return SDValue();
@@ -25333,7 +25549,7 @@ static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
      return SDValue();
  
    // Make sure we are performing an xor against one.
-  if (!isa<ConstantSDNode>(N1) || !cast<ConstantSDNode>(N1)->isOne())
+  if (!isOneConstant(N1))
      return SDValue();
  
    // SetCC on x86 zero extends so only act on this if it's a logical shift.
@@ -25609,8 +25825,8 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
      SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
      for (unsigned i = 0; i != NumElems; ++i)
        ShuffleVec[i] = i * SizeRatio;
-    for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
-      ShuffleVec[i] = NumElems*SizeRatio;
+    for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
+      ShuffleVec[i] = NumElems * SizeRatio;
      NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
                                     DAG.getConstant(0, dl, WideVecVT),
                                     &ShuffleVec[0]);
@@ -25691,8 +25907,8 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
           "WideVecVT should be legal");
  
    SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
-                                        DAG.getUNDEF(WideVecVT),
-                                        &ShuffleVec[0]);
+                                              DAG.getUNDEF(WideVecVT),
+                                              &ShuffleVec[0]);
  
    SDValue NewMask;
    SDValue Mask = Mst->getMask();
@@ -25724,8 +25940,9 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
      NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
    }
  
-  return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, Mst->getBasePtr(),
-                            NewMask, StVT, Mst->getMemOperand(), false);
+  return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
+                            Mst->getBasePtr(), NewMask, StVT,
+                            Mst->getMemOperand(), false);
  }
  /// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
  static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
@@ -26153,29 +26370,70 @@ static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
  static SDValue PerformFNEGCombine(SDNode *N, SelectionDAG &DAG,
                                    const X86Subtarget *Subtarget) {
    EVT VT = N->getValueType(0);
+  EVT SVT = VT.getScalarType();
    SDValue Arg = N->getOperand(0);
+  SDLoc DL(N);
+
+  // Let legalize expand this if it isn't a legal type yet.
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
+  // If we're negating a FMUL node on a target with FMA, then we can avoid the
+  // use of a constant by performing (-0 - A*B) instead.
+  // FIXME: Check rounding control flags as well once it becomes available. 
+  if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
+      Arg->getFlags()->hasNoSignedZeros() && Subtarget->hasAnyFMA()) {
+    SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
+    return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
+                       Arg.getOperand(1), Zero);
+  }
  
    // If we're negating a FMA node, then we can adjust the
    // instruction to include the extra negation.
    if (Arg.hasOneUse()) {
      switch (Arg.getOpcode()) {
-      case X86ISD::FMADD:
-        return DAG.getNode(X86ISD::FNMSUB, SDLoc(N), VT, Arg.getOperand(0),
-                           Arg.getOperand(1), Arg.getOperand(2));
-      case X86ISD::FMSUB:
-        return DAG.getNode(X86ISD::FNMADD, SDLoc(N), VT, Arg.getOperand(0),
-                           Arg.getOperand(1), Arg.getOperand(2));
-      case X86ISD::FNMADD:
-        return DAG.getNode(X86ISD::FMSUB, SDLoc(N), VT, Arg.getOperand(0),
-                           Arg.getOperand(1), Arg.getOperand(2));
-      case X86ISD::FNMSUB:
-        return DAG.getNode(X86ISD::FMADD, SDLoc(N), VT, Arg.getOperand(0),
-                           Arg.getOperand(1), Arg.getOperand(2));
+    case X86ISD::FMADD:
+      return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
+                         Arg.getOperand(1), Arg.getOperand(2));
+    case X86ISD::FMSUB:
+      return DAG.getNode(X86ISD::FNMADD, DL, VT, Arg.getOperand(0),
+                         Arg.getOperand(1), Arg.getOperand(2));
+    case X86ISD::FNMADD:
+      return DAG.getNode(X86ISD::FMSUB, DL, VT, Arg.getOperand(0),
+                         Arg.getOperand(1), Arg.getOperand(2));
+    case X86ISD::FNMSUB:
+      return DAG.getNode(X86ISD::FMADD, DL, VT, Arg.getOperand(0),
+                         Arg.getOperand(1), Arg.getOperand(2));
      }
    }
    return SDValue();
  }
  
+static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
+                              const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  if (VT.is512BitVector() && !Subtarget->hasDQI()) {
+    // VXORPS, VORPS, VANDPS, VANDNPS are supported only under DQ extention.
+    // These logic operations may be executed in the integer domain.
+    SDLoc dl(N);
+    MVT IntScalar = MVT::getIntegerVT(VT.getScalarSizeInBits());
+    MVT IntVT = MVT::getVectorVT(IntScalar, VT.getVectorNumElements());
+
+    SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(0));
+    SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(1));
+    unsigned IntOpcode = 0;
+    switch (N->getOpcode()) {
+      default: llvm_unreachable("Unexpected FP logic op");
+      case X86ISD::FOR: IntOpcode = ISD::OR; break;
+      case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
+      case X86ISD::FAND: IntOpcode = ISD::AND; break;
+      case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
+    }
+    SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
+    return  DAG.getNode(ISD::BITCAST, dl, VT, IntOp);
+  }
+  return SDValue();
+}
  /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
  static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG,
                                   const X86Subtarget *Subtarget) {
@@ -26191,19 +26449,7 @@ static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG,
      if (C->getValueAPF().isPosZero())
        return N->getOperand(0);
  
-  EVT VT = N->getValueType(0);
-  if (VT.is512BitVector() && !Subtarget->hasDQI()) {
-    SDLoc dl(N);
-    MVT IntScalar = MVT::getIntegerVT(VT.getScalarSizeInBits());
-    MVT IntVT = MVT::getVectorVT(IntScalar, VT.getVectorNumElements());
-
-    SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(0));
-    SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(1));
-    unsigned IntOpcode = (N->getOpcode() == X86ISD::FOR) ? ISD::OR : ISD::XOR;
-    SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
-    return  DAG.getNode(ISD::BITCAST, dl, VT, IntOp);
-  }
-  return SDValue();
+  return lowerX86FPLogicOp(N, DAG, Subtarget);
  }
  
  /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
@@ -26228,7 +26474,8 @@ static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
  }
  
  /// Do target-specific dag combines on X86ISD::FAND nodes.
-static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG,
+                                  const X86Subtarget *Subtarget) {
    // FAND(0.0, x) -> 0.0
    if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
      if (C->getValueAPF().isPosZero())
@@ -26239,11 +26486,12 @@ static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
      if (C->getValueAPF().isPosZero())
        return N->getOperand(1);
  
-  return SDValue();
+  return lowerX86FPLogicOp(N, DAG, Subtarget);
  }
  
  /// Do target-specific dag combines on X86ISD::FANDN nodes
-static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG,
+                                   const X86Subtarget *Subtarget) {
    // FANDN(0.0, x) -> x
    if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
      if (C->getValueAPF().isPosZero())
@@ -26254,7 +26502,7 @@ static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) {
      if (C->getValueAPF().isPosZero())
        return N->getOperand(1);
  
-  return SDValue();
+  return lowerX86FPLogicOp(N, DAG, Subtarget);
  }
  
  static SDValue PerformBTCombine(SDNode *N,
@@ -26485,9 +26733,7 @@ static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
      return SDValue();
  
    EVT ScalarVT = VT.getScalarType();
-  if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
-      (!Subtarget->hasFMA() && !Subtarget->hasFMA4() &&
-       !Subtarget->hasAVX512()))
+  if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget->hasAnyFMA())
      return SDValue();
  
    SDValue A = N->getOperand(0);
@@ -26532,8 +26778,7 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
        N0.getOperand(0).hasOneUse()) {
      SDValue N00 = N0.getOperand(0);
      if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
-      ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
-      if (!C || C->getZExtValue() != 1)
+      if (!isOneConstant(N0.getOperand(1)))
          return SDValue();
        return DAG.getNode(ISD::AND, dl, VT,
                           DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
@@ -26586,21 +26831,19 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
    SDLoc DL(N);
  
    if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
-    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
-      if (C->getAPIntValue() == 0 && LHS.hasOneUse()) {
-        SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS,
-                                   LHS.getOperand(1));
-        return DAG.getSetCC(DL, N->getValueType(0), addV,
-                            DAG.getConstant(0, DL, addV.getValueType()), CC);
-      }
+    if (isNullConstant(LHS.getOperand(0)) && LHS.hasOneUse()) {
+      SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS,
+                                 LHS.getOperand(1));
+      return DAG.getSetCC(DL, N->getValueType(0), addV,
+                          DAG.getConstant(0, DL, addV.getValueType()), CC);
+    }
    if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
-    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0)))
-      if (C->getAPIntValue() == 0 && RHS.hasOneUse()) {
-        SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS,
-                                   RHS.getOperand(1));
-        return DAG.getSetCC(DL, N->getValueType(0), addV,
-                            DAG.getConstant(0, DL, addV.getValueType()), CC);
-      }
+    if (isNullConstant(RHS.getOperand(0)) && RHS.hasOneUse()) {
+      SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS,
+                                 RHS.getOperand(1));
+      return DAG.getSetCC(DL, N->getValueType(0), addV,
+                          DAG.getConstant(0, DL, addV.getValueType()), CC);
+    }
  
    if (VT.getScalarType() == MVT::i1 &&
        (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
@@ -27020,8 +27263,7 @@ static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
        V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
      SDValue ExtractedV = V.getOperand(0);
      SDValue OrigV = ExtractedV.getOperand(0);
-    if (auto *ExtractIdx = dyn_cast<ConstantSDNode>(ExtractedV.getOperand(1)))
-      if (ExtractIdx->getZExtValue() == 0) {
+    if (isNullConstant(ExtractedV.getOperand(1))) {
          MVT OrigVT = OrigV.getSimpleValueType();
          // Extract a subvector if necessary...
          if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
@@ -27076,8 +27318,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
    case X86ISD::FOR:         return PerformFORCombine(N, DAG, Subtarget);
    case X86ISD::FMIN:
    case X86ISD::FMAX:        return PerformFMinFMaxCombine(N, DAG);
-  case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
-  case X86ISD::FANDN:       return PerformFANDNCombine(N, DAG);
+  case X86ISD::FAND:        return PerformFANDCombine(N, DAG, Subtarget);
+  case X86ISD::FANDN:       return PerformFANDNCombine(N, DAG, Subtarget);
    case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
    case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
    case ISD::ANY_EXTEND:
@@ -27735,6 +27977,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
        case MVT::f64:
        case MVT::i64:
          return std::make_pair(0U, &X86::FR64RegClass);
+      // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
        // Vector types.
        case MVT::v16i8:
        case MVT::v8i16:
@@ -27847,6 +28090,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
      // target independent register mapper will just pick the first match it can
      // find, ignoring the required type.
  
+    // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
      if (VT == MVT::f32 || VT == MVT::i32)
        Res.second = &X86::FR32RegClass;
      else if (VT == MVT::f64 || VT == MVT::i64)