Revert "Revert "Mark vastart_save_xmm_regs as changing EFLAGS""

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 23910a350e86066130f2a3f43bc36a36b104e71b..c780c14abbc3c19b8fa74c43da5767698af22027 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -180,7 +180,7 @@ static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
    const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
    bool is64Bit = Subtarget->is64Bit();
  
-  if (Subtarget->isTargetEnvMacho()) {
+  if (Subtarget->isTargetMacho()) {
      if (is64Bit)
        return new X86_64MachoTargetObjectFile();
      return new TargetLoweringObjectFileMachO();
@@ -190,7 +190,7 @@ static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
      return new X86LinuxTargetObjectFile();
    if (Subtarget->isTargetELF())
      return new TargetLoweringObjectFileELF();
-  if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
+  if (Subtarget->isTargetCOFF())
      return new TargetLoweringObjectFileCOFF();
    llvm_unreachable("unknown subtarget type");
  }
@@ -632,7 +632,7 @@ void X86TargetLowering::resetOperationActions() {
    setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
    setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
  
-  if (Subtarget->isOSWindows() && !Subtarget->isTargetEnvMacho())
+  if (Subtarget->isOSWindows() && !Subtarget->isTargetMacho())
      setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
                         MVT::i64 : MVT::i32, Custom);
    else if (TM.Options.EnableSegmentedStacks)
@@ -1306,9 +1306,13 @@ void X86TargetLowering::resetOperationActions() {
      addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
      addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
  
+    addRegisterClass(MVT::i1,     &X86::VK1RegClass);
      addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
      addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
  
+    setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
+    setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
+    setOperationAction(ISD::XOR,                MVT::i1,    Legal);
      setLoadExtAction(ISD::EXTLOAD,              MVT::v8f32, Legal);
      setOperationAction(ISD::LOAD,               MVT::v16f32, Legal);
      setOperationAction(ISD::LOAD,               MVT::v8f64, Legal);
@@ -1352,7 +1356,7 @@ void X86TargetLowering::resetOperationActions() {
      setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
      setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
  
-    setOperationAction(ISD::TRUNCATE,           MVT::i1, Legal);
+    setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
      setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
      setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
      setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
@@ -1370,12 +1374,15 @@ void X86TargetLowering::resetOperationActions() {
      setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
      setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
      setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1,    Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1, Legal);
  
      setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
      setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);
  
      setOperationAction(ISD::MUL,              MVT::v8i64, Custom);
  
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
      setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
      setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
      setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
@@ -2175,7 +2182,6 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
  
    MachineFrameInfo *MFI = MF.getFrameInfo();
    bool Is64Bit = Subtarget->is64Bit();
-  bool IsWindows = Subtarget->isTargetWindows();
    bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
  
    assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
@@ -2222,6 +2228,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
          RC = &X86::VR128RegClass;
        else if (RegVT == MVT::x86mmx)
          RC = &X86::VR64RegClass;
+      else if (RegVT == MVT::i1)
+        RC = &X86::VK1RegClass;
        else if (RegVT == MVT::v8i1)
          RC = &X86::VK8RegClass;
        else if (RegVT == MVT::v16i1)
@@ -2420,7 +2428,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
    } else {
      FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
      // If this is an sret function, the return should pop the hidden pointer.
-    if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
+    if (!Is64Bit && !IsTailCallConvention(CallConv) &&
+        !Subtarget->getTargetTriple().isOSMSVCRT() &&
          argsAreStructReturn(Ins) == StackStructReturn)
        FuncInfo->setBytesToPopOnReturn(4);
    }
@@ -2509,7 +2518,6 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
    MachineFunction &MF = DAG.getMachineFunction();
    bool Is64Bit        = Subtarget->is64Bit();
    bool IsWin64        = Subtarget->isCallingConvWin64(CallConv);
-  bool IsWindows      = Subtarget->isTargetWindows();
    StructReturnType SR = callIsStructReturn(Outs);
    bool IsSibcall      = false;
  
@@ -2665,15 +2673,21 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
        RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
                 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
      } else {
-      // If we are tail calling a global or external symbol in GOT pic mode, we
-      // cannot use a direct jump, since that would make lazy dynamic linking
-      // impossible (see PR15086).  So pretend this is not a tail call, to
-      // prevent the optimization to a jump.
+      // If we are tail calling and generating PIC/GOT style code load the
+      // address of the callee into ECX. The value in ecx is used as target of
+      // the tail jump. This is done to circumvent the ebx/callee-saved problem
+      // for tail calls on PIC/GOT architectures. Normally we would just put the
+      // address of GOT into ebx and then call target@PLT. But for tail calls
+      // ebx would be restored (since ebx is callee saved) before jumping to the
+      // target@PLT.
+
+      // Note: The actual moving to ECX is done further down.
        GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
-      if ((G && !G->getGlobal()->hasHiddenVisibility() &&
-          !G->getGlobal()->hasProtectedVisibility()) ||
-          isa<ExternalSymbolSDNode>(Callee))
-        isTailCall = false;
+      if (G && !G->getGlobal()->hasHiddenVisibility() &&
+          !G->getGlobal()->hasProtectedVisibility())
+        Callee = LowerGlobalAddress(Callee, DAG);
+      else if (isa<ExternalSymbolSDNode>(Callee))
+        Callee = LowerExternalSymbol(Callee, DAG);
      }
    }
  
@@ -2897,7 +2911,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
    if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
                         getTargetMachine().Options.GuaranteedTailCallOpt))
      NumBytesForCalleeToPush = NumBytes;    // Callee pops everything
-  else if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
+  else if (!Is64Bit && !IsTailCallConvention(CallConv) &&
+           !Subtarget->getTargetTriple().isOSMSVCRT() &&
             SR == StackStructReturn)
      // If this is a call to a struct-return function, the callee
      // pops the hidden struct pointer, so we have to push it back.
@@ -3086,9 +3101,13 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
    if (isCalleeStructRet || isCallerStructRet)
      return false;
  
-  // An stdcall caller is expected to clean up its arguments; the callee
-  // isn't going to do that.
-  if (!CCMatch && CallerCC == CallingConv::X86_StdCall)
+  // An stdcall/thiscall caller is expected to clean up its arguments; the
+  // callee isn't going to do that.
+  // FIXME: this is more restrictive than needed. We could produce a tailcall
+  // when the stack adjustment matches. For example, with a thiscall that takes
+  // only one argument.
+  if (!CCMatch && (CallerCC == CallingConv::X86_StdCall ||
+                   CallerCC == CallingConv::X86_ThisCall))
      return false;
  
    // Do not sibcall optimize vararg calls unless all arguments are passed via
@@ -3409,6 +3428,24 @@ bool X86::isCalleePop(CallingConv::ID CallingConv,
    }
  }
  
+/// \brief Return true if the condition is an unsigned comparison operation.
+static bool isX86CCUnsigned(unsigned X86CC) {
+  switch (X86CC) {
+  default: llvm_unreachable("Invalid integer condition!");
+  case X86::COND_E:     return true;
+  case X86::COND_G:     return false;
+  case X86::COND_GE:    return false;
+  case X86::COND_L:     return false;
+  case X86::COND_LE:    return false;
+  case X86::COND_NE:    return true;
+  case X86::COND_B:     return true;
+  case X86::COND_A:     return true;
+  case X86::COND_BE:    return true;
+  case X86::COND_AE:    return true;
+  }
+  llvm_unreachable("covered switch fell through?!");
+}
+
  /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
  /// specific condition code, returning the condition code and the LHS/RHS of the
  /// comparison to make.
@@ -5389,7 +5426,8 @@ LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
  /// rather than undef via VZEXT_LOAD, but we do not detect that case today.
  /// There's even a handy isZeroNode for that purpose.
  static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
-                                        SDLoc &DL, SelectionDAG &DAG) {
+                                        SDLoc &DL, SelectionDAG &DAG,
+                                        bool isAfterLegalize) {
    EVT EltVT = VT.getVectorElementType();
    unsigned NumElems = Elts.size();
  
@@ -5425,7 +5463,13 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
    // load of the entire vector width starting at the base pointer.  If we found
    // consecutive loads for the low half, generate a vzext_load node.
    if (LastLoadedElt == NumElems - 1) {
+
+    if (isAfterLegalize &&
+        !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
+      return SDValue();
+
      SDValue NewLd = SDValue();
+
      if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
        NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
                            LDBase->getPointerInfo(),
@@ -6069,7 +6113,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
        V[i] = Op.getOperand(i);
  
      // Check for elements which are consecutive loads.
-    SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG);
+    SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false);
      if (LD.getNode())
        return LD;
  
@@ -7641,6 +7685,39 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
    return SDValue();
  }
  
+/// Extract one bit from mask vector, like v16i1 or v8i1.
+/// AVX-512 feature.
+static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) {
+  SDValue Vec = Op.getOperand(0);
+  SDLoc dl(Vec);
+  MVT VecVT = Vec.getSimpleValueType();
+  SDValue Idx = Op.getOperand(1);
+  MVT EltVT = Op.getSimpleValueType();
+
+  assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
+
+  // variable index can't be handled in mask registers,
+  // extend vector to VR512
+  if (!isa<ConstantSDNode>(Idx)) {
+    MVT ExtVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
+    SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
+    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                              ExtVT.getVectorElementType(), Ext, Idx);
+    return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
+  }
+
+  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+  if (IdxVal) {
+    unsigned MaxSift = VecVT.getSizeInBits() - 1;
+    Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
+                      DAG.getConstant(MaxSift - IdxVal, MVT::i8));
+    Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
+                      DAG.getConstant(MaxSift, MVT::i8));
+  }
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i1, Vec,
+                       DAG.getIntPtrConstant(0));
+}
+
  SDValue
  X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
                                             SelectionDAG &DAG) const {
@@ -7648,6 +7725,10 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
    SDValue Vec = Op.getOperand(0);
    MVT VecVT = Vec.getSimpleValueType();
    SDValue Idx = Op.getOperand(1);
+
+  if (Op.getSimpleValueType() == MVT::i1)
+    return ExtractBitFromMaskVector(Op, DAG);
+
    if (!isa<ConstantSDNode>(Idx)) {
      if (VecVT.is512BitVector() ||
          (VecVT.is256BitVector() && Subtarget->hasInt256() &&
@@ -8431,6 +8512,11 @@ SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{
    SDValue ShOpLo = Op.getOperand(0);
    SDValue ShOpHi = Op.getOperand(1);
    SDValue ShAmt  = Op.getOperand(2);
+  // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
+  // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
+  // during isel.
+  SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
+                                  DAG.getConstant(VTBits - 1, MVT::i8));
    SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
                                       DAG.getConstant(VTBits - 1, MVT::i8))
                         : DAG.getConstant(0, VT);
@@ -8438,12 +8524,15 @@ SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{
    SDValue Tmp2, Tmp3;
    if (Op.getOpcode() == ISD::SHL_PARTS) {
      Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
-    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
    } else {
      Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
-    Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt);
+    Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
    }
  
+  // If the shift amount is larger or equal than the width of a part we can't
+  // rely on the results of shld/shrd. Insert a test and select the appropriate
+  // values for large shift amounts.
    SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
                                  DAG.getConstant(VTBits, MVT::i8));
    SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
@@ -8985,6 +9074,17 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
    MVT VT = Op.getSimpleValueType();
    SDValue In = Op.getOperand(0);
    MVT InVT = In.getSimpleValueType();
+
+  if (VT == MVT::i1) {
+    assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
+           "Invalid scalar TRUNCATE operation");
+    In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(1, InVT));
+    if (InVT.getSizeInBits() == 64)
+      In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::i32, In);
+    else if (InVT.getSizeInBits() < 32)
+      In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
+    return DAG.getNode(X86ISD::TRUNC, DL, VT, In);
+  }
    assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
           "Invalid TRUNCATE operation");
  
@@ -9645,13 +9745,30 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
  /// equivalent.
  SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
                                     SelectionDAG &DAG) const {
-  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1))
+  SDLoc dl(Op0);
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) {
      if (C->getAPIntValue() == 0)
        return EmitTest(Op0, X86CC, DAG);
  
-  SDLoc dl(Op0);
+     if (Op0.getValueType() == MVT::i1) {
+      Op0 = DAG.getNode(ISD::XOR, dl, MVT::i1, Op0, DAG.getConstant(-1, MVT::i1));
+      return DAG.getNode(X86ISD::CMP, dl, MVT::i1, Op0, Op0);
+     }
+  }
+ 
    if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
         Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
+    // Do the comparison at i32 if it's smaller. This avoids subregister
+    // aliasing issues. Keep the smaller reference if we're optimizing for
+    // size, however, as that'll allow better folding of memory operations.
+    if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 &&
+        !DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
+             AttributeSet::FunctionIndex, Attribute::MinSize)) {
+      unsigned ExtendOp =
+          isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
+      Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
+      Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
+    }
      // Use SUB instead of CMP to enable CSE between SUB and CMP.
      SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
      SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
@@ -10074,7 +10191,8 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
  
    if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
  
-  assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
+  assert((VT == MVT::i8 || (Subtarget->hasAVX512() && VT == MVT::i1))
+         && "SetCC type must be 8-bit or 1-bit integer");
    SDValue Op0 = Op.getOperand(0);
    SDValue Op1 = Op.getOperand(1);
    SDLoc dl(Op);
@@ -10187,8 +10305,12 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
          cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
  
      if (SSECC != 8) {
-      unsigned Opcode = VT == MVT::f32 ? X86ISD::FSETCCss : X86ISD::FSETCCsd;
-      SDValue Cmp = DAG.getNode(Opcode, DL, VT, CondOp0, CondOp1,
+      if (Subtarget->hasAVX512()) {
+        SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1,
+                                  DAG.getConstant(SSECC, MVT::i8));
+        return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2);
+      }
+      SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
                                  DAG.getConstant(SSECC, MVT::i8));
        SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
        SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
@@ -10760,7 +10882,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
            getTargetMachine().Options.EnableSegmentedStacks) &&
           "This should be used only on Windows targets or when segmented stacks "
           "are being used");
-  assert(!Subtarget->isTargetEnvMacho() && "Not implemented");
+  assert(!Subtarget->isTargetMacho() && "Not implemented");
    SDLoc dl(Op);
  
    // Get the inputs.
@@ -11451,9 +11573,9 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
      SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
      return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
    }
-  case Intrinsic::x86_avx512_kortestz:
-  case Intrinsic::x86_avx512_kortestc: {
-    unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz)? X86::COND_E: X86::COND_B;
+  case Intrinsic::x86_avx512_kortestz_w:
+  case Intrinsic::x86_avx512_kortestc_w: {
+    unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B;
      SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1));
      SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2));
      SDValue CC = DAG.getConstant(X86CC, MVT::i8);
@@ -13127,19 +13249,27 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
        // fall through
      case MVT::v4i32:
      case MVT::v8i16: {
-      // (sext (vzext x)) -> (vsext x)
        SDValue Op0 = Op.getOperand(0);
        SDValue Op00 = Op0.getOperand(0);
        SDValue Tmp1;
        // Hopefully, this VECTOR_SHUFFLE is just a VZEXT.
        if (Op0.getOpcode() == ISD::BITCAST &&
-          Op00.getOpcode() == ISD::VECTOR_SHUFFLE)
+          Op00.getOpcode() == ISD::VECTOR_SHUFFLE) {
+        // (sext (vzext x)) -> (vsext x)
          Tmp1 = LowerVectorIntExtend(Op00, Subtarget, DAG);
-      if (Tmp1.getNode()) {
-        SDValue Tmp1Op0 = Tmp1.getOperand(0);
-        assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT &&
-               "This optimization is invalid without a VZEXT.");
-        return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0));
+        if (Tmp1.getNode()) {
+          EVT ExtraEltVT = ExtraVT.getVectorElementType();
+          // This folding is only valid when the in-reg type is a vector of i8,
+          // i16, or i32.
+          if (ExtraEltVT == MVT::i8 || ExtraEltVT == MVT::i16 ||
+              ExtraEltVT == MVT::i32) {
+            SDValue Tmp1Op0 = Tmp1.getOperand(0);
+            assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT &&
+                   "This optimization is invalid without a VZEXT.");
+            return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0));
+          }
+          Op0 = Tmp1;
+        }
        }
  
        // If the above didn't work, then just use Shift-Left + Shift-Right.
@@ -13719,8 +13849,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::CMPMU:              return "X86ISD::CMPMU";
    case X86ISD::SETCC:              return "X86ISD::SETCC";
    case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
-  case X86ISD::FSETCCsd:           return "X86ISD::FSETCCsd";
-  case X86ISD::FSETCCss:           return "X86ISD::FSETCCss";
+  case X86ISD::FSETCC:             return "X86ISD::FSETCC";
    case X86ISD::CMOV:               return "X86ISD::CMOV";
    case X86ISD::BRCOND:             return "X86ISD::BRCOND";
    case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
@@ -13815,7 +13944,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::TESTP:              return "X86ISD::TESTP";
    case X86ISD::TESTM:              return "X86ISD::TESTM";
    case X86ISD::KORTEST:            return "X86ISD::KORTEST";
-  case X86ISD::KTEST:              return "X86ISD::KTEST";
    case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
    case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
    case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
@@ -15212,7 +15340,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
  
    unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
    // In the XMM save block, save all the XMM argument registers.
-  for (int i = 3, e = MI->getNumOperands(); i != e; ++i) {
+  for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) {
      int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
      MachineMemOperand *MMO =
        F->getMachineMemOperand(
@@ -15465,7 +15593,7 @@ X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
    const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
    DebugLoc DL = MI->getDebugLoc();
  
-  assert(!Subtarget->isTargetEnvMacho());
+  assert(!Subtarget->isTargetMacho());
  
    // The lowering is pretty easy: we're just emitting the call to _alloca.  The
    // non-trivial part is impdef of ESP.
@@ -15999,6 +16127,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
    case X86::EH_SjLj_LongJmp32:
    case X86::EH_SjLj_LongJmp64:
      return emitEHSjLjLongJmp(MI, BB);
+
+  case TargetOpcode::STACKMAP:
+  case TargetOpcode::PATCHPOINT:
+    return emitPatchPoint(MI, BB);
    }
  }
  
@@ -16254,7 +16386,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
    for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
      Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
  
-  return EltsFromConsecutiveLoads(VT, Elts, dl, DAG);
+  return EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true);
  }
  
  /// PerformTruncateCombine - Converts truncate operation to
@@ -16361,44 +16493,6 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
                       EltNo);
  }
  
-/// Extract one bit from mask vector, like v16i1 or v8i1.
-/// AVX-512 feature.
-static SDValue ExtractBitFromMaskVector(SDNode *N, SelectionDAG &DAG) {
-  SDValue Vec = N->getOperand(0);
-  SDLoc dl(Vec);
-  MVT VecVT = Vec.getSimpleValueType();
-  SDValue Idx = N->getOperand(1);
-  MVT EltVT = N->getSimpleValueType(0);
-
-  assert((VecVT.getVectorElementType() == MVT::i1 && EltVT == MVT::i8) ||
-         "Unexpected operands in ExtractBitFromMaskVector");
-
-  // variable index
-  if (!isa<ConstantSDNode>(Idx)) {
-    MVT ExtVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
-    SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
-    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
-                              ExtVT.getVectorElementType(), Ext);
-    return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
-  }
-
-  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
-
-  MVT ScalarVT = MVT::getIntegerVT(VecVT.getSizeInBits());
-  unsigned MaxShift = VecVT.getSizeInBits() - 1;
-  Vec = DAG.getNode(ISD::BITCAST, dl, ScalarVT, Vec);
-  Vec = DAG.getNode(ISD::SHL, dl, ScalarVT, Vec,
-              DAG.getConstant(MaxShift - IdxVal, ScalarVT));
-  Vec = DAG.getNode(ISD::SRL, dl, ScalarVT, Vec,
-    DAG.getConstant(MaxShift, ScalarVT));
-
-  if (VecVT == MVT::v16i1) {
-    Vec = DAG.getNode(ISD::BITCAST, dl, MVT::i16, Vec);
-    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Vec);
-  }
-  return DAG.getNode(ISD::BITCAST, dl, MVT::i8, Vec);
-}
-
  /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
  /// generation and convert it from being a bunch of shuffles and extracts
  /// to a simple store and scalar loads to extract the elements.
@@ -16410,10 +16504,6 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
  
    SDValue InputVector = N->getOperand(0);
  
-  if (InputVector.getValueType().getVectorElementType() == MVT::i1 &&
-      !DCI.isBeforeLegalize())
-    return ExtractBitFromMaskVector(N, DAG);
-
    // Detect whether we are trying to convert from mmx to i32 and the bitcast
    // from mmx to v2i32 has a single usage.
    if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST &&
@@ -16959,12 +17049,13 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
    // Simplify vector selection if the selector will be produced by CMPP*/PCMP*.
    if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
        // Check if SETCC has already been promoted
-      TLI.getSetCCResultType(*DAG.getContext(), VT) == Cond.getValueType()) {
+      TLI.getSetCCResultType(*DAG.getContext(), VT) == CondVT &&
+      // Check that condition value type matches vselect operand type
+      CondVT == VT) { 
  
      assert(Cond.getValueType().isVector() &&
             "vector select expects a vector selector!");
  
-    EVT IntVT = Cond.getValueType();
      bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
      bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
  
@@ -16979,7 +17070,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
          ISD::CondCode NewCC =
            ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
                                 Cond.getOperand(0).getValueType().isInteger());
-        Cond = DAG.getSetCC(DL, IntVT, Cond.getOperand(0), Cond.getOperand(1), NewCC);
+        Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC);
          std::swap(LHS, RHS);
          TValIsAllOnes = FValIsAllOnes;
          FValIsAllZeros = TValIsAllZeros;
@@ -16992,11 +17083,11 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
        if (TValIsAllOnes && FValIsAllZeros)
          Ret = Cond;
        else if (TValIsAllOnes)
-        Ret = DAG.getNode(ISD::OR, DL, IntVT, Cond,
-                          DAG.getNode(ISD::BITCAST, DL, IntVT, RHS));
+        Ret = DAG.getNode(ISD::OR, DL, CondVT, Cond,
+                          DAG.getNode(ISD::BITCAST, DL, CondVT, RHS));
        else if (FValIsAllZeros)
-        Ret = DAG.getNode(ISD::AND, DL, IntVT, Cond,
-                          DAG.getNode(ISD::BITCAST, DL, IntVT, LHS));
+        Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond,
+                          DAG.getNode(ISD::BITCAST, DL, CondVT, LHS));
  
        return DAG.getNode(ISD::BITCAST, DL, VT, Ret);
      }
@@ -17014,6 +17105,15 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
      if (BitWidth == 1)
        return SDValue();
  
+    // Check all uses of that condition operand to check whether it will be
+    // consumed by non-BLEND instructions, which may depend on all bits are set
+    // properly.
+    for (SDNode::use_iterator I = Cond->use_begin(),
+                              E = Cond->use_end(); I != E; ++I)
+      if (I->getOpcode() != ISD::VSELECT)
+        // TODO: Add other opcodes eventually lowered into BLEND.
+        return SDValue();
+
      assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
      APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
  
@@ -17547,17 +17647,16 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
          if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
              (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
            bool is64BitFP = (CMP00.getValueType() == MVT::f64);
-          X86ISD::NodeType NTOperator = is64BitFP ?
-            X86ISD::FSETCCsd : X86ISD::FSETCCss;
            // FIXME: need symbolic constants for these magic numbers.
            // See X86ATTInstPrinter.cpp:printSSECC().
            unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
-          SDValue OnesOrZeroesF = DAG.getNode(NTOperator, DL, MVT::f32, CMP00, CMP01,
+          SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00, CMP01,
                                                DAG.getConstant(x86cc, MVT::i8));
-          SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, MVT::i32,
+          MVT IntVT = (is64BitFP ? MVT::i64 : MVT::i32); 
+          SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, IntVT,
                                                OnesOrZeroesF);
-          SDValue ANDed = DAG.getNode(ISD::AND, DL, MVT::i32, OnesOrZeroesI,
-                                      DAG.getConstant(1, MVT::i32));
+          SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
+                                      DAG.getConstant(1, IntVT));
            SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed);
            return OneBitOfTruth;
          }
@@ -17895,12 +17994,12 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
    MachineFunction &MF = DAG.getMachineFunction();
    bool OptForSize = MF.getFunction()->getAttributes().
      hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
- 
-  // SHLD/SHRD instructions have lower register pressure, but on some 
-  // platforms they have higher latency than the equivalent 
-  // series of shifts/or that would otherwise be generated. 
+
+  // SHLD/SHRD instructions have lower register pressure, but on some
+  // platforms they have higher latency than the equivalent
+  // series of shifts/or that would otherwise be generated.
    // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
-  // have higer latencies and we are not optimizing for size.  
+  // have higher latencies and we are not optimizing for size.
    if (!OptForSize && Subtarget->isSHLDSlow())
      return SDValue();